cui-llama.rn 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -2
- package/android/src/main/jni.cpp +26 -21
- package/cpp/common.cpp +2028 -1520
- package/cpp/common.h +134 -18
- package/cpp/ggml-aarch64.c +612 -0
- package/cpp/ggml-alloc.h +2 -2
- package/cpp/ggml-backend.c +33 -6
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-common.h +20 -0
- package/cpp/ggml-impl.h +4 -7
- package/cpp/ggml-metal.m +63 -2
- package/cpp/ggml-quants.c +690 -2
- package/cpp/ggml-quants.h +15 -0
- package/cpp/ggml.c +1650 -317
- package/cpp/ggml.h +155 -48
- package/cpp/llama-grammar.cpp +721 -122
- package/cpp/llama-grammar.h +120 -15
- package/cpp/llama-impl.h +132 -1
- package/cpp/llama-sampling.cpp +1361 -356
- package/cpp/llama-sampling.h +20 -48
- package/cpp/llama-vocab.cpp +140 -7
- package/cpp/llama-vocab.h +3 -2
- package/cpp/llama.cpp +810 -307
- package/cpp/llama.h +213 -259
- package/cpp/rn-llama.hpp +17 -14
- package/cpp/sampling.cpp +347 -355
- package/cpp/sampling.h +106 -135
- package/cpp/sgemm.cpp +153 -0
- package/package.json +1 -1
- package/cpp/grammar-parser.cpp +0 -539
- package/cpp/grammar-parser.h +0 -29
package/cpp/llama.cpp
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
#include "llama-impl.h"
|
2
2
|
#include "llama-vocab.h"
|
3
|
-
#include "llama-grammar.h"
|
4
3
|
#include "llama-sampling.h"
|
5
4
|
|
6
5
|
#include "unicode.h"
|
@@ -223,6 +222,7 @@ enum llm_arch {
|
|
223
222
|
LLM_ARCH_JAIS,
|
224
223
|
LLM_ARCH_NEMOTRON,
|
225
224
|
LLM_ARCH_EXAONE,
|
225
|
+
LLM_ARCH_RWKV6,
|
226
226
|
LLM_ARCH_UNKNOWN,
|
227
227
|
};
|
228
228
|
|
@@ -270,6 +270,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
270
270
|
{ LLM_ARCH_JAIS, "jais" },
|
271
271
|
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
272
272
|
{ LLM_ARCH_EXAONE, "exaone" },
|
273
|
+
{ LLM_ARCH_RWKV6, "rwkv6" },
|
273
274
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
274
275
|
};
|
275
276
|
|
@@ -306,6 +307,9 @@ enum llm_kv {
|
|
306
307
|
LLM_KV_DECODER_START_TOKEN_ID,
|
307
308
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
308
309
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
310
|
+
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
311
|
+
LLM_KV_TIME_MIX_EXTRA_DIM,
|
312
|
+
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
309
313
|
|
310
314
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
311
315
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -341,6 +345,8 @@ enum llm_kv {
|
|
341
345
|
LLM_KV_SSM_TIME_STEP_RANK,
|
342
346
|
LLM_KV_SSM_DT_B_C_RMS,
|
343
347
|
|
348
|
+
LLM_KV_WKV_HEAD_SIZE,
|
349
|
+
|
344
350
|
LLM_KV_TOKENIZER_MODEL,
|
345
351
|
LLM_KV_TOKENIZER_PRE,
|
346
352
|
LLM_KV_TOKENIZER_LIST,
|
@@ -400,11 +406,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
400
406
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
401
407
|
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
402
408
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
403
|
-
{ LLM_KV_POOLING_TYPE
|
409
|
+
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
404
410
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
405
411
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
406
412
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
407
413
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
414
|
+
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
415
|
+
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
|
416
|
+
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
408
417
|
|
409
418
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
410
419
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -440,6 +449,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
440
449
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
441
450
|
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
|
442
451
|
|
452
|
+
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
453
|
+
|
443
454
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
444
455
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
445
456
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
@@ -529,6 +540,29 @@ enum llm_tensor {
|
|
529
540
|
LLM_TENSOR_SSM_A,
|
530
541
|
LLM_TENSOR_SSM_D,
|
531
542
|
LLM_TENSOR_SSM_OUT,
|
543
|
+
LLM_TENSOR_TIME_MIX_W1,
|
544
|
+
LLM_TENSOR_TIME_MIX_W2,
|
545
|
+
LLM_TENSOR_TIME_MIX_LERP_X,
|
546
|
+
LLM_TENSOR_TIME_MIX_LERP_W,
|
547
|
+
LLM_TENSOR_TIME_MIX_LERP_K,
|
548
|
+
LLM_TENSOR_TIME_MIX_LERP_V,
|
549
|
+
LLM_TENSOR_TIME_MIX_LERP_R,
|
550
|
+
LLM_TENSOR_TIME_MIX_LERP_G,
|
551
|
+
LLM_TENSOR_TIME_MIX_FIRST,
|
552
|
+
LLM_TENSOR_TIME_MIX_DECAY,
|
553
|
+
LLM_TENSOR_TIME_MIX_DECAY_W1,
|
554
|
+
LLM_TENSOR_TIME_MIX_DECAY_W2,
|
555
|
+
LLM_TENSOR_TIME_MIX_KEY,
|
556
|
+
LLM_TENSOR_TIME_MIX_VALUE,
|
557
|
+
LLM_TENSOR_TIME_MIX_RECEPTANCE,
|
558
|
+
LLM_TENSOR_TIME_MIX_GATE,
|
559
|
+
LLM_TENSOR_TIME_MIX_LN,
|
560
|
+
LLM_TENSOR_TIME_MIX_OUTPUT,
|
561
|
+
LLM_TENSOR_CHANNEL_MIX_LERP_K,
|
562
|
+
LLM_TENSOR_CHANNEL_MIX_LERP_R,
|
563
|
+
LLM_TENSOR_CHANNEL_MIX_KEY,
|
564
|
+
LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
|
565
|
+
LLM_TENSOR_CHANNEL_MIX_VALUE,
|
532
566
|
LLM_TENSOR_ATTN_Q_A,
|
533
567
|
LLM_TENSOR_ATTN_Q_B,
|
534
568
|
LLM_TENSOR_ATTN_KV_A_MQA,
|
@@ -1350,6 +1384,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1350
1384
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1351
1385
|
},
|
1352
1386
|
},
|
1387
|
+
{
|
1388
|
+
LLM_ARCH_RWKV6,
|
1389
|
+
{
|
1390
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1391
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
1392
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1393
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1394
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1395
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
1396
|
+
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
|
1397
|
+
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
|
1398
|
+
{ LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
|
1399
|
+
{ LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" },
|
1400
|
+
{ LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" },
|
1401
|
+
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
|
1402
|
+
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
|
1403
|
+
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
|
1404
|
+
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
1405
|
+
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
1406
|
+
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
1407
|
+
{ LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
|
1408
|
+
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
|
1409
|
+
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
|
1410
|
+
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
|
1411
|
+
{ LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
|
1412
|
+
{ LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
|
1413
|
+
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
|
1414
|
+
{ LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
|
1415
|
+
{ LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" },
|
1416
|
+
{ LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
|
1417
|
+
{ LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
|
1418
|
+
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
|
1419
|
+
},
|
1420
|
+
},
|
1353
1421
|
{
|
1354
1422
|
LLM_ARCH_UNKNOWN,
|
1355
1423
|
{
|
@@ -2162,6 +2230,7 @@ enum e_model {
|
|
2162
2230
|
MODEL_1B,
|
2163
2231
|
MODEL_1_3B,
|
2164
2232
|
MODEL_1_4B,
|
2233
|
+
MODEL_1_6B,
|
2165
2234
|
MODEL_2B,
|
2166
2235
|
MODEL_2_8B,
|
2167
2236
|
MODEL_3B,
|
@@ -2239,6 +2308,12 @@ struct llama_hparams {
|
|
2239
2308
|
float f_attn_logit_softcapping = 50.0f;
|
2240
2309
|
float f_final_logit_softcapping = 30.0f;
|
2241
2310
|
|
2311
|
+
// for RWKV
|
2312
|
+
uint32_t rescale_every_n_layers = 0;
|
2313
|
+
uint32_t time_mix_extra_dim = 0;
|
2314
|
+
uint32_t time_decay_extra_dim = 0;
|
2315
|
+
uint32_t wkv_head_size = 0;
|
2316
|
+
|
2242
2317
|
float rope_attn_factor = 1.0f;
|
2243
2318
|
float rope_freq_base_train;
|
2244
2319
|
float rope_freq_scale_train;
|
@@ -2302,6 +2377,11 @@ struct llama_hparams {
|
|
2302
2377
|
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
2303
2378
|
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
2304
2379
|
|
2380
|
+
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
|
2381
|
+
if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
|
2382
|
+
if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
|
2383
|
+
if (this->wkv_head_size != other.wkv_head_size) return true;
|
2384
|
+
|
2305
2385
|
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
2306
2386
|
|
2307
2387
|
const float EPSILON = 1e-9f;
|
@@ -2365,15 +2445,25 @@ struct llama_hparams {
|
|
2365
2445
|
}
|
2366
2446
|
|
2367
2447
|
uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
|
2368
|
-
// corresponds to Mamba's conv_states size
|
2369
|
-
|
2370
|
-
|
2371
|
-
|
2448
|
+
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
|
2449
|
+
if (wkv_head_size != 0) {
|
2450
|
+
// for RWKV models
|
2451
|
+
return 2 * n_embd;
|
2452
|
+
} else {
|
2453
|
+
// TODO: maybe support other convolution strides than 1
|
2454
|
+
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
2455
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
2456
|
+
}
|
2372
2457
|
}
|
2373
2458
|
|
2374
2459
|
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
2375
|
-
|
2376
|
-
|
2460
|
+
if (wkv_head_size != 0) {
|
2461
|
+
// corresponds to RWKV's wkv_states size
|
2462
|
+
return n_embd * wkv_head_size;
|
2463
|
+
} else {
|
2464
|
+
// corresponds to Mamba's ssm_states size
|
2465
|
+
return ssm_d_state * ssm_d_inner;
|
2466
|
+
}
|
2377
2467
|
}
|
2378
2468
|
};
|
2379
2469
|
|
@@ -2384,8 +2474,8 @@ struct llama_cparams {
|
|
2384
2474
|
uint32_t n_batch;
|
2385
2475
|
uint32_t n_ubatch;
|
2386
2476
|
uint32_t n_seq_max;
|
2387
|
-
|
2388
|
-
|
2477
|
+
int n_threads; // number of threads to use for generation
|
2478
|
+
int n_threads_batch; // number of threads to use for batch processing
|
2389
2479
|
|
2390
2480
|
float rope_freq_base;
|
2391
2481
|
float rope_freq_scale;
|
@@ -2512,6 +2602,36 @@ struct llama_layer {
|
|
2512
2602
|
struct lm_ggml_tensor * ssm_conv1d_b;
|
2513
2603
|
struct lm_ggml_tensor * ssm_dt_b;
|
2514
2604
|
|
2605
|
+
// rwkv
|
2606
|
+
struct lm_ggml_tensor * time_mix_w1;
|
2607
|
+
struct lm_ggml_tensor * time_mix_w2;
|
2608
|
+
struct lm_ggml_tensor * time_mix_lerp_x;
|
2609
|
+
struct lm_ggml_tensor * time_mix_lerp_w;
|
2610
|
+
struct lm_ggml_tensor * time_mix_lerp_k;
|
2611
|
+
struct lm_ggml_tensor * time_mix_lerp_v;
|
2612
|
+
struct lm_ggml_tensor * time_mix_lerp_r;
|
2613
|
+
struct lm_ggml_tensor * time_mix_lerp_g;
|
2614
|
+
|
2615
|
+
struct lm_ggml_tensor * time_mix_first;
|
2616
|
+
struct lm_ggml_tensor * time_mix_decay;
|
2617
|
+
struct lm_ggml_tensor * time_mix_decay_w1;
|
2618
|
+
struct lm_ggml_tensor * time_mix_decay_w2;
|
2619
|
+
struct lm_ggml_tensor * time_mix_key;
|
2620
|
+
struct lm_ggml_tensor * time_mix_value;
|
2621
|
+
struct lm_ggml_tensor * time_mix_receptance;
|
2622
|
+
struct lm_ggml_tensor * time_mix_gate;
|
2623
|
+
|
2624
|
+
struct lm_ggml_tensor * time_mix_ln;
|
2625
|
+
struct lm_ggml_tensor * time_mix_ln_b;
|
2626
|
+
struct lm_ggml_tensor * time_mix_output;
|
2627
|
+
|
2628
|
+
struct lm_ggml_tensor * channel_mix_lerp_k;
|
2629
|
+
struct lm_ggml_tensor * channel_mix_lerp_r;
|
2630
|
+
|
2631
|
+
struct lm_ggml_tensor * channel_mix_key;
|
2632
|
+
struct lm_ggml_tensor * channel_mix_receptance;
|
2633
|
+
struct lm_ggml_tensor * channel_mix_value;
|
2634
|
+
|
2515
2635
|
// long rope factors
|
2516
2636
|
struct lm_ggml_tensor * rope_long = nullptr;
|
2517
2637
|
struct lm_ggml_tensor * rope_short = nullptr;
|
@@ -3069,7 +3189,6 @@ struct llama_sbatch {
|
|
3069
3189
|
struct llama_context {
|
3070
3190
|
llama_context(const llama_model & model)
|
3071
3191
|
: model(model)
|
3072
|
-
, sampling(llama_n_vocab(&model))
|
3073
3192
|
, t_start_us(model.t_start_us)
|
3074
3193
|
, t_load_us(model.t_load_us) {}
|
3075
3194
|
|
@@ -3086,7 +3205,6 @@ struct llama_context {
|
|
3086
3205
|
const struct llama_model & model;
|
3087
3206
|
|
3088
3207
|
struct llama_cparams cparams;
|
3089
|
-
struct llama_sampling sampling;
|
3090
3208
|
struct llama_sbatch sbatch;
|
3091
3209
|
struct llama_kv_cache kv_self;
|
3092
3210
|
struct llama_control_vector cvec;
|
@@ -3102,18 +3220,21 @@ struct llama_context {
|
|
3102
3220
|
#endif
|
3103
3221
|
lm_ggml_backend_t backend_cpu = nullptr;
|
3104
3222
|
|
3223
|
+
lm_ggml_threadpool_t threadpool = nullptr;
|
3224
|
+
lm_ggml_threadpool_t threadpool_batch = nullptr;
|
3225
|
+
|
3105
3226
|
bool has_evaluated_once = false;
|
3106
3227
|
|
3107
|
-
int64_t t_start_us;
|
3108
|
-
int64_t t_load_us;
|
3109
|
-
int64_t t_p_eval_us = 0;
|
3110
|
-
int64_t t_eval_us = 0;
|
3228
|
+
mutable int64_t t_start_us;
|
3229
|
+
mutable int64_t t_load_us;
|
3230
|
+
mutable int64_t t_p_eval_us = 0;
|
3231
|
+
mutable int64_t t_eval_us = 0;
|
3111
3232
|
|
3112
|
-
int64_t t_compute_start_us = 0;
|
3113
|
-
int64_t n_queued_tokens = 0;
|
3233
|
+
mutable int64_t t_compute_start_us = 0;
|
3234
|
+
mutable int64_t n_queued_tokens = 0;
|
3114
3235
|
|
3115
|
-
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
3116
|
-
int32_t n_eval = 0; // number of eval calls
|
3236
|
+
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
3237
|
+
mutable int32_t n_eval = 0; // number of eval calls
|
3117
3238
|
|
3118
3239
|
// host buffer for the model output (logits and embeddings)
|
3119
3240
|
lm_ggml_backend_buffer_t buf_output = nullptr;
|
@@ -3233,29 +3354,33 @@ static size_t llama_get_device_count(const llama_model & model) {
|
|
3233
3354
|
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
3234
3355
|
lm_ggml_backend_buffer_type_t buft = nullptr;
|
3235
3356
|
|
3236
|
-
#
|
3237
|
-
int dev_count = (int)llama_get_device_count(model);
|
3357
|
+
#ifdef LM_GGML_USE_RPC
|
3238
3358
|
int rpc_count = (int)model.rpc_servers.size();
|
3239
|
-
|
3240
|
-
|
3359
|
+
#else
|
3360
|
+
int rpc_count = 0;
|
3361
|
+
#endif
|
3362
|
+
int local_gpu = gpu - rpc_count;
|
3363
|
+
#if defined(LM_GGML_USE_RPC)
|
3364
|
+
if (gpu < rpc_count) {
|
3365
|
+
const char * endpoint = model.rpc_servers[gpu].c_str();
|
3241
3366
|
return lm_ggml_backend_rpc_buffer_type(endpoint);
|
3242
3367
|
}
|
3243
3368
|
#endif
|
3244
3369
|
#if defined(LM_GGML_USE_METAL)
|
3245
3370
|
buft = lm_ggml_backend_metal_buffer_type();
|
3246
3371
|
#elif defined(LM_GGML_USE_CUDA)
|
3247
|
-
buft = lm_ggml_backend_cuda_buffer_type(
|
3372
|
+
buft = lm_ggml_backend_cuda_buffer_type(local_gpu);
|
3248
3373
|
#elif defined(LM_GGML_USE_VULKAN)
|
3249
|
-
buft = lm_ggml_backend_vk_buffer_type(
|
3374
|
+
buft = lm_ggml_backend_vk_buffer_type(local_gpu);
|
3250
3375
|
#elif defined(LM_GGML_USE_SYCL)
|
3251
|
-
buft = lm_ggml_backend_sycl_buffer_type(
|
3376
|
+
buft = lm_ggml_backend_sycl_buffer_type(local_gpu);
|
3252
3377
|
#elif defined(LM_GGML_USE_KOMPUTE)
|
3253
|
-
buft = lm_ggml_backend_kompute_buffer_type(
|
3378
|
+
buft = lm_ggml_backend_kompute_buffer_type(local_gpu);
|
3254
3379
|
if (buft == nullptr) {
|
3255
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__,
|
3380
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
|
3256
3381
|
}
|
3257
3382
|
#elif defined(LM_GGML_USE_CANN)
|
3258
|
-
buft = lm_ggml_backend_cann_buffer_type(
|
3383
|
+
buft = lm_ggml_backend_cann_buffer_type(local_gpu);
|
3259
3384
|
#endif
|
3260
3385
|
|
3261
3386
|
if (buft == nullptr) {
|
@@ -3263,7 +3388,7 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const lla
|
|
3263
3388
|
}
|
3264
3389
|
return buft;
|
3265
3390
|
LM_GGML_UNUSED(model);
|
3266
|
-
LM_GGML_UNUSED(
|
3391
|
+
LM_GGML_UNUSED(local_gpu);
|
3267
3392
|
}
|
3268
3393
|
|
3269
3394
|
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
@@ -3290,13 +3415,17 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
|
|
3290
3415
|
}
|
3291
3416
|
|
3292
3417
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
3293
|
-
#
|
3294
|
-
int dev_count = (int)llama_get_device_count(model);
|
3418
|
+
#ifdef LM_GGML_USE_RPC
|
3295
3419
|
int rpc_count = (int)model.rpc_servers.size();
|
3296
|
-
|
3420
|
+
#else
|
3421
|
+
int rpc_count = 0;
|
3422
|
+
#endif
|
3423
|
+
int local_device = device - rpc_count;
|
3424
|
+
#if defined(LM_GGML_USE_RPC)
|
3425
|
+
if (device < rpc_count) {
|
3297
3426
|
size_t total;
|
3298
3427
|
size_t free;
|
3299
|
-
const char * endpoint = model.rpc_servers[device
|
3428
|
+
const char * endpoint = model.rpc_servers[device].c_str();
|
3300
3429
|
lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
3301
3430
|
return free;
|
3302
3431
|
}
|
@@ -3304,28 +3433,28 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
3304
3433
|
#if defined(LM_GGML_USE_CUDA)
|
3305
3434
|
size_t total;
|
3306
3435
|
size_t free;
|
3307
|
-
lm_ggml_backend_cuda_get_device_memory(
|
3436
|
+
lm_ggml_backend_cuda_get_device_memory(local_device, &free, &total);
|
3308
3437
|
return free;
|
3309
3438
|
#elif defined(LM_GGML_USE_SYCL)
|
3310
3439
|
size_t total;
|
3311
3440
|
size_t free;
|
3312
|
-
lm_ggml_backend_sycl_get_device_memory(
|
3441
|
+
lm_ggml_backend_sycl_get_device_memory(local_device, &free, &total);
|
3313
3442
|
return free;
|
3314
3443
|
#elif defined(LM_GGML_USE_VULKAN)
|
3315
3444
|
size_t total;
|
3316
3445
|
size_t free;
|
3317
|
-
lm_ggml_backend_vk_get_device_memory(
|
3446
|
+
lm_ggml_backend_vk_get_device_memory(local_device, &free, &total);
|
3318
3447
|
return free;
|
3319
3448
|
#elif defined(LM_GGML_USE_CANN)
|
3320
3449
|
size_t total;
|
3321
3450
|
size_t free;
|
3322
|
-
lm_ggml_backend_cann_get_device_memory(
|
3451
|
+
lm_ggml_backend_cann_get_device_memory(local_device, &free, &total);
|
3323
3452
|
return free;
|
3324
3453
|
#else
|
3325
3454
|
return 1;
|
3326
3455
|
#endif
|
3327
3456
|
LM_GGML_UNUSED(model);
|
3328
|
-
LM_GGML_UNUSED(
|
3457
|
+
LM_GGML_UNUSED(local_device);
|
3329
3458
|
}
|
3330
3459
|
|
3331
3460
|
//
|
@@ -3434,7 +3563,7 @@ static bool llama_kv_cache_find_slot(
|
|
3434
3563
|
const uint32_t n_seq_tokens = batch.n_seq_tokens;
|
3435
3564
|
|
3436
3565
|
if (cache.recurrent) {
|
3437
|
-
// For recurrent state architectures (like Mamba),
|
3566
|
+
// For recurrent state architectures (like Mamba or RWKV),
|
3438
3567
|
// each cache cell can store the state for a whole sequence.
|
3439
3568
|
// A slot should be always be contiguous.
|
3440
3569
|
|
@@ -3683,7 +3812,7 @@ static bool llama_kv_cache_seq_rm(
|
|
3683
3812
|
if (p0 < 0) p0 = 0;
|
3684
3813
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
3685
3814
|
|
3686
|
-
// models like Mamba can't have a state partially erased
|
3815
|
+
// models like Mamba or RWKV can't have a state partially erased
|
3687
3816
|
if (cache.recurrent) {
|
3688
3817
|
if (seq_id >= (int64_t) cache.size) {
|
3689
3818
|
// could be fatal
|
@@ -3697,7 +3826,8 @@ static bool llama_kv_cache_seq_rm(
|
|
3697
3826
|
if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
|
3698
3827
|
return false;
|
3699
3828
|
}
|
3700
|
-
|
3829
|
+
// invalidate tails which will be cleared
|
3830
|
+
if (p0 <= cell.pos && cell.pos < p1) {
|
3701
3831
|
tail_id = -1;
|
3702
3832
|
}
|
3703
3833
|
}
|
@@ -3819,7 +3949,7 @@ static void llama_kv_cache_seq_add(
|
|
3819
3949
|
if (p0 == p1) return;
|
3820
3950
|
|
3821
3951
|
if (cache.recurrent) {
|
3822
|
-
// for Mamba-like models, only the pos needs to be shifted
|
3952
|
+
// for Mamba-like or RWKV models, only the pos needs to be shifted
|
3823
3953
|
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
3824
3954
|
const int32_t tail_id = cache.cells[seq_id].tail;
|
3825
3955
|
if (tail_id >= 0) {
|
@@ -3868,7 +3998,7 @@ static void llama_kv_cache_seq_div(
|
|
3868
3998
|
if (p0 == p1) return;
|
3869
3999
|
|
3870
4000
|
if (cache.recurrent) {
|
3871
|
-
// for Mamba-like models, only the pos needs to be changed
|
4001
|
+
// for Mamba-like or RWKV models, only the pos needs to be changed
|
3872
4002
|
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
3873
4003
|
const int32_t tail_id = cache.cells[seq_id].tail;
|
3874
4004
|
if (tail_id >= 0) {
|
@@ -4322,6 +4452,8 @@ struct llama_model_loader {
|
|
4322
4452
|
case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
4323
4453
|
case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
4324
4454
|
case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
4455
|
+
case LM_GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
|
4456
|
+
case LM_GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
|
4325
4457
|
case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
4326
4458
|
case LM_GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
4327
4459
|
case LM_GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
@@ -5015,6 +5147,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
5015
5147
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
5016
5148
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
5017
5149
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
5150
|
+
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
5151
|
+
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
5018
5152
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
5019
5153
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
5020
5154
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
@@ -5059,6 +5193,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
5059
5193
|
case MODEL_1B: return "1B";
|
5060
5194
|
case MODEL_1_3B: return "1.3B";
|
5061
5195
|
case MODEL_1_4B: return "1.4B";
|
5196
|
+
case MODEL_1_6B: return "1.6B";
|
5062
5197
|
case MODEL_2B: return "2B";
|
5063
5198
|
case MODEL_2_8B: return "2.8B";
|
5064
5199
|
case MODEL_3B: return "3B";
|
@@ -5105,6 +5240,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
5105
5240
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
5106
5241
|
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
5107
5242
|
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
5243
|
+
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
5108
5244
|
default: return "unknown";
|
5109
5245
|
}
|
5110
5246
|
}
|
@@ -5801,6 +5937,26 @@ static void llm_load_hparams(
|
|
5801
5937
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5802
5938
|
}
|
5803
5939
|
} break;
|
5940
|
+
case LLM_ARCH_RWKV6:
|
5941
|
+
{
|
5942
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
5943
|
+
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
5944
|
+
ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
5945
|
+
ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
5946
|
+
ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
|
5947
|
+
|
5948
|
+
switch (hparams.n_layer) {
|
5949
|
+
case 24: model.type = e_model::MODEL_1_6B; break;
|
5950
|
+
case 32:
|
5951
|
+
switch (hparams.n_embd) {
|
5952
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
5953
|
+
case 4096: model.type = e_model::MODEL_7B; break;
|
5954
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5955
|
+
} break;
|
5956
|
+
case 61: model.type = e_model::MODEL_14B; break;
|
5957
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5958
|
+
}
|
5959
|
+
} break;
|
5804
5960
|
default: (void)0;
|
5805
5961
|
}
|
5806
5962
|
|
@@ -5930,6 +6086,15 @@ static void llm_load_vocab(
|
|
5930
6086
|
}
|
5931
6087
|
#endif
|
5932
6088
|
}
|
6089
|
+
} else if (tokenizer_model == "rwkv") {
|
6090
|
+
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
|
6091
|
+
|
6092
|
+
// default special tokens
|
6093
|
+
vocab.special_bos_id = -1;
|
6094
|
+
vocab.special_eos_id = -1;
|
6095
|
+
vocab.special_unk_id = -1;
|
6096
|
+
vocab.special_sep_id = -1;
|
6097
|
+
vocab.special_pad_id = -1;
|
5933
6098
|
} else {
|
5934
6099
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
5935
6100
|
}
|
@@ -6061,6 +6226,12 @@ static void llm_load_vocab(
|
|
6061
6226
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
6062
6227
|
vocab.tokenizer_add_bos = false;
|
6063
6228
|
vocab.tokenizer_add_eos = true;
|
6229
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
|
6230
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
6231
|
+
vocab.tokenizer_add_space_prefix = false;
|
6232
|
+
vocab.tokenizer_clean_spaces = false;
|
6233
|
+
vocab.tokenizer_add_bos = false;
|
6234
|
+
vocab.tokenizer_add_eos = false;
|
6064
6235
|
} else {
|
6065
6236
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
6066
6237
|
}
|
@@ -6088,6 +6259,7 @@ static void llm_load_vocab(
|
|
6088
6259
|
|
6089
6260
|
const uint32_t n_vocab = lm_gguf_get_arr_n(ctx, token_idx);
|
6090
6261
|
|
6262
|
+
vocab.n_vocab = n_vocab;
|
6091
6263
|
vocab.id_to_token.resize(n_vocab);
|
6092
6264
|
|
6093
6265
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
@@ -6165,6 +6337,10 @@ static void llm_load_vocab(
|
|
6165
6337
|
}
|
6166
6338
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
6167
6339
|
vocab.linefeed_id = vocab.special_pad_id;
|
6340
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
|
6341
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
|
6342
|
+
LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
6343
|
+
vocab.linefeed_id = ids[0];
|
6168
6344
|
} else {
|
6169
6345
|
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
6170
6346
|
LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
@@ -6234,6 +6410,11 @@ static void llm_load_vocab(
|
|
6234
6410
|
)
|
6235
6411
|
) {
|
6236
6412
|
vocab.special_eot_id = t.second;
|
6413
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6414
|
+
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6415
|
+
__func__, t.first.c_str());
|
6416
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6417
|
+
}
|
6237
6418
|
break;
|
6238
6419
|
}
|
6239
6420
|
}
|
@@ -6247,6 +6428,11 @@ static void llm_load_vocab(
|
|
6247
6428
|
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
6248
6429
|
if (t != vocab.token_to_id.end()) {
|
6249
6430
|
vocab.special_eom_id = t->second;
|
6431
|
+
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6432
|
+
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6433
|
+
__func__, t->first.c_str());
|
6434
|
+
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6435
|
+
}
|
6250
6436
|
}
|
6251
6437
|
}
|
6252
6438
|
}
|
@@ -7955,23 +8141,23 @@ static bool llm_load_tensors(
|
|
7955
8141
|
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
7956
8142
|
|
7957
8143
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
7958
|
-
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
|
8144
|
+
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7959
8145
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
7960
|
-
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
|
8146
|
+
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7961
8147
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
7962
|
-
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
|
8148
|
+
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7963
8149
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
7964
|
-
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
|
8150
|
+
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7965
8151
|
|
7966
8152
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
7967
8153
|
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
7968
8154
|
|
7969
8155
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
7970
|
-
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
|
8156
|
+
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7971
8157
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
7972
|
-
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
|
8158
|
+
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7973
8159
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7974
|
-
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
8160
|
+
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7975
8161
|
}
|
7976
8162
|
} break;
|
7977
8163
|
case LLM_ARCH_T5:
|
@@ -8211,6 +8397,68 @@ static bool llm_load_tensors(
|
|
8211
8397
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
8212
8398
|
}
|
8213
8399
|
} break;
|
8400
|
+
case LLM_ARCH_RWKV6:
|
8401
|
+
{
|
8402
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
8403
|
+
|
8404
|
+
// Block 0, LN0
|
8405
|
+
model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
8406
|
+
model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
8407
|
+
|
8408
|
+
// output
|
8409
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
8410
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
8411
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
8412
|
+
|
8413
|
+
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
8414
|
+
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
8415
|
+
const int head_size = hparams.wkv_head_size;
|
8416
|
+
const int attn_hidden_size = n_embd;
|
8417
|
+
const int ffn_size = hparams.n_ff_arr[0];
|
8418
|
+
|
8419
|
+
for (int i = 0; i < n_layer; ++i) {
|
8420
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
8421
|
+
|
8422
|
+
auto & layer = model.layers[i];
|
8423
|
+
|
8424
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
8425
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
8426
|
+
|
8427
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
8428
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
8429
|
+
|
8430
|
+
layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5});
|
8431
|
+
layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5});
|
8432
|
+
|
8433
|
+
layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1});
|
8434
|
+
layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1});
|
8435
|
+
layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
|
8436
|
+
layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1});
|
8437
|
+
layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
|
8438
|
+
layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
|
8439
|
+
|
8440
|
+
layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
|
8441
|
+
layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
|
8442
|
+
layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});
|
8443
|
+
layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size});
|
8444
|
+
layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
|
8445
|
+
layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
|
8446
|
+
layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
|
8447
|
+
layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd});
|
8448
|
+
|
8449
|
+
layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
|
8450
|
+
layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
|
8451
|
+
layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
|
8452
|
+
|
8453
|
+
layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
|
8454
|
+
layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
|
8455
|
+
|
8456
|
+
layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
|
8457
|
+
layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
|
8458
|
+
layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd});
|
8459
|
+
}
|
8460
|
+
|
8461
|
+
} break;
|
8214
8462
|
default:
|
8215
8463
|
throw std::runtime_error("unknown architecture");
|
8216
8464
|
}
|
@@ -8495,8 +8743,7 @@ static void llm_build_kv_store(
|
|
8495
8743
|
|
8496
8744
|
LM_GGML_ASSERT(kv.size == n_ctx);
|
8497
8745
|
|
8498
|
-
struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
8499
|
-
(lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
8746
|
+
struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
|
8500
8747
|
cb(k_cache_view, "k_cache_view", il);
|
8501
8748
|
|
8502
8749
|
// note: storing RoPE-ed version of K in the KV cache
|
@@ -8507,8 +8754,7 @@ static void llm_build_kv_store(
|
|
8507
8754
|
struct lm_ggml_tensor * v_cache_view = nullptr;
|
8508
8755
|
|
8509
8756
|
if (cparams.flash_attn) {
|
8510
|
-
v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
8511
|
-
(kv_head)*lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
8757
|
+
v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
|
8512
8758
|
} else {
|
8513
8759
|
// note: the V cache is transposed when not using flash attention
|
8514
8760
|
v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
@@ -8995,8 +9241,7 @@ static struct lm_ggml_tensor * llm_build_kv(
|
|
8995
9241
|
|
8996
9242
|
struct lm_ggml_tensor * cur;
|
8997
9243
|
|
8998
|
-
cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
|
8999
|
-
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
9244
|
+
cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
9000
9245
|
cb(cur, "kqv_out", il);
|
9001
9246
|
|
9002
9247
|
return cur;
|
@@ -9170,6 +9415,171 @@ static struct lm_ggml_tensor * llm_build_mamba(
|
|
9170
9415
|
return cur;
|
9171
9416
|
}
|
9172
9417
|
|
9418
|
+
static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
9419
|
+
struct llama_context & lctx,
|
9420
|
+
struct lm_ggml_context * ctx,
|
9421
|
+
const struct llama_layer * layer,
|
9422
|
+
struct lm_ggml_tensor * cur,
|
9423
|
+
struct lm_ggml_tensor * x_prev,
|
9424
|
+
struct lm_ggml_tensor ** wkv_state) {
|
9425
|
+
size_t n_embed = cur->ne[0];
|
9426
|
+
size_t n_seq_tokens = cur->ne[1];
|
9427
|
+
size_t n_seqs = cur->ne[2];
|
9428
|
+
|
9429
|
+
size_t head_size = layer->time_mix_first->ne[0];
|
9430
|
+
size_t head_count = layer->time_mix_first->ne[1];
|
9431
|
+
|
9432
|
+
size_t n_tokens = n_seqs * n_seq_tokens;
|
9433
|
+
|
9434
|
+
struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
|
9435
|
+
|
9436
|
+
sx = lm_ggml_reshape_2d(ctx, sx, n_embed, n_tokens);
|
9437
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
|
9438
|
+
|
9439
|
+
struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
|
9440
|
+
|
9441
|
+
xxx = lm_ggml_reshape_4d(
|
9442
|
+
ctx,
|
9443
|
+
lm_ggml_tanh(
|
9444
|
+
ctx,
|
9445
|
+
lm_ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
|
9446
|
+
),
|
9447
|
+
layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
|
9448
|
+
);
|
9449
|
+
|
9450
|
+
xxx = lm_ggml_cont(ctx, lm_ggml_permute(ctx, xxx, 0, 1, 3, 2));
|
9451
|
+
|
9452
|
+
xxx = lm_ggml_mul_mat(
|
9453
|
+
ctx,
|
9454
|
+
lm_ggml_reshape_4d(
|
9455
|
+
ctx,
|
9456
|
+
layer->time_mix_w2,
|
9457
|
+
layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
|
9458
|
+
),
|
9459
|
+
xxx
|
9460
|
+
);
|
9461
|
+
|
9462
|
+
struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
|
9463
|
+
struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
|
9464
|
+
struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
|
9465
|
+
struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
|
9466
|
+
struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
|
9467
|
+
|
9468
|
+
struct lm_ggml_tensor * xw = lm_ggml_add(
|
9469
|
+
ctx,
|
9470
|
+
lm_ggml_mul(
|
9471
|
+
ctx,
|
9472
|
+
lm_ggml_add(ctx, mw, layer->time_mix_lerp_w),
|
9473
|
+
sx
|
9474
|
+
),
|
9475
|
+
cur
|
9476
|
+
);
|
9477
|
+
|
9478
|
+
struct lm_ggml_tensor * xk = lm_ggml_add(
|
9479
|
+
ctx,
|
9480
|
+
lm_ggml_mul(
|
9481
|
+
ctx,
|
9482
|
+
lm_ggml_add(ctx, mk, layer->time_mix_lerp_k),
|
9483
|
+
sx
|
9484
|
+
),
|
9485
|
+
cur
|
9486
|
+
);
|
9487
|
+
|
9488
|
+
struct lm_ggml_tensor * xv = lm_ggml_add(
|
9489
|
+
ctx,
|
9490
|
+
lm_ggml_mul(
|
9491
|
+
ctx,
|
9492
|
+
lm_ggml_add(ctx, mv, layer->time_mix_lerp_v),
|
9493
|
+
sx
|
9494
|
+
),
|
9495
|
+
cur
|
9496
|
+
);
|
9497
|
+
|
9498
|
+
struct lm_ggml_tensor * xr = lm_ggml_add(
|
9499
|
+
ctx,
|
9500
|
+
lm_ggml_mul(
|
9501
|
+
ctx,
|
9502
|
+
lm_ggml_add(ctx, mr, layer->time_mix_lerp_r),
|
9503
|
+
sx
|
9504
|
+
),
|
9505
|
+
cur
|
9506
|
+
);
|
9507
|
+
|
9508
|
+
struct lm_ggml_tensor * xg = lm_ggml_add(
|
9509
|
+
ctx,
|
9510
|
+
lm_ggml_mul(
|
9511
|
+
ctx,
|
9512
|
+
lm_ggml_add(ctx, mg, layer->time_mix_lerp_g),
|
9513
|
+
sx
|
9514
|
+
),
|
9515
|
+
cur
|
9516
|
+
);
|
9517
|
+
|
9518
|
+
struct lm_ggml_tensor * r = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
|
9519
|
+
struct lm_ggml_tensor * k = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
|
9520
|
+
struct lm_ggml_tensor * v = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
|
9521
|
+
struct lm_ggml_tensor * g = lm_ggml_silu(
|
9522
|
+
ctx,
|
9523
|
+
llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
|
9524
|
+
);
|
9525
|
+
|
9526
|
+
struct lm_ggml_tensor * w = lm_ggml_mul_mat(
|
9527
|
+
ctx,
|
9528
|
+
layer->time_mix_decay_w2,
|
9529
|
+
lm_ggml_tanh(
|
9530
|
+
ctx,
|
9531
|
+
lm_ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
|
9532
|
+
)
|
9533
|
+
);
|
9534
|
+
|
9535
|
+
w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
|
9536
|
+
w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
|
9537
|
+
w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
|
9538
|
+
|
9539
|
+
k = lm_ggml_transpose(ctx, k);
|
9540
|
+
v = lm_ggml_transpose(ctx, v);
|
9541
|
+
r = lm_ggml_transpose(ctx, r);
|
9542
|
+
|
9543
|
+
struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
9544
|
+
cur = lm_ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
|
9545
|
+
*wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
|
9546
|
+
|
9547
|
+
// group norm with head_count groups
|
9548
|
+
cur = lm_ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
|
9549
|
+
cur = lm_ggml_norm(ctx, cur, 64e-5f);
|
9550
|
+
|
9551
|
+
// Convert back to regular vectors.
|
9552
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
|
9553
|
+
cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
|
9554
|
+
|
9555
|
+
cur = lm_ggml_mul(ctx, cur, g);
|
9556
|
+
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
|
9557
|
+
|
9558
|
+
return lm_ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
|
9559
|
+
}
|
9560
|
+
|
9561
|
+
static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
|
9562
|
+
struct llama_context & lctx,
|
9563
|
+
struct lm_ggml_context * ctx,
|
9564
|
+
const struct llama_layer * layer,
|
9565
|
+
struct lm_ggml_tensor * cur,
|
9566
|
+
struct lm_ggml_tensor * x_prev) {
|
9567
|
+
struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
|
9568
|
+
struct lm_ggml_tensor * xk = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
|
9569
|
+
struct lm_ggml_tensor * xr = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
|
9570
|
+
|
9571
|
+
struct lm_ggml_tensor * r = lm_ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
|
9572
|
+
struct lm_ggml_tensor * k = lm_ggml_sqr(
|
9573
|
+
ctx,
|
9574
|
+
lm_ggml_relu(
|
9575
|
+
ctx,
|
9576
|
+
llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
|
9577
|
+
)
|
9578
|
+
);
|
9579
|
+
|
9580
|
+
return lm_ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
|
9581
|
+
}
|
9582
|
+
|
9173
9583
|
struct llm_build_context {
|
9174
9584
|
const llama_model & model;
|
9175
9585
|
llama_context & lctx;
|
@@ -13790,7 +14200,9 @@ struct llm_build_context {
|
|
13790
14200
|
{
|
13791
14201
|
// compute Q and K and RoPE them
|
13792
14202
|
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13793
|
-
|
14203
|
+
if (model.layers[il].wq_scale) {
|
14204
|
+
Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
14205
|
+
}
|
13794
14206
|
cb(Qcur, "Qcur", il);
|
13795
14207
|
if (model.layers[il].bq) {
|
13796
14208
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
@@ -13799,7 +14211,9 @@ struct llm_build_context {
|
|
13799
14211
|
|
13800
14212
|
// B1.K
|
13801
14213
|
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13802
|
-
|
14214
|
+
if (model.layers[il].wk_scale) {
|
14215
|
+
Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
14216
|
+
}
|
13803
14217
|
cb(Kcur, "Kcur", il);
|
13804
14218
|
if (model.layers[il].bk) {
|
13805
14219
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
@@ -13808,7 +14222,9 @@ struct llm_build_context {
|
|
13808
14222
|
|
13809
14223
|
// B1.V
|
13810
14224
|
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13811
|
-
|
14225
|
+
if (model.layers[il].wv_scale) {
|
14226
|
+
Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
14227
|
+
}
|
13812
14228
|
cb(Vcur, "Vcur", il);
|
13813
14229
|
if (model.layers[il].bv) {
|
13814
14230
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -13839,7 +14255,9 @@ struct llm_build_context {
|
|
13839
14255
|
cb(cur, "attn_sub_norm", il);
|
13840
14256
|
|
13841
14257
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
13842
|
-
|
14258
|
+
if (model.layers[il].wo_scale) {
|
14259
|
+
cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
14260
|
+
}
|
13843
14261
|
if (model.layers[il].bo) {
|
13844
14262
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bo);
|
13845
14263
|
}
|
@@ -13876,7 +14294,9 @@ struct llm_build_context {
|
|
13876
14294
|
cb(cur, "ffn_sub_norm", il);
|
13877
14295
|
|
13878
14296
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
|
13879
|
-
|
14297
|
+
if (model.layers[il].ffn_down_scale) {
|
14298
|
+
cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
14299
|
+
}
|
13880
14300
|
cb(cur, "ffn_down", il);
|
13881
14301
|
|
13882
14302
|
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
@@ -14691,6 +15111,117 @@ struct llm_build_context {
|
|
14691
15111
|
|
14692
15112
|
return gf;
|
14693
15113
|
}
|
15114
|
+
|
15115
|
+
lm_ggml_cgraph * build_rwkv6() {
|
15116
|
+
lm_ggml_cgraph *gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
15117
|
+
|
15118
|
+
// Token shift state dimensions should be 2 * n_emb
|
15119
|
+
LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
|
15120
|
+
|
15121
|
+
const int64_t n_seqs = batch.n_seqs;
|
15122
|
+
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
15123
|
+
const int64_t n_tokens = batch.n_tokens;
|
15124
|
+
LM_GGML_ASSERT(n_seqs != 0);
|
15125
|
+
LM_GGML_ASSERT(batch.equal_seqs);
|
15126
|
+
LM_GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
15127
|
+
|
15128
|
+
struct lm_ggml_tensor * cur;
|
15129
|
+
struct lm_ggml_tensor * inpL;
|
15130
|
+
struct lm_ggml_tensor * state_copy = build_inp_s_copy();
|
15131
|
+
struct lm_ggml_tensor * state_mask = build_inp_s_mask();
|
15132
|
+
|
15133
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
15134
|
+
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
15135
|
+
|
15136
|
+
for (int il = 0; il < n_layer; ++il) {
|
15137
|
+
const llama_layer * layer = &model.layers[il];
|
15138
|
+
|
15139
|
+
// (ab)using the KV cache to store the states
|
15140
|
+
struct lm_ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
|
15141
|
+
gf, kv_self.k_l[il], state_copy, state_mask,
|
15142
|
+
hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
15143
|
+
struct lm_ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
|
15144
|
+
gf, kv_self.v_l[il], state_copy, state_mask,
|
15145
|
+
hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
15146
|
+
|
15147
|
+
cur = lm_ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
15148
|
+
token_shift = lm_ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
|
15149
|
+
|
15150
|
+
struct lm_ggml_tensor * att_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
15151
|
+
struct lm_ggml_tensor * ffn_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * lm_ggml_element_size(token_shift));
|
15152
|
+
|
15153
|
+
struct lm_ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
|
15154
|
+
struct lm_ggml_tensor * x_prev = lm_ggml_concat(
|
15155
|
+
ctx0,
|
15156
|
+
att_shift,
|
15157
|
+
lm_ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
|
15158
|
+
1
|
15159
|
+
);
|
15160
|
+
|
15161
|
+
cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
|
15162
|
+
lm_ggml_build_forward_expand(gf, cur);
|
15163
|
+
lm_ggml_build_forward_expand(
|
15164
|
+
gf,
|
15165
|
+
lm_ggml_cpy(
|
15166
|
+
ctx0,
|
15167
|
+
wkv_states,
|
15168
|
+
lm_ggml_view_1d(
|
15169
|
+
ctx0,
|
15170
|
+
kv_self.v_l[il],
|
15171
|
+
hparams.n_embd_v_s() * n_seqs,
|
15172
|
+
hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
|
15173
|
+
)
|
15174
|
+
)
|
15175
|
+
);
|
15176
|
+
|
15177
|
+
struct lm_ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
|
15178
|
+
x_prev = lm_ggml_concat(
|
15179
|
+
ctx0,
|
15180
|
+
ffn_shift,
|
15181
|
+
lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
|
15182
|
+
1
|
15183
|
+
);
|
15184
|
+
cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
|
15185
|
+
lm_ggml_build_forward_expand(gf, cur);
|
15186
|
+
|
15187
|
+
struct lm_ggml_tensor * last_norm_att = lm_ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_att));
|
15188
|
+
struct lm_ggml_tensor * last_norm_ffn = lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_ffn));
|
15189
|
+
|
15190
|
+
token_shift = lm_ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
|
15191
|
+
|
15192
|
+
lm_ggml_build_forward_expand(
|
15193
|
+
gf,
|
15194
|
+
lm_ggml_cpy(
|
15195
|
+
ctx0,
|
15196
|
+
lm_ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
|
15197
|
+
lm_ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * lm_ggml_element_size(kv_self.k_l[il]))
|
15198
|
+
)
|
15199
|
+
);
|
15200
|
+
|
15201
|
+
if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
|
15202
|
+
cur = lm_ggml_scale(ctx0, cur, 0.5F);
|
15203
|
+
}
|
15204
|
+
|
15205
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
15206
|
+
cb(cur, "l_out", il);
|
15207
|
+
|
15208
|
+
// input for next layer
|
15209
|
+
inpL = cur;
|
15210
|
+
}
|
15211
|
+
|
15212
|
+
cur = inpL;
|
15213
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
15214
|
+
cur = lm_ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
15215
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
15216
|
+
|
15217
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
|
15218
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
15219
|
+
|
15220
|
+
cb(cur, "result_output", -1);
|
15221
|
+
lm_ggml_build_forward_expand(gf, cur);
|
15222
|
+
|
15223
|
+
return gf;
|
15224
|
+
}
|
14694
15225
|
};
|
14695
15226
|
|
14696
15227
|
static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -14937,6 +15468,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
14937
15468
|
{
|
14938
15469
|
result = llm.build_exaone();
|
14939
15470
|
} break;
|
15471
|
+
case LLM_ARCH_RWKV6:
|
15472
|
+
{
|
15473
|
+
result = llm.build_rwkv6();
|
15474
|
+
} break;
|
14940
15475
|
default:
|
14941
15476
|
LM_GGML_ABORT("fatal error");
|
14942
15477
|
}
|
@@ -15505,9 +16040,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
|
|
15505
16040
|
}
|
15506
16041
|
|
15507
16042
|
static void llama_graph_compute(
|
15508
|
-
|
15509
|
-
|
15510
|
-
|
16043
|
+
llama_context & lctx,
|
16044
|
+
lm_ggml_cgraph * gf,
|
16045
|
+
int n_threads,
|
16046
|
+
lm_ggml_threadpool * threadpool) {
|
15511
16047
|
#ifdef LM_GGML_USE_METAL
|
15512
16048
|
if (lm_ggml_backend_is_metal(lctx.backend_metal)) {
|
15513
16049
|
lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
@@ -15516,6 +16052,7 @@ static void llama_graph_compute(
|
|
15516
16052
|
|
15517
16053
|
if (lctx.backend_cpu != nullptr) {
|
15518
16054
|
lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
16055
|
+
lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
15519
16056
|
lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
15520
16057
|
}
|
15521
16058
|
#ifdef LM_GGML_USE_BLAS
|
@@ -15550,6 +16087,13 @@ static int llama_decode_internal(
|
|
15550
16087
|
return -1;
|
15551
16088
|
}
|
15552
16089
|
|
16090
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
16091
|
+
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
|
16092
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
16093
|
+
return -1;
|
16094
|
+
}
|
16095
|
+
}
|
16096
|
+
|
15553
16097
|
const auto & model = lctx.model;
|
15554
16098
|
const auto & hparams = model.hparams;
|
15555
16099
|
const auto & cparams = lctx.cparams;
|
@@ -15636,6 +16180,8 @@ static int llama_decode_internal(
|
|
15636
16180
|
}
|
15637
16181
|
|
15638
16182
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
16183
|
+
lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
16184
|
+
|
15639
16185
|
LM_GGML_ASSERT(n_threads > 0);
|
15640
16186
|
|
15641
16187
|
// non-causal masks do not use the KV cache
|
@@ -15697,7 +16243,7 @@ static int llama_decode_internal(
|
|
15697
16243
|
|
15698
16244
|
llama_set_inputs(lctx, ubatch);
|
15699
16245
|
|
15700
|
-
llama_graph_compute(lctx, gf, n_threads);
|
16246
|
+
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
15701
16247
|
|
15702
16248
|
// update the kv ring buffer
|
15703
16249
|
{
|
@@ -15840,6 +16386,13 @@ static int llama_encode_internal(
|
|
15840
16386
|
return -1;
|
15841
16387
|
}
|
15842
16388
|
|
16389
|
+
for (uint32_t i = 0; i < n_tokens; ++i) {
|
16390
|
+
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
|
16391
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
16392
|
+
return -1;
|
16393
|
+
}
|
16394
|
+
}
|
16395
|
+
|
15843
16396
|
const auto & model = lctx.model;
|
15844
16397
|
const auto & hparams = model.hparams;
|
15845
16398
|
const auto & cparams = lctx.cparams;
|
@@ -15874,7 +16427,9 @@ static int llama_encode_internal(
|
|
15874
16427
|
lctx.inp_embd_enc = NULL;
|
15875
16428
|
lctx.n_outputs = n_tokens;
|
15876
16429
|
|
15877
|
-
|
16430
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
16431
|
+
lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
16432
|
+
|
15878
16433
|
LM_GGML_ASSERT(n_threads > 0);
|
15879
16434
|
|
15880
16435
|
lm_ggml_backend_sched_reset(lctx.sched);
|
@@ -15906,7 +16461,7 @@ static int llama_encode_internal(
|
|
15906
16461
|
|
15907
16462
|
llama_set_inputs(lctx, ubatch);
|
15908
16463
|
|
15909
|
-
llama_graph_compute(lctx, gf, n_threads);
|
16464
|
+
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
15910
16465
|
|
15911
16466
|
// extract embeddings
|
15912
16467
|
if (embd) {
|
@@ -16188,7 +16743,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
16188
16743
|
|
16189
16744
|
lm_ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
16190
16745
|
|
16191
|
-
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
16746
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
16192
16747
|
#endif
|
16193
16748
|
|
16194
16749
|
//const int64_t t_end = lm_ggml_time_us();
|
@@ -16214,7 +16769,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|
16214
16769
|
|
16215
16770
|
llama_set_k_shift(lctx);
|
16216
16771
|
|
16217
|
-
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
16772
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
16218
16773
|
|
16219
16774
|
need_reserve = true;
|
16220
16775
|
}
|
@@ -16425,6 +16980,9 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
16425
16980
|
new_type == LM_GGML_TYPE_Q4_0_8_8) {
|
16426
16981
|
new_type = LM_GGML_TYPE_Q4_0;
|
16427
16982
|
}
|
16983
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
16984
|
+
new_type = LM_GGML_TYPE_Q4_K;
|
16985
|
+
}
|
16428
16986
|
}
|
16429
16987
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
16430
16988
|
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
@@ -16624,6 +17182,8 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
16624
17182
|
}
|
16625
17183
|
if (convert_incompatible_tensor) {
|
16626
17184
|
switch (new_type) {
|
17185
|
+
case LM_GGML_TYPE_TQ1_0:
|
17186
|
+
case LM_GGML_TYPE_TQ2_0: new_type = LM_GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
|
16627
17187
|
case LM_GGML_TYPE_IQ2_XXS:
|
16628
17188
|
case LM_GGML_TYPE_IQ2_XS:
|
16629
17189
|
case LM_GGML_TYPE_IQ2_S:
|
@@ -16729,6 +17289,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
16729
17289
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
16730
17290
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = LM_GGML_TYPE_Q5_K; break;
|
16731
17291
|
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = LM_GGML_TYPE_Q6_K; break;
|
17292
|
+
case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = LM_GGML_TYPE_TQ1_0; break;
|
17293
|
+
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = LM_GGML_TYPE_TQ2_0; break;
|
16732
17294
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = LM_GGML_TYPE_IQ2_XXS; break;
|
16733
17295
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = LM_GGML_TYPE_IQ2_XS; break;
|
16734
17296
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = LM_GGML_TYPE_IQ2_XS; break;
|
@@ -16833,7 +17395,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
16833
17395
|
|
16834
17396
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
16835
17397
|
if (name.find("attn_v.weight") != std::string::npos ||
|
16836
|
-
name.find("attn_qkv.weight") != std::string::npos
|
17398
|
+
name.find("attn_qkv.weight") != std::string::npos ||
|
17399
|
+
name.find("attn_kv_b.weight")!= std::string::npos) {
|
16837
17400
|
++qs.n_attention_wv;
|
16838
17401
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
16839
17402
|
qs.has_output = true;
|
@@ -16974,6 +17537,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
16974
17537
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
16975
17538
|
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
16976
17539
|
|
17540
|
+
// do not quantize RWKV's time_mix_first tensors
|
17541
|
+
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
17542
|
+
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
17543
|
+
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
17544
|
+
|
16977
17545
|
// do not quantize relative position bias (T5)
|
16978
17546
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
16979
17547
|
|
@@ -17357,7 +17925,6 @@ struct llama_model_params llama_model_default_params() {
|
|
17357
17925
|
|
17358
17926
|
struct llama_context_params llama_context_default_params() {
|
17359
17927
|
struct llama_context_params result = {
|
17360
|
-
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
17361
17928
|
/*.n_ctx =*/ 512,
|
17362
17929
|
/*.n_batch =*/ 2048,
|
17363
17930
|
/*.n_ubatch =*/ 512,
|
@@ -17390,6 +17957,14 @@ struct llama_context_params llama_context_default_params() {
|
|
17390
17957
|
return result;
|
17391
17958
|
}
|
17392
17959
|
|
17960
|
+
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
17961
|
+
struct llama_sampler_chain_params result = {
|
17962
|
+
/*.no_perf =*/ true,
|
17963
|
+
};
|
17964
|
+
|
17965
|
+
return result;
|
17966
|
+
}
|
17967
|
+
|
17393
17968
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
17394
17969
|
struct llama_model_quantize_params result = {
|
17395
17970
|
/*.nthread =*/ 0,
|
@@ -17461,6 +18036,19 @@ void llama_numa_init(enum lm_ggml_numa_strategy numa) {
|
|
17461
18036
|
}
|
17462
18037
|
}
|
17463
18038
|
|
18039
|
+
void llama_attach_threadpool(
|
18040
|
+
struct llama_context * ctx,
|
18041
|
+
lm_ggml_threadpool_t threadpool,
|
18042
|
+
lm_ggml_threadpool_t threadpool_batch) {
|
18043
|
+
ctx->threadpool = threadpool;
|
18044
|
+
ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
|
18045
|
+
}
|
18046
|
+
|
18047
|
+
void llama_detach_threadpool(struct llama_context * ctx) {
|
18048
|
+
ctx->threadpool = nullptr;
|
18049
|
+
ctx->threadpool_batch = nullptr;
|
18050
|
+
}
|
18051
|
+
|
17464
18052
|
void llama_backend_free(void) {
|
17465
18053
|
lm_ggml_quantize_free();
|
17466
18054
|
}
|
@@ -17630,10 +18218,6 @@ struct llama_context * llama_new_context_with_model(
|
|
17630
18218
|
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
17631
18219
|
}
|
17632
18220
|
|
17633
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
17634
|
-
params.seed = time(NULL);
|
17635
|
-
}
|
17636
|
-
|
17637
18221
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
17638
18222
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
17639
18223
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
@@ -17644,10 +18228,10 @@ struct llama_context * llama_new_context_with_model(
|
|
17644
18228
|
ctx->abort_callback = params.abort_callback;
|
17645
18229
|
ctx->abort_callback_data = params.abort_callback_data;
|
17646
18230
|
|
17647
|
-
ctx->
|
17648
|
-
|
18231
|
+
ctx->logits_all = params.logits_all;
|
18232
|
+
|
17649
18233
|
// build worst-case graph for encoder if a model contains encoder
|
17650
|
-
ctx->is_encoding
|
18234
|
+
ctx->is_encoding = llama_model_has_encoder(model);
|
17651
18235
|
|
17652
18236
|
uint32_t kv_size = cparams.n_ctx;
|
17653
18237
|
lm_ggml_type type_k = params.type_k;
|
@@ -17667,6 +18251,20 @@ struct llama_context * llama_new_context_with_model(
|
|
17667
18251
|
|
17668
18252
|
if (!hparams.vocab_only) {
|
17669
18253
|
// initialize backends
|
18254
|
+
#if defined(LM_GGML_USE_RPC)
|
18255
|
+
if (model->n_gpu_layers > 0) {
|
18256
|
+
for (const auto & endpoint : model->rpc_servers) {
|
18257
|
+
lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
|
18258
|
+
if (backend == nullptr) {
|
18259
|
+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
18260
|
+
llama_free(ctx);
|
18261
|
+
return nullptr;
|
18262
|
+
}
|
18263
|
+
ctx->backends.push_back(backend);
|
18264
|
+
}
|
18265
|
+
}
|
18266
|
+
#endif
|
18267
|
+
|
17670
18268
|
#if defined(LM_GGML_USE_METAL)
|
17671
18269
|
if (model->n_gpu_layers > 0) {
|
17672
18270
|
ctx->backend_metal = lm_ggml_backend_metal_init();
|
@@ -17791,19 +18389,6 @@ struct llama_context * llama_new_context_with_model(
|
|
17791
18389
|
}
|
17792
18390
|
#endif
|
17793
18391
|
|
17794
|
-
#if defined(LM_GGML_USE_RPC)
|
17795
|
-
if (model->n_gpu_layers > 0) {
|
17796
|
-
for (const auto & endpoint : model->rpc_servers) {
|
17797
|
-
lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
|
17798
|
-
if (backend == nullptr) {
|
17799
|
-
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
17800
|
-
llama_free(ctx);
|
17801
|
-
return nullptr;
|
17802
|
-
}
|
17803
|
-
ctx->backends.push_back(backend);
|
17804
|
-
}
|
17805
|
-
}
|
17806
|
-
#endif
|
17807
18392
|
ctx->backend_cpu = lm_ggml_backend_cpu_init();
|
17808
18393
|
if (ctx->backend_cpu == nullptr) {
|
17809
18394
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
@@ -17924,14 +18509,6 @@ void llama_free(struct llama_context * ctx) {
|
|
17924
18509
|
delete ctx;
|
17925
18510
|
}
|
17926
18511
|
|
17927
|
-
const struct llama_model * llama_get_model(const struct llama_context * ctx) {
|
17928
|
-
return &ctx->model;
|
17929
|
-
}
|
17930
|
-
|
17931
|
-
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
|
17932
|
-
return &ctx->model.vocab;
|
17933
|
-
}
|
17934
|
-
|
17935
18512
|
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
17936
18513
|
return ctx->cparams.n_ctx;
|
17937
18514
|
}
|
@@ -17952,6 +18529,30 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
17952
18529
|
return model->vocab.type;
|
17953
18530
|
}
|
17954
18531
|
|
18532
|
+
int32_t llama_n_vocab(const struct llama_model * model) {
|
18533
|
+
return model->hparams.n_vocab;
|
18534
|
+
}
|
18535
|
+
|
18536
|
+
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
18537
|
+
return model->hparams.n_ctx_train;
|
18538
|
+
}
|
18539
|
+
|
18540
|
+
int32_t llama_n_embd(const struct llama_model * model) {
|
18541
|
+
return model->hparams.n_embd;
|
18542
|
+
}
|
18543
|
+
|
18544
|
+
int32_t llama_n_layer(const struct llama_model * model) {
|
18545
|
+
return model->hparams.n_layer;
|
18546
|
+
}
|
18547
|
+
|
18548
|
+
const struct llama_model * llama_get_model(const struct llama_context * ctx) {
|
18549
|
+
return &ctx->model;
|
18550
|
+
}
|
18551
|
+
|
18552
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
18553
|
+
return ctx->cparams.pooling_type;
|
18554
|
+
}
|
18555
|
+
|
17955
18556
|
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
17956
18557
|
switch (model->arch) {
|
17957
18558
|
// these models do not use RoPE
|
@@ -17965,6 +18566,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
17965
18566
|
case LLM_ARCH_T5:
|
17966
18567
|
case LLM_ARCH_T5ENCODER:
|
17967
18568
|
case LLM_ARCH_JAIS:
|
18569
|
+
case LLM_ARCH_RWKV6:
|
17968
18570
|
return LLAMA_ROPE_TYPE_NONE;
|
17969
18571
|
|
17970
18572
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -18014,26 +18616,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
18014
18616
|
return LLAMA_ROPE_TYPE_NONE;
|
18015
18617
|
}
|
18016
18618
|
|
18017
|
-
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
18018
|
-
return ctx->cparams.pooling_type;
|
18019
|
-
}
|
18020
|
-
|
18021
|
-
int32_t llama_n_vocab(const struct llama_model * model) {
|
18022
|
-
return model->hparams.n_vocab;
|
18023
|
-
}
|
18024
|
-
|
18025
|
-
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
18026
|
-
return model->hparams.n_ctx_train;
|
18027
|
-
}
|
18028
|
-
|
18029
|
-
int32_t llama_n_embd(const struct llama_model * model) {
|
18030
|
-
return model->hparams.n_embd;
|
18031
|
-
}
|
18032
|
-
|
18033
|
-
int32_t llama_n_layer(const struct llama_model * model) {
|
18034
|
-
return model->hparams.n_layer;
|
18035
|
-
}
|
18036
|
-
|
18037
18619
|
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
18038
18620
|
return model->hparams.rope_freq_scale_train;
|
18039
18621
|
}
|
@@ -18133,6 +18715,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) {
|
|
18133
18715
|
bool llama_model_is_recurrent(const struct llama_model * model) {
|
18134
18716
|
switch (model->arch) {
|
18135
18717
|
case LLM_ARCH_MAMBA: return true;
|
18718
|
+
case LLM_ARCH_RWKV6: return true;
|
18136
18719
|
default: return false;
|
18137
18720
|
}
|
18138
18721
|
}
|
@@ -18449,14 +19032,14 @@ struct llama_data_write {
|
|
18449
19032
|
// TODO: add more model-specific info which should prevent loading the session file if not identical
|
18450
19033
|
}
|
18451
19034
|
|
18452
|
-
void write_rng(const std::mt19937 & rng) {
|
18453
|
-
|
18454
|
-
|
19035
|
+
//void write_rng(const std::mt19937 & rng) {
|
19036
|
+
// std::ostringstream rng_ss;
|
19037
|
+
// rng_ss << rng;
|
18455
19038
|
|
18456
|
-
|
19039
|
+
// const std::string & rng_str = rng_ss.str();
|
18457
19040
|
|
18458
|
-
|
18459
|
-
}
|
19041
|
+
// write_string(rng_str);
|
19042
|
+
//}
|
18460
19043
|
|
18461
19044
|
void write_output_ids(struct llama_context * ctx) {
|
18462
19045
|
llama_output_reorder(ctx);
|
@@ -18676,17 +19259,17 @@ struct llama_data_read {
|
|
18676
19259
|
// TODO: add more info which needs to be identical but which is not verified otherwise
|
18677
19260
|
}
|
18678
19261
|
|
18679
|
-
void read_rng(std::mt19937 & rng) {
|
18680
|
-
|
18681
|
-
|
19262
|
+
//void read_rng(std::mt19937 & rng) {
|
19263
|
+
// std::string rng_str;
|
19264
|
+
// read_string(rng_str);
|
18682
19265
|
|
18683
|
-
|
18684
|
-
|
19266
|
+
// std::istringstream rng_ss(rng_str);
|
19267
|
+
// rng_ss >> rng;
|
18685
19268
|
|
18686
|
-
|
18687
|
-
|
18688
|
-
|
18689
|
-
}
|
19269
|
+
// if (rng_ss.fail()) {
|
19270
|
+
// throw std::runtime_error("failed to load RNG state");
|
19271
|
+
// }
|
19272
|
+
//}
|
18690
19273
|
|
18691
19274
|
void read_output_ids(struct llama_context * ctx) {
|
18692
19275
|
std::vector<int32_t> output_pos;
|
@@ -19116,8 +19699,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
|
|
19116
19699
|
|
19117
19700
|
data_ctx.write_model_info(ctx);
|
19118
19701
|
|
19119
|
-
data_ctx.write_rng(ctx->sampling.rng);
|
19120
|
-
|
19121
19702
|
// copy outputs
|
19122
19703
|
data_ctx.write_output_ids(ctx);
|
19123
19704
|
data_ctx.write_logits(ctx);
|
@@ -19155,9 +19736,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
|
|
19155
19736
|
|
19156
19737
|
data_ctx.read_model_info(ctx);
|
19157
19738
|
|
19158
|
-
// set rng
|
19159
|
-
data_ctx.read_rng(ctx->sampling.rng);
|
19160
|
-
|
19161
19739
|
// set outputs
|
19162
19740
|
data_ctx.read_output_ids(ctx);
|
19163
19741
|
data_ctx.read_logits(ctx);
|
@@ -19377,16 +19955,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
|
|
19377
19955
|
}
|
19378
19956
|
}
|
19379
19957
|
|
19380
|
-
void llama_set_n_threads(struct llama_context * ctx,
|
19958
|
+
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
|
19381
19959
|
ctx->cparams.n_threads = n_threads;
|
19382
19960
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
19383
19961
|
}
|
19384
19962
|
|
19385
|
-
|
19963
|
+
int32_t llama_n_threads(struct llama_context * ctx) {
|
19386
19964
|
return ctx->cparams.n_threads;
|
19387
19965
|
}
|
19388
19966
|
|
19389
|
-
|
19967
|
+
int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
19390
19968
|
return ctx->cparams.n_threads_batch;
|
19391
19969
|
}
|
19392
19970
|
|
@@ -19560,8 +20138,9 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|
19560
20138
|
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
19561
20139
|
#ifndef NDEBUG
|
19562
20140
|
LM_GGML_ABORT("fatal error");
|
19563
|
-
#
|
20141
|
+
#else
|
19564
20142
|
return nullptr;
|
20143
|
+
#endif
|
19565
20144
|
}
|
19566
20145
|
}
|
19567
20146
|
|
@@ -19609,8 +20188,9 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
|
19609
20188
|
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
19610
20189
|
#ifndef NDEBUG
|
19611
20190
|
LM_GGML_ABORT("fatal error");
|
19612
|
-
#
|
20191
|
+
#else
|
19613
20192
|
return nullptr;
|
20193
|
+
#endif
|
19614
20194
|
}
|
19615
20195
|
}
|
19616
20196
|
|
@@ -20044,128 +20624,18 @@ int32_t llama_chat_apply_template(
|
|
20044
20624
|
}
|
20045
20625
|
|
20046
20626
|
//
|
20047
|
-
//
|
20627
|
+
// sampling
|
20048
20628
|
//
|
20049
20629
|
|
20050
|
-
|
20051
|
-
|
20052
|
-
|
20053
|
-
size_t start_rule_index) {
|
20054
|
-
return llama_grammar_init_impl(rules, n_rules, start_rule_index);
|
20055
|
-
}
|
20056
|
-
|
20057
|
-
void llama_grammar_free(struct llama_grammar * grammar) {
|
20058
|
-
llama_grammar_free_impl(grammar);
|
20059
|
-
}
|
20060
|
-
|
20061
|
-
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
20062
|
-
return llama_grammar_copy_impl(grammar);
|
20063
|
-
}
|
20064
|
-
|
20065
|
-
void llama_grammar_sample(
|
20066
|
-
const struct llama_grammar * grammar,
|
20067
|
-
const struct llama_context * ctx,
|
20068
|
-
llama_token_data_array * candidates) {
|
20069
|
-
llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
|
20070
|
-
}
|
20071
|
-
|
20072
|
-
void llama_sample_grammar(
|
20073
|
-
struct llama_context * ctx,
|
20074
|
-
llama_token_data_array * candidates,
|
20075
|
-
const struct llama_grammar * grammar) {
|
20076
|
-
llama_grammar_sample(grammar, ctx, candidates);
|
20077
|
-
}
|
20078
|
-
|
20079
|
-
void llama_grammar_accept_token(
|
20080
|
-
struct llama_grammar * grammar,
|
20081
|
-
struct llama_context * ctx,
|
20082
|
-
llama_token token) {
|
20083
|
-
llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token);
|
20630
|
+
// TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
|
20631
|
+
struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
|
20632
|
+
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
20084
20633
|
}
|
20085
20634
|
|
20086
20635
|
//
|
20087
|
-
//
|
20636
|
+
// model split
|
20088
20637
|
//
|
20089
20638
|
|
20090
|
-
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
20091
|
-
llama_set_rng_seed_impl(&ctx->sampling, seed);
|
20092
|
-
}
|
20093
|
-
|
20094
|
-
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
20095
|
-
llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates);
|
20096
|
-
}
|
20097
|
-
|
20098
|
-
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
20099
|
-
llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
|
20100
|
-
}
|
20101
|
-
|
20102
|
-
void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
20103
|
-
llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
|
20104
|
-
}
|
20105
|
-
|
20106
|
-
void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
20107
|
-
llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
|
20108
|
-
}
|
20109
|
-
|
20110
|
-
void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates, float xtc_threshold, float xtc_probability, size_t min_keep, std::mt19937 rng){
|
20111
|
-
llama_sample_xtc_impl(ctx ? &ctx-> sampling: nullptr, candidates, xtc_threshold, xtc_probability, min_keep, rng);
|
20112
|
-
}
|
20113
|
-
|
20114
|
-
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
20115
|
-
llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
|
20116
|
-
}
|
20117
|
-
|
20118
|
-
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
20119
|
-
llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
|
20120
|
-
}
|
20121
|
-
|
20122
|
-
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
|
20123
|
-
llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
|
20124
|
-
}
|
20125
|
-
|
20126
|
-
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
20127
|
-
llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
|
20128
|
-
}
|
20129
|
-
|
20130
|
-
void llama_sample_repetition_penalties(
|
20131
|
-
struct llama_context * ctx,
|
20132
|
-
llama_token_data_array * candidates,
|
20133
|
-
const llama_token * last_tokens,
|
20134
|
-
size_t penalty_last_n,
|
20135
|
-
float penalty_repeat,
|
20136
|
-
float penalty_freq,
|
20137
|
-
float penalty_present) {
|
20138
|
-
llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
|
20139
|
-
}
|
20140
|
-
|
20141
|
-
void llama_sample_apply_guidance(
|
20142
|
-
struct llama_context * ctx,
|
20143
|
-
float * logits,
|
20144
|
-
float * logits_guidance,
|
20145
|
-
float scale) {
|
20146
|
-
llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
|
20147
|
-
}
|
20148
|
-
|
20149
|
-
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
20150
|
-
return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
|
20151
|
-
}
|
20152
|
-
|
20153
|
-
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
20154
|
-
return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
|
20155
|
-
}
|
20156
|
-
|
20157
|
-
llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
|
20158
|
-
return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
|
20159
|
-
}
|
20160
|
-
|
20161
|
-
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
20162
|
-
return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
|
20163
|
-
}
|
20164
|
-
|
20165
|
-
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
20166
|
-
return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
|
20167
|
-
}
|
20168
|
-
|
20169
20639
|
int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
20170
20640
|
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
20171
20641
|
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
@@ -20190,45 +20660,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|
20190
20660
|
return 0;
|
20191
20661
|
}
|
20192
20662
|
|
20193
|
-
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
20194
|
-
struct llama_timings result = {
|
20195
|
-
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
20196
|
-
/*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
|
20197
|
-
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
20198
|
-
/*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
|
20199
|
-
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
20200
|
-
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
20201
|
-
|
20202
|
-
/*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
|
20203
|
-
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
20204
|
-
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
20205
|
-
};
|
20206
|
-
|
20207
|
-
return result;
|
20208
|
-
}
|
20209
|
-
|
20210
|
-
void llama_print_timings(struct llama_context * ctx) {
|
20211
|
-
const llama_timings timings = llama_get_timings(ctx);
|
20212
|
-
|
20213
|
-
LLAMA_LOG_INFO("\n");
|
20214
|
-
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
20215
|
-
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20216
|
-
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
20217
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
20218
|
-
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
20219
|
-
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20220
|
-
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
20221
|
-
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
|
20222
|
-
}
|
20223
|
-
|
20224
|
-
void llama_reset_timings(struct llama_context * ctx) {
|
20225
|
-
ctx->t_start_us = lm_ggml_time_us();
|
20226
|
-
ctx->t_eval_us = ctx->n_eval = 0;
|
20227
|
-
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
20228
|
-
|
20229
|
-
ctx->sampling.reset_timings();
|
20230
|
-
}
|
20231
|
-
|
20232
20663
|
const char * llama_print_system_info(void) {
|
20233
20664
|
static std::string s;
|
20234
20665
|
|
@@ -20257,7 +20688,68 @@ const char * llama_print_system_info(void) {
|
|
20257
20688
|
return s.c_str();
|
20258
20689
|
}
|
20259
20690
|
|
20260
|
-
void
|
20691
|
+
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
|
20692
|
+
switch (type) {
|
20693
|
+
case LLAMA_PERF_TYPE_CONTEXT:
|
20694
|
+
{
|
20695
|
+
const auto * p = (const struct llama_context *) ctx;
|
20696
|
+
|
20697
|
+
const double t_start_ms = 1e-3 * p->t_start_us;
|
20698
|
+
const double t_end_ms = 1.00 * lm_ggml_time_ms();
|
20699
|
+
const double t_load_ms = 1e-3 * p->t_load_us;
|
20700
|
+
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
|
20701
|
+
const double t_eval_ms = 1e-3 * p->t_eval_us;
|
20702
|
+
|
20703
|
+
const int32_t n_p_eval = std::max(0, p->n_p_eval);
|
20704
|
+
const int32_t n_eval = std::max(1, p->n_eval);
|
20705
|
+
|
20706
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
|
20707
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
20708
|
+
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
|
20709
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20710
|
+
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
|
20711
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
|
20712
|
+
} break;
|
20713
|
+
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
20714
|
+
{
|
20715
|
+
const auto * smpl = (const struct llama_sampler *) ctx;
|
20716
|
+
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
|
20717
|
+
|
20718
|
+
const double t_sampler_ms = 1e-3 * p->t_sample_us;
|
20719
|
+
|
20720
|
+
const int32_t n_sampler = std::max(0, p->n_sample);
|
20721
|
+
|
20722
|
+
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20723
|
+
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
|
20724
|
+
} break;
|
20725
|
+
default:
|
20726
|
+
LM_GGML_ABORT("invalid perf type");
|
20727
|
+
}
|
20728
|
+
}
|
20729
|
+
|
20730
|
+
void llama_perf_reset(void * ctx, enum llama_perf_type type) {
|
20731
|
+
switch (type) {
|
20732
|
+
case LLAMA_PERF_TYPE_CONTEXT:
|
20733
|
+
{
|
20734
|
+
auto * p = (struct llama_context *) ctx;
|
20735
|
+
|
20736
|
+
p->t_start_us = lm_ggml_time_us();
|
20737
|
+
p->t_eval_us = p->n_eval = 0;
|
20738
|
+
p->t_p_eval_us = p->n_p_eval = 0;
|
20739
|
+
} break;
|
20740
|
+
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
20741
|
+
{
|
20742
|
+
auto * smpl = (struct llama_sampler *) ctx;
|
20743
|
+
auto * p = (struct llama_sampler_chain *) smpl->ctx;
|
20744
|
+
|
20745
|
+
p->t_sample_us = p->n_sample = 0;
|
20746
|
+
} break;
|
20747
|
+
default:
|
20748
|
+
LM_GGML_ABORT("invalid perf type");
|
20749
|
+
}
|
20750
|
+
}
|
20751
|
+
|
20752
|
+
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
20261
20753
|
fprintf(stream, "\n");
|
20262
20754
|
fprintf(stream, "###########\n");
|
20263
20755
|
fprintf(stream, "# Timings #\n");
|
@@ -20268,21 +20760,15 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|
20268
20760
|
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
20269
20761
|
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
20270
20762
|
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
20271
|
-
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
|
20272
|
-
1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
|
20273
20763
|
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
20274
20764
|
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
20275
|
-
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->sampling.n_sample);
|
20276
20765
|
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
20277
20766
|
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
20278
20767
|
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
20279
|
-
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
|
20280
20768
|
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
20281
20769
|
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
20282
20770
|
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
20283
20771
|
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
20284
|
-
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
|
20285
|
-
1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
|
20286
20772
|
}
|
20287
20773
|
|
20288
20774
|
// For internal test use
|
@@ -20334,3 +20820,20 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
|
|
20334
20820
|
fputs(text, stderr);
|
20335
20821
|
fflush(stderr);
|
20336
20822
|
}
|
20823
|
+
|
20824
|
+
struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
|
20825
|
+
const auto * ctx = (llama_context *) v_ctx;
|
20826
|
+
struct llama_token_timings result = {
|
20827
|
+
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
20828
|
+
/*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
|
20829
|
+
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
20830
|
+
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
20831
|
+
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
20832
|
+
|
20833
|
+
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
20834
|
+
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
20835
|
+
};
|
20836
|
+
|
20837
|
+
return result;
|
20838
|
+
}
|
20839
|
+
|