cui-llama.rn 1.1.2 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -2
- package/android/src/main/jni.cpp +26 -21
- package/cpp/common.cpp +181 -1584
- package/cpp/common.h +131 -52
- package/cpp/ggml-aarch64.c +612 -0
- package/cpp/ggml-alloc.h +2 -2
- package/cpp/ggml-backend.c +33 -6
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-common.h +20 -0
- package/cpp/ggml-impl.h +36 -7
- package/cpp/ggml-metal.m +68 -8
- package/cpp/ggml-quants.c +932 -50
- package/cpp/ggml-quants.h +15 -0
- package/cpp/ggml.c +1712 -325
- package/cpp/ggml.h +169 -100
- package/cpp/llama-grammar.cpp +721 -122
- package/cpp/llama-grammar.h +120 -15
- package/cpp/llama-impl.h +132 -1
- package/cpp/llama-sampling.cpp +1483 -354
- package/cpp/llama-sampling.h +20 -48
- package/cpp/llama-vocab.cpp +140 -7
- package/cpp/llama-vocab.h +3 -2
- package/cpp/llama.cpp +824 -327
- package/cpp/llama.h +235 -256
- package/cpp/rn-llama.hpp +18 -14
- package/cpp/sampling.cpp +353 -354
- package/cpp/sampling.h +62 -143
- package/cpp/sgemm.cpp +153 -0
- package/package.json +1 -1
- package/cpp/grammar-parser.cpp +0 -539
- package/cpp/grammar-parser.h +0 -29
package/cpp/llama.cpp
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
#include "llama-impl.h"
|
2
2
|
#include "llama-vocab.h"
|
3
|
-
#include "llama-grammar.h"
|
4
3
|
#include "llama-sampling.h"
|
5
4
|
|
6
5
|
#include "unicode.h"
|
@@ -223,6 +222,7 @@ enum llm_arch {
|
|
223
222
|
LLM_ARCH_JAIS,
|
224
223
|
LLM_ARCH_NEMOTRON,
|
225
224
|
LLM_ARCH_EXAONE,
|
225
|
+
LLM_ARCH_RWKV6,
|
226
226
|
LLM_ARCH_UNKNOWN,
|
227
227
|
};
|
228
228
|
|
@@ -270,6 +270,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
270
270
|
{ LLM_ARCH_JAIS, "jais" },
|
271
271
|
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
272
272
|
{ LLM_ARCH_EXAONE, "exaone" },
|
273
|
+
{ LLM_ARCH_RWKV6, "rwkv6" },
|
273
274
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
274
275
|
};
|
275
276
|
|
@@ -306,6 +307,9 @@ enum llm_kv {
|
|
306
307
|
LLM_KV_DECODER_START_TOKEN_ID,
|
307
308
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
308
309
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
310
|
+
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
311
|
+
LLM_KV_TIME_MIX_EXTRA_DIM,
|
312
|
+
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
309
313
|
|
310
314
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
311
315
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -341,6 +345,8 @@ enum llm_kv {
|
|
341
345
|
LLM_KV_SSM_TIME_STEP_RANK,
|
342
346
|
LLM_KV_SSM_DT_B_C_RMS,
|
343
347
|
|
348
|
+
LLM_KV_WKV_HEAD_SIZE,
|
349
|
+
|
344
350
|
LLM_KV_TOKENIZER_MODEL,
|
345
351
|
LLM_KV_TOKENIZER_PRE,
|
346
352
|
LLM_KV_TOKENIZER_LIST,
|
@@ -400,11 +406,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
400
406
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
401
407
|
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
402
408
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
403
|
-
{ LLM_KV_POOLING_TYPE
|
409
|
+
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
404
410
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
405
411
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
406
412
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
407
413
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
414
|
+
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
415
|
+
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
|
416
|
+
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
408
417
|
|
409
418
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
410
419
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -440,6 +449,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
440
449
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
441
450
|
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
|
442
451
|
|
452
|
+
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
453
|
+
|
443
454
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
444
455
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
445
456
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
@@ -529,6 +540,29 @@ enum llm_tensor {
|
|
529
540
|
LLM_TENSOR_SSM_A,
|
530
541
|
LLM_TENSOR_SSM_D,
|
531
542
|
LLM_TENSOR_SSM_OUT,
|
543
|
+
LLM_TENSOR_TIME_MIX_W1,
|
544
|
+
LLM_TENSOR_TIME_MIX_W2,
|
545
|
+
LLM_TENSOR_TIME_MIX_LERP_X,
|
546
|
+
LLM_TENSOR_TIME_MIX_LERP_W,
|
547
|
+
LLM_TENSOR_TIME_MIX_LERP_K,
|
548
|
+
LLM_TENSOR_TIME_MIX_LERP_V,
|
549
|
+
LLM_TENSOR_TIME_MIX_LERP_R,
|
550
|
+
LLM_TENSOR_TIME_MIX_LERP_G,
|
551
|
+
LLM_TENSOR_TIME_MIX_FIRST,
|
552
|
+
LLM_TENSOR_TIME_MIX_DECAY,
|
553
|
+
LLM_TENSOR_TIME_MIX_DECAY_W1,
|
554
|
+
LLM_TENSOR_TIME_MIX_DECAY_W2,
|
555
|
+
LLM_TENSOR_TIME_MIX_KEY,
|
556
|
+
LLM_TENSOR_TIME_MIX_VALUE,
|
557
|
+
LLM_TENSOR_TIME_MIX_RECEPTANCE,
|
558
|
+
LLM_TENSOR_TIME_MIX_GATE,
|
559
|
+
LLM_TENSOR_TIME_MIX_LN,
|
560
|
+
LLM_TENSOR_TIME_MIX_OUTPUT,
|
561
|
+
LLM_TENSOR_CHANNEL_MIX_LERP_K,
|
562
|
+
LLM_TENSOR_CHANNEL_MIX_LERP_R,
|
563
|
+
LLM_TENSOR_CHANNEL_MIX_KEY,
|
564
|
+
LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
|
565
|
+
LLM_TENSOR_CHANNEL_MIX_VALUE,
|
532
566
|
LLM_TENSOR_ATTN_Q_A,
|
533
567
|
LLM_TENSOR_ATTN_Q_B,
|
534
568
|
LLM_TENSOR_ATTN_KV_A_MQA,
|
@@ -1350,6 +1384,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1350
1384
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1351
1385
|
},
|
1352
1386
|
},
|
1387
|
+
{
|
1388
|
+
LLM_ARCH_RWKV6,
|
1389
|
+
{
|
1390
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1391
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
1392
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1393
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1394
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1395
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
1396
|
+
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
|
1397
|
+
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
|
1398
|
+
{ LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
|
1399
|
+
{ LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" },
|
1400
|
+
{ LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" },
|
1401
|
+
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
|
1402
|
+
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
|
1403
|
+
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
|
1404
|
+
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
1405
|
+
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
1406
|
+
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
1407
|
+
{ LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
|
1408
|
+
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
|
1409
|
+
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
|
1410
|
+
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
|
1411
|
+
{ LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
|
1412
|
+
{ LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
|
1413
|
+
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
|
1414
|
+
{ LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
|
1415
|
+
{ LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" },
|
1416
|
+
{ LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
|
1417
|
+
{ LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
|
1418
|
+
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
|
1419
|
+
},
|
1420
|
+
},
|
1353
1421
|
{
|
1354
1422
|
LLM_ARCH_UNKNOWN,
|
1355
1423
|
{
|
@@ -2099,6 +2167,10 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf
|
|
2099
2167
|
if (host_buffer) {
|
2100
2168
|
buft = lm_ggml_backend_sycl_host_buffer_type();
|
2101
2169
|
}
|
2170
|
+
#elif defined(LM_GGML_USE_CANN)
|
2171
|
+
if (host_buffer) {
|
2172
|
+
buft = lm_ggml_backend_cann_host_buffer_type();
|
2173
|
+
}
|
2102
2174
|
#elif defined(LM_GGML_USE_CPU_HBM)
|
2103
2175
|
buft = lm_ggml_backend_cpu_hbm_buffer_type();
|
2104
2176
|
#elif defined(LM_GGML_USE_VULKAN)
|
@@ -2162,6 +2234,7 @@ enum e_model {
|
|
2162
2234
|
MODEL_1B,
|
2163
2235
|
MODEL_1_3B,
|
2164
2236
|
MODEL_1_4B,
|
2237
|
+
MODEL_1_6B,
|
2165
2238
|
MODEL_2B,
|
2166
2239
|
MODEL_2_8B,
|
2167
2240
|
MODEL_3B,
|
@@ -2239,6 +2312,12 @@ struct llama_hparams {
|
|
2239
2312
|
float f_attn_logit_softcapping = 50.0f;
|
2240
2313
|
float f_final_logit_softcapping = 30.0f;
|
2241
2314
|
|
2315
|
+
// for RWKV
|
2316
|
+
uint32_t rescale_every_n_layers = 0;
|
2317
|
+
uint32_t time_mix_extra_dim = 0;
|
2318
|
+
uint32_t time_decay_extra_dim = 0;
|
2319
|
+
uint32_t wkv_head_size = 0;
|
2320
|
+
|
2242
2321
|
float rope_attn_factor = 1.0f;
|
2243
2322
|
float rope_freq_base_train;
|
2244
2323
|
float rope_freq_scale_train;
|
@@ -2302,6 +2381,11 @@ struct llama_hparams {
|
|
2302
2381
|
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
2303
2382
|
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
2304
2383
|
|
2384
|
+
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
|
2385
|
+
if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
|
2386
|
+
if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
|
2387
|
+
if (this->wkv_head_size != other.wkv_head_size) return true;
|
2388
|
+
|
2305
2389
|
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
2306
2390
|
|
2307
2391
|
const float EPSILON = 1e-9f;
|
@@ -2365,15 +2449,25 @@ struct llama_hparams {
|
|
2365
2449
|
}
|
2366
2450
|
|
2367
2451
|
uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
|
2368
|
-
// corresponds to Mamba's conv_states size
|
2369
|
-
|
2370
|
-
|
2371
|
-
|
2452
|
+
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
|
2453
|
+
if (wkv_head_size != 0) {
|
2454
|
+
// for RWKV models
|
2455
|
+
return 2 * n_embd;
|
2456
|
+
} else {
|
2457
|
+
// TODO: maybe support other convolution strides than 1
|
2458
|
+
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
2459
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
2460
|
+
}
|
2372
2461
|
}
|
2373
2462
|
|
2374
2463
|
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
2375
|
-
|
2376
|
-
|
2464
|
+
if (wkv_head_size != 0) {
|
2465
|
+
// corresponds to RWKV's wkv_states size
|
2466
|
+
return n_embd * wkv_head_size;
|
2467
|
+
} else {
|
2468
|
+
// corresponds to Mamba's ssm_states size
|
2469
|
+
return ssm_d_state * ssm_d_inner;
|
2470
|
+
}
|
2377
2471
|
}
|
2378
2472
|
};
|
2379
2473
|
|
@@ -2384,8 +2478,8 @@ struct llama_cparams {
|
|
2384
2478
|
uint32_t n_batch;
|
2385
2479
|
uint32_t n_ubatch;
|
2386
2480
|
uint32_t n_seq_max;
|
2387
|
-
|
2388
|
-
|
2481
|
+
int n_threads; // number of threads to use for generation
|
2482
|
+
int n_threads_batch; // number of threads to use for batch processing
|
2389
2483
|
|
2390
2484
|
float rope_freq_base;
|
2391
2485
|
float rope_freq_scale;
|
@@ -2403,6 +2497,7 @@ struct llama_cparams {
|
|
2403
2497
|
bool causal_attn;
|
2404
2498
|
bool offload_kqv;
|
2405
2499
|
bool flash_attn;
|
2500
|
+
bool no_perf;
|
2406
2501
|
|
2407
2502
|
enum llama_pooling_type pooling_type;
|
2408
2503
|
|
@@ -2512,6 +2607,36 @@ struct llama_layer {
|
|
2512
2607
|
struct lm_ggml_tensor * ssm_conv1d_b;
|
2513
2608
|
struct lm_ggml_tensor * ssm_dt_b;
|
2514
2609
|
|
2610
|
+
// rwkv
|
2611
|
+
struct lm_ggml_tensor * time_mix_w1;
|
2612
|
+
struct lm_ggml_tensor * time_mix_w2;
|
2613
|
+
struct lm_ggml_tensor * time_mix_lerp_x;
|
2614
|
+
struct lm_ggml_tensor * time_mix_lerp_w;
|
2615
|
+
struct lm_ggml_tensor * time_mix_lerp_k;
|
2616
|
+
struct lm_ggml_tensor * time_mix_lerp_v;
|
2617
|
+
struct lm_ggml_tensor * time_mix_lerp_r;
|
2618
|
+
struct lm_ggml_tensor * time_mix_lerp_g;
|
2619
|
+
|
2620
|
+
struct lm_ggml_tensor * time_mix_first;
|
2621
|
+
struct lm_ggml_tensor * time_mix_decay;
|
2622
|
+
struct lm_ggml_tensor * time_mix_decay_w1;
|
2623
|
+
struct lm_ggml_tensor * time_mix_decay_w2;
|
2624
|
+
struct lm_ggml_tensor * time_mix_key;
|
2625
|
+
struct lm_ggml_tensor * time_mix_value;
|
2626
|
+
struct lm_ggml_tensor * time_mix_receptance;
|
2627
|
+
struct lm_ggml_tensor * time_mix_gate;
|
2628
|
+
|
2629
|
+
struct lm_ggml_tensor * time_mix_ln;
|
2630
|
+
struct lm_ggml_tensor * time_mix_ln_b;
|
2631
|
+
struct lm_ggml_tensor * time_mix_output;
|
2632
|
+
|
2633
|
+
struct lm_ggml_tensor * channel_mix_lerp_k;
|
2634
|
+
struct lm_ggml_tensor * channel_mix_lerp_r;
|
2635
|
+
|
2636
|
+
struct lm_ggml_tensor * channel_mix_key;
|
2637
|
+
struct lm_ggml_tensor * channel_mix_receptance;
|
2638
|
+
struct lm_ggml_tensor * channel_mix_value;
|
2639
|
+
|
2515
2640
|
// long rope factors
|
2516
2641
|
struct lm_ggml_tensor * rope_long = nullptr;
|
2517
2642
|
struct lm_ggml_tensor * rope_short = nullptr;
|
@@ -3069,7 +3194,6 @@ struct llama_sbatch {
|
|
3069
3194
|
struct llama_context {
|
3070
3195
|
llama_context(const llama_model & model)
|
3071
3196
|
: model(model)
|
3072
|
-
, sampling(llama_n_vocab(&model))
|
3073
3197
|
, t_start_us(model.t_start_us)
|
3074
3198
|
, t_load_us(model.t_load_us) {}
|
3075
3199
|
|
@@ -3086,7 +3210,6 @@ struct llama_context {
|
|
3086
3210
|
const struct llama_model & model;
|
3087
3211
|
|
3088
3212
|
struct llama_cparams cparams;
|
3089
|
-
struct llama_sampling sampling;
|
3090
3213
|
struct llama_sbatch sbatch;
|
3091
3214
|
struct llama_kv_cache kv_self;
|
3092
3215
|
struct llama_control_vector cvec;
|
@@ -3102,18 +3225,21 @@ struct llama_context {
|
|
3102
3225
|
#endif
|
3103
3226
|
lm_ggml_backend_t backend_cpu = nullptr;
|
3104
3227
|
|
3228
|
+
lm_ggml_threadpool_t threadpool = nullptr;
|
3229
|
+
lm_ggml_threadpool_t threadpool_batch = nullptr;
|
3230
|
+
|
3105
3231
|
bool has_evaluated_once = false;
|
3106
3232
|
|
3107
|
-
int64_t t_start_us;
|
3108
|
-
int64_t t_load_us;
|
3109
|
-
int64_t t_p_eval_us = 0;
|
3110
|
-
int64_t t_eval_us = 0;
|
3233
|
+
mutable int64_t t_start_us;
|
3234
|
+
mutable int64_t t_load_us;
|
3235
|
+
mutable int64_t t_p_eval_us = 0;
|
3236
|
+
mutable int64_t t_eval_us = 0;
|
3111
3237
|
|
3112
|
-
int64_t t_compute_start_us = 0;
|
3113
|
-
int64_t n_queued_tokens = 0;
|
3238
|
+
mutable int64_t t_compute_start_us = 0;
|
3239
|
+
mutable int64_t n_queued_tokens = 0;
|
3114
3240
|
|
3115
|
-
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
3116
|
-
int32_t n_eval = 0; // number of eval calls
|
3241
|
+
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
3242
|
+
mutable int32_t n_eval = 0; // number of eval calls
|
3117
3243
|
|
3118
3244
|
// host buffer for the model output (logits and embeddings)
|
3119
3245
|
lm_ggml_backend_buffer_t buf_output = nullptr;
|
@@ -3233,29 +3359,33 @@ static size_t llama_get_device_count(const llama_model & model) {
|
|
3233
3359
|
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
3234
3360
|
lm_ggml_backend_buffer_type_t buft = nullptr;
|
3235
3361
|
|
3236
|
-
#
|
3237
|
-
int dev_count = (int)llama_get_device_count(model);
|
3362
|
+
#ifdef LM_GGML_USE_RPC
|
3238
3363
|
int rpc_count = (int)model.rpc_servers.size();
|
3239
|
-
|
3240
|
-
|
3364
|
+
#else
|
3365
|
+
int rpc_count = 0;
|
3366
|
+
#endif
|
3367
|
+
int local_gpu = gpu - rpc_count;
|
3368
|
+
#if defined(LM_GGML_USE_RPC)
|
3369
|
+
if (gpu < rpc_count) {
|
3370
|
+
const char * endpoint = model.rpc_servers[gpu].c_str();
|
3241
3371
|
return lm_ggml_backend_rpc_buffer_type(endpoint);
|
3242
3372
|
}
|
3243
3373
|
#endif
|
3244
3374
|
#if defined(LM_GGML_USE_METAL)
|
3245
3375
|
buft = lm_ggml_backend_metal_buffer_type();
|
3246
3376
|
#elif defined(LM_GGML_USE_CUDA)
|
3247
|
-
buft = lm_ggml_backend_cuda_buffer_type(
|
3377
|
+
buft = lm_ggml_backend_cuda_buffer_type(local_gpu);
|
3248
3378
|
#elif defined(LM_GGML_USE_VULKAN)
|
3249
|
-
buft = lm_ggml_backend_vk_buffer_type(
|
3379
|
+
buft = lm_ggml_backend_vk_buffer_type(local_gpu);
|
3250
3380
|
#elif defined(LM_GGML_USE_SYCL)
|
3251
|
-
buft = lm_ggml_backend_sycl_buffer_type(
|
3381
|
+
buft = lm_ggml_backend_sycl_buffer_type(local_gpu);
|
3252
3382
|
#elif defined(LM_GGML_USE_KOMPUTE)
|
3253
|
-
buft = lm_ggml_backend_kompute_buffer_type(
|
3383
|
+
buft = lm_ggml_backend_kompute_buffer_type(local_gpu);
|
3254
3384
|
if (buft == nullptr) {
|
3255
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__,
|
3385
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
|
3256
3386
|
}
|
3257
3387
|
#elif defined(LM_GGML_USE_CANN)
|
3258
|
-
buft = lm_ggml_backend_cann_buffer_type(
|
3388
|
+
buft = lm_ggml_backend_cann_buffer_type(local_gpu);
|
3259
3389
|
#endif
|
3260
3390
|
|
3261
3391
|
if (buft == nullptr) {
|
@@ -3263,7 +3393,7 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const lla
|
|
3263
3393
|
}
|
3264
3394
|
return buft;
|
3265
3395
|
LM_GGML_UNUSED(model);
|
3266
|
-
LM_GGML_UNUSED(
|
3396
|
+
LM_GGML_UNUSED(local_gpu);
|
3267
3397
|
}
|
3268
3398
|
|
3269
3399
|
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
@@ -3290,13 +3420,17 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
|
|
3290
3420
|
}
|
3291
3421
|
|
3292
3422
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
3293
|
-
#
|
3294
|
-
int dev_count = (int)llama_get_device_count(model);
|
3423
|
+
#ifdef LM_GGML_USE_RPC
|
3295
3424
|
int rpc_count = (int)model.rpc_servers.size();
|
3296
|
-
|
3425
|
+
#else
|
3426
|
+
int rpc_count = 0;
|
3427
|
+
#endif
|
3428
|
+
int local_device = device - rpc_count;
|
3429
|
+
#if defined(LM_GGML_USE_RPC)
|
3430
|
+
if (device < rpc_count) {
|
3297
3431
|
size_t total;
|
3298
3432
|
size_t free;
|
3299
|
-
const char * endpoint = model.rpc_servers[device
|
3433
|
+
const char * endpoint = model.rpc_servers[device].c_str();
|
3300
3434
|
lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
3301
3435
|
return free;
|
3302
3436
|
}
|
@@ -3304,28 +3438,28 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
3304
3438
|
#if defined(LM_GGML_USE_CUDA)
|
3305
3439
|
size_t total;
|
3306
3440
|
size_t free;
|
3307
|
-
lm_ggml_backend_cuda_get_device_memory(
|
3441
|
+
lm_ggml_backend_cuda_get_device_memory(local_device, &free, &total);
|
3308
3442
|
return free;
|
3309
3443
|
#elif defined(LM_GGML_USE_SYCL)
|
3310
3444
|
size_t total;
|
3311
3445
|
size_t free;
|
3312
|
-
lm_ggml_backend_sycl_get_device_memory(
|
3446
|
+
lm_ggml_backend_sycl_get_device_memory(local_device, &free, &total);
|
3313
3447
|
return free;
|
3314
3448
|
#elif defined(LM_GGML_USE_VULKAN)
|
3315
3449
|
size_t total;
|
3316
3450
|
size_t free;
|
3317
|
-
lm_ggml_backend_vk_get_device_memory(
|
3451
|
+
lm_ggml_backend_vk_get_device_memory(local_device, &free, &total);
|
3318
3452
|
return free;
|
3319
3453
|
#elif defined(LM_GGML_USE_CANN)
|
3320
3454
|
size_t total;
|
3321
3455
|
size_t free;
|
3322
|
-
lm_ggml_backend_cann_get_device_memory(
|
3456
|
+
lm_ggml_backend_cann_get_device_memory(local_device, &free, &total);
|
3323
3457
|
return free;
|
3324
3458
|
#else
|
3325
3459
|
return 1;
|
3326
3460
|
#endif
|
3327
3461
|
LM_GGML_UNUSED(model);
|
3328
|
-
LM_GGML_UNUSED(
|
3462
|
+
LM_GGML_UNUSED(local_device);
|
3329
3463
|
}
|
3330
3464
|
|
3331
3465
|
//
|
@@ -3434,7 +3568,7 @@ static bool llama_kv_cache_find_slot(
|
|
3434
3568
|
const uint32_t n_seq_tokens = batch.n_seq_tokens;
|
3435
3569
|
|
3436
3570
|
if (cache.recurrent) {
|
3437
|
-
// For recurrent state architectures (like Mamba),
|
3571
|
+
// For recurrent state architectures (like Mamba or RWKV),
|
3438
3572
|
// each cache cell can store the state for a whole sequence.
|
3439
3573
|
// A slot should be always be contiguous.
|
3440
3574
|
|
@@ -3683,7 +3817,7 @@ static bool llama_kv_cache_seq_rm(
|
|
3683
3817
|
if (p0 < 0) p0 = 0;
|
3684
3818
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
3685
3819
|
|
3686
|
-
// models like Mamba can't have a state partially erased
|
3820
|
+
// models like Mamba or RWKV can't have a state partially erased
|
3687
3821
|
if (cache.recurrent) {
|
3688
3822
|
if (seq_id >= (int64_t) cache.size) {
|
3689
3823
|
// could be fatal
|
@@ -3697,7 +3831,8 @@ static bool llama_kv_cache_seq_rm(
|
|
3697
3831
|
if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
|
3698
3832
|
return false;
|
3699
3833
|
}
|
3700
|
-
|
3834
|
+
// invalidate tails which will be cleared
|
3835
|
+
if (p0 <= cell.pos && cell.pos < p1) {
|
3701
3836
|
tail_id = -1;
|
3702
3837
|
}
|
3703
3838
|
}
|
@@ -3819,7 +3954,7 @@ static void llama_kv_cache_seq_add(
|
|
3819
3954
|
if (p0 == p1) return;
|
3820
3955
|
|
3821
3956
|
if (cache.recurrent) {
|
3822
|
-
// for Mamba-like models, only the pos needs to be shifted
|
3957
|
+
// for Mamba-like or RWKV models, only the pos needs to be shifted
|
3823
3958
|
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
3824
3959
|
const int32_t tail_id = cache.cells[seq_id].tail;
|
3825
3960
|
if (tail_id >= 0) {
|
@@ -3868,7 +4003,7 @@ static void llama_kv_cache_seq_div(
|
|
3868
4003
|
if (p0 == p1) return;
|
3869
4004
|
|
3870
4005
|
if (cache.recurrent) {
|
3871
|
-
// for Mamba-like models, only the pos needs to be changed
|
4006
|
+
// for Mamba-like or RWKV models, only the pos needs to be changed
|
3872
4007
|
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
3873
4008
|
const int32_t tail_id = cache.cells[seq_id].tail;
|
3874
4009
|
if (tail_id >= 0) {
|
@@ -4322,6 +4457,8 @@ struct llama_model_loader {
|
|
4322
4457
|
case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
4323
4458
|
case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
4324
4459
|
case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
4460
|
+
case LM_GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
|
4461
|
+
case LM_GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
|
4325
4462
|
case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
4326
4463
|
case LM_GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
4327
4464
|
case LM_GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
@@ -5015,6 +5152,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
5015
5152
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
5016
5153
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
5017
5154
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
5155
|
+
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
5156
|
+
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
5018
5157
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
5019
5158
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
5020
5159
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
@@ -5059,6 +5198,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
5059
5198
|
case MODEL_1B: return "1B";
|
5060
5199
|
case MODEL_1_3B: return "1.3B";
|
5061
5200
|
case MODEL_1_4B: return "1.4B";
|
5201
|
+
case MODEL_1_6B: return "1.6B";
|
5062
5202
|
case MODEL_2B: return "2B";
|
5063
5203
|
case MODEL_2_8B: return "2.8B";
|
5064
5204
|
case MODEL_3B: return "3B";
|
@@ -5105,6 +5245,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
5105
5245
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
5106
5246
|
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
5107
5247
|
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
5248
|
+
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
5108
5249
|
default: return "unknown";
|
5109
5250
|
}
|
5110
5251
|
}
|
@@ -5801,6 +5942,26 @@ static void llm_load_hparams(
|
|
5801
5942
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5802
5943
|
}
|
5803
5944
|
} break;
|
5945
|
+
case LLM_ARCH_RWKV6:
|
5946
|
+
{
|
5947
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
5948
|
+
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
5949
|
+
ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
5950
|
+
ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
5951
|
+
ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
|
5952
|
+
|
5953
|
+
switch (hparams.n_layer) {
|
5954
|
+
case 24: model.type = e_model::MODEL_1_6B; break;
|
5955
|
+
case 32:
|
5956
|
+
switch (hparams.n_embd) {
|
5957
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
5958
|
+
case 4096: model.type = e_model::MODEL_7B; break;
|
5959
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5960
|
+
} break;
|
5961
|
+
case 61: model.type = e_model::MODEL_14B; break;
|
5962
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5963
|
+
}
|
5964
|
+
} break;
|
5804
5965
|
default: (void)0;
|
5805
5966
|
}
|
5806
5967
|
|
@@ -5930,6 +6091,15 @@ static void llm_load_vocab(
|
|
5930
6091
|
}
|
5931
6092
|
#endif
|
5932
6093
|
}
|
6094
|
+
} else if (tokenizer_model == "rwkv") {
|
6095
|
+
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
|
6096
|
+
|
6097
|
+
// default special tokens
|
6098
|
+
vocab.special_bos_id = -1;
|
6099
|
+
vocab.special_eos_id = -1;
|
6100
|
+
vocab.special_unk_id = -1;
|
6101
|
+
vocab.special_sep_id = -1;
|
6102
|
+
vocab.special_pad_id = -1;
|
5933
6103
|
} else {
|
5934
6104
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
5935
6105
|
}
|
@@ -6061,6 +6231,12 @@ static void llm_load_vocab(
|
|
6061
6231
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
6062
6232
|
vocab.tokenizer_add_bos = false;
|
6063
6233
|
vocab.tokenizer_add_eos = true;
|
6234
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
|
6235
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
6236
|
+
vocab.tokenizer_add_space_prefix = false;
|
6237
|
+
vocab.tokenizer_clean_spaces = false;
|
6238
|
+
vocab.tokenizer_add_bos = false;
|
6239
|
+
vocab.tokenizer_add_eos = false;
|
6064
6240
|
} else {
|
6065
6241
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
6066
6242
|
}
|
@@ -6088,6 +6264,7 @@ static void llm_load_vocab(
|
|
6088
6264
|
|
6089
6265
|
const uint32_t n_vocab = lm_gguf_get_arr_n(ctx, token_idx);
|
6090
6266
|
|
6267
|
+
vocab.n_vocab = n_vocab;
|
6091
6268
|
vocab.id_to_token.resize(n_vocab);
|
6092
6269
|
|
6093
6270
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
@@ -6165,6 +6342,10 @@ static void llm_load_vocab(
|
|
6165
6342
|
}
|
6166
6343
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
6167
6344
|
vocab.linefeed_id = vocab.special_pad_id;
|
6345
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
|
6346
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
|
6347
|
+
LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
6348
|
+
vocab.linefeed_id = ids[0];
|
6168
6349
|
} else {
|
6169
6350
|
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
6170
6351
|
LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
@@ -6234,6 +6415,11 @@ static void llm_load_vocab(
|
|
6234
6415
|
)
|
6235
6416
|
) {
|
6236
6417
|
vocab.special_eot_id = t.second;
|
6418
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6419
|
+
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6420
|
+
__func__, t.first.c_str());
|
6421
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6422
|
+
}
|
6237
6423
|
break;
|
6238
6424
|
}
|
6239
6425
|
}
|
@@ -6247,6 +6433,11 @@ static void llm_load_vocab(
|
|
6247
6433
|
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
6248
6434
|
if (t != vocab.token_to_id.end()) {
|
6249
6435
|
vocab.special_eom_id = t->second;
|
6436
|
+
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6437
|
+
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6438
|
+
__func__, t->first.c_str());
|
6439
|
+
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6440
|
+
}
|
6250
6441
|
}
|
6251
6442
|
}
|
6252
6443
|
}
|
@@ -6482,8 +6673,6 @@ static bool llm_load_tensors(
|
|
6482
6673
|
bool use_mlock,
|
6483
6674
|
llama_progress_callback progress_callback,
|
6484
6675
|
void * progress_callback_user_data) {
|
6485
|
-
model.t_start_us = lm_ggml_time_us();
|
6486
|
-
|
6487
6676
|
auto & hparams = model.hparams;
|
6488
6677
|
|
6489
6678
|
model.split_mode = split_mode;
|
@@ -7955,23 +8144,23 @@ static bool llm_load_tensors(
|
|
7955
8144
|
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
7956
8145
|
|
7957
8146
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
7958
|
-
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
|
8147
|
+
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7959
8148
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
7960
|
-
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
|
8149
|
+
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7961
8150
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
7962
|
-
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
|
8151
|
+
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7963
8152
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
7964
|
-
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
|
8153
|
+
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7965
8154
|
|
7966
8155
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
7967
8156
|
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
7968
8157
|
|
7969
8158
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
7970
|
-
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
|
8159
|
+
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7971
8160
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
7972
|
-
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
|
8161
|
+
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7973
8162
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7974
|
-
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
8163
|
+
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7975
8164
|
}
|
7976
8165
|
} break;
|
7977
8166
|
case LLM_ARCH_T5:
|
@@ -8211,6 +8400,68 @@ static bool llm_load_tensors(
|
|
8211
8400
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
8212
8401
|
}
|
8213
8402
|
} break;
|
8403
|
+
case LLM_ARCH_RWKV6:
|
8404
|
+
{
|
8405
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
8406
|
+
|
8407
|
+
// Block 0, LN0
|
8408
|
+
model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
8409
|
+
model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
8410
|
+
|
8411
|
+
// output
|
8412
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
8413
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
8414
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
8415
|
+
|
8416
|
+
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
8417
|
+
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
8418
|
+
const int head_size = hparams.wkv_head_size;
|
8419
|
+
const int attn_hidden_size = n_embd;
|
8420
|
+
const int ffn_size = hparams.n_ff_arr[0];
|
8421
|
+
|
8422
|
+
for (int i = 0; i < n_layer; ++i) {
|
8423
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
8424
|
+
|
8425
|
+
auto & layer = model.layers[i];
|
8426
|
+
|
8427
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
8428
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
8429
|
+
|
8430
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
8431
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
8432
|
+
|
8433
|
+
layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5});
|
8434
|
+
layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5});
|
8435
|
+
|
8436
|
+
layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1});
|
8437
|
+
layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1});
|
8438
|
+
layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
|
8439
|
+
layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1});
|
8440
|
+
layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
|
8441
|
+
layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
|
8442
|
+
|
8443
|
+
layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
|
8444
|
+
layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
|
8445
|
+
layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});
|
8446
|
+
layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size});
|
8447
|
+
layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
|
8448
|
+
layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
|
8449
|
+
layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
|
8450
|
+
layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd});
|
8451
|
+
|
8452
|
+
layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
|
8453
|
+
layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
|
8454
|
+
layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
|
8455
|
+
|
8456
|
+
layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
|
8457
|
+
layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
|
8458
|
+
|
8459
|
+
layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
|
8460
|
+
layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
|
8461
|
+
layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd});
|
8462
|
+
}
|
8463
|
+
|
8464
|
+
} break;
|
8214
8465
|
default:
|
8215
8466
|
throw std::runtime_error("unknown architecture");
|
8216
8467
|
}
|
@@ -8352,14 +8603,13 @@ static bool llm_load_tensors(
|
|
8352
8603
|
}
|
8353
8604
|
}
|
8354
8605
|
|
8355
|
-
// loading time will be recalculate after the first eval, so
|
8356
|
-
// we take page faults deferred by mmap() into consideration
|
8357
|
-
model.t_load_us = lm_ggml_time_us() - model.t_start_us;
|
8358
8606
|
return true;
|
8359
8607
|
}
|
8360
8608
|
|
8361
8609
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
8362
8610
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
8611
|
+
model.t_start_us = lm_ggml_time_us();
|
8612
|
+
|
8363
8613
|
try {
|
8364
8614
|
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
8365
8615
|
|
@@ -8421,6 +8671,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
8421
8671
|
return -1;
|
8422
8672
|
}
|
8423
8673
|
|
8674
|
+
// loading time will be recalculate after the first eval, so
|
8675
|
+
// we take page faults deferred by mmap() into consideration
|
8676
|
+
model.t_load_us = lm_ggml_time_us() - model.t_start_us;
|
8677
|
+
|
8424
8678
|
return 0;
|
8425
8679
|
}
|
8426
8680
|
|
@@ -8495,8 +8749,7 @@ static void llm_build_kv_store(
|
|
8495
8749
|
|
8496
8750
|
LM_GGML_ASSERT(kv.size == n_ctx);
|
8497
8751
|
|
8498
|
-
struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
8499
|
-
(lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
8752
|
+
struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
|
8500
8753
|
cb(k_cache_view, "k_cache_view", il);
|
8501
8754
|
|
8502
8755
|
// note: storing RoPE-ed version of K in the KV cache
|
@@ -8507,8 +8760,7 @@ static void llm_build_kv_store(
|
|
8507
8760
|
struct lm_ggml_tensor * v_cache_view = nullptr;
|
8508
8761
|
|
8509
8762
|
if (cparams.flash_attn) {
|
8510
|
-
v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
8511
|
-
(kv_head)*lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
8763
|
+
v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
|
8512
8764
|
} else {
|
8513
8765
|
// note: the V cache is transposed when not using flash attention
|
8514
8766
|
v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
@@ -8995,8 +9247,7 @@ static struct lm_ggml_tensor * llm_build_kv(
|
|
8995
9247
|
|
8996
9248
|
struct lm_ggml_tensor * cur;
|
8997
9249
|
|
8998
|
-
cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
|
8999
|
-
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
9250
|
+
cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
9000
9251
|
cb(cur, "kqv_out", il);
|
9001
9252
|
|
9002
9253
|
return cur;
|
@@ -9024,7 +9275,7 @@ static struct lm_ggml_tensor * llm_build_copy_mask_state(
|
|
9024
9275
|
// FIXME: zero-out NANs?
|
9025
9276
|
states = lm_ggml_mul(ctx, states, state_mask);
|
9026
9277
|
|
9027
|
-
// copy states which won't be changed further (between n_seqs and
|
9278
|
+
// copy states which won't be changed further (between n_seqs and n_kv)
|
9028
9279
|
lm_ggml_build_forward_expand(graph,
|
9029
9280
|
lm_ggml_cpy(ctx,
|
9030
9281
|
lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)),
|
@@ -9170,6 +9421,171 @@ static struct lm_ggml_tensor * llm_build_mamba(
|
|
9170
9421
|
return cur;
|
9171
9422
|
}
|
9172
9423
|
|
9424
|
+
static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
9425
|
+
struct llama_context & lctx,
|
9426
|
+
struct lm_ggml_context * ctx,
|
9427
|
+
const struct llama_layer * layer,
|
9428
|
+
struct lm_ggml_tensor * cur,
|
9429
|
+
struct lm_ggml_tensor * x_prev,
|
9430
|
+
struct lm_ggml_tensor ** wkv_state) {
|
9431
|
+
size_t n_embed = cur->ne[0];
|
9432
|
+
size_t n_seq_tokens = cur->ne[1];
|
9433
|
+
size_t n_seqs = cur->ne[2];
|
9434
|
+
|
9435
|
+
size_t head_size = layer->time_mix_first->ne[0];
|
9436
|
+
size_t head_count = layer->time_mix_first->ne[1];
|
9437
|
+
|
9438
|
+
size_t n_tokens = n_seqs * n_seq_tokens;
|
9439
|
+
|
9440
|
+
struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
|
9441
|
+
|
9442
|
+
sx = lm_ggml_reshape_2d(ctx, sx, n_embed, n_tokens);
|
9443
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
|
9444
|
+
|
9445
|
+
struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
|
9446
|
+
|
9447
|
+
xxx = lm_ggml_reshape_4d(
|
9448
|
+
ctx,
|
9449
|
+
lm_ggml_tanh(
|
9450
|
+
ctx,
|
9451
|
+
lm_ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
|
9452
|
+
),
|
9453
|
+
layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
|
9454
|
+
);
|
9455
|
+
|
9456
|
+
xxx = lm_ggml_cont(ctx, lm_ggml_permute(ctx, xxx, 0, 1, 3, 2));
|
9457
|
+
|
9458
|
+
xxx = lm_ggml_mul_mat(
|
9459
|
+
ctx,
|
9460
|
+
lm_ggml_reshape_4d(
|
9461
|
+
ctx,
|
9462
|
+
layer->time_mix_w2,
|
9463
|
+
layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
|
9464
|
+
),
|
9465
|
+
xxx
|
9466
|
+
);
|
9467
|
+
|
9468
|
+
struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
|
9469
|
+
struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
|
9470
|
+
struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
|
9471
|
+
struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
|
9472
|
+
struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
|
9473
|
+
|
9474
|
+
struct lm_ggml_tensor * xw = lm_ggml_add(
|
9475
|
+
ctx,
|
9476
|
+
lm_ggml_mul(
|
9477
|
+
ctx,
|
9478
|
+
lm_ggml_add(ctx, mw, layer->time_mix_lerp_w),
|
9479
|
+
sx
|
9480
|
+
),
|
9481
|
+
cur
|
9482
|
+
);
|
9483
|
+
|
9484
|
+
struct lm_ggml_tensor * xk = lm_ggml_add(
|
9485
|
+
ctx,
|
9486
|
+
lm_ggml_mul(
|
9487
|
+
ctx,
|
9488
|
+
lm_ggml_add(ctx, mk, layer->time_mix_lerp_k),
|
9489
|
+
sx
|
9490
|
+
),
|
9491
|
+
cur
|
9492
|
+
);
|
9493
|
+
|
9494
|
+
struct lm_ggml_tensor * xv = lm_ggml_add(
|
9495
|
+
ctx,
|
9496
|
+
lm_ggml_mul(
|
9497
|
+
ctx,
|
9498
|
+
lm_ggml_add(ctx, mv, layer->time_mix_lerp_v),
|
9499
|
+
sx
|
9500
|
+
),
|
9501
|
+
cur
|
9502
|
+
);
|
9503
|
+
|
9504
|
+
struct lm_ggml_tensor * xr = lm_ggml_add(
|
9505
|
+
ctx,
|
9506
|
+
lm_ggml_mul(
|
9507
|
+
ctx,
|
9508
|
+
lm_ggml_add(ctx, mr, layer->time_mix_lerp_r),
|
9509
|
+
sx
|
9510
|
+
),
|
9511
|
+
cur
|
9512
|
+
);
|
9513
|
+
|
9514
|
+
struct lm_ggml_tensor * xg = lm_ggml_add(
|
9515
|
+
ctx,
|
9516
|
+
lm_ggml_mul(
|
9517
|
+
ctx,
|
9518
|
+
lm_ggml_add(ctx, mg, layer->time_mix_lerp_g),
|
9519
|
+
sx
|
9520
|
+
),
|
9521
|
+
cur
|
9522
|
+
);
|
9523
|
+
|
9524
|
+
struct lm_ggml_tensor * r = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
|
9525
|
+
struct lm_ggml_tensor * k = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
|
9526
|
+
struct lm_ggml_tensor * v = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
|
9527
|
+
struct lm_ggml_tensor * g = lm_ggml_silu(
|
9528
|
+
ctx,
|
9529
|
+
llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
|
9530
|
+
);
|
9531
|
+
|
9532
|
+
struct lm_ggml_tensor * w = lm_ggml_mul_mat(
|
9533
|
+
ctx,
|
9534
|
+
layer->time_mix_decay_w2,
|
9535
|
+
lm_ggml_tanh(
|
9536
|
+
ctx,
|
9537
|
+
lm_ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
|
9538
|
+
)
|
9539
|
+
);
|
9540
|
+
|
9541
|
+
w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
|
9542
|
+
w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
|
9543
|
+
w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
|
9544
|
+
|
9545
|
+
k = lm_ggml_transpose(ctx, k);
|
9546
|
+
v = lm_ggml_transpose(ctx, v);
|
9547
|
+
r = lm_ggml_transpose(ctx, r);
|
9548
|
+
|
9549
|
+
struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
9550
|
+
cur = lm_ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
|
9551
|
+
*wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
|
9552
|
+
|
9553
|
+
// group norm with head_count groups
|
9554
|
+
cur = lm_ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
|
9555
|
+
cur = lm_ggml_norm(ctx, cur, 64e-5f);
|
9556
|
+
|
9557
|
+
// Convert back to regular vectors.
|
9558
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
|
9559
|
+
cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
|
9560
|
+
|
9561
|
+
cur = lm_ggml_mul(ctx, cur, g);
|
9562
|
+
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
|
9563
|
+
|
9564
|
+
return lm_ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
|
9565
|
+
}
|
9566
|
+
|
9567
|
+
static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
|
9568
|
+
struct llama_context & lctx,
|
9569
|
+
struct lm_ggml_context * ctx,
|
9570
|
+
const struct llama_layer * layer,
|
9571
|
+
struct lm_ggml_tensor * cur,
|
9572
|
+
struct lm_ggml_tensor * x_prev) {
|
9573
|
+
struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
|
9574
|
+
struct lm_ggml_tensor * xk = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
|
9575
|
+
struct lm_ggml_tensor * xr = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
|
9576
|
+
|
9577
|
+
struct lm_ggml_tensor * r = lm_ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
|
9578
|
+
struct lm_ggml_tensor * k = lm_ggml_sqr(
|
9579
|
+
ctx,
|
9580
|
+
lm_ggml_relu(
|
9581
|
+
ctx,
|
9582
|
+
llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
|
9583
|
+
)
|
9584
|
+
);
|
9585
|
+
|
9586
|
+
return lm_ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
|
9587
|
+
}
|
9588
|
+
|
9173
9589
|
struct llm_build_context {
|
9174
9590
|
const llama_model & model;
|
9175
9591
|
llama_context & lctx;
|
@@ -9478,8 +9894,8 @@ struct llm_build_context {
|
|
9478
9894
|
struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) {
|
9479
9895
|
// find result_norm tensor for input
|
9480
9896
|
struct lm_ggml_tensor * inp = nullptr;
|
9481
|
-
for (int i = gf
|
9482
|
-
inp = gf
|
9897
|
+
for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
9898
|
+
inp = lm_ggml_graph_node(gf, i);
|
9483
9899
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
9484
9900
|
break;
|
9485
9901
|
} else {
|
@@ -13790,7 +14206,9 @@ struct llm_build_context {
|
|
13790
14206
|
{
|
13791
14207
|
// compute Q and K and RoPE them
|
13792
14208
|
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13793
|
-
|
14209
|
+
if (model.layers[il].wq_scale) {
|
14210
|
+
Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
14211
|
+
}
|
13794
14212
|
cb(Qcur, "Qcur", il);
|
13795
14213
|
if (model.layers[il].bq) {
|
13796
14214
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
@@ -13799,7 +14217,9 @@ struct llm_build_context {
|
|
13799
14217
|
|
13800
14218
|
// B1.K
|
13801
14219
|
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13802
|
-
|
14220
|
+
if (model.layers[il].wk_scale) {
|
14221
|
+
Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
14222
|
+
}
|
13803
14223
|
cb(Kcur, "Kcur", il);
|
13804
14224
|
if (model.layers[il].bk) {
|
13805
14225
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
@@ -13808,7 +14228,9 @@ struct llm_build_context {
|
|
13808
14228
|
|
13809
14229
|
// B1.V
|
13810
14230
|
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13811
|
-
|
14231
|
+
if (model.layers[il].wv_scale) {
|
14232
|
+
Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
14233
|
+
}
|
13812
14234
|
cb(Vcur, "Vcur", il);
|
13813
14235
|
if (model.layers[il].bv) {
|
13814
14236
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -13839,7 +14261,9 @@ struct llm_build_context {
|
|
13839
14261
|
cb(cur, "attn_sub_norm", il);
|
13840
14262
|
|
13841
14263
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
13842
|
-
|
14264
|
+
if (model.layers[il].wo_scale) {
|
14265
|
+
cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
14266
|
+
}
|
13843
14267
|
if (model.layers[il].bo) {
|
13844
14268
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bo);
|
13845
14269
|
}
|
@@ -13876,7 +14300,9 @@ struct llm_build_context {
|
|
13876
14300
|
cb(cur, "ffn_sub_norm", il);
|
13877
14301
|
|
13878
14302
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
|
13879
|
-
|
14303
|
+
if (model.layers[il].ffn_down_scale) {
|
14304
|
+
cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
14305
|
+
}
|
13880
14306
|
cb(cur, "ffn_down", il);
|
13881
14307
|
|
13882
14308
|
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
@@ -14691,6 +15117,117 @@ struct llm_build_context {
|
|
14691
15117
|
|
14692
15118
|
return gf;
|
14693
15119
|
}
|
15120
|
+
|
15121
|
+
lm_ggml_cgraph * build_rwkv6() {
|
15122
|
+
lm_ggml_cgraph *gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
15123
|
+
|
15124
|
+
// Token shift state dimensions should be 2 * n_emb
|
15125
|
+
LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
|
15126
|
+
|
15127
|
+
const int64_t n_seqs = batch.n_seqs;
|
15128
|
+
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
15129
|
+
const int64_t n_tokens = batch.n_tokens;
|
15130
|
+
LM_GGML_ASSERT(n_seqs != 0);
|
15131
|
+
LM_GGML_ASSERT(batch.equal_seqs);
|
15132
|
+
LM_GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
15133
|
+
|
15134
|
+
struct lm_ggml_tensor * cur;
|
15135
|
+
struct lm_ggml_tensor * inpL;
|
15136
|
+
struct lm_ggml_tensor * state_copy = build_inp_s_copy();
|
15137
|
+
struct lm_ggml_tensor * state_mask = build_inp_s_mask();
|
15138
|
+
|
15139
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
15140
|
+
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
15141
|
+
|
15142
|
+
for (int il = 0; il < n_layer; ++il) {
|
15143
|
+
const llama_layer * layer = &model.layers[il];
|
15144
|
+
|
15145
|
+
// (ab)using the KV cache to store the states
|
15146
|
+
struct lm_ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
|
15147
|
+
gf, kv_self.k_l[il], state_copy, state_mask,
|
15148
|
+
hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
15149
|
+
struct lm_ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
|
15150
|
+
gf, kv_self.v_l[il], state_copy, state_mask,
|
15151
|
+
hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
15152
|
+
|
15153
|
+
cur = lm_ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
15154
|
+
token_shift = lm_ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
|
15155
|
+
|
15156
|
+
struct lm_ggml_tensor * att_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
15157
|
+
struct lm_ggml_tensor * ffn_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * lm_ggml_element_size(token_shift));
|
15158
|
+
|
15159
|
+
struct lm_ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
|
15160
|
+
struct lm_ggml_tensor * x_prev = lm_ggml_concat(
|
15161
|
+
ctx0,
|
15162
|
+
att_shift,
|
15163
|
+
lm_ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
|
15164
|
+
1
|
15165
|
+
);
|
15166
|
+
|
15167
|
+
cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
|
15168
|
+
lm_ggml_build_forward_expand(gf, cur);
|
15169
|
+
lm_ggml_build_forward_expand(
|
15170
|
+
gf,
|
15171
|
+
lm_ggml_cpy(
|
15172
|
+
ctx0,
|
15173
|
+
wkv_states,
|
15174
|
+
lm_ggml_view_1d(
|
15175
|
+
ctx0,
|
15176
|
+
kv_self.v_l[il],
|
15177
|
+
hparams.n_embd_v_s() * n_seqs,
|
15178
|
+
hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
|
15179
|
+
)
|
15180
|
+
)
|
15181
|
+
);
|
15182
|
+
|
15183
|
+
struct lm_ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
|
15184
|
+
x_prev = lm_ggml_concat(
|
15185
|
+
ctx0,
|
15186
|
+
ffn_shift,
|
15187
|
+
lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
|
15188
|
+
1
|
15189
|
+
);
|
15190
|
+
cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
|
15191
|
+
lm_ggml_build_forward_expand(gf, cur);
|
15192
|
+
|
15193
|
+
struct lm_ggml_tensor * last_norm_att = lm_ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_att));
|
15194
|
+
struct lm_ggml_tensor * last_norm_ffn = lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_ffn));
|
15195
|
+
|
15196
|
+
token_shift = lm_ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
|
15197
|
+
|
15198
|
+
lm_ggml_build_forward_expand(
|
15199
|
+
gf,
|
15200
|
+
lm_ggml_cpy(
|
15201
|
+
ctx0,
|
15202
|
+
lm_ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
|
15203
|
+
lm_ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * lm_ggml_element_size(kv_self.k_l[il]))
|
15204
|
+
)
|
15205
|
+
);
|
15206
|
+
|
15207
|
+
if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
|
15208
|
+
cur = lm_ggml_scale(ctx0, cur, 0.5F);
|
15209
|
+
}
|
15210
|
+
|
15211
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
15212
|
+
cb(cur, "l_out", il);
|
15213
|
+
|
15214
|
+
// input for next layer
|
15215
|
+
inpL = cur;
|
15216
|
+
}
|
15217
|
+
|
15218
|
+
cur = inpL;
|
15219
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
15220
|
+
cur = lm_ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
15221
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
15222
|
+
|
15223
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
|
15224
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
15225
|
+
|
15226
|
+
cb(cur, "result_output", -1);
|
15227
|
+
lm_ggml_build_forward_expand(gf, cur);
|
15228
|
+
|
15229
|
+
return gf;
|
15230
|
+
}
|
14694
15231
|
};
|
14695
15232
|
|
14696
15233
|
static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -14937,6 +15474,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
14937
15474
|
{
|
14938
15475
|
result = llm.build_exaone();
|
14939
15476
|
} break;
|
15477
|
+
case LLM_ARCH_RWKV6:
|
15478
|
+
{
|
15479
|
+
result = llm.build_rwkv6();
|
15480
|
+
} break;
|
14940
15481
|
default:
|
14941
15482
|
LM_GGML_ABORT("fatal error");
|
14942
15483
|
}
|
@@ -15296,7 +15837,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|
15296
15837
|
|
15297
15838
|
// clear unused states
|
15298
15839
|
for (int i = 0; i < n_kv; ++i) {
|
15299
|
-
uint32_t
|
15840
|
+
const uint32_t cell_id = i + kv_self.head;
|
15300
15841
|
llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
|
15301
15842
|
|
15302
15843
|
data[i] = (float) (kv_cell.src >= 0);
|
@@ -15505,9 +16046,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
|
|
15505
16046
|
}
|
15506
16047
|
|
15507
16048
|
static void llama_graph_compute(
|
15508
|
-
|
15509
|
-
|
15510
|
-
|
16049
|
+
llama_context & lctx,
|
16050
|
+
lm_ggml_cgraph * gf,
|
16051
|
+
int n_threads,
|
16052
|
+
lm_ggml_threadpool * threadpool) {
|
15511
16053
|
#ifdef LM_GGML_USE_METAL
|
15512
16054
|
if (lm_ggml_backend_is_metal(lctx.backend_metal)) {
|
15513
16055
|
lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
@@ -15516,6 +16058,7 @@ static void llama_graph_compute(
|
|
15516
16058
|
|
15517
16059
|
if (lctx.backend_cpu != nullptr) {
|
15518
16060
|
lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
16061
|
+
lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
15519
16062
|
lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
15520
16063
|
}
|
15521
16064
|
#ifdef LM_GGML_USE_BLAS
|
@@ -15556,6 +16099,15 @@ static int llama_decode_internal(
|
|
15556
16099
|
|
15557
16100
|
LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
|
15558
16101
|
|
16102
|
+
if (batch_all.token) {
|
16103
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
16104
|
+
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
|
16105
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
16106
|
+
return -1;
|
16107
|
+
}
|
16108
|
+
}
|
16109
|
+
}
|
16110
|
+
|
15559
16111
|
LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
15560
16112
|
|
15561
16113
|
LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
@@ -15636,6 +16188,8 @@ static int llama_decode_internal(
|
|
15636
16188
|
}
|
15637
16189
|
|
15638
16190
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
16191
|
+
lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
16192
|
+
|
15639
16193
|
LM_GGML_ASSERT(n_threads > 0);
|
15640
16194
|
|
15641
16195
|
// non-causal masks do not use the KV cache
|
@@ -15670,8 +16224,8 @@ static int llama_decode_internal(
|
|
15670
16224
|
lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
15671
16225
|
|
15672
16226
|
// the output is always the last tensor in the graph
|
15673
|
-
struct lm_ggml_tensor * res = gf
|
15674
|
-
struct lm_ggml_tensor * embd = gf
|
16227
|
+
struct lm_ggml_tensor * res = lm_ggml_graph_node(gf, -1);
|
16228
|
+
struct lm_ggml_tensor * embd = lm_ggml_graph_node(gf, -2);
|
15675
16229
|
|
15676
16230
|
if (lctx.n_outputs == 0) {
|
15677
16231
|
// no output
|
@@ -15680,9 +16234,9 @@ static int llama_decode_internal(
|
|
15680
16234
|
} else if (cparams.embeddings) {
|
15681
16235
|
res = nullptr; // do not extract logits for embedding case
|
15682
16236
|
embd = nullptr;
|
15683
|
-
for (int i = gf
|
15684
|
-
if (strcmp(gf
|
15685
|
-
embd = gf
|
16237
|
+
for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
16238
|
+
if (strcmp(lm_ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
16239
|
+
embd = lm_ggml_graph_node(gf, i);
|
15686
16240
|
break;
|
15687
16241
|
}
|
15688
16242
|
}
|
@@ -15697,7 +16251,7 @@ static int llama_decode_internal(
|
|
15697
16251
|
|
15698
16252
|
llama_set_inputs(lctx, ubatch);
|
15699
16253
|
|
15700
|
-
llama_graph_compute(lctx, gf, n_threads);
|
16254
|
+
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
15701
16255
|
|
15702
16256
|
// update the kv ring buffer
|
15703
16257
|
{
|
@@ -15846,6 +16400,15 @@ static int llama_encode_internal(
|
|
15846
16400
|
|
15847
16401
|
LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
15848
16402
|
|
16403
|
+
if (batch.token) {
|
16404
|
+
for (uint32_t i = 0; i < n_tokens; ++i) {
|
16405
|
+
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
|
16406
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
16407
|
+
return -1;
|
16408
|
+
}
|
16409
|
+
}
|
16410
|
+
}
|
16411
|
+
|
15849
16412
|
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
15850
16413
|
LM_GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
|
15851
16414
|
|
@@ -15874,7 +16437,9 @@ static int llama_encode_internal(
|
|
15874
16437
|
lctx.inp_embd_enc = NULL;
|
15875
16438
|
lctx.n_outputs = n_tokens;
|
15876
16439
|
|
15877
|
-
|
16440
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
16441
|
+
lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
16442
|
+
|
15878
16443
|
LM_GGML_ASSERT(n_threads > 0);
|
15879
16444
|
|
15880
16445
|
lm_ggml_backend_sched_reset(lctx.sched);
|
@@ -15888,15 +16453,15 @@ static int llama_encode_internal(
|
|
15888
16453
|
// there are two cases here
|
15889
16454
|
if (llama_model_has_decoder(&lctx.model)) {
|
15890
16455
|
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
15891
|
-
embd = gf
|
16456
|
+
embd = lm_ggml_graph_node(gf, -1);
|
15892
16457
|
LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
15893
16458
|
} else {
|
15894
16459
|
// second case is an encoder-only T5 model
|
15895
16460
|
if (cparams.embeddings) {
|
15896
16461
|
// only output embeddings if required
|
15897
|
-
embd = gf
|
16462
|
+
embd = lm_ggml_graph_node(gf, -1);
|
15898
16463
|
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
15899
|
-
embd = gf
|
16464
|
+
embd = lm_ggml_graph_node(gf, -2);
|
15900
16465
|
}
|
15901
16466
|
LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
15902
16467
|
}
|
@@ -15906,7 +16471,7 @@ static int llama_encode_internal(
|
|
15906
16471
|
|
15907
16472
|
llama_set_inputs(lctx, ubatch);
|
15908
16473
|
|
15909
|
-
llama_graph_compute(lctx, gf, n_threads);
|
16474
|
+
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
15910
16475
|
|
15911
16476
|
// extract embeddings
|
15912
16477
|
if (embd) {
|
@@ -16188,7 +16753,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
16188
16753
|
|
16189
16754
|
lm_ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
16190
16755
|
|
16191
|
-
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
16756
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
16192
16757
|
#endif
|
16193
16758
|
|
16194
16759
|
//const int64_t t_end = lm_ggml_time_us();
|
@@ -16214,7 +16779,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|
16214
16779
|
|
16215
16780
|
llama_set_k_shift(lctx);
|
16216
16781
|
|
16217
|
-
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
16782
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
16218
16783
|
|
16219
16784
|
need_reserve = true;
|
16220
16785
|
}
|
@@ -16425,6 +16990,9 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
16425
16990
|
new_type == LM_GGML_TYPE_Q4_0_8_8) {
|
16426
16991
|
new_type = LM_GGML_TYPE_Q4_0;
|
16427
16992
|
}
|
16993
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
16994
|
+
new_type = LM_GGML_TYPE_Q4_K;
|
16995
|
+
}
|
16428
16996
|
}
|
16429
16997
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
16430
16998
|
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
@@ -16624,6 +17192,8 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
16624
17192
|
}
|
16625
17193
|
if (convert_incompatible_tensor) {
|
16626
17194
|
switch (new_type) {
|
17195
|
+
case LM_GGML_TYPE_TQ1_0:
|
17196
|
+
case LM_GGML_TYPE_TQ2_0: new_type = LM_GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
|
16627
17197
|
case LM_GGML_TYPE_IQ2_XXS:
|
16628
17198
|
case LM_GGML_TYPE_IQ2_XS:
|
16629
17199
|
case LM_GGML_TYPE_IQ2_S:
|
@@ -16729,6 +17299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
16729
17299
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
16730
17300
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = LM_GGML_TYPE_Q5_K; break;
|
16731
17301
|
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = LM_GGML_TYPE_Q6_K; break;
|
17302
|
+
case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = LM_GGML_TYPE_TQ1_0; break;
|
17303
|
+
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = LM_GGML_TYPE_TQ2_0; break;
|
16732
17304
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = LM_GGML_TYPE_IQ2_XXS; break;
|
16733
17305
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = LM_GGML_TYPE_IQ2_XS; break;
|
16734
17306
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = LM_GGML_TYPE_IQ2_XS; break;
|
@@ -16833,7 +17405,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
16833
17405
|
|
16834
17406
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
16835
17407
|
if (name.find("attn_v.weight") != std::string::npos ||
|
16836
|
-
name.find("attn_qkv.weight") != std::string::npos
|
17408
|
+
name.find("attn_qkv.weight") != std::string::npos ||
|
17409
|
+
name.find("attn_kv_b.weight")!= std::string::npos) {
|
16837
17410
|
++qs.n_attention_wv;
|
16838
17411
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
16839
17412
|
qs.has_output = true;
|
@@ -16974,6 +17547,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
16974
17547
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
16975
17548
|
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
16976
17549
|
|
17550
|
+
// do not quantize RWKV's time_mix_first tensors
|
17551
|
+
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
17552
|
+
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
17553
|
+
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
17554
|
+
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
17555
|
+
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
17556
|
+
|
16977
17557
|
// do not quantize relative position bias (T5)
|
16978
17558
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
16979
17559
|
|
@@ -17357,7 +17937,6 @@ struct llama_model_params llama_model_default_params() {
|
|
17357
17937
|
|
17358
17938
|
struct llama_context_params llama_context_default_params() {
|
17359
17939
|
struct llama_context_params result = {
|
17360
|
-
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
17361
17940
|
/*.n_ctx =*/ 512,
|
17362
17941
|
/*.n_batch =*/ 2048,
|
17363
17942
|
/*.n_ubatch =*/ 512,
|
@@ -17383,6 +17962,7 @@ struct llama_context_params llama_context_default_params() {
|
|
17383
17962
|
/*.embeddings =*/ false,
|
17384
17963
|
/*.offload_kqv =*/ true,
|
17385
17964
|
/*.flash_attn =*/ false,
|
17965
|
+
/*.no_perf =*/ true,
|
17386
17966
|
/*.abort_callback =*/ nullptr,
|
17387
17967
|
/*.abort_callback_data =*/ nullptr,
|
17388
17968
|
};
|
@@ -17390,6 +17970,14 @@ struct llama_context_params llama_context_default_params() {
|
|
17390
17970
|
return result;
|
17391
17971
|
}
|
17392
17972
|
|
17973
|
+
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
17974
|
+
struct llama_sampler_chain_params result = {
|
17975
|
+
/*.no_perf =*/ true,
|
17976
|
+
};
|
17977
|
+
|
17978
|
+
return result;
|
17979
|
+
}
|
17980
|
+
|
17393
17981
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
17394
17982
|
struct llama_model_quantize_params result = {
|
17395
17983
|
/*.nthread =*/ 0,
|
@@ -17461,6 +18049,19 @@ void llama_numa_init(enum lm_ggml_numa_strategy numa) {
|
|
17461
18049
|
}
|
17462
18050
|
}
|
17463
18051
|
|
18052
|
+
void llama_attach_threadpool(
|
18053
|
+
struct llama_context * ctx,
|
18054
|
+
lm_ggml_threadpool_t threadpool,
|
18055
|
+
lm_ggml_threadpool_t threadpool_batch) {
|
18056
|
+
ctx->threadpool = threadpool;
|
18057
|
+
ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
|
18058
|
+
}
|
18059
|
+
|
18060
|
+
void llama_detach_threadpool(struct llama_context * ctx) {
|
18061
|
+
ctx->threadpool = nullptr;
|
18062
|
+
ctx->threadpool_batch = nullptr;
|
18063
|
+
}
|
18064
|
+
|
17464
18065
|
void llama_backend_free(void) {
|
17465
18066
|
lm_ggml_quantize_free();
|
17466
18067
|
}
|
@@ -17572,6 +18173,7 @@ struct llama_context * llama_new_context_with_model(
|
|
17572
18173
|
cparams.embeddings = params.embeddings;
|
17573
18174
|
cparams.offload_kqv = params.offload_kqv;
|
17574
18175
|
cparams.flash_attn = params.flash_attn;
|
18176
|
+
cparams.no_perf = params.no_perf;
|
17575
18177
|
cparams.pooling_type = params.pooling_type;
|
17576
18178
|
|
17577
18179
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -17630,10 +18232,6 @@ struct llama_context * llama_new_context_with_model(
|
|
17630
18232
|
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
17631
18233
|
}
|
17632
18234
|
|
17633
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
17634
|
-
params.seed = time(NULL);
|
17635
|
-
}
|
17636
|
-
|
17637
18235
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
17638
18236
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
17639
18237
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
@@ -17644,10 +18242,10 @@ struct llama_context * llama_new_context_with_model(
|
|
17644
18242
|
ctx->abort_callback = params.abort_callback;
|
17645
18243
|
ctx->abort_callback_data = params.abort_callback_data;
|
17646
18244
|
|
17647
|
-
ctx->
|
17648
|
-
|
18245
|
+
ctx->logits_all = params.logits_all;
|
18246
|
+
|
17649
18247
|
// build worst-case graph for encoder if a model contains encoder
|
17650
|
-
ctx->is_encoding
|
18248
|
+
ctx->is_encoding = llama_model_has_encoder(model);
|
17651
18249
|
|
17652
18250
|
uint32_t kv_size = cparams.n_ctx;
|
17653
18251
|
lm_ggml_type type_k = params.type_k;
|
@@ -17667,6 +18265,20 @@ struct llama_context * llama_new_context_with_model(
|
|
17667
18265
|
|
17668
18266
|
if (!hparams.vocab_only) {
|
17669
18267
|
// initialize backends
|
18268
|
+
#if defined(LM_GGML_USE_RPC)
|
18269
|
+
if (model->n_gpu_layers > 0) {
|
18270
|
+
for (const auto & endpoint : model->rpc_servers) {
|
18271
|
+
lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
|
18272
|
+
if (backend == nullptr) {
|
18273
|
+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
18274
|
+
llama_free(ctx);
|
18275
|
+
return nullptr;
|
18276
|
+
}
|
18277
|
+
ctx->backends.push_back(backend);
|
18278
|
+
}
|
18279
|
+
}
|
18280
|
+
#endif
|
18281
|
+
|
17670
18282
|
#if defined(LM_GGML_USE_METAL)
|
17671
18283
|
if (model->n_gpu_layers > 0) {
|
17672
18284
|
ctx->backend_metal = lm_ggml_backend_metal_init();
|
@@ -17791,19 +18403,6 @@ struct llama_context * llama_new_context_with_model(
|
|
17791
18403
|
}
|
17792
18404
|
#endif
|
17793
18405
|
|
17794
|
-
#if defined(LM_GGML_USE_RPC)
|
17795
|
-
if (model->n_gpu_layers > 0) {
|
17796
|
-
for (const auto & endpoint : model->rpc_servers) {
|
17797
|
-
lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
|
17798
|
-
if (backend == nullptr) {
|
17799
|
-
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
17800
|
-
llama_free(ctx);
|
17801
|
-
return nullptr;
|
17802
|
-
}
|
17803
|
-
ctx->backends.push_back(backend);
|
17804
|
-
}
|
17805
|
-
}
|
17806
|
-
#endif
|
17807
18406
|
ctx->backend_cpu = lm_ggml_backend_cpu_init();
|
17808
18407
|
if (ctx->backend_cpu == nullptr) {
|
17809
18408
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
@@ -17912,7 +18511,7 @@ struct llama_context * llama_new_context_with_model(
|
|
17912
18511
|
|
17913
18512
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
17914
18513
|
int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched);
|
17915
|
-
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf
|
18514
|
+
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, lm_ggml_graph_n_nodes(gf));
|
17916
18515
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
17917
18516
|
}
|
17918
18517
|
}
|
@@ -17924,14 +18523,6 @@ void llama_free(struct llama_context * ctx) {
|
|
17924
18523
|
delete ctx;
|
17925
18524
|
}
|
17926
18525
|
|
17927
|
-
const struct llama_model * llama_get_model(const struct llama_context * ctx) {
|
17928
|
-
return &ctx->model;
|
17929
|
-
}
|
17930
|
-
|
17931
|
-
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
|
17932
|
-
return &ctx->model.vocab;
|
17933
|
-
}
|
17934
|
-
|
17935
18526
|
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
17936
18527
|
return ctx->cparams.n_ctx;
|
17937
18528
|
}
|
@@ -17952,6 +18543,30 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
17952
18543
|
return model->vocab.type;
|
17953
18544
|
}
|
17954
18545
|
|
18546
|
+
int32_t llama_n_vocab(const struct llama_model * model) {
|
18547
|
+
return model->hparams.n_vocab;
|
18548
|
+
}
|
18549
|
+
|
18550
|
+
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
18551
|
+
return model->hparams.n_ctx_train;
|
18552
|
+
}
|
18553
|
+
|
18554
|
+
int32_t llama_n_embd(const struct llama_model * model) {
|
18555
|
+
return model->hparams.n_embd;
|
18556
|
+
}
|
18557
|
+
|
18558
|
+
int32_t llama_n_layer(const struct llama_model * model) {
|
18559
|
+
return model->hparams.n_layer;
|
18560
|
+
}
|
18561
|
+
|
18562
|
+
const struct llama_model * llama_get_model(const struct llama_context * ctx) {
|
18563
|
+
return &ctx->model;
|
18564
|
+
}
|
18565
|
+
|
18566
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
18567
|
+
return ctx->cparams.pooling_type;
|
18568
|
+
}
|
18569
|
+
|
17955
18570
|
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
17956
18571
|
switch (model->arch) {
|
17957
18572
|
// these models do not use RoPE
|
@@ -17965,6 +18580,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
17965
18580
|
case LLM_ARCH_T5:
|
17966
18581
|
case LLM_ARCH_T5ENCODER:
|
17967
18582
|
case LLM_ARCH_JAIS:
|
18583
|
+
case LLM_ARCH_RWKV6:
|
17968
18584
|
return LLAMA_ROPE_TYPE_NONE;
|
17969
18585
|
|
17970
18586
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -18014,26 +18630,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
18014
18630
|
return LLAMA_ROPE_TYPE_NONE;
|
18015
18631
|
}
|
18016
18632
|
|
18017
|
-
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
18018
|
-
return ctx->cparams.pooling_type;
|
18019
|
-
}
|
18020
|
-
|
18021
|
-
int32_t llama_n_vocab(const struct llama_model * model) {
|
18022
|
-
return model->hparams.n_vocab;
|
18023
|
-
}
|
18024
|
-
|
18025
|
-
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
18026
|
-
return model->hparams.n_ctx_train;
|
18027
|
-
}
|
18028
|
-
|
18029
|
-
int32_t llama_n_embd(const struct llama_model * model) {
|
18030
|
-
return model->hparams.n_embd;
|
18031
|
-
}
|
18032
|
-
|
18033
|
-
int32_t llama_n_layer(const struct llama_model * model) {
|
18034
|
-
return model->hparams.n_layer;
|
18035
|
-
}
|
18036
|
-
|
18037
18633
|
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
18038
18634
|
return model->hparams.rope_freq_scale_train;
|
18039
18635
|
}
|
@@ -18133,6 +18729,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) {
|
|
18133
18729
|
bool llama_model_is_recurrent(const struct llama_model * model) {
|
18134
18730
|
switch (model->arch) {
|
18135
18731
|
case LLM_ARCH_MAMBA: return true;
|
18732
|
+
case LLM_ARCH_RWKV6: return true;
|
18136
18733
|
default: return false;
|
18137
18734
|
}
|
18138
18735
|
}
|
@@ -18449,14 +19046,14 @@ struct llama_data_write {
|
|
18449
19046
|
// TODO: add more model-specific info which should prevent loading the session file if not identical
|
18450
19047
|
}
|
18451
19048
|
|
18452
|
-
void write_rng(const std::mt19937 & rng) {
|
18453
|
-
|
18454
|
-
|
19049
|
+
//void write_rng(const std::mt19937 & rng) {
|
19050
|
+
// std::ostringstream rng_ss;
|
19051
|
+
// rng_ss << rng;
|
18455
19052
|
|
18456
|
-
|
19053
|
+
// const std::string & rng_str = rng_ss.str();
|
18457
19054
|
|
18458
|
-
|
18459
|
-
}
|
19055
|
+
// write_string(rng_str);
|
19056
|
+
//}
|
18460
19057
|
|
18461
19058
|
void write_output_ids(struct llama_context * ctx) {
|
18462
19059
|
llama_output_reorder(ctx);
|
@@ -18676,17 +19273,17 @@ struct llama_data_read {
|
|
18676
19273
|
// TODO: add more info which needs to be identical but which is not verified otherwise
|
18677
19274
|
}
|
18678
19275
|
|
18679
|
-
void read_rng(std::mt19937 & rng) {
|
18680
|
-
|
18681
|
-
|
19276
|
+
//void read_rng(std::mt19937 & rng) {
|
19277
|
+
// std::string rng_str;
|
19278
|
+
// read_string(rng_str);
|
18682
19279
|
|
18683
|
-
|
18684
|
-
|
19280
|
+
// std::istringstream rng_ss(rng_str);
|
19281
|
+
// rng_ss >> rng;
|
18685
19282
|
|
18686
|
-
|
18687
|
-
|
18688
|
-
|
18689
|
-
}
|
19283
|
+
// if (rng_ss.fail()) {
|
19284
|
+
// throw std::runtime_error("failed to load RNG state");
|
19285
|
+
// }
|
19286
|
+
//}
|
18690
19287
|
|
18691
19288
|
void read_output_ids(struct llama_context * ctx) {
|
18692
19289
|
std::vector<int32_t> output_pos;
|
@@ -19116,8 +19713,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
|
|
19116
19713
|
|
19117
19714
|
data_ctx.write_model_info(ctx);
|
19118
19715
|
|
19119
|
-
data_ctx.write_rng(ctx->sampling.rng);
|
19120
|
-
|
19121
19716
|
// copy outputs
|
19122
19717
|
data_ctx.write_output_ids(ctx);
|
19123
19718
|
data_ctx.write_logits(ctx);
|
@@ -19155,9 +19750,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
|
|
19155
19750
|
|
19156
19751
|
data_ctx.read_model_info(ctx);
|
19157
19752
|
|
19158
|
-
// set rng
|
19159
|
-
data_ctx.read_rng(ctx->sampling.rng);
|
19160
|
-
|
19161
19753
|
// set outputs
|
19162
19754
|
data_ctx.read_output_ids(ctx);
|
19163
19755
|
data_ctx.read_logits(ctx);
|
@@ -19377,16 +19969,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
|
|
19377
19969
|
}
|
19378
19970
|
}
|
19379
19971
|
|
19380
|
-
void llama_set_n_threads(struct llama_context * ctx,
|
19972
|
+
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
|
19381
19973
|
ctx->cparams.n_threads = n_threads;
|
19382
19974
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
19383
19975
|
}
|
19384
19976
|
|
19385
|
-
|
19977
|
+
int32_t llama_n_threads(struct llama_context * ctx) {
|
19386
19978
|
return ctx->cparams.n_threads;
|
19387
19979
|
}
|
19388
19980
|
|
19389
|
-
|
19981
|
+
int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
19390
19982
|
return ctx->cparams.n_threads_batch;
|
19391
19983
|
}
|
19392
19984
|
|
@@ -19500,10 +20092,14 @@ void llama_synchronize(struct llama_context * ctx) {
|
|
19500
20092
|
|
19501
20093
|
// add the evaluation to the stats
|
19502
20094
|
if (ctx->n_queued_tokens == 1) {
|
19503
|
-
ctx->
|
20095
|
+
if (!ctx->cparams.no_perf) {
|
20096
|
+
ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
|
20097
|
+
}
|
19504
20098
|
ctx->n_eval++;
|
19505
20099
|
} else if (ctx->n_queued_tokens > 1) {
|
19506
|
-
ctx->
|
20100
|
+
if (!ctx->cparams.no_perf) {
|
20101
|
+
ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
|
20102
|
+
}
|
19507
20103
|
ctx->n_p_eval += ctx->n_queued_tokens;
|
19508
20104
|
}
|
19509
20105
|
|
@@ -19560,8 +20156,9 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|
19560
20156
|
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
19561
20157
|
#ifndef NDEBUG
|
19562
20158
|
LM_GGML_ABORT("fatal error");
|
19563
|
-
#
|
20159
|
+
#else
|
19564
20160
|
return nullptr;
|
20161
|
+
#endif
|
19565
20162
|
}
|
19566
20163
|
}
|
19567
20164
|
|
@@ -19609,8 +20206,9 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
|
19609
20206
|
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
19610
20207
|
#ifndef NDEBUG
|
19611
20208
|
LM_GGML_ABORT("fatal error");
|
19612
|
-
#
|
20209
|
+
#else
|
19613
20210
|
return nullptr;
|
20211
|
+
#endif
|
19614
20212
|
}
|
19615
20213
|
}
|
19616
20214
|
|
@@ -20044,128 +20642,18 @@ int32_t llama_chat_apply_template(
|
|
20044
20642
|
}
|
20045
20643
|
|
20046
20644
|
//
|
20047
|
-
//
|
20645
|
+
// sampling
|
20048
20646
|
//
|
20049
20647
|
|
20050
|
-
|
20051
|
-
|
20052
|
-
|
20053
|
-
size_t start_rule_index) {
|
20054
|
-
return llama_grammar_init_impl(rules, n_rules, start_rule_index);
|
20055
|
-
}
|
20056
|
-
|
20057
|
-
void llama_grammar_free(struct llama_grammar * grammar) {
|
20058
|
-
llama_grammar_free_impl(grammar);
|
20059
|
-
}
|
20060
|
-
|
20061
|
-
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
20062
|
-
return llama_grammar_copy_impl(grammar);
|
20063
|
-
}
|
20064
|
-
|
20065
|
-
void llama_grammar_sample(
|
20066
|
-
const struct llama_grammar * grammar,
|
20067
|
-
const struct llama_context * ctx,
|
20068
|
-
llama_token_data_array * candidates) {
|
20069
|
-
llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
|
20070
|
-
}
|
20071
|
-
|
20072
|
-
void llama_sample_grammar(
|
20073
|
-
struct llama_context * ctx,
|
20074
|
-
llama_token_data_array * candidates,
|
20075
|
-
const struct llama_grammar * grammar) {
|
20076
|
-
llama_grammar_sample(grammar, ctx, candidates);
|
20077
|
-
}
|
20078
|
-
|
20079
|
-
void llama_grammar_accept_token(
|
20080
|
-
struct llama_grammar * grammar,
|
20081
|
-
struct llama_context * ctx,
|
20082
|
-
llama_token token) {
|
20083
|
-
llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token);
|
20648
|
+
// TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
|
20649
|
+
struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
|
20650
|
+
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
20084
20651
|
}
|
20085
20652
|
|
20086
20653
|
//
|
20087
|
-
//
|
20654
|
+
// model split
|
20088
20655
|
//
|
20089
20656
|
|
20090
|
-
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
20091
|
-
llama_set_rng_seed_impl(&ctx->sampling, seed);
|
20092
|
-
}
|
20093
|
-
|
20094
|
-
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
20095
|
-
llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates);
|
20096
|
-
}
|
20097
|
-
|
20098
|
-
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
20099
|
-
llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
|
20100
|
-
}
|
20101
|
-
|
20102
|
-
void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
20103
|
-
llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
|
20104
|
-
}
|
20105
|
-
|
20106
|
-
void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
20107
|
-
llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
|
20108
|
-
}
|
20109
|
-
|
20110
|
-
void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates, float xtc_threshold, float xtc_probability, size_t min_keep, std::mt19937 rng){
|
20111
|
-
llama_sample_xtc_impl(ctx ? &ctx-> sampling: nullptr, candidates, xtc_threshold, xtc_probability, min_keep, rng);
|
20112
|
-
}
|
20113
|
-
|
20114
|
-
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
20115
|
-
llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
|
20116
|
-
}
|
20117
|
-
|
20118
|
-
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
20119
|
-
llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
|
20120
|
-
}
|
20121
|
-
|
20122
|
-
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
|
20123
|
-
llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
|
20124
|
-
}
|
20125
|
-
|
20126
|
-
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
20127
|
-
llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
|
20128
|
-
}
|
20129
|
-
|
20130
|
-
void llama_sample_repetition_penalties(
|
20131
|
-
struct llama_context * ctx,
|
20132
|
-
llama_token_data_array * candidates,
|
20133
|
-
const llama_token * last_tokens,
|
20134
|
-
size_t penalty_last_n,
|
20135
|
-
float penalty_repeat,
|
20136
|
-
float penalty_freq,
|
20137
|
-
float penalty_present) {
|
20138
|
-
llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
|
20139
|
-
}
|
20140
|
-
|
20141
|
-
void llama_sample_apply_guidance(
|
20142
|
-
struct llama_context * ctx,
|
20143
|
-
float * logits,
|
20144
|
-
float * logits_guidance,
|
20145
|
-
float scale) {
|
20146
|
-
llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
|
20147
|
-
}
|
20148
|
-
|
20149
|
-
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
20150
|
-
return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
|
20151
|
-
}
|
20152
|
-
|
20153
|
-
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
20154
|
-
return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
|
20155
|
-
}
|
20156
|
-
|
20157
|
-
llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
|
20158
|
-
return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
|
20159
|
-
}
|
20160
|
-
|
20161
|
-
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
20162
|
-
return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
|
20163
|
-
}
|
20164
|
-
|
20165
|
-
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
20166
|
-
return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
|
20167
|
-
}
|
20168
|
-
|
20169
20657
|
int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
20170
20658
|
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
20171
20659
|
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
@@ -20190,45 +20678,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|
20190
20678
|
return 0;
|
20191
20679
|
}
|
20192
20680
|
|
20193
|
-
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
20194
|
-
struct llama_timings result = {
|
20195
|
-
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
20196
|
-
/*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
|
20197
|
-
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
20198
|
-
/*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
|
20199
|
-
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
20200
|
-
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
20201
|
-
|
20202
|
-
/*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
|
20203
|
-
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
20204
|
-
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
20205
|
-
};
|
20206
|
-
|
20207
|
-
return result;
|
20208
|
-
}
|
20209
|
-
|
20210
|
-
void llama_print_timings(struct llama_context * ctx) {
|
20211
|
-
const llama_timings timings = llama_get_timings(ctx);
|
20212
|
-
|
20213
|
-
LLAMA_LOG_INFO("\n");
|
20214
|
-
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
20215
|
-
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20216
|
-
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
20217
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
20218
|
-
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
20219
|
-
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20220
|
-
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
20221
|
-
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
|
20222
|
-
}
|
20223
|
-
|
20224
|
-
void llama_reset_timings(struct llama_context * ctx) {
|
20225
|
-
ctx->t_start_us = lm_ggml_time_us();
|
20226
|
-
ctx->t_eval_us = ctx->n_eval = 0;
|
20227
|
-
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
20228
|
-
|
20229
|
-
ctx->sampling.reset_timings();
|
20230
|
-
}
|
20231
|
-
|
20232
20681
|
const char * llama_print_system_info(void) {
|
20233
20682
|
static std::string s;
|
20234
20683
|
|
@@ -20246,6 +20695,7 @@ const char * llama_print_system_info(void) {
|
|
20246
20695
|
s += "ARM_FMA = " + std::to_string(lm_ggml_cpu_has_arm_fma()) + " | ";
|
20247
20696
|
s += "F16C = " + std::to_string(lm_ggml_cpu_has_f16c()) + " | ";
|
20248
20697
|
s += "FP16_VA = " + std::to_string(lm_ggml_cpu_has_fp16_va()) + " | ";
|
20698
|
+
s += "RISCV_VECT = " + std::to_string(lm_ggml_cpu_has_riscv_v()) + " | ";
|
20249
20699
|
s += "WASM_SIMD = " + std::to_string(lm_ggml_cpu_has_wasm_simd()) + " | ";
|
20250
20700
|
s += "BLAS = " + std::to_string(lm_ggml_cpu_has_blas()) + " | ";
|
20251
20701
|
s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | ";
|
@@ -20257,7 +20707,43 @@ const char * llama_print_system_info(void) {
|
|
20257
20707
|
return s.c_str();
|
20258
20708
|
}
|
20259
20709
|
|
20260
|
-
|
20710
|
+
struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
|
20711
|
+
struct llama_perf_context_data data = {};
|
20712
|
+
|
20713
|
+
if (ctx == nullptr) {
|
20714
|
+
return data;
|
20715
|
+
}
|
20716
|
+
|
20717
|
+
data.t_start_ms = 1e-3 * ctx->t_start_us;
|
20718
|
+
data.t_load_ms = 1e-3 * ctx->t_load_us;
|
20719
|
+
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
|
20720
|
+
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
|
20721
|
+
data.n_p_eval = std::max(1, ctx->n_p_eval);
|
20722
|
+
data.n_eval = std::max(1, ctx->n_eval);
|
20723
|
+
|
20724
|
+
return data;
|
20725
|
+
}
|
20726
|
+
|
20727
|
+
void llama_perf_context_print(const struct llama_context * ctx) {
|
20728
|
+
const auto data = llama_perf_context(ctx);
|
20729
|
+
|
20730
|
+
const double t_end_ms = 1e-3 * lm_ggml_time_us();
|
20731
|
+
|
20732
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
20733
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
20734
|
+
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
20735
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20736
|
+
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
20737
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
20738
|
+
}
|
20739
|
+
|
20740
|
+
void llama_perf_context_reset(struct llama_context * ctx) {
|
20741
|
+
ctx->t_start_us = lm_ggml_time_us();
|
20742
|
+
ctx->t_eval_us = ctx->n_eval = 0;
|
20743
|
+
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
20744
|
+
}
|
20745
|
+
|
20746
|
+
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
20261
20747
|
fprintf(stream, "\n");
|
20262
20748
|
fprintf(stream, "###########\n");
|
20263
20749
|
fprintf(stream, "# Timings #\n");
|
@@ -20268,21 +20754,15 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|
20268
20754
|
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
20269
20755
|
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
20270
20756
|
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
20271
|
-
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
|
20272
|
-
1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
|
20273
20757
|
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
20274
20758
|
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
20275
|
-
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->sampling.n_sample);
|
20276
20759
|
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
20277
20760
|
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
20278
20761
|
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
20279
|
-
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
|
20280
20762
|
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
20281
20763
|
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
20282
20764
|
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
20283
20765
|
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
20284
|
-
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
|
20285
|
-
1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
|
20286
20766
|
}
|
20287
20767
|
|
20288
20768
|
// For internal test use
|
@@ -20334,3 +20814,20 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
|
|
20334
20814
|
fputs(text, stderr);
|
20335
20815
|
fflush(stderr);
|
20336
20816
|
}
|
20817
|
+
|
20818
|
+
struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
|
20819
|
+
const auto * ctx = (llama_context *) v_ctx;
|
20820
|
+
struct llama_token_timings result = {
|
20821
|
+
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
20822
|
+
/*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
|
20823
|
+
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
20824
|
+
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
20825
|
+
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
20826
|
+
|
20827
|
+
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
20828
|
+
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
20829
|
+
};
|
20830
|
+
|
20831
|
+
return result;
|
20832
|
+
}
|
20833
|
+
|