cui-llama.rn 1.1.2 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -1,6 +1,5 @@
1
1
  #include "llama-impl.h"
2
2
  #include "llama-vocab.h"
3
- #include "llama-grammar.h"
4
3
  #include "llama-sampling.h"
5
4
 
6
5
  #include "unicode.h"
@@ -223,6 +222,7 @@ enum llm_arch {
223
222
  LLM_ARCH_JAIS,
224
223
  LLM_ARCH_NEMOTRON,
225
224
  LLM_ARCH_EXAONE,
225
+ LLM_ARCH_RWKV6,
226
226
  LLM_ARCH_UNKNOWN,
227
227
  };
228
228
 
@@ -270,6 +270,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
270
270
  { LLM_ARCH_JAIS, "jais" },
271
271
  { LLM_ARCH_NEMOTRON, "nemotron" },
272
272
  { LLM_ARCH_EXAONE, "exaone" },
273
+ { LLM_ARCH_RWKV6, "rwkv6" },
273
274
  { LLM_ARCH_UNKNOWN, "(unknown)" },
274
275
  };
275
276
 
@@ -306,6 +307,9 @@ enum llm_kv {
306
307
  LLM_KV_DECODER_START_TOKEN_ID,
307
308
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
308
309
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
310
+ LLM_KV_RESCALE_EVERY_N_LAYERS,
311
+ LLM_KV_TIME_MIX_EXTRA_DIM,
312
+ LLM_KV_TIME_DECAY_EXTRA_DIM,
309
313
 
310
314
  LLM_KV_ATTENTION_HEAD_COUNT,
311
315
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -341,6 +345,8 @@ enum llm_kv {
341
345
  LLM_KV_SSM_TIME_STEP_RANK,
342
346
  LLM_KV_SSM_DT_B_C_RMS,
343
347
 
348
+ LLM_KV_WKV_HEAD_SIZE,
349
+
344
350
  LLM_KV_TOKENIZER_MODEL,
345
351
  LLM_KV_TOKENIZER_PRE,
346
352
  LLM_KV_TOKENIZER_LIST,
@@ -400,11 +406,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
400
406
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
401
407
  { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
402
408
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
403
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
409
+ { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
404
410
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
405
411
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
406
412
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
407
413
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
414
+ { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
415
+ { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
416
+ { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
408
417
 
409
418
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
410
419
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -440,6 +449,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
440
449
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
441
450
  { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
442
451
 
452
+ { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
453
+
443
454
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
444
455
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
445
456
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -529,6 +540,29 @@ enum llm_tensor {
529
540
  LLM_TENSOR_SSM_A,
530
541
  LLM_TENSOR_SSM_D,
531
542
  LLM_TENSOR_SSM_OUT,
543
+ LLM_TENSOR_TIME_MIX_W1,
544
+ LLM_TENSOR_TIME_MIX_W2,
545
+ LLM_TENSOR_TIME_MIX_LERP_X,
546
+ LLM_TENSOR_TIME_MIX_LERP_W,
547
+ LLM_TENSOR_TIME_MIX_LERP_K,
548
+ LLM_TENSOR_TIME_MIX_LERP_V,
549
+ LLM_TENSOR_TIME_MIX_LERP_R,
550
+ LLM_TENSOR_TIME_MIX_LERP_G,
551
+ LLM_TENSOR_TIME_MIX_FIRST,
552
+ LLM_TENSOR_TIME_MIX_DECAY,
553
+ LLM_TENSOR_TIME_MIX_DECAY_W1,
554
+ LLM_TENSOR_TIME_MIX_DECAY_W2,
555
+ LLM_TENSOR_TIME_MIX_KEY,
556
+ LLM_TENSOR_TIME_MIX_VALUE,
557
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
558
+ LLM_TENSOR_TIME_MIX_GATE,
559
+ LLM_TENSOR_TIME_MIX_LN,
560
+ LLM_TENSOR_TIME_MIX_OUTPUT,
561
+ LLM_TENSOR_CHANNEL_MIX_LERP_K,
562
+ LLM_TENSOR_CHANNEL_MIX_LERP_R,
563
+ LLM_TENSOR_CHANNEL_MIX_KEY,
564
+ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
565
+ LLM_TENSOR_CHANNEL_MIX_VALUE,
532
566
  LLM_TENSOR_ATTN_Q_A,
533
567
  LLM_TENSOR_ATTN_Q_B,
534
568
  LLM_TENSOR_ATTN_KV_A_MQA,
@@ -1350,6 +1384,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1350
1384
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1351
1385
  },
1352
1386
  },
1387
+ {
1388
+ LLM_ARCH_RWKV6,
1389
+ {
1390
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1391
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1392
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1393
+ { LLM_TENSOR_OUTPUT, "output" },
1394
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1395
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
1396
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
1397
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
1398
+ { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
1399
+ { LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" },
1400
+ { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" },
1401
+ { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
1402
+ { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
1403
+ { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
1404
+ { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
1405
+ { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
1406
+ { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
1407
+ { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
1408
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
1409
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
1410
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
1411
+ { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
1412
+ { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
1413
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
1414
+ { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
1415
+ { LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" },
1416
+ { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
1417
+ { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
1418
+ { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1419
+ },
1420
+ },
1353
1421
  {
1354
1422
  LLM_ARCH_UNKNOWN,
1355
1423
  {
@@ -2099,6 +2167,10 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf
2099
2167
  if (host_buffer) {
2100
2168
  buft = lm_ggml_backend_sycl_host_buffer_type();
2101
2169
  }
2170
+ #elif defined(LM_GGML_USE_CANN)
2171
+ if (host_buffer) {
2172
+ buft = lm_ggml_backend_cann_host_buffer_type();
2173
+ }
2102
2174
  #elif defined(LM_GGML_USE_CPU_HBM)
2103
2175
  buft = lm_ggml_backend_cpu_hbm_buffer_type();
2104
2176
  #elif defined(LM_GGML_USE_VULKAN)
@@ -2162,6 +2234,7 @@ enum e_model {
2162
2234
  MODEL_1B,
2163
2235
  MODEL_1_3B,
2164
2236
  MODEL_1_4B,
2237
+ MODEL_1_6B,
2165
2238
  MODEL_2B,
2166
2239
  MODEL_2_8B,
2167
2240
  MODEL_3B,
@@ -2239,6 +2312,12 @@ struct llama_hparams {
2239
2312
  float f_attn_logit_softcapping = 50.0f;
2240
2313
  float f_final_logit_softcapping = 30.0f;
2241
2314
 
2315
+ // for RWKV
2316
+ uint32_t rescale_every_n_layers = 0;
2317
+ uint32_t time_mix_extra_dim = 0;
2318
+ uint32_t time_decay_extra_dim = 0;
2319
+ uint32_t wkv_head_size = 0;
2320
+
2242
2321
  float rope_attn_factor = 1.0f;
2243
2322
  float rope_freq_base_train;
2244
2323
  float rope_freq_scale_train;
@@ -2302,6 +2381,11 @@ struct llama_hparams {
2302
2381
  if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2303
2382
  if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2304
2383
 
2384
+ if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
2385
+ if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
2386
+ if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
2387
+ if (this->wkv_head_size != other.wkv_head_size) return true;
2388
+
2305
2389
  if (this->dec_start_token_id != other.dec_start_token_id) return true;
2306
2390
 
2307
2391
  const float EPSILON = 1e-9f;
@@ -2365,15 +2449,25 @@ struct llama_hparams {
2365
2449
  }
2366
2450
 
2367
2451
  uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
2368
- // corresponds to Mamba's conv_states size
2369
- // TODO: maybe support other convolution strides than 1
2370
- // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2371
- return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2452
+ // corresponds to Mamba's conv_states size or RWKV's token_shift states size
2453
+ if (wkv_head_size != 0) {
2454
+ // for RWKV models
2455
+ return 2 * n_embd;
2456
+ } else {
2457
+ // TODO: maybe support other convolution strides than 1
2458
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2459
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2460
+ }
2372
2461
  }
2373
2462
 
2374
2463
  uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
2375
- // corresponds to Mamba's ssm_states size
2376
- return ssm_d_state * ssm_d_inner;
2464
+ if (wkv_head_size != 0) {
2465
+ // corresponds to RWKV's wkv_states size
2466
+ return n_embd * wkv_head_size;
2467
+ } else {
2468
+ // corresponds to Mamba's ssm_states size
2469
+ return ssm_d_state * ssm_d_inner;
2470
+ }
2377
2471
  }
2378
2472
  };
2379
2473
 
@@ -2384,8 +2478,8 @@ struct llama_cparams {
2384
2478
  uint32_t n_batch;
2385
2479
  uint32_t n_ubatch;
2386
2480
  uint32_t n_seq_max;
2387
- uint32_t n_threads; // number of threads to use for generation
2388
- uint32_t n_threads_batch; // number of threads to use for batch processing
2481
+ int n_threads; // number of threads to use for generation
2482
+ int n_threads_batch; // number of threads to use for batch processing
2389
2483
 
2390
2484
  float rope_freq_base;
2391
2485
  float rope_freq_scale;
@@ -2403,6 +2497,7 @@ struct llama_cparams {
2403
2497
  bool causal_attn;
2404
2498
  bool offload_kqv;
2405
2499
  bool flash_attn;
2500
+ bool no_perf;
2406
2501
 
2407
2502
  enum llama_pooling_type pooling_type;
2408
2503
 
@@ -2512,6 +2607,36 @@ struct llama_layer {
2512
2607
  struct lm_ggml_tensor * ssm_conv1d_b;
2513
2608
  struct lm_ggml_tensor * ssm_dt_b;
2514
2609
 
2610
+ // rwkv
2611
+ struct lm_ggml_tensor * time_mix_w1;
2612
+ struct lm_ggml_tensor * time_mix_w2;
2613
+ struct lm_ggml_tensor * time_mix_lerp_x;
2614
+ struct lm_ggml_tensor * time_mix_lerp_w;
2615
+ struct lm_ggml_tensor * time_mix_lerp_k;
2616
+ struct lm_ggml_tensor * time_mix_lerp_v;
2617
+ struct lm_ggml_tensor * time_mix_lerp_r;
2618
+ struct lm_ggml_tensor * time_mix_lerp_g;
2619
+
2620
+ struct lm_ggml_tensor * time_mix_first;
2621
+ struct lm_ggml_tensor * time_mix_decay;
2622
+ struct lm_ggml_tensor * time_mix_decay_w1;
2623
+ struct lm_ggml_tensor * time_mix_decay_w2;
2624
+ struct lm_ggml_tensor * time_mix_key;
2625
+ struct lm_ggml_tensor * time_mix_value;
2626
+ struct lm_ggml_tensor * time_mix_receptance;
2627
+ struct lm_ggml_tensor * time_mix_gate;
2628
+
2629
+ struct lm_ggml_tensor * time_mix_ln;
2630
+ struct lm_ggml_tensor * time_mix_ln_b;
2631
+ struct lm_ggml_tensor * time_mix_output;
2632
+
2633
+ struct lm_ggml_tensor * channel_mix_lerp_k;
2634
+ struct lm_ggml_tensor * channel_mix_lerp_r;
2635
+
2636
+ struct lm_ggml_tensor * channel_mix_key;
2637
+ struct lm_ggml_tensor * channel_mix_receptance;
2638
+ struct lm_ggml_tensor * channel_mix_value;
2639
+
2515
2640
  // long rope factors
2516
2641
  struct lm_ggml_tensor * rope_long = nullptr;
2517
2642
  struct lm_ggml_tensor * rope_short = nullptr;
@@ -3069,7 +3194,6 @@ struct llama_sbatch {
3069
3194
  struct llama_context {
3070
3195
  llama_context(const llama_model & model)
3071
3196
  : model(model)
3072
- , sampling(llama_n_vocab(&model))
3073
3197
  , t_start_us(model.t_start_us)
3074
3198
  , t_load_us(model.t_load_us) {}
3075
3199
 
@@ -3086,7 +3210,6 @@ struct llama_context {
3086
3210
  const struct llama_model & model;
3087
3211
 
3088
3212
  struct llama_cparams cparams;
3089
- struct llama_sampling sampling;
3090
3213
  struct llama_sbatch sbatch;
3091
3214
  struct llama_kv_cache kv_self;
3092
3215
  struct llama_control_vector cvec;
@@ -3102,18 +3225,21 @@ struct llama_context {
3102
3225
  #endif
3103
3226
  lm_ggml_backend_t backend_cpu = nullptr;
3104
3227
 
3228
+ lm_ggml_threadpool_t threadpool = nullptr;
3229
+ lm_ggml_threadpool_t threadpool_batch = nullptr;
3230
+
3105
3231
  bool has_evaluated_once = false;
3106
3232
 
3107
- int64_t t_start_us;
3108
- int64_t t_load_us;
3109
- int64_t t_p_eval_us = 0;
3110
- int64_t t_eval_us = 0;
3233
+ mutable int64_t t_start_us;
3234
+ mutable int64_t t_load_us;
3235
+ mutable int64_t t_p_eval_us = 0;
3236
+ mutable int64_t t_eval_us = 0;
3111
3237
 
3112
- int64_t t_compute_start_us = 0;
3113
- int64_t n_queued_tokens = 0;
3238
+ mutable int64_t t_compute_start_us = 0;
3239
+ mutable int64_t n_queued_tokens = 0;
3114
3240
 
3115
- int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
3116
- int32_t n_eval = 0; // number of eval calls
3241
+ mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
3242
+ mutable int32_t n_eval = 0; // number of eval calls
3117
3243
 
3118
3244
  // host buffer for the model output (logits and embeddings)
3119
3245
  lm_ggml_backend_buffer_t buf_output = nullptr;
@@ -3233,29 +3359,33 @@ static size_t llama_get_device_count(const llama_model & model) {
3233
3359
  static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
3234
3360
  lm_ggml_backend_buffer_type_t buft = nullptr;
3235
3361
 
3236
- #if defined(LM_GGML_USE_RPC)
3237
- int dev_count = (int)llama_get_device_count(model);
3362
+ #ifdef LM_GGML_USE_RPC
3238
3363
  int rpc_count = (int)model.rpc_servers.size();
3239
- if (gpu >= dev_count - rpc_count) {
3240
- const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
3364
+ #else
3365
+ int rpc_count = 0;
3366
+ #endif
3367
+ int local_gpu = gpu - rpc_count;
3368
+ #if defined(LM_GGML_USE_RPC)
3369
+ if (gpu < rpc_count) {
3370
+ const char * endpoint = model.rpc_servers[gpu].c_str();
3241
3371
  return lm_ggml_backend_rpc_buffer_type(endpoint);
3242
3372
  }
3243
3373
  #endif
3244
3374
  #if defined(LM_GGML_USE_METAL)
3245
3375
  buft = lm_ggml_backend_metal_buffer_type();
3246
3376
  #elif defined(LM_GGML_USE_CUDA)
3247
- buft = lm_ggml_backend_cuda_buffer_type(gpu);
3377
+ buft = lm_ggml_backend_cuda_buffer_type(local_gpu);
3248
3378
  #elif defined(LM_GGML_USE_VULKAN)
3249
- buft = lm_ggml_backend_vk_buffer_type(gpu);
3379
+ buft = lm_ggml_backend_vk_buffer_type(local_gpu);
3250
3380
  #elif defined(LM_GGML_USE_SYCL)
3251
- buft = lm_ggml_backend_sycl_buffer_type(gpu);
3381
+ buft = lm_ggml_backend_sycl_buffer_type(local_gpu);
3252
3382
  #elif defined(LM_GGML_USE_KOMPUTE)
3253
- buft = lm_ggml_backend_kompute_buffer_type(gpu);
3383
+ buft = lm_ggml_backend_kompute_buffer_type(local_gpu);
3254
3384
  if (buft == nullptr) {
3255
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
3385
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
3256
3386
  }
3257
3387
  #elif defined(LM_GGML_USE_CANN)
3258
- buft = lm_ggml_backend_cann_buffer_type(gpu);
3388
+ buft = lm_ggml_backend_cann_buffer_type(local_gpu);
3259
3389
  #endif
3260
3390
 
3261
3391
  if (buft == nullptr) {
@@ -3263,7 +3393,7 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const lla
3263
3393
  }
3264
3394
  return buft;
3265
3395
  LM_GGML_UNUSED(model);
3266
- LM_GGML_UNUSED(gpu);
3396
+ LM_GGML_UNUSED(local_gpu);
3267
3397
  }
3268
3398
 
3269
3399
  static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
@@ -3290,13 +3420,17 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
3290
3420
  }
3291
3421
 
3292
3422
  static size_t llama_get_device_memory(const llama_model & model, int device) {
3293
- #if defined(LM_GGML_USE_RPC)
3294
- int dev_count = (int)llama_get_device_count(model);
3423
+ #ifdef LM_GGML_USE_RPC
3295
3424
  int rpc_count = (int)model.rpc_servers.size();
3296
- if (device >= dev_count - rpc_count) {
3425
+ #else
3426
+ int rpc_count = 0;
3427
+ #endif
3428
+ int local_device = device - rpc_count;
3429
+ #if defined(LM_GGML_USE_RPC)
3430
+ if (device < rpc_count) {
3297
3431
  size_t total;
3298
3432
  size_t free;
3299
- const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
3433
+ const char * endpoint = model.rpc_servers[device].c_str();
3300
3434
  lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
3301
3435
  return free;
3302
3436
  }
@@ -3304,28 +3438,28 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
3304
3438
  #if defined(LM_GGML_USE_CUDA)
3305
3439
  size_t total;
3306
3440
  size_t free;
3307
- lm_ggml_backend_cuda_get_device_memory(device, &free, &total);
3441
+ lm_ggml_backend_cuda_get_device_memory(local_device, &free, &total);
3308
3442
  return free;
3309
3443
  #elif defined(LM_GGML_USE_SYCL)
3310
3444
  size_t total;
3311
3445
  size_t free;
3312
- lm_ggml_backend_sycl_get_device_memory(device, &free, &total);
3446
+ lm_ggml_backend_sycl_get_device_memory(local_device, &free, &total);
3313
3447
  return free;
3314
3448
  #elif defined(LM_GGML_USE_VULKAN)
3315
3449
  size_t total;
3316
3450
  size_t free;
3317
- lm_ggml_backend_vk_get_device_memory(device, &free, &total);
3451
+ lm_ggml_backend_vk_get_device_memory(local_device, &free, &total);
3318
3452
  return free;
3319
3453
  #elif defined(LM_GGML_USE_CANN)
3320
3454
  size_t total;
3321
3455
  size_t free;
3322
- lm_ggml_backend_cann_get_device_memory(device, &free, &total);
3456
+ lm_ggml_backend_cann_get_device_memory(local_device, &free, &total);
3323
3457
  return free;
3324
3458
  #else
3325
3459
  return 1;
3326
3460
  #endif
3327
3461
  LM_GGML_UNUSED(model);
3328
- LM_GGML_UNUSED(device);
3462
+ LM_GGML_UNUSED(local_device);
3329
3463
  }
3330
3464
 
3331
3465
  //
@@ -3434,7 +3568,7 @@ static bool llama_kv_cache_find_slot(
3434
3568
  const uint32_t n_seq_tokens = batch.n_seq_tokens;
3435
3569
 
3436
3570
  if (cache.recurrent) {
3437
- // For recurrent state architectures (like Mamba),
3571
+ // For recurrent state architectures (like Mamba or RWKV),
3438
3572
  // each cache cell can store the state for a whole sequence.
3439
3573
  // A slot should be always be contiguous.
3440
3574
 
@@ -3683,7 +3817,7 @@ static bool llama_kv_cache_seq_rm(
3683
3817
  if (p0 < 0) p0 = 0;
3684
3818
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
3685
3819
 
3686
- // models like Mamba can't have a state partially erased
3820
+ // models like Mamba or RWKV can't have a state partially erased
3687
3821
  if (cache.recurrent) {
3688
3822
  if (seq_id >= (int64_t) cache.size) {
3689
3823
  // could be fatal
@@ -3697,7 +3831,8 @@ static bool llama_kv_cache_seq_rm(
3697
3831
  if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
3698
3832
  return false;
3699
3833
  }
3700
- if (p0 <= cell.pos && p1 < cell.pos) {
3834
+ // invalidate tails which will be cleared
3835
+ if (p0 <= cell.pos && cell.pos < p1) {
3701
3836
  tail_id = -1;
3702
3837
  }
3703
3838
  }
@@ -3819,7 +3954,7 @@ static void llama_kv_cache_seq_add(
3819
3954
  if (p0 == p1) return;
3820
3955
 
3821
3956
  if (cache.recurrent) {
3822
- // for Mamba-like models, only the pos needs to be shifted
3957
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
3823
3958
  if (0 <= seq_id && seq_id < (int64_t) cache.size) {
3824
3959
  const int32_t tail_id = cache.cells[seq_id].tail;
3825
3960
  if (tail_id >= 0) {
@@ -3868,7 +4003,7 @@ static void llama_kv_cache_seq_div(
3868
4003
  if (p0 == p1) return;
3869
4004
 
3870
4005
  if (cache.recurrent) {
3871
- // for Mamba-like models, only the pos needs to be changed
4006
+ // for Mamba-like or RWKV models, only the pos needs to be changed
3872
4007
  if (0 <= seq_id && seq_id < (int64_t) cache.size) {
3873
4008
  const int32_t tail_id = cache.cells[seq_id].tail;
3874
4009
  if (tail_id >= 0) {
@@ -4322,6 +4457,8 @@ struct llama_model_loader {
4322
4457
  case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
4323
4458
  case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
4324
4459
  case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
4460
+ case LM_GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
4461
+ case LM_GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
4325
4462
  case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
4326
4463
  case LM_GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
4327
4464
  case LM_GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
@@ -5015,6 +5152,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5015
5152
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
5016
5153
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
5017
5154
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
5155
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
5156
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
5018
5157
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
5019
5158
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
5020
5159
  case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
@@ -5059,6 +5198,7 @@ static const char * llama_model_type_name(e_model type) {
5059
5198
  case MODEL_1B: return "1B";
5060
5199
  case MODEL_1_3B: return "1.3B";
5061
5200
  case MODEL_1_4B: return "1.4B";
5201
+ case MODEL_1_6B: return "1.6B";
5062
5202
  case MODEL_2B: return "2B";
5063
5203
  case MODEL_2_8B: return "2.8B";
5064
5204
  case MODEL_3B: return "3B";
@@ -5105,6 +5245,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
5105
5245
  case LLAMA_VOCAB_TYPE_BPE: return "BPE";
5106
5246
  case LLAMA_VOCAB_TYPE_WPM: return "WPM";
5107
5247
  case LLAMA_VOCAB_TYPE_UGM: return "UGM";
5248
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
5108
5249
  default: return "unknown";
5109
5250
  }
5110
5251
  }
@@ -5801,6 +5942,26 @@ static void llm_load_hparams(
5801
5942
  default: model.type = e_model::MODEL_UNKNOWN;
5802
5943
  }
5803
5944
  } break;
5945
+ case LLM_ARCH_RWKV6:
5946
+ {
5947
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5948
+ ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
5949
+ ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
5950
+ ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
5951
+ ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
5952
+
5953
+ switch (hparams.n_layer) {
5954
+ case 24: model.type = e_model::MODEL_1_6B; break;
5955
+ case 32:
5956
+ switch (hparams.n_embd) {
5957
+ case 2560: model.type = e_model::MODEL_3B; break;
5958
+ case 4096: model.type = e_model::MODEL_7B; break;
5959
+ default: model.type = e_model::MODEL_UNKNOWN;
5960
+ } break;
5961
+ case 61: model.type = e_model::MODEL_14B; break;
5962
+ default: model.type = e_model::MODEL_UNKNOWN;
5963
+ }
5964
+ } break;
5804
5965
  default: (void)0;
5805
5966
  }
5806
5967
 
@@ -5930,6 +6091,15 @@ static void llm_load_vocab(
5930
6091
  }
5931
6092
  #endif
5932
6093
  }
6094
+ } else if (tokenizer_model == "rwkv") {
6095
+ vocab.type = LLAMA_VOCAB_TYPE_RWKV;
6096
+
6097
+ // default special tokens
6098
+ vocab.special_bos_id = -1;
6099
+ vocab.special_eos_id = -1;
6100
+ vocab.special_unk_id = -1;
6101
+ vocab.special_sep_id = -1;
6102
+ vocab.special_pad_id = -1;
5933
6103
  } else {
5934
6104
  throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
5935
6105
  }
@@ -6061,6 +6231,12 @@ static void llm_load_vocab(
6061
6231
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
6062
6232
  vocab.tokenizer_add_bos = false;
6063
6233
  vocab.tokenizer_add_eos = true;
6234
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
6235
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
6236
+ vocab.tokenizer_add_space_prefix = false;
6237
+ vocab.tokenizer_clean_spaces = false;
6238
+ vocab.tokenizer_add_bos = false;
6239
+ vocab.tokenizer_add_eos = false;
6064
6240
  } else {
6065
6241
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
6066
6242
  }
@@ -6088,6 +6264,7 @@ static void llm_load_vocab(
6088
6264
 
6089
6265
  const uint32_t n_vocab = lm_gguf_get_arr_n(ctx, token_idx);
6090
6266
 
6267
+ vocab.n_vocab = n_vocab;
6091
6268
  vocab.id_to_token.resize(n_vocab);
6092
6269
 
6093
6270
  for (uint32_t i = 0; i < n_vocab; i++) {
@@ -6165,6 +6342,10 @@ static void llm_load_vocab(
6165
6342
  }
6166
6343
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
6167
6344
  vocab.linefeed_id = vocab.special_pad_id;
6345
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
6346
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
6347
+ LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
6348
+ vocab.linefeed_id = ids[0];
6168
6349
  } else {
6169
6350
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
6170
6351
  LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@@ -6234,6 +6415,11 @@ static void llm_load_vocab(
6234
6415
  )
6235
6416
  ) {
6236
6417
  vocab.special_eot_id = t.second;
6418
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6419
+ LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6420
+ __func__, t.first.c_str());
6421
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6422
+ }
6237
6423
  break;
6238
6424
  }
6239
6425
  }
@@ -6247,6 +6433,11 @@ static void llm_load_vocab(
6247
6433
  const auto & t = vocab.token_to_id.find("<|eom_id|>");
6248
6434
  if (t != vocab.token_to_id.end()) {
6249
6435
  vocab.special_eom_id = t->second;
6436
+ if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6437
+ LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6438
+ __func__, t->first.c_str());
6439
+ vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6440
+ }
6250
6441
  }
6251
6442
  }
6252
6443
  }
@@ -6482,8 +6673,6 @@ static bool llm_load_tensors(
6482
6673
  bool use_mlock,
6483
6674
  llama_progress_callback progress_callback,
6484
6675
  void * progress_callback_user_data) {
6485
- model.t_start_us = lm_ggml_time_us();
6486
-
6487
6676
  auto & hparams = model.hparams;
6488
6677
 
6489
6678
  model.split_mode = split_mode;
@@ -7955,23 +8144,23 @@ static bool llm_load_tensors(
7955
8144
  layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
7956
8145
 
7957
8146
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
7958
- layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
8147
+ layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7959
8148
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
7960
- layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
8149
+ layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7961
8150
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
7962
- layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
8151
+ layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7963
8152
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
7964
- layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
8153
+ layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7965
8154
 
7966
8155
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7967
8156
  layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
7968
8157
 
7969
8158
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
7970
- layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
8159
+ layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7971
8160
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
7972
- layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
8161
+ layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7973
8162
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7974
- layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
8163
+ layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7975
8164
  }
7976
8165
  } break;
7977
8166
  case LLM_ARCH_T5:
@@ -8211,6 +8400,68 @@ static bool llm_load_tensors(
8211
8400
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
8212
8401
  }
8213
8402
  } break;
8403
+ case LLM_ARCH_RWKV6:
8404
+ {
8405
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
8406
+
8407
+ // Block 0, LN0
8408
+ model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
8409
+ model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
8410
+
8411
+ // output
8412
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
8413
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
8414
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
8415
+
8416
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
8417
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
8418
+ const int head_size = hparams.wkv_head_size;
8419
+ const int attn_hidden_size = n_embd;
8420
+ const int ffn_size = hparams.n_ff_arr[0];
8421
+
8422
+ for (int i = 0; i < n_layer; ++i) {
8423
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
8424
+
8425
+ auto & layer = model.layers[i];
8426
+
8427
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
8428
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
8429
+
8430
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
8431
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
8432
+
8433
+ layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5});
8434
+ layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5});
8435
+
8436
+ layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1});
8437
+ layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1});
8438
+ layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
8439
+ layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1});
8440
+ layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
8441
+ layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
8442
+
8443
+ layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
8444
+ layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
8445
+ layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});
8446
+ layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size});
8447
+ layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
8448
+ layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
8449
+ layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
8450
+ layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd});
8451
+
8452
+ layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
8453
+ layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
8454
+ layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
8455
+
8456
+ layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
8457
+ layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
8458
+
8459
+ layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
8460
+ layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
8461
+ layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd});
8462
+ }
8463
+
8464
+ } break;
8214
8465
  default:
8215
8466
  throw std::runtime_error("unknown architecture");
8216
8467
  }
@@ -8352,14 +8603,13 @@ static bool llm_load_tensors(
8352
8603
  }
8353
8604
  }
8354
8605
 
8355
- // loading time will be recalculate after the first eval, so
8356
- // we take page faults deferred by mmap() into consideration
8357
- model.t_load_us = lm_ggml_time_us() - model.t_start_us;
8358
8606
  return true;
8359
8607
  }
8360
8608
 
8361
8609
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
8362
8610
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
8611
+ model.t_start_us = lm_ggml_time_us();
8612
+
8363
8613
  try {
8364
8614
  llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
8365
8615
 
@@ -8421,6 +8671,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
8421
8671
  return -1;
8422
8672
  }
8423
8673
 
8674
+ // loading time will be recalculate after the first eval, so
8675
+ // we take page faults deferred by mmap() into consideration
8676
+ model.t_load_us = lm_ggml_time_us() - model.t_start_us;
8677
+
8424
8678
  return 0;
8425
8679
  }
8426
8680
 
@@ -8495,8 +8749,7 @@ static void llm_build_kv_store(
8495
8749
 
8496
8750
  LM_GGML_ASSERT(kv.size == n_ctx);
8497
8751
 
8498
- struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
8499
- (lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
8752
+ struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
8500
8753
  cb(k_cache_view, "k_cache_view", il);
8501
8754
 
8502
8755
  // note: storing RoPE-ed version of K in the KV cache
@@ -8507,8 +8760,7 @@ static void llm_build_kv_store(
8507
8760
  struct lm_ggml_tensor * v_cache_view = nullptr;
8508
8761
 
8509
8762
  if (cparams.flash_attn) {
8510
- v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
8511
- (kv_head)*lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
8763
+ v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
8512
8764
  } else {
8513
8765
  // note: the V cache is transposed when not using flash attention
8514
8766
  v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
@@ -8995,8 +9247,7 @@ static struct lm_ggml_tensor * llm_build_kv(
8995
9247
 
8996
9248
  struct lm_ggml_tensor * cur;
8997
9249
 
8998
- cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
8999
- q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
9250
+ cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
9000
9251
  cb(cur, "kqv_out", il);
9001
9252
 
9002
9253
  return cur;
@@ -9024,7 +9275,7 @@ static struct lm_ggml_tensor * llm_build_copy_mask_state(
9024
9275
  // FIXME: zero-out NANs?
9025
9276
  states = lm_ggml_mul(ctx, states, state_mask);
9026
9277
 
9027
- // copy states which won't be changed further (between n_seqs and n_rs)
9278
+ // copy states which won't be changed further (between n_seqs and n_kv)
9028
9279
  lm_ggml_build_forward_expand(graph,
9029
9280
  lm_ggml_cpy(ctx,
9030
9281
  lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)),
@@ -9170,6 +9421,171 @@ static struct lm_ggml_tensor * llm_build_mamba(
9170
9421
  return cur;
9171
9422
  }
9172
9423
 
9424
+ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9425
+ struct llama_context & lctx,
9426
+ struct lm_ggml_context * ctx,
9427
+ const struct llama_layer * layer,
9428
+ struct lm_ggml_tensor * cur,
9429
+ struct lm_ggml_tensor * x_prev,
9430
+ struct lm_ggml_tensor ** wkv_state) {
9431
+ size_t n_embed = cur->ne[0];
9432
+ size_t n_seq_tokens = cur->ne[1];
9433
+ size_t n_seqs = cur->ne[2];
9434
+
9435
+ size_t head_size = layer->time_mix_first->ne[0];
9436
+ size_t head_count = layer->time_mix_first->ne[1];
9437
+
9438
+ size_t n_tokens = n_seqs * n_seq_tokens;
9439
+
9440
+ struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
9441
+
9442
+ sx = lm_ggml_reshape_2d(ctx, sx, n_embed, n_tokens);
9443
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
9444
+
9445
+ struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
9446
+
9447
+ xxx = lm_ggml_reshape_4d(
9448
+ ctx,
9449
+ lm_ggml_tanh(
9450
+ ctx,
9451
+ lm_ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
9452
+ ),
9453
+ layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
9454
+ );
9455
+
9456
+ xxx = lm_ggml_cont(ctx, lm_ggml_permute(ctx, xxx, 0, 1, 3, 2));
9457
+
9458
+ xxx = lm_ggml_mul_mat(
9459
+ ctx,
9460
+ lm_ggml_reshape_4d(
9461
+ ctx,
9462
+ layer->time_mix_w2,
9463
+ layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
9464
+ ),
9465
+ xxx
9466
+ );
9467
+
9468
+ struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
9469
+ struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
9470
+ struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
9471
+ struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
9472
+ struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
9473
+
9474
+ struct lm_ggml_tensor * xw = lm_ggml_add(
9475
+ ctx,
9476
+ lm_ggml_mul(
9477
+ ctx,
9478
+ lm_ggml_add(ctx, mw, layer->time_mix_lerp_w),
9479
+ sx
9480
+ ),
9481
+ cur
9482
+ );
9483
+
9484
+ struct lm_ggml_tensor * xk = lm_ggml_add(
9485
+ ctx,
9486
+ lm_ggml_mul(
9487
+ ctx,
9488
+ lm_ggml_add(ctx, mk, layer->time_mix_lerp_k),
9489
+ sx
9490
+ ),
9491
+ cur
9492
+ );
9493
+
9494
+ struct lm_ggml_tensor * xv = lm_ggml_add(
9495
+ ctx,
9496
+ lm_ggml_mul(
9497
+ ctx,
9498
+ lm_ggml_add(ctx, mv, layer->time_mix_lerp_v),
9499
+ sx
9500
+ ),
9501
+ cur
9502
+ );
9503
+
9504
+ struct lm_ggml_tensor * xr = lm_ggml_add(
9505
+ ctx,
9506
+ lm_ggml_mul(
9507
+ ctx,
9508
+ lm_ggml_add(ctx, mr, layer->time_mix_lerp_r),
9509
+ sx
9510
+ ),
9511
+ cur
9512
+ );
9513
+
9514
+ struct lm_ggml_tensor * xg = lm_ggml_add(
9515
+ ctx,
9516
+ lm_ggml_mul(
9517
+ ctx,
9518
+ lm_ggml_add(ctx, mg, layer->time_mix_lerp_g),
9519
+ sx
9520
+ ),
9521
+ cur
9522
+ );
9523
+
9524
+ struct lm_ggml_tensor * r = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
9525
+ struct lm_ggml_tensor * k = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
9526
+ struct lm_ggml_tensor * v = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
9527
+ struct lm_ggml_tensor * g = lm_ggml_silu(
9528
+ ctx,
9529
+ llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
9530
+ );
9531
+
9532
+ struct lm_ggml_tensor * w = lm_ggml_mul_mat(
9533
+ ctx,
9534
+ layer->time_mix_decay_w2,
9535
+ lm_ggml_tanh(
9536
+ ctx,
9537
+ lm_ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
9538
+ )
9539
+ );
9540
+
9541
+ w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
9542
+ w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
9543
+ w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
9544
+
9545
+ k = lm_ggml_transpose(ctx, k);
9546
+ v = lm_ggml_transpose(ctx, v);
9547
+ r = lm_ggml_transpose(ctx, r);
9548
+
9549
+ struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
9550
+ cur = lm_ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
9551
+ *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
9552
+
9553
+ // group norm with head_count groups
9554
+ cur = lm_ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
9555
+ cur = lm_ggml_norm(ctx, cur, 64e-5f);
9556
+
9557
+ // Convert back to regular vectors.
9558
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
9559
+ cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
9560
+
9561
+ cur = lm_ggml_mul(ctx, cur, g);
9562
+ cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
9563
+
9564
+ return lm_ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
9565
+ }
9566
+
9567
+ static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
9568
+ struct llama_context & lctx,
9569
+ struct lm_ggml_context * ctx,
9570
+ const struct llama_layer * layer,
9571
+ struct lm_ggml_tensor * cur,
9572
+ struct lm_ggml_tensor * x_prev) {
9573
+ struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
9574
+ struct lm_ggml_tensor * xk = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
9575
+ struct lm_ggml_tensor * xr = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
9576
+
9577
+ struct lm_ggml_tensor * r = lm_ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
9578
+ struct lm_ggml_tensor * k = lm_ggml_sqr(
9579
+ ctx,
9580
+ lm_ggml_relu(
9581
+ ctx,
9582
+ llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
9583
+ )
9584
+ );
9585
+
9586
+ return lm_ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
9587
+ }
9588
+
9173
9589
  struct llm_build_context {
9174
9590
  const llama_model & model;
9175
9591
  llama_context & lctx;
@@ -9478,8 +9894,8 @@ struct llm_build_context {
9478
9894
  struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) {
9479
9895
  // find result_norm tensor for input
9480
9896
  struct lm_ggml_tensor * inp = nullptr;
9481
- for (int i = gf->n_nodes - 1; i >= 0; --i) {
9482
- inp = gf->nodes[i];
9897
+ for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
9898
+ inp = lm_ggml_graph_node(gf, i);
9483
9899
  if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
9484
9900
  break;
9485
9901
  } else {
@@ -13790,7 +14206,9 @@ struct llm_build_context {
13790
14206
  {
13791
14207
  // compute Q and K and RoPE them
13792
14208
  struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13793
- Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
14209
+ if (model.layers[il].wq_scale) {
14210
+ Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
14211
+ }
13794
14212
  cb(Qcur, "Qcur", il);
13795
14213
  if (model.layers[il].bq) {
13796
14214
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -13799,7 +14217,9 @@ struct llm_build_context {
13799
14217
 
13800
14218
  // B1.K
13801
14219
  struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13802
- Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
14220
+ if (model.layers[il].wk_scale) {
14221
+ Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
14222
+ }
13803
14223
  cb(Kcur, "Kcur", il);
13804
14224
  if (model.layers[il].bk) {
13805
14225
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
@@ -13808,7 +14228,9 @@ struct llm_build_context {
13808
14228
 
13809
14229
  // B1.V
13810
14230
  struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13811
- Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
14231
+ if (model.layers[il].wv_scale) {
14232
+ Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
14233
+ }
13812
14234
  cb(Vcur, "Vcur", il);
13813
14235
  if (model.layers[il].bv) {
13814
14236
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -13839,7 +14261,9 @@ struct llm_build_context {
13839
14261
  cb(cur, "attn_sub_norm", il);
13840
14262
 
13841
14263
  cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
13842
- cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
14264
+ if (model.layers[il].wo_scale) {
14265
+ cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
14266
+ }
13843
14267
  if (model.layers[il].bo) {
13844
14268
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bo);
13845
14269
  }
@@ -13876,7 +14300,9 @@ struct llm_build_context {
13876
14300
  cb(cur, "ffn_sub_norm", il);
13877
14301
 
13878
14302
  cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
13879
- cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
14303
+ if (model.layers[il].ffn_down_scale) {
14304
+ cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
14305
+ }
13880
14306
  cb(cur, "ffn_down", il);
13881
14307
 
13882
14308
  cur = lm_ggml_add(ctx0, cur, ffn_inp);
@@ -14691,6 +15117,117 @@ struct llm_build_context {
14691
15117
 
14692
15118
  return gf;
14693
15119
  }
15120
+
15121
+ lm_ggml_cgraph * build_rwkv6() {
15122
+ lm_ggml_cgraph *gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15123
+
15124
+ // Token shift state dimensions should be 2 * n_emb
15125
+ LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
15126
+
15127
+ const int64_t n_seqs = batch.n_seqs;
15128
+ const int64_t n_seq_tokens = batch.n_seq_tokens;
15129
+ const int64_t n_tokens = batch.n_tokens;
15130
+ LM_GGML_ASSERT(n_seqs != 0);
15131
+ LM_GGML_ASSERT(batch.equal_seqs);
15132
+ LM_GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
15133
+
15134
+ struct lm_ggml_tensor * cur;
15135
+ struct lm_ggml_tensor * inpL;
15136
+ struct lm_ggml_tensor * state_copy = build_inp_s_copy();
15137
+ struct lm_ggml_tensor * state_mask = build_inp_s_mask();
15138
+
15139
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
15140
+ inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
15141
+
15142
+ for (int il = 0; il < n_layer; ++il) {
15143
+ const llama_layer * layer = &model.layers[il];
15144
+
15145
+ // (ab)using the KV cache to store the states
15146
+ struct lm_ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
15147
+ gf, kv_self.k_l[il], state_copy, state_mask,
15148
+ hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
15149
+ struct lm_ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
15150
+ gf, kv_self.v_l[il], state_copy, state_mask,
15151
+ hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
15152
+
15153
+ cur = lm_ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
15154
+ token_shift = lm_ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
15155
+
15156
+ struct lm_ggml_tensor * att_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
15157
+ struct lm_ggml_tensor * ffn_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * lm_ggml_element_size(token_shift));
15158
+
15159
+ struct lm_ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
15160
+ struct lm_ggml_tensor * x_prev = lm_ggml_concat(
15161
+ ctx0,
15162
+ att_shift,
15163
+ lm_ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
15164
+ 1
15165
+ );
15166
+
15167
+ cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
15168
+ lm_ggml_build_forward_expand(gf, cur);
15169
+ lm_ggml_build_forward_expand(
15170
+ gf,
15171
+ lm_ggml_cpy(
15172
+ ctx0,
15173
+ wkv_states,
15174
+ lm_ggml_view_1d(
15175
+ ctx0,
15176
+ kv_self.v_l[il],
15177
+ hparams.n_embd_v_s() * n_seqs,
15178
+ hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
15179
+ )
15180
+ )
15181
+ );
15182
+
15183
+ struct lm_ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
15184
+ x_prev = lm_ggml_concat(
15185
+ ctx0,
15186
+ ffn_shift,
15187
+ lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
15188
+ 1
15189
+ );
15190
+ cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
15191
+ lm_ggml_build_forward_expand(gf, cur);
15192
+
15193
+ struct lm_ggml_tensor * last_norm_att = lm_ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_att));
15194
+ struct lm_ggml_tensor * last_norm_ffn = lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_ffn));
15195
+
15196
+ token_shift = lm_ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
15197
+
15198
+ lm_ggml_build_forward_expand(
15199
+ gf,
15200
+ lm_ggml_cpy(
15201
+ ctx0,
15202
+ lm_ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
15203
+ lm_ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * lm_ggml_element_size(kv_self.k_l[il]))
15204
+ )
15205
+ );
15206
+
15207
+ if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
15208
+ cur = lm_ggml_scale(ctx0, cur, 0.5F);
15209
+ }
15210
+
15211
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
15212
+ cb(cur, "l_out", il);
15213
+
15214
+ // input for next layer
15215
+ inpL = cur;
15216
+ }
15217
+
15218
+ cur = inpL;
15219
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
15220
+ cur = lm_ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
15221
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
15222
+
15223
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
15224
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15225
+
15226
+ cb(cur, "result_output", -1);
15227
+ lm_ggml_build_forward_expand(gf, cur);
15228
+
15229
+ return gf;
15230
+ }
14694
15231
  };
14695
15232
 
14696
15233
  static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -14937,6 +15474,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
14937
15474
  {
14938
15475
  result = llm.build_exaone();
14939
15476
  } break;
15477
+ case LLM_ARCH_RWKV6:
15478
+ {
15479
+ result = llm.build_rwkv6();
15480
+ } break;
14940
15481
  default:
14941
15482
  LM_GGML_ABORT("fatal error");
14942
15483
  }
@@ -15296,7 +15837,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
15296
15837
 
15297
15838
  // clear unused states
15298
15839
  for (int i = 0; i < n_kv; ++i) {
15299
- uint32_t cell_id = i + kv_self.head;
15840
+ const uint32_t cell_id = i + kv_self.head;
15300
15841
  llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
15301
15842
 
15302
15843
  data[i] = (float) (kv_cell.src >= 0);
@@ -15505,9 +16046,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
15505
16046
  }
15506
16047
 
15507
16048
  static void llama_graph_compute(
15508
- llama_context & lctx,
15509
- lm_ggml_cgraph * gf,
15510
- int n_threads) {
16049
+ llama_context & lctx,
16050
+ lm_ggml_cgraph * gf,
16051
+ int n_threads,
16052
+ lm_ggml_threadpool * threadpool) {
15511
16053
  #ifdef LM_GGML_USE_METAL
15512
16054
  if (lm_ggml_backend_is_metal(lctx.backend_metal)) {
15513
16055
  lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -15516,6 +16058,7 @@ static void llama_graph_compute(
15516
16058
 
15517
16059
  if (lctx.backend_cpu != nullptr) {
15518
16060
  lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
16061
+ lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
15519
16062
  lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
15520
16063
  }
15521
16064
  #ifdef LM_GGML_USE_BLAS
@@ -15556,6 +16099,15 @@ static int llama_decode_internal(
15556
16099
 
15557
16100
  LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
15558
16101
 
16102
+ if (batch_all.token) {
16103
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
16104
+ if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
16105
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16106
+ return -1;
16107
+ }
16108
+ }
16109
+ }
16110
+
15559
16111
  LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
15560
16112
 
15561
16113
  LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@@ -15636,6 +16188,8 @@ static int llama_decode_internal(
15636
16188
  }
15637
16189
 
15638
16190
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
16191
+ lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
16192
+
15639
16193
  LM_GGML_ASSERT(n_threads > 0);
15640
16194
 
15641
16195
  // non-causal masks do not use the KV cache
@@ -15670,8 +16224,8 @@ static int llama_decode_internal(
15670
16224
  lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
15671
16225
 
15672
16226
  // the output is always the last tensor in the graph
15673
- struct lm_ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
15674
- struct lm_ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
16227
+ struct lm_ggml_tensor * res = lm_ggml_graph_node(gf, -1);
16228
+ struct lm_ggml_tensor * embd = lm_ggml_graph_node(gf, -2);
15675
16229
 
15676
16230
  if (lctx.n_outputs == 0) {
15677
16231
  // no output
@@ -15680,9 +16234,9 @@ static int llama_decode_internal(
15680
16234
  } else if (cparams.embeddings) {
15681
16235
  res = nullptr; // do not extract logits for embedding case
15682
16236
  embd = nullptr;
15683
- for (int i = gf->n_nodes - 1; i >= 0; --i) {
15684
- if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
15685
- embd = gf->nodes[i];
16237
+ for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
16238
+ if (strcmp(lm_ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
16239
+ embd = lm_ggml_graph_node(gf, i);
15686
16240
  break;
15687
16241
  }
15688
16242
  }
@@ -15697,7 +16251,7 @@ static int llama_decode_internal(
15697
16251
 
15698
16252
  llama_set_inputs(lctx, ubatch);
15699
16253
 
15700
- llama_graph_compute(lctx, gf, n_threads);
16254
+ llama_graph_compute(lctx, gf, n_threads, threadpool);
15701
16255
 
15702
16256
  // update the kv ring buffer
15703
16257
  {
@@ -15846,6 +16400,15 @@ static int llama_encode_internal(
15846
16400
 
15847
16401
  LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
15848
16402
 
16403
+ if (batch.token) {
16404
+ for (uint32_t i = 0; i < n_tokens; ++i) {
16405
+ if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
16406
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
16407
+ return -1;
16408
+ }
16409
+ }
16410
+ }
16411
+
15849
16412
  // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
15850
16413
  LM_GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
15851
16414
 
@@ -15874,7 +16437,9 @@ static int llama_encode_internal(
15874
16437
  lctx.inp_embd_enc = NULL;
15875
16438
  lctx.n_outputs = n_tokens;
15876
16439
 
15877
- const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
16440
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
16441
+ lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
16442
+
15878
16443
  LM_GGML_ASSERT(n_threads > 0);
15879
16444
 
15880
16445
  lm_ggml_backend_sched_reset(lctx.sched);
@@ -15888,15 +16453,15 @@ static int llama_encode_internal(
15888
16453
  // there are two cases here
15889
16454
  if (llama_model_has_decoder(&lctx.model)) {
15890
16455
  // first case is an encoder-decoder T5 model where embeddings are passed to decoder
15891
- embd = gf->nodes[gf->n_nodes - 1];
16456
+ embd = lm_ggml_graph_node(gf, -1);
15892
16457
  LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
15893
16458
  } else {
15894
16459
  // second case is an encoder-only T5 model
15895
16460
  if (cparams.embeddings) {
15896
16461
  // only output embeddings if required
15897
- embd = gf->nodes[gf->n_nodes - 1];
16462
+ embd = lm_ggml_graph_node(gf, -1);
15898
16463
  if (strcmp(embd->name, "result_embd_pooled") != 0) {
15899
- embd = gf->nodes[gf->n_nodes - 2];
16464
+ embd = lm_ggml_graph_node(gf, -2);
15900
16465
  }
15901
16466
  LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
15902
16467
  }
@@ -15906,7 +16471,7 @@ static int llama_encode_internal(
15906
16471
 
15907
16472
  llama_set_inputs(lctx, ubatch);
15908
16473
 
15909
- llama_graph_compute(lctx, gf, n_threads);
16474
+ llama_graph_compute(lctx, gf, n_threads, threadpool);
15910
16475
 
15911
16476
  // extract embeddings
15912
16477
  if (embd) {
@@ -16188,7 +16753,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
16188
16753
 
16189
16754
  lm_ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
16190
16755
 
16191
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
16756
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
16192
16757
  #endif
16193
16758
 
16194
16759
  //const int64_t t_end = lm_ggml_time_us();
@@ -16214,7 +16779,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
16214
16779
 
16215
16780
  llama_set_k_shift(lctx);
16216
16781
 
16217
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
16782
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
16218
16783
 
16219
16784
  need_reserve = true;
16220
16785
  }
@@ -16425,6 +16990,9 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
16425
16990
  new_type == LM_GGML_TYPE_Q4_0_8_8) {
16426
16991
  new_type = LM_GGML_TYPE_Q4_0;
16427
16992
  }
16993
+ else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
16994
+ new_type = LM_GGML_TYPE_Q4_K;
16995
+ }
16428
16996
  }
16429
16997
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
16430
16998
  ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -16624,6 +17192,8 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
16624
17192
  }
16625
17193
  if (convert_incompatible_tensor) {
16626
17194
  switch (new_type) {
17195
+ case LM_GGML_TYPE_TQ1_0:
17196
+ case LM_GGML_TYPE_TQ2_0: new_type = LM_GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
16627
17197
  case LM_GGML_TYPE_IQ2_XXS:
16628
17198
  case LM_GGML_TYPE_IQ2_XS:
16629
17199
  case LM_GGML_TYPE_IQ2_S:
@@ -16729,6 +17299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16729
17299
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
16730
17300
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = LM_GGML_TYPE_Q5_K; break;
16731
17301
  case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = LM_GGML_TYPE_Q6_K; break;
17302
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = LM_GGML_TYPE_TQ1_0; break;
17303
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = LM_GGML_TYPE_TQ2_0; break;
16732
17304
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = LM_GGML_TYPE_IQ2_XXS; break;
16733
17305
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = LM_GGML_TYPE_IQ2_XS; break;
16734
17306
  case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = LM_GGML_TYPE_IQ2_XS; break;
@@ -16833,7 +17405,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16833
17405
 
16834
17406
  // TODO: avoid hardcoded tensor names - use the TN_* constants
16835
17407
  if (name.find("attn_v.weight") != std::string::npos ||
16836
- name.find("attn_qkv.weight") != std::string::npos) {
17408
+ name.find("attn_qkv.weight") != std::string::npos ||
17409
+ name.find("attn_kv_b.weight")!= std::string::npos) {
16837
17410
  ++qs.n_attention_wv;
16838
17411
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
16839
17412
  qs.has_output = true;
@@ -16974,6 +17547,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16974
17547
  // NOTE: can't use LLM_TN here because the layer number is not known
16975
17548
  quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
16976
17549
 
17550
+ // do not quantize RWKV's time_mix_first tensors
17551
+ quantize &= name.find("time_mix_first.weight") == std::string::npos;
17552
+ quantize &= name.find("time_mix_w1.weight") == std::string::npos;
17553
+ quantize &= name.find("time_mix_w2.weight") == std::string::npos;
17554
+ quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
17555
+ quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
17556
+
16977
17557
  // do not quantize relative position bias (T5)
16978
17558
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
16979
17559
 
@@ -17357,7 +17937,6 @@ struct llama_model_params llama_model_default_params() {
17357
17937
 
17358
17938
  struct llama_context_params llama_context_default_params() {
17359
17939
  struct llama_context_params result = {
17360
- /*.seed =*/ LLAMA_DEFAULT_SEED,
17361
17940
  /*.n_ctx =*/ 512,
17362
17941
  /*.n_batch =*/ 2048,
17363
17942
  /*.n_ubatch =*/ 512,
@@ -17383,6 +17962,7 @@ struct llama_context_params llama_context_default_params() {
17383
17962
  /*.embeddings =*/ false,
17384
17963
  /*.offload_kqv =*/ true,
17385
17964
  /*.flash_attn =*/ false,
17965
+ /*.no_perf =*/ true,
17386
17966
  /*.abort_callback =*/ nullptr,
17387
17967
  /*.abort_callback_data =*/ nullptr,
17388
17968
  };
@@ -17390,6 +17970,14 @@ struct llama_context_params llama_context_default_params() {
17390
17970
  return result;
17391
17971
  }
17392
17972
 
17973
+ struct llama_sampler_chain_params llama_sampler_chain_default_params() {
17974
+ struct llama_sampler_chain_params result = {
17975
+ /*.no_perf =*/ true,
17976
+ };
17977
+
17978
+ return result;
17979
+ }
17980
+
17393
17981
  struct llama_model_quantize_params llama_model_quantize_default_params() {
17394
17982
  struct llama_model_quantize_params result = {
17395
17983
  /*.nthread =*/ 0,
@@ -17461,6 +18049,19 @@ void llama_numa_init(enum lm_ggml_numa_strategy numa) {
17461
18049
  }
17462
18050
  }
17463
18051
 
18052
+ void llama_attach_threadpool(
18053
+ struct llama_context * ctx,
18054
+ lm_ggml_threadpool_t threadpool,
18055
+ lm_ggml_threadpool_t threadpool_batch) {
18056
+ ctx->threadpool = threadpool;
18057
+ ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
18058
+ }
18059
+
18060
+ void llama_detach_threadpool(struct llama_context * ctx) {
18061
+ ctx->threadpool = nullptr;
18062
+ ctx->threadpool_batch = nullptr;
18063
+ }
18064
+
17464
18065
  void llama_backend_free(void) {
17465
18066
  lm_ggml_quantize_free();
17466
18067
  }
@@ -17572,6 +18173,7 @@ struct llama_context * llama_new_context_with_model(
17572
18173
  cparams.embeddings = params.embeddings;
17573
18174
  cparams.offload_kqv = params.offload_kqv;
17574
18175
  cparams.flash_attn = params.flash_attn;
18176
+ cparams.no_perf = params.no_perf;
17575
18177
  cparams.pooling_type = params.pooling_type;
17576
18178
 
17577
18179
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -17630,10 +18232,6 @@ struct llama_context * llama_new_context_with_model(
17630
18232
  cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
17631
18233
  }
17632
18234
 
17633
- if (params.seed == LLAMA_DEFAULT_SEED) {
17634
- params.seed = time(NULL);
17635
- }
17636
-
17637
18235
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
17638
18236
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
17639
18237
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
@@ -17644,10 +18242,10 @@ struct llama_context * llama_new_context_with_model(
17644
18242
  ctx->abort_callback = params.abort_callback;
17645
18243
  ctx->abort_callback_data = params.abort_callback_data;
17646
18244
 
17647
- ctx->sampling.rng = std::mt19937(params.seed);
17648
- ctx->logits_all = params.logits_all;
18245
+ ctx->logits_all = params.logits_all;
18246
+
17649
18247
  // build worst-case graph for encoder if a model contains encoder
17650
- ctx->is_encoding = llama_model_has_encoder(model);
18248
+ ctx->is_encoding = llama_model_has_encoder(model);
17651
18249
 
17652
18250
  uint32_t kv_size = cparams.n_ctx;
17653
18251
  lm_ggml_type type_k = params.type_k;
@@ -17667,6 +18265,20 @@ struct llama_context * llama_new_context_with_model(
17667
18265
 
17668
18266
  if (!hparams.vocab_only) {
17669
18267
  // initialize backends
18268
+ #if defined(LM_GGML_USE_RPC)
18269
+ if (model->n_gpu_layers > 0) {
18270
+ for (const auto & endpoint : model->rpc_servers) {
18271
+ lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
18272
+ if (backend == nullptr) {
18273
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18274
+ llama_free(ctx);
18275
+ return nullptr;
18276
+ }
18277
+ ctx->backends.push_back(backend);
18278
+ }
18279
+ }
18280
+ #endif
18281
+
17670
18282
  #if defined(LM_GGML_USE_METAL)
17671
18283
  if (model->n_gpu_layers > 0) {
17672
18284
  ctx->backend_metal = lm_ggml_backend_metal_init();
@@ -17791,19 +18403,6 @@ struct llama_context * llama_new_context_with_model(
17791
18403
  }
17792
18404
  #endif
17793
18405
 
17794
- #if defined(LM_GGML_USE_RPC)
17795
- if (model->n_gpu_layers > 0) {
17796
- for (const auto & endpoint : model->rpc_servers) {
17797
- lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
17798
- if (backend == nullptr) {
17799
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
17800
- llama_free(ctx);
17801
- return nullptr;
17802
- }
17803
- ctx->backends.push_back(backend);
17804
- }
17805
- }
17806
- #endif
17807
18406
  ctx->backend_cpu = lm_ggml_backend_cpu_init();
17808
18407
  if (ctx->backend_cpu == nullptr) {
17809
18408
  LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -17912,7 +18511,7 @@ struct llama_context * llama_new_context_with_model(
17912
18511
 
17913
18512
  // note: the number of splits during measure is higher than during inference due to the kv shift
17914
18513
  int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched);
17915
- LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
18514
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, lm_ggml_graph_n_nodes(gf));
17916
18515
  LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
17917
18516
  }
17918
18517
  }
@@ -17924,14 +18523,6 @@ void llama_free(struct llama_context * ctx) {
17924
18523
  delete ctx;
17925
18524
  }
17926
18525
 
17927
- const struct llama_model * llama_get_model(const struct llama_context * ctx) {
17928
- return &ctx->model;
17929
- }
17930
-
17931
- const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
17932
- return &ctx->model.vocab;
17933
- }
17934
-
17935
18526
  uint32_t llama_n_ctx(const struct llama_context * ctx) {
17936
18527
  return ctx->cparams.n_ctx;
17937
18528
  }
@@ -17952,6 +18543,30 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
17952
18543
  return model->vocab.type;
17953
18544
  }
17954
18545
 
18546
+ int32_t llama_n_vocab(const struct llama_model * model) {
18547
+ return model->hparams.n_vocab;
18548
+ }
18549
+
18550
+ int32_t llama_n_ctx_train(const struct llama_model * model) {
18551
+ return model->hparams.n_ctx_train;
18552
+ }
18553
+
18554
+ int32_t llama_n_embd(const struct llama_model * model) {
18555
+ return model->hparams.n_embd;
18556
+ }
18557
+
18558
+ int32_t llama_n_layer(const struct llama_model * model) {
18559
+ return model->hparams.n_layer;
18560
+ }
18561
+
18562
+ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
18563
+ return &ctx->model;
18564
+ }
18565
+
18566
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
18567
+ return ctx->cparams.pooling_type;
18568
+ }
18569
+
17955
18570
  enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17956
18571
  switch (model->arch) {
17957
18572
  // these models do not use RoPE
@@ -17965,6 +18580,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17965
18580
  case LLM_ARCH_T5:
17966
18581
  case LLM_ARCH_T5ENCODER:
17967
18582
  case LLM_ARCH_JAIS:
18583
+ case LLM_ARCH_RWKV6:
17968
18584
  return LLAMA_ROPE_TYPE_NONE;
17969
18585
 
17970
18586
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -18014,26 +18630,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18014
18630
  return LLAMA_ROPE_TYPE_NONE;
18015
18631
  }
18016
18632
 
18017
- enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
18018
- return ctx->cparams.pooling_type;
18019
- }
18020
-
18021
- int32_t llama_n_vocab(const struct llama_model * model) {
18022
- return model->hparams.n_vocab;
18023
- }
18024
-
18025
- int32_t llama_n_ctx_train(const struct llama_model * model) {
18026
- return model->hparams.n_ctx_train;
18027
- }
18028
-
18029
- int32_t llama_n_embd(const struct llama_model * model) {
18030
- return model->hparams.n_embd;
18031
- }
18032
-
18033
- int32_t llama_n_layer(const struct llama_model * model) {
18034
- return model->hparams.n_layer;
18035
- }
18036
-
18037
18633
  float llama_rope_freq_scale_train(const struct llama_model * model) {
18038
18634
  return model->hparams.rope_freq_scale_train;
18039
18635
  }
@@ -18133,6 +18729,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) {
18133
18729
  bool llama_model_is_recurrent(const struct llama_model * model) {
18134
18730
  switch (model->arch) {
18135
18731
  case LLM_ARCH_MAMBA: return true;
18732
+ case LLM_ARCH_RWKV6: return true;
18136
18733
  default: return false;
18137
18734
  }
18138
18735
  }
@@ -18449,14 +19046,14 @@ struct llama_data_write {
18449
19046
  // TODO: add more model-specific info which should prevent loading the session file if not identical
18450
19047
  }
18451
19048
 
18452
- void write_rng(const std::mt19937 & rng) {
18453
- std::ostringstream rng_ss;
18454
- rng_ss << rng;
19049
+ //void write_rng(const std::mt19937 & rng) {
19050
+ // std::ostringstream rng_ss;
19051
+ // rng_ss << rng;
18455
19052
 
18456
- const std::string & rng_str = rng_ss.str();
19053
+ // const std::string & rng_str = rng_ss.str();
18457
19054
 
18458
- write_string(rng_str);
18459
- }
19055
+ // write_string(rng_str);
19056
+ //}
18460
19057
 
18461
19058
  void write_output_ids(struct llama_context * ctx) {
18462
19059
  llama_output_reorder(ctx);
@@ -18676,17 +19273,17 @@ struct llama_data_read {
18676
19273
  // TODO: add more info which needs to be identical but which is not verified otherwise
18677
19274
  }
18678
19275
 
18679
- void read_rng(std::mt19937 & rng) {
18680
- std::string rng_str;
18681
- read_string(rng_str);
19276
+ //void read_rng(std::mt19937 & rng) {
19277
+ // std::string rng_str;
19278
+ // read_string(rng_str);
18682
19279
 
18683
- std::istringstream rng_ss(rng_str);
18684
- rng_ss >> rng;
19280
+ // std::istringstream rng_ss(rng_str);
19281
+ // rng_ss >> rng;
18685
19282
 
18686
- if (rng_ss.fail()) {
18687
- throw std::runtime_error("failed to load RNG state");
18688
- }
18689
- }
19283
+ // if (rng_ss.fail()) {
19284
+ // throw std::runtime_error("failed to load RNG state");
19285
+ // }
19286
+ //}
18690
19287
 
18691
19288
  void read_output_ids(struct llama_context * ctx) {
18692
19289
  std::vector<int32_t> output_pos;
@@ -19116,8 +19713,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
19116
19713
 
19117
19714
  data_ctx.write_model_info(ctx);
19118
19715
 
19119
- data_ctx.write_rng(ctx->sampling.rng);
19120
-
19121
19716
  // copy outputs
19122
19717
  data_ctx.write_output_ids(ctx);
19123
19718
  data_ctx.write_logits(ctx);
@@ -19155,9 +19750,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
19155
19750
 
19156
19751
  data_ctx.read_model_info(ctx);
19157
19752
 
19158
- // set rng
19159
- data_ctx.read_rng(ctx->sampling.rng);
19160
-
19161
19753
  // set outputs
19162
19754
  data_ctx.read_output_ids(ctx);
19163
19755
  data_ctx.read_logits(ctx);
@@ -19377,16 +19969,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
19377
19969
  }
19378
19970
  }
19379
19971
 
19380
- void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
19972
+ void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
19381
19973
  ctx->cparams.n_threads = n_threads;
19382
19974
  ctx->cparams.n_threads_batch = n_threads_batch;
19383
19975
  }
19384
19976
 
19385
- uint32_t llama_n_threads(struct llama_context * ctx) {
19977
+ int32_t llama_n_threads(struct llama_context * ctx) {
19386
19978
  return ctx->cparams.n_threads;
19387
19979
  }
19388
19980
 
19389
- uint32_t llama_n_threads_batch(struct llama_context * ctx) {
19981
+ int32_t llama_n_threads_batch(struct llama_context * ctx) {
19390
19982
  return ctx->cparams.n_threads_batch;
19391
19983
  }
19392
19984
 
@@ -19500,10 +20092,14 @@ void llama_synchronize(struct llama_context * ctx) {
19500
20092
 
19501
20093
  // add the evaluation to the stats
19502
20094
  if (ctx->n_queued_tokens == 1) {
19503
- ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20095
+ if (!ctx->cparams.no_perf) {
20096
+ ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20097
+ }
19504
20098
  ctx->n_eval++;
19505
20099
  } else if (ctx->n_queued_tokens > 1) {
19506
- ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20100
+ if (!ctx->cparams.no_perf) {
20101
+ ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20102
+ }
19507
20103
  ctx->n_p_eval += ctx->n_queued_tokens;
19508
20104
  }
19509
20105
 
@@ -19560,8 +20156,9 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
19560
20156
  LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
19561
20157
  #ifndef NDEBUG
19562
20158
  LM_GGML_ABORT("fatal error");
19563
- #endif
20159
+ #else
19564
20160
  return nullptr;
20161
+ #endif
19565
20162
  }
19566
20163
  }
19567
20164
 
@@ -19609,8 +20206,9 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
19609
20206
  LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
19610
20207
  #ifndef NDEBUG
19611
20208
  LM_GGML_ABORT("fatal error");
19612
- #endif
20209
+ #else
19613
20210
  return nullptr;
20211
+ #endif
19614
20212
  }
19615
20213
  }
19616
20214
 
@@ -20044,128 +20642,18 @@ int32_t llama_chat_apply_template(
20044
20642
  }
20045
20643
 
20046
20644
  //
20047
- // grammar
20645
+ // sampling
20048
20646
  //
20049
20647
 
20050
- struct llama_grammar * llama_grammar_init(
20051
- const llama_grammar_element ** rules,
20052
- size_t n_rules,
20053
- size_t start_rule_index) {
20054
- return llama_grammar_init_impl(rules, n_rules, start_rule_index);
20055
- }
20056
-
20057
- void llama_grammar_free(struct llama_grammar * grammar) {
20058
- llama_grammar_free_impl(grammar);
20059
- }
20060
-
20061
- struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
20062
- return llama_grammar_copy_impl(grammar);
20063
- }
20064
-
20065
- void llama_grammar_sample(
20066
- const struct llama_grammar * grammar,
20067
- const struct llama_context * ctx,
20068
- llama_token_data_array * candidates) {
20069
- llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
20070
- }
20071
-
20072
- void llama_sample_grammar(
20073
- struct llama_context * ctx,
20074
- llama_token_data_array * candidates,
20075
- const struct llama_grammar * grammar) {
20076
- llama_grammar_sample(grammar, ctx, candidates);
20077
- }
20078
-
20079
- void llama_grammar_accept_token(
20080
- struct llama_grammar * grammar,
20081
- struct llama_context * ctx,
20082
- llama_token token) {
20083
- llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token);
20648
+ // TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
20649
+ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
20650
+ return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
20084
20651
  }
20085
20652
 
20086
20653
  //
20087
- // sampling
20654
+ // model split
20088
20655
  //
20089
20656
 
20090
- void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
20091
- llama_set_rng_seed_impl(&ctx->sampling, seed);
20092
- }
20093
-
20094
- void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
20095
- llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates);
20096
- }
20097
-
20098
- void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
20099
- llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
20100
- }
20101
-
20102
- void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
20103
- llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
20104
- }
20105
-
20106
- void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
20107
- llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
20108
- }
20109
-
20110
- void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates, float xtc_threshold, float xtc_probability, size_t min_keep, std::mt19937 rng){
20111
- llama_sample_xtc_impl(ctx ? &ctx-> sampling: nullptr, candidates, xtc_threshold, xtc_probability, min_keep, rng);
20112
- }
20113
-
20114
- void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
20115
- llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
20116
- }
20117
-
20118
- void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
20119
- llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
20120
- }
20121
-
20122
- void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
20123
- llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
20124
- }
20125
-
20126
- void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
20127
- llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
20128
- }
20129
-
20130
- void llama_sample_repetition_penalties(
20131
- struct llama_context * ctx,
20132
- llama_token_data_array * candidates,
20133
- const llama_token * last_tokens,
20134
- size_t penalty_last_n,
20135
- float penalty_repeat,
20136
- float penalty_freq,
20137
- float penalty_present) {
20138
- llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
20139
- }
20140
-
20141
- void llama_sample_apply_guidance(
20142
- struct llama_context * ctx,
20143
- float * logits,
20144
- float * logits_guidance,
20145
- float scale) {
20146
- llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
20147
- }
20148
-
20149
- llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
20150
- return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
20151
- }
20152
-
20153
- llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
20154
- return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
20155
- }
20156
-
20157
- llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
20158
- return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
20159
- }
20160
-
20161
- llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
20162
- return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
20163
- }
20164
-
20165
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
20166
- return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
20167
- }
20168
-
20169
20657
  int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
20170
20658
  static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
20171
20659
  if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
@@ -20190,45 +20678,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
20190
20678
  return 0;
20191
20679
  }
20192
20680
 
20193
- struct llama_timings llama_get_timings(struct llama_context * ctx) {
20194
- struct llama_timings result = {
20195
- /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
20196
- /*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
20197
- /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
20198
- /*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
20199
- /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
20200
- /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
20201
-
20202
- /*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
20203
- /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
20204
- /*.n_eval =*/ std::max(1, ctx->n_eval),
20205
- };
20206
-
20207
- return result;
20208
- }
20209
-
20210
- void llama_print_timings(struct llama_context * ctx) {
20211
- const llama_timings timings = llama_get_timings(ctx);
20212
-
20213
- LLAMA_LOG_INFO("\n");
20214
- LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
20215
- LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20216
- __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
20217
- LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20218
- __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
20219
- LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20220
- __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
20221
- LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
20222
- }
20223
-
20224
- void llama_reset_timings(struct llama_context * ctx) {
20225
- ctx->t_start_us = lm_ggml_time_us();
20226
- ctx->t_eval_us = ctx->n_eval = 0;
20227
- ctx->t_p_eval_us = ctx->n_p_eval = 0;
20228
-
20229
- ctx->sampling.reset_timings();
20230
- }
20231
-
20232
20681
  const char * llama_print_system_info(void) {
20233
20682
  static std::string s;
20234
20683
 
@@ -20246,6 +20695,7 @@ const char * llama_print_system_info(void) {
20246
20695
  s += "ARM_FMA = " + std::to_string(lm_ggml_cpu_has_arm_fma()) + " | ";
20247
20696
  s += "F16C = " + std::to_string(lm_ggml_cpu_has_f16c()) + " | ";
20248
20697
  s += "FP16_VA = " + std::to_string(lm_ggml_cpu_has_fp16_va()) + " | ";
20698
+ s += "RISCV_VECT = " + std::to_string(lm_ggml_cpu_has_riscv_v()) + " | ";
20249
20699
  s += "WASM_SIMD = " + std::to_string(lm_ggml_cpu_has_wasm_simd()) + " | ";
20250
20700
  s += "BLAS = " + std::to_string(lm_ggml_cpu_has_blas()) + " | ";
20251
20701
  s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | ";
@@ -20257,7 +20707,43 @@ const char * llama_print_system_info(void) {
20257
20707
  return s.c_str();
20258
20708
  }
20259
20709
 
20260
- void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
20710
+ struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
20711
+ struct llama_perf_context_data data = {};
20712
+
20713
+ if (ctx == nullptr) {
20714
+ return data;
20715
+ }
20716
+
20717
+ data.t_start_ms = 1e-3 * ctx->t_start_us;
20718
+ data.t_load_ms = 1e-3 * ctx->t_load_us;
20719
+ data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
20720
+ data.t_eval_ms = 1e-3 * ctx->t_eval_us;
20721
+ data.n_p_eval = std::max(1, ctx->n_p_eval);
20722
+ data.n_eval = std::max(1, ctx->n_eval);
20723
+
20724
+ return data;
20725
+ }
20726
+
20727
+ void llama_perf_context_print(const struct llama_context * ctx) {
20728
+ const auto data = llama_perf_context(ctx);
20729
+
20730
+ const double t_end_ms = 1e-3 * lm_ggml_time_us();
20731
+
20732
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
20733
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20734
+ __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
20735
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20736
+ __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
20737
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
20738
+ }
20739
+
20740
+ void llama_perf_context_reset(struct llama_context * ctx) {
20741
+ ctx->t_start_us = lm_ggml_time_us();
20742
+ ctx->t_eval_us = ctx->n_eval = 0;
20743
+ ctx->t_p_eval_us = ctx->n_p_eval = 0;
20744
+ }
20745
+
20746
+ void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
20261
20747
  fprintf(stream, "\n");
20262
20748
  fprintf(stream, "###########\n");
20263
20749
  fprintf(stream, "# Timings #\n");
@@ -20268,21 +20754,15 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
20268
20754
  1.0e-3 * ctx->t_eval_us / ctx->n_eval);
20269
20755
  fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
20270
20756
  1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
20271
- fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
20272
- 1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
20273
20757
  fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
20274
20758
  fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
20275
- fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->sampling.n_sample);
20276
20759
  fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
20277
20760
  fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
20278
20761
  fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
20279
- fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
20280
20762
  fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
20281
20763
  1.0e6 * ctx->n_eval / ctx->t_eval_us);
20282
20764
  fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
20283
20765
  1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
20284
- fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
20285
- 1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
20286
20766
  }
20287
20767
 
20288
20768
  // For internal test use
@@ -20334,3 +20814,20 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
20334
20814
  fputs(text, stderr);
20335
20815
  fflush(stderr);
20336
20816
  }
20817
+
20818
+ struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
20819
+ const auto * ctx = (llama_context *) v_ctx;
20820
+ struct llama_token_timings result = {
20821
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
20822
+ /*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
20823
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
20824
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
20825
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
20826
+
20827
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
20828
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
20829
+ };
20830
+
20831
+ return result;
20832
+ }
20833
+