cui-llama.rn 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -1,6 +1,5 @@
1
1
  #include "llama-impl.h"
2
2
  #include "llama-vocab.h"
3
- #include "llama-grammar.h"
4
3
  #include "llama-sampling.h"
5
4
 
6
5
  #include "unicode.h"
@@ -223,6 +222,7 @@ enum llm_arch {
223
222
  LLM_ARCH_JAIS,
224
223
  LLM_ARCH_NEMOTRON,
225
224
  LLM_ARCH_EXAONE,
225
+ LLM_ARCH_RWKV6,
226
226
  LLM_ARCH_UNKNOWN,
227
227
  };
228
228
 
@@ -270,6 +270,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
270
270
  { LLM_ARCH_JAIS, "jais" },
271
271
  { LLM_ARCH_NEMOTRON, "nemotron" },
272
272
  { LLM_ARCH_EXAONE, "exaone" },
273
+ { LLM_ARCH_RWKV6, "rwkv6" },
273
274
  { LLM_ARCH_UNKNOWN, "(unknown)" },
274
275
  };
275
276
 
@@ -306,6 +307,9 @@ enum llm_kv {
306
307
  LLM_KV_DECODER_START_TOKEN_ID,
307
308
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
308
309
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
310
+ LLM_KV_RESCALE_EVERY_N_LAYERS,
311
+ LLM_KV_TIME_MIX_EXTRA_DIM,
312
+ LLM_KV_TIME_DECAY_EXTRA_DIM,
309
313
 
310
314
  LLM_KV_ATTENTION_HEAD_COUNT,
311
315
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -341,6 +345,8 @@ enum llm_kv {
341
345
  LLM_KV_SSM_TIME_STEP_RANK,
342
346
  LLM_KV_SSM_DT_B_C_RMS,
343
347
 
348
+ LLM_KV_WKV_HEAD_SIZE,
349
+
344
350
  LLM_KV_TOKENIZER_MODEL,
345
351
  LLM_KV_TOKENIZER_PRE,
346
352
  LLM_KV_TOKENIZER_LIST,
@@ -400,11 +406,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
400
406
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
401
407
  { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
402
408
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
403
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
409
+ { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
404
410
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
405
411
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
406
412
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
407
413
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
414
+ { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
415
+ { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
416
+ { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
408
417
 
409
418
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
410
419
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -440,6 +449,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
440
449
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
441
450
  { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
442
451
 
452
+ { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
453
+
443
454
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
444
455
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
445
456
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -529,6 +540,29 @@ enum llm_tensor {
529
540
  LLM_TENSOR_SSM_A,
530
541
  LLM_TENSOR_SSM_D,
531
542
  LLM_TENSOR_SSM_OUT,
543
+ LLM_TENSOR_TIME_MIX_W1,
544
+ LLM_TENSOR_TIME_MIX_W2,
545
+ LLM_TENSOR_TIME_MIX_LERP_X,
546
+ LLM_TENSOR_TIME_MIX_LERP_W,
547
+ LLM_TENSOR_TIME_MIX_LERP_K,
548
+ LLM_TENSOR_TIME_MIX_LERP_V,
549
+ LLM_TENSOR_TIME_MIX_LERP_R,
550
+ LLM_TENSOR_TIME_MIX_LERP_G,
551
+ LLM_TENSOR_TIME_MIX_FIRST,
552
+ LLM_TENSOR_TIME_MIX_DECAY,
553
+ LLM_TENSOR_TIME_MIX_DECAY_W1,
554
+ LLM_TENSOR_TIME_MIX_DECAY_W2,
555
+ LLM_TENSOR_TIME_MIX_KEY,
556
+ LLM_TENSOR_TIME_MIX_VALUE,
557
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
558
+ LLM_TENSOR_TIME_MIX_GATE,
559
+ LLM_TENSOR_TIME_MIX_LN,
560
+ LLM_TENSOR_TIME_MIX_OUTPUT,
561
+ LLM_TENSOR_CHANNEL_MIX_LERP_K,
562
+ LLM_TENSOR_CHANNEL_MIX_LERP_R,
563
+ LLM_TENSOR_CHANNEL_MIX_KEY,
564
+ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
565
+ LLM_TENSOR_CHANNEL_MIX_VALUE,
532
566
  LLM_TENSOR_ATTN_Q_A,
533
567
  LLM_TENSOR_ATTN_Q_B,
534
568
  LLM_TENSOR_ATTN_KV_A_MQA,
@@ -1350,6 +1384,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1350
1384
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1351
1385
  },
1352
1386
  },
1387
+ {
1388
+ LLM_ARCH_RWKV6,
1389
+ {
1390
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1391
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1392
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1393
+ { LLM_TENSOR_OUTPUT, "output" },
1394
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1395
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
1396
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
1397
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
1398
+ { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
1399
+ { LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" },
1400
+ { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" },
1401
+ { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
1402
+ { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
1403
+ { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
1404
+ { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
1405
+ { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
1406
+ { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
1407
+ { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
1408
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
1409
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
1410
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
1411
+ { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
1412
+ { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
1413
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
1414
+ { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
1415
+ { LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" },
1416
+ { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
1417
+ { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
1418
+ { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1419
+ },
1420
+ },
1353
1421
  {
1354
1422
  LLM_ARCH_UNKNOWN,
1355
1423
  {
@@ -2162,6 +2230,7 @@ enum e_model {
2162
2230
  MODEL_1B,
2163
2231
  MODEL_1_3B,
2164
2232
  MODEL_1_4B,
2233
+ MODEL_1_6B,
2165
2234
  MODEL_2B,
2166
2235
  MODEL_2_8B,
2167
2236
  MODEL_3B,
@@ -2239,6 +2308,12 @@ struct llama_hparams {
2239
2308
  float f_attn_logit_softcapping = 50.0f;
2240
2309
  float f_final_logit_softcapping = 30.0f;
2241
2310
 
2311
+ // for RWKV
2312
+ uint32_t rescale_every_n_layers = 0;
2313
+ uint32_t time_mix_extra_dim = 0;
2314
+ uint32_t time_decay_extra_dim = 0;
2315
+ uint32_t wkv_head_size = 0;
2316
+
2242
2317
  float rope_attn_factor = 1.0f;
2243
2318
  float rope_freq_base_train;
2244
2319
  float rope_freq_scale_train;
@@ -2302,6 +2377,11 @@ struct llama_hparams {
2302
2377
  if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2303
2378
  if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2304
2379
 
2380
+ if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
2381
+ if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
2382
+ if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
2383
+ if (this->wkv_head_size != other.wkv_head_size) return true;
2384
+
2305
2385
  if (this->dec_start_token_id != other.dec_start_token_id) return true;
2306
2386
 
2307
2387
  const float EPSILON = 1e-9f;
@@ -2365,15 +2445,25 @@ struct llama_hparams {
2365
2445
  }
2366
2446
 
2367
2447
  uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
2368
- // corresponds to Mamba's conv_states size
2369
- // TODO: maybe support other convolution strides than 1
2370
- // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2371
- return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2448
+ // corresponds to Mamba's conv_states size or RWKV's token_shift states size
2449
+ if (wkv_head_size != 0) {
2450
+ // for RWKV models
2451
+ return 2 * n_embd;
2452
+ } else {
2453
+ // TODO: maybe support other convolution strides than 1
2454
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2455
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2456
+ }
2372
2457
  }
2373
2458
 
2374
2459
  uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
2375
- // corresponds to Mamba's ssm_states size
2376
- return ssm_d_state * ssm_d_inner;
2460
+ if (wkv_head_size != 0) {
2461
+ // corresponds to RWKV's wkv_states size
2462
+ return n_embd * wkv_head_size;
2463
+ } else {
2464
+ // corresponds to Mamba's ssm_states size
2465
+ return ssm_d_state * ssm_d_inner;
2466
+ }
2377
2467
  }
2378
2468
  };
2379
2469
 
@@ -2384,8 +2474,8 @@ struct llama_cparams {
2384
2474
  uint32_t n_batch;
2385
2475
  uint32_t n_ubatch;
2386
2476
  uint32_t n_seq_max;
2387
- uint32_t n_threads; // number of threads to use for generation
2388
- uint32_t n_threads_batch; // number of threads to use for batch processing
2477
+ int n_threads; // number of threads to use for generation
2478
+ int n_threads_batch; // number of threads to use for batch processing
2389
2479
 
2390
2480
  float rope_freq_base;
2391
2481
  float rope_freq_scale;
@@ -2512,6 +2602,36 @@ struct llama_layer {
2512
2602
  struct lm_ggml_tensor * ssm_conv1d_b;
2513
2603
  struct lm_ggml_tensor * ssm_dt_b;
2514
2604
 
2605
+ // rwkv
2606
+ struct lm_ggml_tensor * time_mix_w1;
2607
+ struct lm_ggml_tensor * time_mix_w2;
2608
+ struct lm_ggml_tensor * time_mix_lerp_x;
2609
+ struct lm_ggml_tensor * time_mix_lerp_w;
2610
+ struct lm_ggml_tensor * time_mix_lerp_k;
2611
+ struct lm_ggml_tensor * time_mix_lerp_v;
2612
+ struct lm_ggml_tensor * time_mix_lerp_r;
2613
+ struct lm_ggml_tensor * time_mix_lerp_g;
2614
+
2615
+ struct lm_ggml_tensor * time_mix_first;
2616
+ struct lm_ggml_tensor * time_mix_decay;
2617
+ struct lm_ggml_tensor * time_mix_decay_w1;
2618
+ struct lm_ggml_tensor * time_mix_decay_w2;
2619
+ struct lm_ggml_tensor * time_mix_key;
2620
+ struct lm_ggml_tensor * time_mix_value;
2621
+ struct lm_ggml_tensor * time_mix_receptance;
2622
+ struct lm_ggml_tensor * time_mix_gate;
2623
+
2624
+ struct lm_ggml_tensor * time_mix_ln;
2625
+ struct lm_ggml_tensor * time_mix_ln_b;
2626
+ struct lm_ggml_tensor * time_mix_output;
2627
+
2628
+ struct lm_ggml_tensor * channel_mix_lerp_k;
2629
+ struct lm_ggml_tensor * channel_mix_lerp_r;
2630
+
2631
+ struct lm_ggml_tensor * channel_mix_key;
2632
+ struct lm_ggml_tensor * channel_mix_receptance;
2633
+ struct lm_ggml_tensor * channel_mix_value;
2634
+
2515
2635
  // long rope factors
2516
2636
  struct lm_ggml_tensor * rope_long = nullptr;
2517
2637
  struct lm_ggml_tensor * rope_short = nullptr;
@@ -3069,7 +3189,6 @@ struct llama_sbatch {
3069
3189
  struct llama_context {
3070
3190
  llama_context(const llama_model & model)
3071
3191
  : model(model)
3072
- , sampling(llama_n_vocab(&model))
3073
3192
  , t_start_us(model.t_start_us)
3074
3193
  , t_load_us(model.t_load_us) {}
3075
3194
 
@@ -3086,7 +3205,6 @@ struct llama_context {
3086
3205
  const struct llama_model & model;
3087
3206
 
3088
3207
  struct llama_cparams cparams;
3089
- struct llama_sampling sampling;
3090
3208
  struct llama_sbatch sbatch;
3091
3209
  struct llama_kv_cache kv_self;
3092
3210
  struct llama_control_vector cvec;
@@ -3102,18 +3220,21 @@ struct llama_context {
3102
3220
  #endif
3103
3221
  lm_ggml_backend_t backend_cpu = nullptr;
3104
3222
 
3223
+ lm_ggml_threadpool_t threadpool = nullptr;
3224
+ lm_ggml_threadpool_t threadpool_batch = nullptr;
3225
+
3105
3226
  bool has_evaluated_once = false;
3106
3227
 
3107
- int64_t t_start_us;
3108
- int64_t t_load_us;
3109
- int64_t t_p_eval_us = 0;
3110
- int64_t t_eval_us = 0;
3228
+ mutable int64_t t_start_us;
3229
+ mutable int64_t t_load_us;
3230
+ mutable int64_t t_p_eval_us = 0;
3231
+ mutable int64_t t_eval_us = 0;
3111
3232
 
3112
- int64_t t_compute_start_us = 0;
3113
- int64_t n_queued_tokens = 0;
3233
+ mutable int64_t t_compute_start_us = 0;
3234
+ mutable int64_t n_queued_tokens = 0;
3114
3235
 
3115
- int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
3116
- int32_t n_eval = 0; // number of eval calls
3236
+ mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
3237
+ mutable int32_t n_eval = 0; // number of eval calls
3117
3238
 
3118
3239
  // host buffer for the model output (logits and embeddings)
3119
3240
  lm_ggml_backend_buffer_t buf_output = nullptr;
@@ -3233,29 +3354,33 @@ static size_t llama_get_device_count(const llama_model & model) {
3233
3354
  static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
3234
3355
  lm_ggml_backend_buffer_type_t buft = nullptr;
3235
3356
 
3236
- #if defined(LM_GGML_USE_RPC)
3237
- int dev_count = (int)llama_get_device_count(model);
3357
+ #ifdef LM_GGML_USE_RPC
3238
3358
  int rpc_count = (int)model.rpc_servers.size();
3239
- if (gpu >= dev_count - rpc_count) {
3240
- const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
3359
+ #else
3360
+ int rpc_count = 0;
3361
+ #endif
3362
+ int local_gpu = gpu - rpc_count;
3363
+ #if defined(LM_GGML_USE_RPC)
3364
+ if (gpu < rpc_count) {
3365
+ const char * endpoint = model.rpc_servers[gpu].c_str();
3241
3366
  return lm_ggml_backend_rpc_buffer_type(endpoint);
3242
3367
  }
3243
3368
  #endif
3244
3369
  #if defined(LM_GGML_USE_METAL)
3245
3370
  buft = lm_ggml_backend_metal_buffer_type();
3246
3371
  #elif defined(LM_GGML_USE_CUDA)
3247
- buft = lm_ggml_backend_cuda_buffer_type(gpu);
3372
+ buft = lm_ggml_backend_cuda_buffer_type(local_gpu);
3248
3373
  #elif defined(LM_GGML_USE_VULKAN)
3249
- buft = lm_ggml_backend_vk_buffer_type(gpu);
3374
+ buft = lm_ggml_backend_vk_buffer_type(local_gpu);
3250
3375
  #elif defined(LM_GGML_USE_SYCL)
3251
- buft = lm_ggml_backend_sycl_buffer_type(gpu);
3376
+ buft = lm_ggml_backend_sycl_buffer_type(local_gpu);
3252
3377
  #elif defined(LM_GGML_USE_KOMPUTE)
3253
- buft = lm_ggml_backend_kompute_buffer_type(gpu);
3378
+ buft = lm_ggml_backend_kompute_buffer_type(local_gpu);
3254
3379
  if (buft == nullptr) {
3255
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
3380
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
3256
3381
  }
3257
3382
  #elif defined(LM_GGML_USE_CANN)
3258
- buft = lm_ggml_backend_cann_buffer_type(gpu);
3383
+ buft = lm_ggml_backend_cann_buffer_type(local_gpu);
3259
3384
  #endif
3260
3385
 
3261
3386
  if (buft == nullptr) {
@@ -3263,7 +3388,7 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const lla
3263
3388
  }
3264
3389
  return buft;
3265
3390
  LM_GGML_UNUSED(model);
3266
- LM_GGML_UNUSED(gpu);
3391
+ LM_GGML_UNUSED(local_gpu);
3267
3392
  }
3268
3393
 
3269
3394
  static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
@@ -3290,13 +3415,17 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
3290
3415
  }
3291
3416
 
3292
3417
  static size_t llama_get_device_memory(const llama_model & model, int device) {
3293
- #if defined(LM_GGML_USE_RPC)
3294
- int dev_count = (int)llama_get_device_count(model);
3418
+ #ifdef LM_GGML_USE_RPC
3295
3419
  int rpc_count = (int)model.rpc_servers.size();
3296
- if (device >= dev_count - rpc_count) {
3420
+ #else
3421
+ int rpc_count = 0;
3422
+ #endif
3423
+ int local_device = device - rpc_count;
3424
+ #if defined(LM_GGML_USE_RPC)
3425
+ if (device < rpc_count) {
3297
3426
  size_t total;
3298
3427
  size_t free;
3299
- const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
3428
+ const char * endpoint = model.rpc_servers[device].c_str();
3300
3429
  lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
3301
3430
  return free;
3302
3431
  }
@@ -3304,28 +3433,28 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
3304
3433
  #if defined(LM_GGML_USE_CUDA)
3305
3434
  size_t total;
3306
3435
  size_t free;
3307
- lm_ggml_backend_cuda_get_device_memory(device, &free, &total);
3436
+ lm_ggml_backend_cuda_get_device_memory(local_device, &free, &total);
3308
3437
  return free;
3309
3438
  #elif defined(LM_GGML_USE_SYCL)
3310
3439
  size_t total;
3311
3440
  size_t free;
3312
- lm_ggml_backend_sycl_get_device_memory(device, &free, &total);
3441
+ lm_ggml_backend_sycl_get_device_memory(local_device, &free, &total);
3313
3442
  return free;
3314
3443
  #elif defined(LM_GGML_USE_VULKAN)
3315
3444
  size_t total;
3316
3445
  size_t free;
3317
- lm_ggml_backend_vk_get_device_memory(device, &free, &total);
3446
+ lm_ggml_backend_vk_get_device_memory(local_device, &free, &total);
3318
3447
  return free;
3319
3448
  #elif defined(LM_GGML_USE_CANN)
3320
3449
  size_t total;
3321
3450
  size_t free;
3322
- lm_ggml_backend_cann_get_device_memory(device, &free, &total);
3451
+ lm_ggml_backend_cann_get_device_memory(local_device, &free, &total);
3323
3452
  return free;
3324
3453
  #else
3325
3454
  return 1;
3326
3455
  #endif
3327
3456
  LM_GGML_UNUSED(model);
3328
- LM_GGML_UNUSED(device);
3457
+ LM_GGML_UNUSED(local_device);
3329
3458
  }
3330
3459
 
3331
3460
  //
@@ -3434,7 +3563,7 @@ static bool llama_kv_cache_find_slot(
3434
3563
  const uint32_t n_seq_tokens = batch.n_seq_tokens;
3435
3564
 
3436
3565
  if (cache.recurrent) {
3437
- // For recurrent state architectures (like Mamba),
3566
+ // For recurrent state architectures (like Mamba or RWKV),
3438
3567
  // each cache cell can store the state for a whole sequence.
3439
3568
  // A slot should be always be contiguous.
3440
3569
 
@@ -3683,7 +3812,7 @@ static bool llama_kv_cache_seq_rm(
3683
3812
  if (p0 < 0) p0 = 0;
3684
3813
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
3685
3814
 
3686
- // models like Mamba can't have a state partially erased
3815
+ // models like Mamba or RWKV can't have a state partially erased
3687
3816
  if (cache.recurrent) {
3688
3817
  if (seq_id >= (int64_t) cache.size) {
3689
3818
  // could be fatal
@@ -3697,7 +3826,8 @@ static bool llama_kv_cache_seq_rm(
3697
3826
  if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
3698
3827
  return false;
3699
3828
  }
3700
- if (p0 <= cell.pos && p1 < cell.pos) {
3829
+ // invalidate tails which will be cleared
3830
+ if (p0 <= cell.pos && cell.pos < p1) {
3701
3831
  tail_id = -1;
3702
3832
  }
3703
3833
  }
@@ -3819,7 +3949,7 @@ static void llama_kv_cache_seq_add(
3819
3949
  if (p0 == p1) return;
3820
3950
 
3821
3951
  if (cache.recurrent) {
3822
- // for Mamba-like models, only the pos needs to be shifted
3952
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
3823
3953
  if (0 <= seq_id && seq_id < (int64_t) cache.size) {
3824
3954
  const int32_t tail_id = cache.cells[seq_id].tail;
3825
3955
  if (tail_id >= 0) {
@@ -3868,7 +3998,7 @@ static void llama_kv_cache_seq_div(
3868
3998
  if (p0 == p1) return;
3869
3999
 
3870
4000
  if (cache.recurrent) {
3871
- // for Mamba-like models, only the pos needs to be changed
4001
+ // for Mamba-like or RWKV models, only the pos needs to be changed
3872
4002
  if (0 <= seq_id && seq_id < (int64_t) cache.size) {
3873
4003
  const int32_t tail_id = cache.cells[seq_id].tail;
3874
4004
  if (tail_id >= 0) {
@@ -4322,6 +4452,8 @@ struct llama_model_loader {
4322
4452
  case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
4323
4453
  case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
4324
4454
  case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
4455
+ case LM_GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
4456
+ case LM_GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
4325
4457
  case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
4326
4458
  case LM_GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
4327
4459
  case LM_GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
@@ -5015,6 +5147,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5015
5147
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
5016
5148
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
5017
5149
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
5150
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
5151
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
5018
5152
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
5019
5153
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
5020
5154
  case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
@@ -5059,6 +5193,7 @@ static const char * llama_model_type_name(e_model type) {
5059
5193
  case MODEL_1B: return "1B";
5060
5194
  case MODEL_1_3B: return "1.3B";
5061
5195
  case MODEL_1_4B: return "1.4B";
5196
+ case MODEL_1_6B: return "1.6B";
5062
5197
  case MODEL_2B: return "2B";
5063
5198
  case MODEL_2_8B: return "2.8B";
5064
5199
  case MODEL_3B: return "3B";
@@ -5105,6 +5240,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
5105
5240
  case LLAMA_VOCAB_TYPE_BPE: return "BPE";
5106
5241
  case LLAMA_VOCAB_TYPE_WPM: return "WPM";
5107
5242
  case LLAMA_VOCAB_TYPE_UGM: return "UGM";
5243
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
5108
5244
  default: return "unknown";
5109
5245
  }
5110
5246
  }
@@ -5801,6 +5937,26 @@ static void llm_load_hparams(
5801
5937
  default: model.type = e_model::MODEL_UNKNOWN;
5802
5938
  }
5803
5939
  } break;
5940
+ case LLM_ARCH_RWKV6:
5941
+ {
5942
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5943
+ ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
5944
+ ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
5945
+ ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
5946
+ ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
5947
+
5948
+ switch (hparams.n_layer) {
5949
+ case 24: model.type = e_model::MODEL_1_6B; break;
5950
+ case 32:
5951
+ switch (hparams.n_embd) {
5952
+ case 2560: model.type = e_model::MODEL_3B; break;
5953
+ case 4096: model.type = e_model::MODEL_7B; break;
5954
+ default: model.type = e_model::MODEL_UNKNOWN;
5955
+ } break;
5956
+ case 61: model.type = e_model::MODEL_14B; break;
5957
+ default: model.type = e_model::MODEL_UNKNOWN;
5958
+ }
5959
+ } break;
5804
5960
  default: (void)0;
5805
5961
  }
5806
5962
 
@@ -5930,6 +6086,15 @@ static void llm_load_vocab(
5930
6086
  }
5931
6087
  #endif
5932
6088
  }
6089
+ } else if (tokenizer_model == "rwkv") {
6090
+ vocab.type = LLAMA_VOCAB_TYPE_RWKV;
6091
+
6092
+ // default special tokens
6093
+ vocab.special_bos_id = -1;
6094
+ vocab.special_eos_id = -1;
6095
+ vocab.special_unk_id = -1;
6096
+ vocab.special_sep_id = -1;
6097
+ vocab.special_pad_id = -1;
5933
6098
  } else {
5934
6099
  throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
5935
6100
  }
@@ -6061,6 +6226,12 @@ static void llm_load_vocab(
6061
6226
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
6062
6227
  vocab.tokenizer_add_bos = false;
6063
6228
  vocab.tokenizer_add_eos = true;
6229
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
6230
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
6231
+ vocab.tokenizer_add_space_prefix = false;
6232
+ vocab.tokenizer_clean_spaces = false;
6233
+ vocab.tokenizer_add_bos = false;
6234
+ vocab.tokenizer_add_eos = false;
6064
6235
  } else {
6065
6236
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
6066
6237
  }
@@ -6088,6 +6259,7 @@ static void llm_load_vocab(
6088
6259
 
6089
6260
  const uint32_t n_vocab = lm_gguf_get_arr_n(ctx, token_idx);
6090
6261
 
6262
+ vocab.n_vocab = n_vocab;
6091
6263
  vocab.id_to_token.resize(n_vocab);
6092
6264
 
6093
6265
  for (uint32_t i = 0; i < n_vocab; i++) {
@@ -6165,6 +6337,10 @@ static void llm_load_vocab(
6165
6337
  }
6166
6338
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
6167
6339
  vocab.linefeed_id = vocab.special_pad_id;
6340
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
6341
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
6342
+ LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
6343
+ vocab.linefeed_id = ids[0];
6168
6344
  } else {
6169
6345
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
6170
6346
  LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@@ -6234,6 +6410,11 @@ static void llm_load_vocab(
6234
6410
  )
6235
6411
  ) {
6236
6412
  vocab.special_eot_id = t.second;
6413
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6414
+ LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6415
+ __func__, t.first.c_str());
6416
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6417
+ }
6237
6418
  break;
6238
6419
  }
6239
6420
  }
@@ -6247,6 +6428,11 @@ static void llm_load_vocab(
6247
6428
  const auto & t = vocab.token_to_id.find("<|eom_id|>");
6248
6429
  if (t != vocab.token_to_id.end()) {
6249
6430
  vocab.special_eom_id = t->second;
6431
+ if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6432
+ LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6433
+ __func__, t->first.c_str());
6434
+ vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6435
+ }
6250
6436
  }
6251
6437
  }
6252
6438
  }
@@ -7955,23 +8141,23 @@ static bool llm_load_tensors(
7955
8141
  layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
7956
8142
 
7957
8143
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
7958
- layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
8144
+ layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7959
8145
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
7960
- layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
8146
+ layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7961
8147
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
7962
- layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
8148
+ layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7963
8149
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
7964
- layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
8150
+ layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7965
8151
 
7966
8152
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7967
8153
  layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
7968
8154
 
7969
8155
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
7970
- layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
8156
+ layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7971
8157
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
7972
- layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
8158
+ layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7973
8159
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7974
- layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
8160
+ layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7975
8161
  }
7976
8162
  } break;
7977
8163
  case LLM_ARCH_T5:
@@ -8211,6 +8397,68 @@ static bool llm_load_tensors(
8211
8397
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
8212
8398
  }
8213
8399
  } break;
8400
+ case LLM_ARCH_RWKV6:
8401
+ {
8402
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
8403
+
8404
+ // Block 0, LN0
8405
+ model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
8406
+ model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
8407
+
8408
+ // output
8409
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
8410
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
8411
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
8412
+
8413
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
8414
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
8415
+ const int head_size = hparams.wkv_head_size;
8416
+ const int attn_hidden_size = n_embd;
8417
+ const int ffn_size = hparams.n_ff_arr[0];
8418
+
8419
+ for (int i = 0; i < n_layer; ++i) {
8420
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
8421
+
8422
+ auto & layer = model.layers[i];
8423
+
8424
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
8425
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
8426
+
8427
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
8428
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
8429
+
8430
+ layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5});
8431
+ layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5});
8432
+
8433
+ layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1});
8434
+ layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1});
8435
+ layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
8436
+ layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1});
8437
+ layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
8438
+ layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
8439
+
8440
+ layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
8441
+ layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
8442
+ layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});
8443
+ layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size});
8444
+ layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
8445
+ layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
8446
+ layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
8447
+ layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd});
8448
+
8449
+ layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
8450
+ layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
8451
+ layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
8452
+
8453
+ layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
8454
+ layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
8455
+
8456
+ layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
8457
+ layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
8458
+ layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd});
8459
+ }
8460
+
8461
+ } break;
8214
8462
  default:
8215
8463
  throw std::runtime_error("unknown architecture");
8216
8464
  }
@@ -8495,8 +8743,7 @@ static void llm_build_kv_store(
8495
8743
 
8496
8744
  LM_GGML_ASSERT(kv.size == n_ctx);
8497
8745
 
8498
- struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
8499
- (lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
8746
+ struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
8500
8747
  cb(k_cache_view, "k_cache_view", il);
8501
8748
 
8502
8749
  // note: storing RoPE-ed version of K in the KV cache
@@ -8507,8 +8754,7 @@ static void llm_build_kv_store(
8507
8754
  struct lm_ggml_tensor * v_cache_view = nullptr;
8508
8755
 
8509
8756
  if (cparams.flash_attn) {
8510
- v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
8511
- (kv_head)*lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
8757
+ v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
8512
8758
  } else {
8513
8759
  // note: the V cache is transposed when not using flash attention
8514
8760
  v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
@@ -8995,8 +9241,7 @@ static struct lm_ggml_tensor * llm_build_kv(
8995
9241
 
8996
9242
  struct lm_ggml_tensor * cur;
8997
9243
 
8998
- cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
8999
- q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
9244
+ cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
9000
9245
  cb(cur, "kqv_out", il);
9001
9246
 
9002
9247
  return cur;
@@ -9170,6 +9415,171 @@ static struct lm_ggml_tensor * llm_build_mamba(
9170
9415
  return cur;
9171
9416
  }
9172
9417
 
9418
+ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9419
+ struct llama_context & lctx,
9420
+ struct lm_ggml_context * ctx,
9421
+ const struct llama_layer * layer,
9422
+ struct lm_ggml_tensor * cur,
9423
+ struct lm_ggml_tensor * x_prev,
9424
+ struct lm_ggml_tensor ** wkv_state) {
9425
+ size_t n_embed = cur->ne[0];
9426
+ size_t n_seq_tokens = cur->ne[1];
9427
+ size_t n_seqs = cur->ne[2];
9428
+
9429
+ size_t head_size = layer->time_mix_first->ne[0];
9430
+ size_t head_count = layer->time_mix_first->ne[1];
9431
+
9432
+ size_t n_tokens = n_seqs * n_seq_tokens;
9433
+
9434
+ struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
9435
+
9436
+ sx = lm_ggml_reshape_2d(ctx, sx, n_embed, n_tokens);
9437
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
9438
+
9439
+ struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
9440
+
9441
+ xxx = lm_ggml_reshape_4d(
9442
+ ctx,
9443
+ lm_ggml_tanh(
9444
+ ctx,
9445
+ lm_ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
9446
+ ),
9447
+ layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
9448
+ );
9449
+
9450
+ xxx = lm_ggml_cont(ctx, lm_ggml_permute(ctx, xxx, 0, 1, 3, 2));
9451
+
9452
+ xxx = lm_ggml_mul_mat(
9453
+ ctx,
9454
+ lm_ggml_reshape_4d(
9455
+ ctx,
9456
+ layer->time_mix_w2,
9457
+ layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
9458
+ ),
9459
+ xxx
9460
+ );
9461
+
9462
+ struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
9463
+ struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
9464
+ struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
9465
+ struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
9466
+ struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
9467
+
9468
+ struct lm_ggml_tensor * xw = lm_ggml_add(
9469
+ ctx,
9470
+ lm_ggml_mul(
9471
+ ctx,
9472
+ lm_ggml_add(ctx, mw, layer->time_mix_lerp_w),
9473
+ sx
9474
+ ),
9475
+ cur
9476
+ );
9477
+
9478
+ struct lm_ggml_tensor * xk = lm_ggml_add(
9479
+ ctx,
9480
+ lm_ggml_mul(
9481
+ ctx,
9482
+ lm_ggml_add(ctx, mk, layer->time_mix_lerp_k),
9483
+ sx
9484
+ ),
9485
+ cur
9486
+ );
9487
+
9488
+ struct lm_ggml_tensor * xv = lm_ggml_add(
9489
+ ctx,
9490
+ lm_ggml_mul(
9491
+ ctx,
9492
+ lm_ggml_add(ctx, mv, layer->time_mix_lerp_v),
9493
+ sx
9494
+ ),
9495
+ cur
9496
+ );
9497
+
9498
+ struct lm_ggml_tensor * xr = lm_ggml_add(
9499
+ ctx,
9500
+ lm_ggml_mul(
9501
+ ctx,
9502
+ lm_ggml_add(ctx, mr, layer->time_mix_lerp_r),
9503
+ sx
9504
+ ),
9505
+ cur
9506
+ );
9507
+
9508
+ struct lm_ggml_tensor * xg = lm_ggml_add(
9509
+ ctx,
9510
+ lm_ggml_mul(
9511
+ ctx,
9512
+ lm_ggml_add(ctx, mg, layer->time_mix_lerp_g),
9513
+ sx
9514
+ ),
9515
+ cur
9516
+ );
9517
+
9518
+ struct lm_ggml_tensor * r = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
9519
+ struct lm_ggml_tensor * k = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
9520
+ struct lm_ggml_tensor * v = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
9521
+ struct lm_ggml_tensor * g = lm_ggml_silu(
9522
+ ctx,
9523
+ llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
9524
+ );
9525
+
9526
+ struct lm_ggml_tensor * w = lm_ggml_mul_mat(
9527
+ ctx,
9528
+ layer->time_mix_decay_w2,
9529
+ lm_ggml_tanh(
9530
+ ctx,
9531
+ lm_ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
9532
+ )
9533
+ );
9534
+
9535
+ w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
9536
+ w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
9537
+ w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
9538
+
9539
+ k = lm_ggml_transpose(ctx, k);
9540
+ v = lm_ggml_transpose(ctx, v);
9541
+ r = lm_ggml_transpose(ctx, r);
9542
+
9543
+ struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
9544
+ cur = lm_ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
9545
+ *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
9546
+
9547
+ // group norm with head_count groups
9548
+ cur = lm_ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
9549
+ cur = lm_ggml_norm(ctx, cur, 64e-5f);
9550
+
9551
+ // Convert back to regular vectors.
9552
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
9553
+ cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
9554
+
9555
+ cur = lm_ggml_mul(ctx, cur, g);
9556
+ cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
9557
+
9558
+ return lm_ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
9559
+ }
9560
+
9561
+ static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
9562
+ struct llama_context & lctx,
9563
+ struct lm_ggml_context * ctx,
9564
+ const struct llama_layer * layer,
9565
+ struct lm_ggml_tensor * cur,
9566
+ struct lm_ggml_tensor * x_prev) {
9567
+ struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
9568
+ struct lm_ggml_tensor * xk = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
9569
+ struct lm_ggml_tensor * xr = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
9570
+
9571
+ struct lm_ggml_tensor * r = lm_ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
9572
+ struct lm_ggml_tensor * k = lm_ggml_sqr(
9573
+ ctx,
9574
+ lm_ggml_relu(
9575
+ ctx,
9576
+ llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
9577
+ )
9578
+ );
9579
+
9580
+ return lm_ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
9581
+ }
9582
+
9173
9583
  struct llm_build_context {
9174
9584
  const llama_model & model;
9175
9585
  llama_context & lctx;
@@ -13790,7 +14200,9 @@ struct llm_build_context {
13790
14200
  {
13791
14201
  // compute Q and K and RoPE them
13792
14202
  struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13793
- Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
14203
+ if (model.layers[il].wq_scale) {
14204
+ Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
14205
+ }
13794
14206
  cb(Qcur, "Qcur", il);
13795
14207
  if (model.layers[il].bq) {
13796
14208
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -13799,7 +14211,9 @@ struct llm_build_context {
13799
14211
 
13800
14212
  // B1.K
13801
14213
  struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13802
- Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
14214
+ if (model.layers[il].wk_scale) {
14215
+ Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
14216
+ }
13803
14217
  cb(Kcur, "Kcur", il);
13804
14218
  if (model.layers[il].bk) {
13805
14219
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
@@ -13808,7 +14222,9 @@ struct llm_build_context {
13808
14222
 
13809
14223
  // B1.V
13810
14224
  struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13811
- Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
14225
+ if (model.layers[il].wv_scale) {
14226
+ Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
14227
+ }
13812
14228
  cb(Vcur, "Vcur", il);
13813
14229
  if (model.layers[il].bv) {
13814
14230
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -13839,7 +14255,9 @@ struct llm_build_context {
13839
14255
  cb(cur, "attn_sub_norm", il);
13840
14256
 
13841
14257
  cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
13842
- cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
14258
+ if (model.layers[il].wo_scale) {
14259
+ cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
14260
+ }
13843
14261
  if (model.layers[il].bo) {
13844
14262
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bo);
13845
14263
  }
@@ -13876,7 +14294,9 @@ struct llm_build_context {
13876
14294
  cb(cur, "ffn_sub_norm", il);
13877
14295
 
13878
14296
  cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
13879
- cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
14297
+ if (model.layers[il].ffn_down_scale) {
14298
+ cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
14299
+ }
13880
14300
  cb(cur, "ffn_down", il);
13881
14301
 
13882
14302
  cur = lm_ggml_add(ctx0, cur, ffn_inp);
@@ -14691,6 +15111,117 @@ struct llm_build_context {
14691
15111
 
14692
15112
  return gf;
14693
15113
  }
15114
+
15115
+ lm_ggml_cgraph * build_rwkv6() {
15116
+ lm_ggml_cgraph *gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15117
+
15118
+ // Token shift state dimensions should be 2 * n_emb
15119
+ LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
15120
+
15121
+ const int64_t n_seqs = batch.n_seqs;
15122
+ const int64_t n_seq_tokens = batch.n_seq_tokens;
15123
+ const int64_t n_tokens = batch.n_tokens;
15124
+ LM_GGML_ASSERT(n_seqs != 0);
15125
+ LM_GGML_ASSERT(batch.equal_seqs);
15126
+ LM_GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
15127
+
15128
+ struct lm_ggml_tensor * cur;
15129
+ struct lm_ggml_tensor * inpL;
15130
+ struct lm_ggml_tensor * state_copy = build_inp_s_copy();
15131
+ struct lm_ggml_tensor * state_mask = build_inp_s_mask();
15132
+
15133
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
15134
+ inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
15135
+
15136
+ for (int il = 0; il < n_layer; ++il) {
15137
+ const llama_layer * layer = &model.layers[il];
15138
+
15139
+ // (ab)using the KV cache to store the states
15140
+ struct lm_ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
15141
+ gf, kv_self.k_l[il], state_copy, state_mask,
15142
+ hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
15143
+ struct lm_ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
15144
+ gf, kv_self.v_l[il], state_copy, state_mask,
15145
+ hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
15146
+
15147
+ cur = lm_ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
15148
+ token_shift = lm_ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
15149
+
15150
+ struct lm_ggml_tensor * att_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
15151
+ struct lm_ggml_tensor * ffn_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * lm_ggml_element_size(token_shift));
15152
+
15153
+ struct lm_ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
15154
+ struct lm_ggml_tensor * x_prev = lm_ggml_concat(
15155
+ ctx0,
15156
+ att_shift,
15157
+ lm_ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
15158
+ 1
15159
+ );
15160
+
15161
+ cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
15162
+ lm_ggml_build_forward_expand(gf, cur);
15163
+ lm_ggml_build_forward_expand(
15164
+ gf,
15165
+ lm_ggml_cpy(
15166
+ ctx0,
15167
+ wkv_states,
15168
+ lm_ggml_view_1d(
15169
+ ctx0,
15170
+ kv_self.v_l[il],
15171
+ hparams.n_embd_v_s() * n_seqs,
15172
+ hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
15173
+ )
15174
+ )
15175
+ );
15176
+
15177
+ struct lm_ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
15178
+ x_prev = lm_ggml_concat(
15179
+ ctx0,
15180
+ ffn_shift,
15181
+ lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
15182
+ 1
15183
+ );
15184
+ cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
15185
+ lm_ggml_build_forward_expand(gf, cur);
15186
+
15187
+ struct lm_ggml_tensor * last_norm_att = lm_ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_att));
15188
+ struct lm_ggml_tensor * last_norm_ffn = lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_ffn));
15189
+
15190
+ token_shift = lm_ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
15191
+
15192
+ lm_ggml_build_forward_expand(
15193
+ gf,
15194
+ lm_ggml_cpy(
15195
+ ctx0,
15196
+ lm_ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
15197
+ lm_ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * lm_ggml_element_size(kv_self.k_l[il]))
15198
+ )
15199
+ );
15200
+
15201
+ if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
15202
+ cur = lm_ggml_scale(ctx0, cur, 0.5F);
15203
+ }
15204
+
15205
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
15206
+ cb(cur, "l_out", il);
15207
+
15208
+ // input for next layer
15209
+ inpL = cur;
15210
+ }
15211
+
15212
+ cur = inpL;
15213
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
15214
+ cur = lm_ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
15215
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
15216
+
15217
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
15218
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15219
+
15220
+ cb(cur, "result_output", -1);
15221
+ lm_ggml_build_forward_expand(gf, cur);
15222
+
15223
+ return gf;
15224
+ }
14694
15225
  };
14695
15226
 
14696
15227
  static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -14937,6 +15468,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
14937
15468
  {
14938
15469
  result = llm.build_exaone();
14939
15470
  } break;
15471
+ case LLM_ARCH_RWKV6:
15472
+ {
15473
+ result = llm.build_rwkv6();
15474
+ } break;
14940
15475
  default:
14941
15476
  LM_GGML_ABORT("fatal error");
14942
15477
  }
@@ -15505,9 +16040,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
15505
16040
  }
15506
16041
 
15507
16042
  static void llama_graph_compute(
15508
- llama_context & lctx,
15509
- lm_ggml_cgraph * gf,
15510
- int n_threads) {
16043
+ llama_context & lctx,
16044
+ lm_ggml_cgraph * gf,
16045
+ int n_threads,
16046
+ lm_ggml_threadpool * threadpool) {
15511
16047
  #ifdef LM_GGML_USE_METAL
15512
16048
  if (lm_ggml_backend_is_metal(lctx.backend_metal)) {
15513
16049
  lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -15516,6 +16052,7 @@ static void llama_graph_compute(
15516
16052
 
15517
16053
  if (lctx.backend_cpu != nullptr) {
15518
16054
  lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
16055
+ lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
15519
16056
  lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
15520
16057
  }
15521
16058
  #ifdef LM_GGML_USE_BLAS
@@ -15550,6 +16087,13 @@ static int llama_decode_internal(
15550
16087
  return -1;
15551
16088
  }
15552
16089
 
16090
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
16091
+ if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
16092
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16093
+ return -1;
16094
+ }
16095
+ }
16096
+
15553
16097
  const auto & model = lctx.model;
15554
16098
  const auto & hparams = model.hparams;
15555
16099
  const auto & cparams = lctx.cparams;
@@ -15636,6 +16180,8 @@ static int llama_decode_internal(
15636
16180
  }
15637
16181
 
15638
16182
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
16183
+ lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
16184
+
15639
16185
  LM_GGML_ASSERT(n_threads > 0);
15640
16186
 
15641
16187
  // non-causal masks do not use the KV cache
@@ -15697,7 +16243,7 @@ static int llama_decode_internal(
15697
16243
 
15698
16244
  llama_set_inputs(lctx, ubatch);
15699
16245
 
15700
- llama_graph_compute(lctx, gf, n_threads);
16246
+ llama_graph_compute(lctx, gf, n_threads, threadpool);
15701
16247
 
15702
16248
  // update the kv ring buffer
15703
16249
  {
@@ -15840,6 +16386,13 @@ static int llama_encode_internal(
15840
16386
  return -1;
15841
16387
  }
15842
16388
 
16389
+ for (uint32_t i = 0; i < n_tokens; ++i) {
16390
+ if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
16391
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
16392
+ return -1;
16393
+ }
16394
+ }
16395
+
15843
16396
  const auto & model = lctx.model;
15844
16397
  const auto & hparams = model.hparams;
15845
16398
  const auto & cparams = lctx.cparams;
@@ -15874,7 +16427,9 @@ static int llama_encode_internal(
15874
16427
  lctx.inp_embd_enc = NULL;
15875
16428
  lctx.n_outputs = n_tokens;
15876
16429
 
15877
- const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
16430
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
16431
+ lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
16432
+
15878
16433
  LM_GGML_ASSERT(n_threads > 0);
15879
16434
 
15880
16435
  lm_ggml_backend_sched_reset(lctx.sched);
@@ -15906,7 +16461,7 @@ static int llama_encode_internal(
15906
16461
 
15907
16462
  llama_set_inputs(lctx, ubatch);
15908
16463
 
15909
- llama_graph_compute(lctx, gf, n_threads);
16464
+ llama_graph_compute(lctx, gf, n_threads, threadpool);
15910
16465
 
15911
16466
  // extract embeddings
15912
16467
  if (embd) {
@@ -16188,7 +16743,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
16188
16743
 
16189
16744
  lm_ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
16190
16745
 
16191
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
16746
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
16192
16747
  #endif
16193
16748
 
16194
16749
  //const int64_t t_end = lm_ggml_time_us();
@@ -16214,7 +16769,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
16214
16769
 
16215
16770
  llama_set_k_shift(lctx);
16216
16771
 
16217
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
16772
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
16218
16773
 
16219
16774
  need_reserve = true;
16220
16775
  }
@@ -16425,6 +16980,9 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
16425
16980
  new_type == LM_GGML_TYPE_Q4_0_8_8) {
16426
16981
  new_type = LM_GGML_TYPE_Q4_0;
16427
16982
  }
16983
+ else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
16984
+ new_type = LM_GGML_TYPE_Q4_K;
16985
+ }
16428
16986
  }
16429
16987
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
16430
16988
  ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -16624,6 +17182,8 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
16624
17182
  }
16625
17183
  if (convert_incompatible_tensor) {
16626
17184
  switch (new_type) {
17185
+ case LM_GGML_TYPE_TQ1_0:
17186
+ case LM_GGML_TYPE_TQ2_0: new_type = LM_GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
16627
17187
  case LM_GGML_TYPE_IQ2_XXS:
16628
17188
  case LM_GGML_TYPE_IQ2_XS:
16629
17189
  case LM_GGML_TYPE_IQ2_S:
@@ -16729,6 +17289,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16729
17289
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
16730
17290
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = LM_GGML_TYPE_Q5_K; break;
16731
17291
  case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = LM_GGML_TYPE_Q6_K; break;
17292
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = LM_GGML_TYPE_TQ1_0; break;
17293
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = LM_GGML_TYPE_TQ2_0; break;
16732
17294
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = LM_GGML_TYPE_IQ2_XXS; break;
16733
17295
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = LM_GGML_TYPE_IQ2_XS; break;
16734
17296
  case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = LM_GGML_TYPE_IQ2_XS; break;
@@ -16833,7 +17395,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16833
17395
 
16834
17396
  // TODO: avoid hardcoded tensor names - use the TN_* constants
16835
17397
  if (name.find("attn_v.weight") != std::string::npos ||
16836
- name.find("attn_qkv.weight") != std::string::npos) {
17398
+ name.find("attn_qkv.weight") != std::string::npos ||
17399
+ name.find("attn_kv_b.weight")!= std::string::npos) {
16837
17400
  ++qs.n_attention_wv;
16838
17401
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
16839
17402
  qs.has_output = true;
@@ -16974,6 +17537,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16974
17537
  // NOTE: can't use LLM_TN here because the layer number is not known
16975
17538
  quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
16976
17539
 
17540
+ // do not quantize RWKV's time_mix_first tensors
17541
+ quantize &= name.find("time_mix_first.weight") == std::string::npos;
17542
+ quantize &= name.find("time_mix_w1.weight") == std::string::npos;
17543
+ quantize &= name.find("time_mix_w2.weight") == std::string::npos;
17544
+
16977
17545
  // do not quantize relative position bias (T5)
16978
17546
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
16979
17547
 
@@ -17357,7 +17925,6 @@ struct llama_model_params llama_model_default_params() {
17357
17925
 
17358
17926
  struct llama_context_params llama_context_default_params() {
17359
17927
  struct llama_context_params result = {
17360
- /*.seed =*/ LLAMA_DEFAULT_SEED,
17361
17928
  /*.n_ctx =*/ 512,
17362
17929
  /*.n_batch =*/ 2048,
17363
17930
  /*.n_ubatch =*/ 512,
@@ -17390,6 +17957,14 @@ struct llama_context_params llama_context_default_params() {
17390
17957
  return result;
17391
17958
  }
17392
17959
 
17960
+ struct llama_sampler_chain_params llama_sampler_chain_default_params() {
17961
+ struct llama_sampler_chain_params result = {
17962
+ /*.no_perf =*/ true,
17963
+ };
17964
+
17965
+ return result;
17966
+ }
17967
+
17393
17968
  struct llama_model_quantize_params llama_model_quantize_default_params() {
17394
17969
  struct llama_model_quantize_params result = {
17395
17970
  /*.nthread =*/ 0,
@@ -17461,6 +18036,19 @@ void llama_numa_init(enum lm_ggml_numa_strategy numa) {
17461
18036
  }
17462
18037
  }
17463
18038
 
18039
+ void llama_attach_threadpool(
18040
+ struct llama_context * ctx,
18041
+ lm_ggml_threadpool_t threadpool,
18042
+ lm_ggml_threadpool_t threadpool_batch) {
18043
+ ctx->threadpool = threadpool;
18044
+ ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
18045
+ }
18046
+
18047
+ void llama_detach_threadpool(struct llama_context * ctx) {
18048
+ ctx->threadpool = nullptr;
18049
+ ctx->threadpool_batch = nullptr;
18050
+ }
18051
+
17464
18052
  void llama_backend_free(void) {
17465
18053
  lm_ggml_quantize_free();
17466
18054
  }
@@ -17630,10 +18218,6 @@ struct llama_context * llama_new_context_with_model(
17630
18218
  cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
17631
18219
  }
17632
18220
 
17633
- if (params.seed == LLAMA_DEFAULT_SEED) {
17634
- params.seed = time(NULL);
17635
- }
17636
-
17637
18221
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
17638
18222
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
17639
18223
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
@@ -17644,10 +18228,10 @@ struct llama_context * llama_new_context_with_model(
17644
18228
  ctx->abort_callback = params.abort_callback;
17645
18229
  ctx->abort_callback_data = params.abort_callback_data;
17646
18230
 
17647
- ctx->sampling.rng = std::mt19937(params.seed);
17648
- ctx->logits_all = params.logits_all;
18231
+ ctx->logits_all = params.logits_all;
18232
+
17649
18233
  // build worst-case graph for encoder if a model contains encoder
17650
- ctx->is_encoding = llama_model_has_encoder(model);
18234
+ ctx->is_encoding = llama_model_has_encoder(model);
17651
18235
 
17652
18236
  uint32_t kv_size = cparams.n_ctx;
17653
18237
  lm_ggml_type type_k = params.type_k;
@@ -17667,6 +18251,20 @@ struct llama_context * llama_new_context_with_model(
17667
18251
 
17668
18252
  if (!hparams.vocab_only) {
17669
18253
  // initialize backends
18254
+ #if defined(LM_GGML_USE_RPC)
18255
+ if (model->n_gpu_layers > 0) {
18256
+ for (const auto & endpoint : model->rpc_servers) {
18257
+ lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
18258
+ if (backend == nullptr) {
18259
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18260
+ llama_free(ctx);
18261
+ return nullptr;
18262
+ }
18263
+ ctx->backends.push_back(backend);
18264
+ }
18265
+ }
18266
+ #endif
18267
+
17670
18268
  #if defined(LM_GGML_USE_METAL)
17671
18269
  if (model->n_gpu_layers > 0) {
17672
18270
  ctx->backend_metal = lm_ggml_backend_metal_init();
@@ -17791,19 +18389,6 @@ struct llama_context * llama_new_context_with_model(
17791
18389
  }
17792
18390
  #endif
17793
18391
 
17794
- #if defined(LM_GGML_USE_RPC)
17795
- if (model->n_gpu_layers > 0) {
17796
- for (const auto & endpoint : model->rpc_servers) {
17797
- lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
17798
- if (backend == nullptr) {
17799
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
17800
- llama_free(ctx);
17801
- return nullptr;
17802
- }
17803
- ctx->backends.push_back(backend);
17804
- }
17805
- }
17806
- #endif
17807
18392
  ctx->backend_cpu = lm_ggml_backend_cpu_init();
17808
18393
  if (ctx->backend_cpu == nullptr) {
17809
18394
  LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -17924,14 +18509,6 @@ void llama_free(struct llama_context * ctx) {
17924
18509
  delete ctx;
17925
18510
  }
17926
18511
 
17927
- const struct llama_model * llama_get_model(const struct llama_context * ctx) {
17928
- return &ctx->model;
17929
- }
17930
-
17931
- const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
17932
- return &ctx->model.vocab;
17933
- }
17934
-
17935
18512
  uint32_t llama_n_ctx(const struct llama_context * ctx) {
17936
18513
  return ctx->cparams.n_ctx;
17937
18514
  }
@@ -17952,6 +18529,30 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
17952
18529
  return model->vocab.type;
17953
18530
  }
17954
18531
 
18532
+ int32_t llama_n_vocab(const struct llama_model * model) {
18533
+ return model->hparams.n_vocab;
18534
+ }
18535
+
18536
+ int32_t llama_n_ctx_train(const struct llama_model * model) {
18537
+ return model->hparams.n_ctx_train;
18538
+ }
18539
+
18540
+ int32_t llama_n_embd(const struct llama_model * model) {
18541
+ return model->hparams.n_embd;
18542
+ }
18543
+
18544
+ int32_t llama_n_layer(const struct llama_model * model) {
18545
+ return model->hparams.n_layer;
18546
+ }
18547
+
18548
+ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
18549
+ return &ctx->model;
18550
+ }
18551
+
18552
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
18553
+ return ctx->cparams.pooling_type;
18554
+ }
18555
+
17955
18556
  enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17956
18557
  switch (model->arch) {
17957
18558
  // these models do not use RoPE
@@ -17965,6 +18566,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17965
18566
  case LLM_ARCH_T5:
17966
18567
  case LLM_ARCH_T5ENCODER:
17967
18568
  case LLM_ARCH_JAIS:
18569
+ case LLM_ARCH_RWKV6:
17968
18570
  return LLAMA_ROPE_TYPE_NONE;
17969
18571
 
17970
18572
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -18014,26 +18616,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18014
18616
  return LLAMA_ROPE_TYPE_NONE;
18015
18617
  }
18016
18618
 
18017
- enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
18018
- return ctx->cparams.pooling_type;
18019
- }
18020
-
18021
- int32_t llama_n_vocab(const struct llama_model * model) {
18022
- return model->hparams.n_vocab;
18023
- }
18024
-
18025
- int32_t llama_n_ctx_train(const struct llama_model * model) {
18026
- return model->hparams.n_ctx_train;
18027
- }
18028
-
18029
- int32_t llama_n_embd(const struct llama_model * model) {
18030
- return model->hparams.n_embd;
18031
- }
18032
-
18033
- int32_t llama_n_layer(const struct llama_model * model) {
18034
- return model->hparams.n_layer;
18035
- }
18036
-
18037
18619
  float llama_rope_freq_scale_train(const struct llama_model * model) {
18038
18620
  return model->hparams.rope_freq_scale_train;
18039
18621
  }
@@ -18133,6 +18715,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) {
18133
18715
  bool llama_model_is_recurrent(const struct llama_model * model) {
18134
18716
  switch (model->arch) {
18135
18717
  case LLM_ARCH_MAMBA: return true;
18718
+ case LLM_ARCH_RWKV6: return true;
18136
18719
  default: return false;
18137
18720
  }
18138
18721
  }
@@ -18449,14 +19032,14 @@ struct llama_data_write {
18449
19032
  // TODO: add more model-specific info which should prevent loading the session file if not identical
18450
19033
  }
18451
19034
 
18452
- void write_rng(const std::mt19937 & rng) {
18453
- std::ostringstream rng_ss;
18454
- rng_ss << rng;
19035
+ //void write_rng(const std::mt19937 & rng) {
19036
+ // std::ostringstream rng_ss;
19037
+ // rng_ss << rng;
18455
19038
 
18456
- const std::string & rng_str = rng_ss.str();
19039
+ // const std::string & rng_str = rng_ss.str();
18457
19040
 
18458
- write_string(rng_str);
18459
- }
19041
+ // write_string(rng_str);
19042
+ //}
18460
19043
 
18461
19044
  void write_output_ids(struct llama_context * ctx) {
18462
19045
  llama_output_reorder(ctx);
@@ -18676,17 +19259,17 @@ struct llama_data_read {
18676
19259
  // TODO: add more info which needs to be identical but which is not verified otherwise
18677
19260
  }
18678
19261
 
18679
- void read_rng(std::mt19937 & rng) {
18680
- std::string rng_str;
18681
- read_string(rng_str);
19262
+ //void read_rng(std::mt19937 & rng) {
19263
+ // std::string rng_str;
19264
+ // read_string(rng_str);
18682
19265
 
18683
- std::istringstream rng_ss(rng_str);
18684
- rng_ss >> rng;
19266
+ // std::istringstream rng_ss(rng_str);
19267
+ // rng_ss >> rng;
18685
19268
 
18686
- if (rng_ss.fail()) {
18687
- throw std::runtime_error("failed to load RNG state");
18688
- }
18689
- }
19269
+ // if (rng_ss.fail()) {
19270
+ // throw std::runtime_error("failed to load RNG state");
19271
+ // }
19272
+ //}
18690
19273
 
18691
19274
  void read_output_ids(struct llama_context * ctx) {
18692
19275
  std::vector<int32_t> output_pos;
@@ -19116,8 +19699,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
19116
19699
 
19117
19700
  data_ctx.write_model_info(ctx);
19118
19701
 
19119
- data_ctx.write_rng(ctx->sampling.rng);
19120
-
19121
19702
  // copy outputs
19122
19703
  data_ctx.write_output_ids(ctx);
19123
19704
  data_ctx.write_logits(ctx);
@@ -19155,9 +19736,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
19155
19736
 
19156
19737
  data_ctx.read_model_info(ctx);
19157
19738
 
19158
- // set rng
19159
- data_ctx.read_rng(ctx->sampling.rng);
19160
-
19161
19739
  // set outputs
19162
19740
  data_ctx.read_output_ids(ctx);
19163
19741
  data_ctx.read_logits(ctx);
@@ -19377,16 +19955,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
19377
19955
  }
19378
19956
  }
19379
19957
 
19380
- void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
19958
+ void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
19381
19959
  ctx->cparams.n_threads = n_threads;
19382
19960
  ctx->cparams.n_threads_batch = n_threads_batch;
19383
19961
  }
19384
19962
 
19385
- uint32_t llama_n_threads(struct llama_context * ctx) {
19963
+ int32_t llama_n_threads(struct llama_context * ctx) {
19386
19964
  return ctx->cparams.n_threads;
19387
19965
  }
19388
19966
 
19389
- uint32_t llama_n_threads_batch(struct llama_context * ctx) {
19967
+ int32_t llama_n_threads_batch(struct llama_context * ctx) {
19390
19968
  return ctx->cparams.n_threads_batch;
19391
19969
  }
19392
19970
 
@@ -19560,8 +20138,9 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
19560
20138
  LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
19561
20139
  #ifndef NDEBUG
19562
20140
  LM_GGML_ABORT("fatal error");
19563
- #endif
20141
+ #else
19564
20142
  return nullptr;
20143
+ #endif
19565
20144
  }
19566
20145
  }
19567
20146
 
@@ -19609,8 +20188,9 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
19609
20188
  LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
19610
20189
  #ifndef NDEBUG
19611
20190
  LM_GGML_ABORT("fatal error");
19612
- #endif
20191
+ #else
19613
20192
  return nullptr;
20193
+ #endif
19614
20194
  }
19615
20195
  }
19616
20196
 
@@ -20044,128 +20624,18 @@ int32_t llama_chat_apply_template(
20044
20624
  }
20045
20625
 
20046
20626
  //
20047
- // grammar
20627
+ // sampling
20048
20628
  //
20049
20629
 
20050
- struct llama_grammar * llama_grammar_init(
20051
- const llama_grammar_element ** rules,
20052
- size_t n_rules,
20053
- size_t start_rule_index) {
20054
- return llama_grammar_init_impl(rules, n_rules, start_rule_index);
20055
- }
20056
-
20057
- void llama_grammar_free(struct llama_grammar * grammar) {
20058
- llama_grammar_free_impl(grammar);
20059
- }
20060
-
20061
- struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
20062
- return llama_grammar_copy_impl(grammar);
20063
- }
20064
-
20065
- void llama_grammar_sample(
20066
- const struct llama_grammar * grammar,
20067
- const struct llama_context * ctx,
20068
- llama_token_data_array * candidates) {
20069
- llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
20070
- }
20071
-
20072
- void llama_sample_grammar(
20073
- struct llama_context * ctx,
20074
- llama_token_data_array * candidates,
20075
- const struct llama_grammar * grammar) {
20076
- llama_grammar_sample(grammar, ctx, candidates);
20077
- }
20078
-
20079
- void llama_grammar_accept_token(
20080
- struct llama_grammar * grammar,
20081
- struct llama_context * ctx,
20082
- llama_token token) {
20083
- llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token);
20630
+ // TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
20631
+ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
20632
+ return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
20084
20633
  }
20085
20634
 
20086
20635
  //
20087
- // sampling
20636
+ // model split
20088
20637
  //
20089
20638
 
20090
- void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
20091
- llama_set_rng_seed_impl(&ctx->sampling, seed);
20092
- }
20093
-
20094
- void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
20095
- llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates);
20096
- }
20097
-
20098
- void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
20099
- llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
20100
- }
20101
-
20102
- void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
20103
- llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
20104
- }
20105
-
20106
- void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
20107
- llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
20108
- }
20109
-
20110
- void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates, float xtc_threshold, float xtc_probability, size_t min_keep, std::mt19937 rng){
20111
- llama_sample_xtc_impl(ctx ? &ctx-> sampling: nullptr, candidates, xtc_threshold, xtc_probability, min_keep, rng);
20112
- }
20113
-
20114
- void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
20115
- llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
20116
- }
20117
-
20118
- void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
20119
- llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
20120
- }
20121
-
20122
- void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
20123
- llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
20124
- }
20125
-
20126
- void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
20127
- llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
20128
- }
20129
-
20130
- void llama_sample_repetition_penalties(
20131
- struct llama_context * ctx,
20132
- llama_token_data_array * candidates,
20133
- const llama_token * last_tokens,
20134
- size_t penalty_last_n,
20135
- float penalty_repeat,
20136
- float penalty_freq,
20137
- float penalty_present) {
20138
- llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
20139
- }
20140
-
20141
- void llama_sample_apply_guidance(
20142
- struct llama_context * ctx,
20143
- float * logits,
20144
- float * logits_guidance,
20145
- float scale) {
20146
- llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
20147
- }
20148
-
20149
- llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
20150
- return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
20151
- }
20152
-
20153
- llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
20154
- return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
20155
- }
20156
-
20157
- llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
20158
- return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
20159
- }
20160
-
20161
- llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
20162
- return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
20163
- }
20164
-
20165
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
20166
- return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
20167
- }
20168
-
20169
20639
  int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
20170
20640
  static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
20171
20641
  if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
@@ -20190,45 +20660,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
20190
20660
  return 0;
20191
20661
  }
20192
20662
 
20193
- struct llama_timings llama_get_timings(struct llama_context * ctx) {
20194
- struct llama_timings result = {
20195
- /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
20196
- /*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
20197
- /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
20198
- /*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
20199
- /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
20200
- /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
20201
-
20202
- /*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
20203
- /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
20204
- /*.n_eval =*/ std::max(1, ctx->n_eval),
20205
- };
20206
-
20207
- return result;
20208
- }
20209
-
20210
- void llama_print_timings(struct llama_context * ctx) {
20211
- const llama_timings timings = llama_get_timings(ctx);
20212
-
20213
- LLAMA_LOG_INFO("\n");
20214
- LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
20215
- LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20216
- __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
20217
- LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20218
- __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
20219
- LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20220
- __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
20221
- LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
20222
- }
20223
-
20224
- void llama_reset_timings(struct llama_context * ctx) {
20225
- ctx->t_start_us = lm_ggml_time_us();
20226
- ctx->t_eval_us = ctx->n_eval = 0;
20227
- ctx->t_p_eval_us = ctx->n_p_eval = 0;
20228
-
20229
- ctx->sampling.reset_timings();
20230
- }
20231
-
20232
20663
  const char * llama_print_system_info(void) {
20233
20664
  static std::string s;
20234
20665
 
@@ -20257,7 +20688,68 @@ const char * llama_print_system_info(void) {
20257
20688
  return s.c_str();
20258
20689
  }
20259
20690
 
20260
- void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
20691
+ void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20692
+ switch (type) {
20693
+ case LLAMA_PERF_TYPE_CONTEXT:
20694
+ {
20695
+ const auto * p = (const struct llama_context *) ctx;
20696
+
20697
+ const double t_start_ms = 1e-3 * p->t_start_us;
20698
+ const double t_end_ms = 1.00 * lm_ggml_time_ms();
20699
+ const double t_load_ms = 1e-3 * p->t_load_us;
20700
+ const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20701
+ const double t_eval_ms = 1e-3 * p->t_eval_us;
20702
+
20703
+ const int32_t n_p_eval = std::max(0, p->n_p_eval);
20704
+ const int32_t n_eval = std::max(1, p->n_eval);
20705
+
20706
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
20707
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20708
+ __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
20709
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20710
+ __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
20711
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
20712
+ } break;
20713
+ case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20714
+ {
20715
+ const auto * smpl = (const struct llama_sampler *) ctx;
20716
+ const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20717
+
20718
+ const double t_sampler_ms = 1e-3 * p->t_sample_us;
20719
+
20720
+ const int32_t n_sampler = std::max(0, p->n_sample);
20721
+
20722
+ LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20723
+ __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
20724
+ } break;
20725
+ default:
20726
+ LM_GGML_ABORT("invalid perf type");
20727
+ }
20728
+ }
20729
+
20730
+ void llama_perf_reset(void * ctx, enum llama_perf_type type) {
20731
+ switch (type) {
20732
+ case LLAMA_PERF_TYPE_CONTEXT:
20733
+ {
20734
+ auto * p = (struct llama_context *) ctx;
20735
+
20736
+ p->t_start_us = lm_ggml_time_us();
20737
+ p->t_eval_us = p->n_eval = 0;
20738
+ p->t_p_eval_us = p->n_p_eval = 0;
20739
+ } break;
20740
+ case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20741
+ {
20742
+ auto * smpl = (struct llama_sampler *) ctx;
20743
+ auto * p = (struct llama_sampler_chain *) smpl->ctx;
20744
+
20745
+ p->t_sample_us = p->n_sample = 0;
20746
+ } break;
20747
+ default:
20748
+ LM_GGML_ABORT("invalid perf type");
20749
+ }
20750
+ }
20751
+
20752
+ void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
20261
20753
  fprintf(stream, "\n");
20262
20754
  fprintf(stream, "###########\n");
20263
20755
  fprintf(stream, "# Timings #\n");
@@ -20268,21 +20760,15 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
20268
20760
  1.0e-3 * ctx->t_eval_us / ctx->n_eval);
20269
20761
  fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
20270
20762
  1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
20271
- fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
20272
- 1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
20273
20763
  fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
20274
20764
  fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
20275
- fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->sampling.n_sample);
20276
20765
  fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
20277
20766
  fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
20278
20767
  fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
20279
- fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
20280
20768
  fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
20281
20769
  1.0e6 * ctx->n_eval / ctx->t_eval_us);
20282
20770
  fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
20283
20771
  1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
20284
- fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
20285
- 1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
20286
20772
  }
20287
20773
 
20288
20774
  // For internal test use
@@ -20334,3 +20820,20 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
20334
20820
  fputs(text, stderr);
20335
20821
  fflush(stderr);
20336
20822
  }
20823
+
20824
+ struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
20825
+ const auto * ctx = (llama_context *) v_ctx;
20826
+ struct llama_token_timings result = {
20827
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
20828
+ /*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
20829
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
20830
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
20831
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
20832
+
20833
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
20834
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
20835
+ };
20836
+
20837
+ return result;
20838
+ }
20839
+