cui-llama.rn 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -2167,6 +2167,10 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf
2167
2167
  if (host_buffer) {
2168
2168
  buft = lm_ggml_backend_sycl_host_buffer_type();
2169
2169
  }
2170
+ #elif defined(LM_GGML_USE_CANN)
2171
+ if (host_buffer) {
2172
+ buft = lm_ggml_backend_cann_host_buffer_type();
2173
+ }
2170
2174
  #elif defined(LM_GGML_USE_CPU_HBM)
2171
2175
  buft = lm_ggml_backend_cpu_hbm_buffer_type();
2172
2176
  #elif defined(LM_GGML_USE_VULKAN)
@@ -2493,6 +2497,7 @@ struct llama_cparams {
2493
2497
  bool causal_attn;
2494
2498
  bool offload_kqv;
2495
2499
  bool flash_attn;
2500
+ bool no_perf;
2496
2501
 
2497
2502
  enum llama_pooling_type pooling_type;
2498
2503
 
@@ -6668,8 +6673,6 @@ static bool llm_load_tensors(
6668
6673
  bool use_mlock,
6669
6674
  llama_progress_callback progress_callback,
6670
6675
  void * progress_callback_user_data) {
6671
- model.t_start_us = lm_ggml_time_us();
6672
-
6673
6676
  auto & hparams = model.hparams;
6674
6677
 
6675
6678
  model.split_mode = split_mode;
@@ -8600,14 +8603,13 @@ static bool llm_load_tensors(
8600
8603
  }
8601
8604
  }
8602
8605
 
8603
- // loading time will be recalculate after the first eval, so
8604
- // we take page faults deferred by mmap() into consideration
8605
- model.t_load_us = lm_ggml_time_us() - model.t_start_us;
8606
8606
  return true;
8607
8607
  }
8608
8608
 
8609
8609
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
8610
8610
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
8611
+ model.t_start_us = lm_ggml_time_us();
8612
+
8611
8613
  try {
8612
8614
  llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
8613
8615
 
@@ -8669,6 +8671,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
8669
8671
  return -1;
8670
8672
  }
8671
8673
 
8674
+ // loading time will be recalculate after the first eval, so
8675
+ // we take page faults deferred by mmap() into consideration
8676
+ model.t_load_us = lm_ggml_time_us() - model.t_start_us;
8677
+
8672
8678
  return 0;
8673
8679
  }
8674
8680
 
@@ -9269,7 +9275,7 @@ static struct lm_ggml_tensor * llm_build_copy_mask_state(
9269
9275
  // FIXME: zero-out NANs?
9270
9276
  states = lm_ggml_mul(ctx, states, state_mask);
9271
9277
 
9272
- // copy states which won't be changed further (between n_seqs and n_rs)
9278
+ // copy states which won't be changed further (between n_seqs and n_kv)
9273
9279
  lm_ggml_build_forward_expand(graph,
9274
9280
  lm_ggml_cpy(ctx,
9275
9281
  lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)),
@@ -9888,8 +9894,8 @@ struct llm_build_context {
9888
9894
  struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) {
9889
9895
  // find result_norm tensor for input
9890
9896
  struct lm_ggml_tensor * inp = nullptr;
9891
- for (int i = gf->n_nodes - 1; i >= 0; --i) {
9892
- inp = gf->nodes[i];
9897
+ for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
9898
+ inp = lm_ggml_graph_node(gf, i);
9893
9899
  if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
9894
9900
  break;
9895
9901
  } else {
@@ -15831,7 +15837,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
15831
15837
 
15832
15838
  // clear unused states
15833
15839
  for (int i = 0; i < n_kv; ++i) {
15834
- uint32_t cell_id = i + kv_self.head;
15840
+ const uint32_t cell_id = i + kv_self.head;
15835
15841
  llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
15836
15842
 
15837
15843
  data[i] = (float) (kv_cell.src >= 0);
@@ -16087,19 +16093,21 @@ static int llama_decode_internal(
16087
16093
  return -1;
16088
16094
  }
16089
16095
 
16090
- for (uint32_t i = 0; i < n_tokens_all; ++i) {
16091
- if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
16092
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16093
- return -1;
16094
- }
16095
- }
16096
-
16097
16096
  const auto & model = lctx.model;
16098
16097
  const auto & hparams = model.hparams;
16099
16098
  const auto & cparams = lctx.cparams;
16100
16099
 
16101
16100
  LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
16102
16101
 
16102
+ if (batch_all.token) {
16103
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
16104
+ if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
16105
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16106
+ return -1;
16107
+ }
16108
+ }
16109
+ }
16110
+
16103
16111
  LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
16104
16112
 
16105
16113
  LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@@ -16216,8 +16224,8 @@ static int llama_decode_internal(
16216
16224
  lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
16217
16225
 
16218
16226
  // the output is always the last tensor in the graph
16219
- struct lm_ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
16220
- struct lm_ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
16227
+ struct lm_ggml_tensor * res = lm_ggml_graph_node(gf, -1);
16228
+ struct lm_ggml_tensor * embd = lm_ggml_graph_node(gf, -2);
16221
16229
 
16222
16230
  if (lctx.n_outputs == 0) {
16223
16231
  // no output
@@ -16226,9 +16234,9 @@ static int llama_decode_internal(
16226
16234
  } else if (cparams.embeddings) {
16227
16235
  res = nullptr; // do not extract logits for embedding case
16228
16236
  embd = nullptr;
16229
- for (int i = gf->n_nodes - 1; i >= 0; --i) {
16230
- if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
16231
- embd = gf->nodes[i];
16237
+ for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
16238
+ if (strcmp(lm_ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
16239
+ embd = lm_ggml_graph_node(gf, i);
16232
16240
  break;
16233
16241
  }
16234
16242
  }
@@ -16386,19 +16394,21 @@ static int llama_encode_internal(
16386
16394
  return -1;
16387
16395
  }
16388
16396
 
16389
- for (uint32_t i = 0; i < n_tokens; ++i) {
16390
- if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
16391
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
16392
- return -1;
16393
- }
16394
- }
16395
-
16396
16397
  const auto & model = lctx.model;
16397
16398
  const auto & hparams = model.hparams;
16398
16399
  const auto & cparams = lctx.cparams;
16399
16400
 
16400
16401
  LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
16401
16402
 
16403
+ if (batch.token) {
16404
+ for (uint32_t i = 0; i < n_tokens; ++i) {
16405
+ if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
16406
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
16407
+ return -1;
16408
+ }
16409
+ }
16410
+ }
16411
+
16402
16412
  // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
16403
16413
  LM_GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
16404
16414
 
@@ -16443,15 +16453,15 @@ static int llama_encode_internal(
16443
16453
  // there are two cases here
16444
16454
  if (llama_model_has_decoder(&lctx.model)) {
16445
16455
  // first case is an encoder-decoder T5 model where embeddings are passed to decoder
16446
- embd = gf->nodes[gf->n_nodes - 1];
16456
+ embd = lm_ggml_graph_node(gf, -1);
16447
16457
  LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
16448
16458
  } else {
16449
16459
  // second case is an encoder-only T5 model
16450
16460
  if (cparams.embeddings) {
16451
16461
  // only output embeddings if required
16452
- embd = gf->nodes[gf->n_nodes - 1];
16462
+ embd = lm_ggml_graph_node(gf, -1);
16453
16463
  if (strcmp(embd->name, "result_embd_pooled") != 0) {
16454
- embd = gf->nodes[gf->n_nodes - 2];
16464
+ embd = lm_ggml_graph_node(gf, -2);
16455
16465
  }
16456
16466
  LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
16457
16467
  }
@@ -17541,6 +17551,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
17541
17551
  quantize &= name.find("time_mix_first.weight") == std::string::npos;
17542
17552
  quantize &= name.find("time_mix_w1.weight") == std::string::npos;
17543
17553
  quantize &= name.find("time_mix_w2.weight") == std::string::npos;
17554
+ quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
17555
+ quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
17544
17556
 
17545
17557
  // do not quantize relative position bias (T5)
17546
17558
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@@ -17950,6 +17962,7 @@ struct llama_context_params llama_context_default_params() {
17950
17962
  /*.embeddings =*/ false,
17951
17963
  /*.offload_kqv =*/ true,
17952
17964
  /*.flash_attn =*/ false,
17965
+ /*.no_perf =*/ true,
17953
17966
  /*.abort_callback =*/ nullptr,
17954
17967
  /*.abort_callback_data =*/ nullptr,
17955
17968
  };
@@ -18160,6 +18173,7 @@ struct llama_context * llama_new_context_with_model(
18160
18173
  cparams.embeddings = params.embeddings;
18161
18174
  cparams.offload_kqv = params.offload_kqv;
18162
18175
  cparams.flash_attn = params.flash_attn;
18176
+ cparams.no_perf = params.no_perf;
18163
18177
  cparams.pooling_type = params.pooling_type;
18164
18178
 
18165
18179
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -18497,7 +18511,7 @@ struct llama_context * llama_new_context_with_model(
18497
18511
 
18498
18512
  // note: the number of splits during measure is higher than during inference due to the kv shift
18499
18513
  int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched);
18500
- LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
18514
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, lm_ggml_graph_n_nodes(gf));
18501
18515
  LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
18502
18516
  }
18503
18517
  }
@@ -20078,10 +20092,14 @@ void llama_synchronize(struct llama_context * ctx) {
20078
20092
 
20079
20093
  // add the evaluation to the stats
20080
20094
  if (ctx->n_queued_tokens == 1) {
20081
- ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20095
+ if (!ctx->cparams.no_perf) {
20096
+ ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20097
+ }
20082
20098
  ctx->n_eval++;
20083
20099
  } else if (ctx->n_queued_tokens > 1) {
20084
- ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20100
+ if (!ctx->cparams.no_perf) {
20101
+ ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20102
+ }
20085
20103
  ctx->n_p_eval += ctx->n_queued_tokens;
20086
20104
  }
20087
20105
 
@@ -20677,6 +20695,7 @@ const char * llama_print_system_info(void) {
20677
20695
  s += "ARM_FMA = " + std::to_string(lm_ggml_cpu_has_arm_fma()) + " | ";
20678
20696
  s += "F16C = " + std::to_string(lm_ggml_cpu_has_f16c()) + " | ";
20679
20697
  s += "FP16_VA = " + std::to_string(lm_ggml_cpu_has_fp16_va()) + " | ";
20698
+ s += "RISCV_VECT = " + std::to_string(lm_ggml_cpu_has_riscv_v()) + " | ";
20680
20699
  s += "WASM_SIMD = " + std::to_string(lm_ggml_cpu_has_wasm_simd()) + " | ";
20681
20700
  s += "BLAS = " + std::to_string(lm_ggml_cpu_has_blas()) + " | ";
20682
20701
  s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | ";
@@ -20688,65 +20707,40 @@ const char * llama_print_system_info(void) {
20688
20707
  return s.c_str();
20689
20708
  }
20690
20709
 
20691
- void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20692
- switch (type) {
20693
- case LLAMA_PERF_TYPE_CONTEXT:
20694
- {
20695
- const auto * p = (const struct llama_context *) ctx;
20696
-
20697
- const double t_start_ms = 1e-3 * p->t_start_us;
20698
- const double t_end_ms = 1.00 * lm_ggml_time_ms();
20699
- const double t_load_ms = 1e-3 * p->t_load_us;
20700
- const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20701
- const double t_eval_ms = 1e-3 * p->t_eval_us;
20702
-
20703
- const int32_t n_p_eval = std::max(0, p->n_p_eval);
20704
- const int32_t n_eval = std::max(1, p->n_eval);
20705
-
20706
- LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
20707
- LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20708
- __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
20709
- LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20710
- __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
20711
- LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
20712
- } break;
20713
- case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20714
- {
20715
- const auto * smpl = (const struct llama_sampler *) ctx;
20716
- const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20710
+ struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
20711
+ struct llama_perf_context_data data = {};
20717
20712
 
20718
- const double t_sampler_ms = 1e-3 * p->t_sample_us;
20713
+ if (ctx == nullptr) {
20714
+ return data;
20715
+ }
20719
20716
 
20720
- const int32_t n_sampler = std::max(0, p->n_sample);
20717
+ data.t_start_ms = 1e-3 * ctx->t_start_us;
20718
+ data.t_load_ms = 1e-3 * ctx->t_load_us;
20719
+ data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
20720
+ data.t_eval_ms = 1e-3 * ctx->t_eval_us;
20721
+ data.n_p_eval = std::max(1, ctx->n_p_eval);
20722
+ data.n_eval = std::max(1, ctx->n_eval);
20721
20723
 
20722
- LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20723
- __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
20724
- } break;
20725
- default:
20726
- LM_GGML_ABORT("invalid perf type");
20727
- }
20724
+ return data;
20728
20725
  }
20729
20726
 
20730
- void llama_perf_reset(void * ctx, enum llama_perf_type type) {
20731
- switch (type) {
20732
- case LLAMA_PERF_TYPE_CONTEXT:
20733
- {
20734
- auto * p = (struct llama_context *) ctx;
20727
+ void llama_perf_context_print(const struct llama_context * ctx) {
20728
+ const auto data = llama_perf_context(ctx);
20735
20729
 
20736
- p->t_start_us = lm_ggml_time_us();
20737
- p->t_eval_us = p->n_eval = 0;
20738
- p->t_p_eval_us = p->n_p_eval = 0;
20739
- } break;
20740
- case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20741
- {
20742
- auto * smpl = (struct llama_sampler *) ctx;
20743
- auto * p = (struct llama_sampler_chain *) smpl->ctx;
20730
+ const double t_end_ms = 1e-3 * lm_ggml_time_us();
20744
20731
 
20745
- p->t_sample_us = p->n_sample = 0;
20746
- } break;
20747
- default:
20748
- LM_GGML_ABORT("invalid perf type");
20749
- }
20732
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
20733
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20734
+ __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
20735
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20736
+ __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
20737
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
20738
+ }
20739
+
20740
+ void llama_perf_context_reset(struct llama_context * ctx) {
20741
+ ctx->t_start_us = lm_ggml_time_us();
20742
+ ctx->t_eval_us = ctx->n_eval = 0;
20743
+ ctx->t_p_eval_us = ctx->n_p_eval = 0;
20750
20744
  }
20751
20745
 
20752
20746
  void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
package/cpp/llama.h CHANGED
@@ -344,7 +344,7 @@ extern "C" {
344
344
  bool embeddings; // if true, extract embeddings (together with logits)
345
345
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
346
346
  bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
347
- //bool no_perf; // whether to measure performance timings, TODO: implement
347
+ bool no_perf; // whether to measure performance timings
348
348
 
349
349
  // Abort callback
350
350
  // if it returns true, execution of llama_decode() will be aborted
@@ -1057,6 +1057,9 @@ extern "C" {
1057
1057
  LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
1058
1058
  LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
1059
1059
 
1060
+ // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
1061
+ LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
1062
+
1060
1063
  // available samplers:
1061
1064
 
1062
1065
  LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
@@ -1131,15 +1134,20 @@ extern "C" {
1131
1134
  int32_t n_logit_bias,
1132
1135
  const llama_logit_bias * logit_bias);
1133
1136
 
1134
- // Shorthand for:
1137
+
1138
+ // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1139
+ LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
1140
+
1141
+ /// @details Sample and accept a token from the idx-th output of the last evaluation
1135
1142
  //
1143
+ // Shorthand for:
1136
1144
  // const auto * logits = llama_get_logits_ith(ctx, idx);
1137
1145
  // llama_token_data_array cur_p = { ... init from logits ... };
1138
1146
  // llama_sampler_apply(smpl, &cur_p);
1139
- // return cur_p.data[cur_p.selected].id;
1140
- //
1141
- // At this point, this is mostly a convenience function.
1142
- //
1147
+ // auto token = cur_p.data[cur_p.selected].id;
1148
+ // llama_sampler_accept(smpl, token);
1149
+ // return token;
1150
+ // Returns the sampled token
1143
1151
  LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
1144
1152
 
1145
1153
  // TODO: extend in the future
@@ -1172,13 +1180,30 @@ extern "C" {
1172
1180
  // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
1173
1181
  //
1174
1182
 
1175
- enum llama_perf_type {
1176
- LLAMA_PERF_TYPE_CONTEXT = 0,
1177
- LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
1183
+ struct llama_perf_context_data {
1184
+ double t_start_ms;
1185
+ double t_load_ms;
1186
+ double t_p_eval_ms;
1187
+ double t_eval_ms;
1188
+
1189
+ int32_t n_p_eval;
1190
+ int32_t n_eval;
1178
1191
  };
1179
1192
 
1180
- LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
1181
- LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
1193
+ struct llama_perf_sampler_data {
1194
+ double t_sample_ms;
1195
+
1196
+ int32_t n_sample;
1197
+ };
1198
+
1199
+ LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
1200
+ LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
1201
+ LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
1202
+
1203
+ // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
1204
+ LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
1205
+ LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1206
+ LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1182
1207
 
1183
1208
  LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
1184
1209
 
package/cpp/rn-llama.hpp CHANGED
@@ -7,6 +7,7 @@
7
7
  #include "llama.h"
8
8
 
9
9
  #include <android/log.h>
10
+ #include "sampling.h"
10
11
  #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
11
12
  #define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
12
13
 
@@ -334,7 +335,7 @@ struct llama_rn_context
334
335
  {
335
336
  // number of tokens to keep when resetting context
336
337
  n_remain = params.n_predict;
337
- llama_perf_reset(ctx, LLAMA_PERF_TYPE_CONTEXT);
338
+ llama_perf_context_reset(ctx);
338
339
  is_predicting = true;
339
340
  }
340
341
 
package/cpp/sampling.cpp CHANGED
@@ -2,6 +2,9 @@
2
2
 
3
3
  #include "common.h"
4
4
 
5
+ #include <cmath>
6
+ #include <unordered_map>
7
+
5
8
  // the ring buffer works similarly to std::deque, but with a fixed capacity
6
9
  // TODO: deduplicate with llama-impl.h
7
10
  template<typename T>
@@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const {
139
142
  struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
140
143
  llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
141
144
 
142
- lparams.no_perf = false; // TODO: control via params
145
+ lparams.no_perf = params.no_perf;
143
146
 
144
147
  auto * result = new gpt_sampler {
145
148
  /* .params = */ params,
@@ -257,10 +260,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
257
260
  // TODO: measure grammar performance
258
261
 
259
262
  if (gsmpl) {
260
- llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
263
+ llama_perf_sampler_print(gsmpl->chain);
261
264
  }
262
265
  if (ctx) {
263
- llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
266
+ llama_perf_context_print(ctx);
264
267
  }
265
268
  }
266
269
 
@@ -310,6 +313,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
310
313
  return cur_p.data[cur_p.selected].id;
311
314
  }
312
315
 
316
+ uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
317
+ return llama_sampler_get_seed(gsmpl->chain);
318
+ }
319
+
313
320
  // helpers
314
321
 
315
322
  llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
@@ -432,7 +439,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
432
439
  }
433
440
 
434
441
  std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
435
- std::unordered_map<char, gpt_sampler_type> sampler_name_map {
442
+ std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
436
443
  { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
437
444
  { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
438
445
  { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
package/cpp/sampling.h CHANGED
@@ -2,65 +2,11 @@
2
2
 
3
3
  #include "llama.h"
4
4
 
5
+ #include "common.h"
6
+
5
7
  #include <string>
6
8
  #include <vector>
7
9
 
8
- enum gpt_sampler_type {
9
- GPT_SAMPLER_TYPE_NONE = 0,
10
- GPT_SAMPLER_TYPE_TOP_K = 1,
11
- GPT_SAMPLER_TYPE_TOP_P = 2,
12
- GPT_SAMPLER_TYPE_MIN_P = 3,
13
- GPT_SAMPLER_TYPE_TFS_Z = 4,
14
- GPT_SAMPLER_TYPE_TYPICAL_P = 5,
15
- GPT_SAMPLER_TYPE_TEMPERATURE = 6,
16
- GPT_SAMPLER_TYPE_XTC = 7,
17
- };
18
-
19
- // sampling parameters
20
- struct gpt_sampler_params {
21
- uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
22
-
23
- int32_t n_prev = 64; // number of previous tokens to remember
24
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
25
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
26
- int32_t top_k = 40; // <= 0 to use vocab size
27
- float top_p = 0.95f; // 1.0 = disabled
28
- float min_p = 0.05f; // 0.0 = disabled
29
- float tfs_z = 1.00f; // 1.0 = disabled
30
- float xtc_t = 0.0f; // 0.0 = disabled
31
- float xtc_p = 0.0f;
32
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
33
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
34
- float dynatemp_range = 0.00f; // 0.0 = disabled
35
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
36
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
37
- float penalty_repeat = 1.00f; // 1.0 = disabled
38
- float penalty_freq = 0.00f; // 0.0 = disabled
39
- float penalty_present = 0.00f; // 0.0 = disabled
40
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
41
- float mirostat_tau = 5.00f; // target entropy
42
- float mirostat_eta = 0.10f; // learning rate
43
- bool penalize_nl = false; // consider newlines as a repeatable token
44
- bool ignore_eos = false;
45
-
46
- std::vector<enum gpt_sampler_type> samplers = {
47
- GPT_SAMPLER_TYPE_TOP_K,
48
- GPT_SAMPLER_TYPE_TFS_Z,
49
- GPT_SAMPLER_TYPE_TYPICAL_P,
50
- GPT_SAMPLER_TYPE_TOP_P,
51
- GPT_SAMPLER_TYPE_MIN_P,
52
- GPT_SAMPLER_TYPE_XTC,
53
- GPT_SAMPLER_TYPE_TEMPERATURE
54
- };
55
-
56
- std::string grammar; // optional BNF-like grammar to constrain sampling
57
-
58
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
59
-
60
- // print the parameters into a string
61
- std::string print() const;
62
- };
63
-
64
10
  // gpt_sampler extends llama_sampler with additional functionality:
65
11
  //
66
12
  // - grammar support
@@ -114,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
114
60
  //
115
61
  llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
116
62
 
63
+ uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
64
+
117
65
  // helpers
118
66
 
119
67
  // access the internal list of current candidate tokens
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.1.4",
3
+ "version": "1.1.5",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",