cui-llama.rn 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +2 -2
- package/cpp/common.cpp +35 -1946
- package/cpp/common.h +91 -128
- package/cpp/ggml-impl.h +32 -0
- package/cpp/ggml-metal.m +5 -6
- package/cpp/ggml-quants.c +242 -48
- package/cpp/ggml.c +89 -35
- package/cpp/ggml.h +25 -63
- package/cpp/llama-sampling.cpp +218 -94
- package/cpp/llama.cpp +80 -86
- package/cpp/llama.h +36 -11
- package/cpp/rn-llama.hpp +2 -1
- package/cpp/sampling.cpp +11 -4
- package/cpp/sampling.h +4 -56
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -2167,6 +2167,10 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf
|
|
2167
2167
|
if (host_buffer) {
|
2168
2168
|
buft = lm_ggml_backend_sycl_host_buffer_type();
|
2169
2169
|
}
|
2170
|
+
#elif defined(LM_GGML_USE_CANN)
|
2171
|
+
if (host_buffer) {
|
2172
|
+
buft = lm_ggml_backend_cann_host_buffer_type();
|
2173
|
+
}
|
2170
2174
|
#elif defined(LM_GGML_USE_CPU_HBM)
|
2171
2175
|
buft = lm_ggml_backend_cpu_hbm_buffer_type();
|
2172
2176
|
#elif defined(LM_GGML_USE_VULKAN)
|
@@ -2493,6 +2497,7 @@ struct llama_cparams {
|
|
2493
2497
|
bool causal_attn;
|
2494
2498
|
bool offload_kqv;
|
2495
2499
|
bool flash_attn;
|
2500
|
+
bool no_perf;
|
2496
2501
|
|
2497
2502
|
enum llama_pooling_type pooling_type;
|
2498
2503
|
|
@@ -6668,8 +6673,6 @@ static bool llm_load_tensors(
|
|
6668
6673
|
bool use_mlock,
|
6669
6674
|
llama_progress_callback progress_callback,
|
6670
6675
|
void * progress_callback_user_data) {
|
6671
|
-
model.t_start_us = lm_ggml_time_us();
|
6672
|
-
|
6673
6676
|
auto & hparams = model.hparams;
|
6674
6677
|
|
6675
6678
|
model.split_mode = split_mode;
|
@@ -8600,14 +8603,13 @@ static bool llm_load_tensors(
|
|
8600
8603
|
}
|
8601
8604
|
}
|
8602
8605
|
|
8603
|
-
// loading time will be recalculate after the first eval, so
|
8604
|
-
// we take page faults deferred by mmap() into consideration
|
8605
|
-
model.t_load_us = lm_ggml_time_us() - model.t_start_us;
|
8606
8606
|
return true;
|
8607
8607
|
}
|
8608
8608
|
|
8609
8609
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
8610
8610
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
8611
|
+
model.t_start_us = lm_ggml_time_us();
|
8612
|
+
|
8611
8613
|
try {
|
8612
8614
|
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
8613
8615
|
|
@@ -8669,6 +8671,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
8669
8671
|
return -1;
|
8670
8672
|
}
|
8671
8673
|
|
8674
|
+
// loading time will be recalculate after the first eval, so
|
8675
|
+
// we take page faults deferred by mmap() into consideration
|
8676
|
+
model.t_load_us = lm_ggml_time_us() - model.t_start_us;
|
8677
|
+
|
8672
8678
|
return 0;
|
8673
8679
|
}
|
8674
8680
|
|
@@ -9269,7 +9275,7 @@ static struct lm_ggml_tensor * llm_build_copy_mask_state(
|
|
9269
9275
|
// FIXME: zero-out NANs?
|
9270
9276
|
states = lm_ggml_mul(ctx, states, state_mask);
|
9271
9277
|
|
9272
|
-
// copy states which won't be changed further (between n_seqs and
|
9278
|
+
// copy states which won't be changed further (between n_seqs and n_kv)
|
9273
9279
|
lm_ggml_build_forward_expand(graph,
|
9274
9280
|
lm_ggml_cpy(ctx,
|
9275
9281
|
lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)),
|
@@ -9888,8 +9894,8 @@ struct llm_build_context {
|
|
9888
9894
|
struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) {
|
9889
9895
|
// find result_norm tensor for input
|
9890
9896
|
struct lm_ggml_tensor * inp = nullptr;
|
9891
|
-
for (int i = gf
|
9892
|
-
inp = gf
|
9897
|
+
for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
9898
|
+
inp = lm_ggml_graph_node(gf, i);
|
9893
9899
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
9894
9900
|
break;
|
9895
9901
|
} else {
|
@@ -15831,7 +15837,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|
15831
15837
|
|
15832
15838
|
// clear unused states
|
15833
15839
|
for (int i = 0; i < n_kv; ++i) {
|
15834
|
-
uint32_t
|
15840
|
+
const uint32_t cell_id = i + kv_self.head;
|
15835
15841
|
llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
|
15836
15842
|
|
15837
15843
|
data[i] = (float) (kv_cell.src >= 0);
|
@@ -16087,19 +16093,21 @@ static int llama_decode_internal(
|
|
16087
16093
|
return -1;
|
16088
16094
|
}
|
16089
16095
|
|
16090
|
-
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
16091
|
-
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
|
16092
|
-
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
16093
|
-
return -1;
|
16094
|
-
}
|
16095
|
-
}
|
16096
|
-
|
16097
16096
|
const auto & model = lctx.model;
|
16098
16097
|
const auto & hparams = model.hparams;
|
16099
16098
|
const auto & cparams = lctx.cparams;
|
16100
16099
|
|
16101
16100
|
LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
|
16102
16101
|
|
16102
|
+
if (batch_all.token) {
|
16103
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
16104
|
+
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
|
16105
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
16106
|
+
return -1;
|
16107
|
+
}
|
16108
|
+
}
|
16109
|
+
}
|
16110
|
+
|
16103
16111
|
LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
16104
16112
|
|
16105
16113
|
LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
@@ -16216,8 +16224,8 @@ static int llama_decode_internal(
|
|
16216
16224
|
lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
16217
16225
|
|
16218
16226
|
// the output is always the last tensor in the graph
|
16219
|
-
struct lm_ggml_tensor * res = gf
|
16220
|
-
struct lm_ggml_tensor * embd = gf
|
16227
|
+
struct lm_ggml_tensor * res = lm_ggml_graph_node(gf, -1);
|
16228
|
+
struct lm_ggml_tensor * embd = lm_ggml_graph_node(gf, -2);
|
16221
16229
|
|
16222
16230
|
if (lctx.n_outputs == 0) {
|
16223
16231
|
// no output
|
@@ -16226,9 +16234,9 @@ static int llama_decode_internal(
|
|
16226
16234
|
} else if (cparams.embeddings) {
|
16227
16235
|
res = nullptr; // do not extract logits for embedding case
|
16228
16236
|
embd = nullptr;
|
16229
|
-
for (int i = gf
|
16230
|
-
if (strcmp(gf
|
16231
|
-
embd = gf
|
16237
|
+
for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
16238
|
+
if (strcmp(lm_ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
16239
|
+
embd = lm_ggml_graph_node(gf, i);
|
16232
16240
|
break;
|
16233
16241
|
}
|
16234
16242
|
}
|
@@ -16386,19 +16394,21 @@ static int llama_encode_internal(
|
|
16386
16394
|
return -1;
|
16387
16395
|
}
|
16388
16396
|
|
16389
|
-
for (uint32_t i = 0; i < n_tokens; ++i) {
|
16390
|
-
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
|
16391
|
-
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
16392
|
-
return -1;
|
16393
|
-
}
|
16394
|
-
}
|
16395
|
-
|
16396
16397
|
const auto & model = lctx.model;
|
16397
16398
|
const auto & hparams = model.hparams;
|
16398
16399
|
const auto & cparams = lctx.cparams;
|
16399
16400
|
|
16400
16401
|
LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
16401
16402
|
|
16403
|
+
if (batch.token) {
|
16404
|
+
for (uint32_t i = 0; i < n_tokens; ++i) {
|
16405
|
+
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
|
16406
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
16407
|
+
return -1;
|
16408
|
+
}
|
16409
|
+
}
|
16410
|
+
}
|
16411
|
+
|
16402
16412
|
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
16403
16413
|
LM_GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
|
16404
16414
|
|
@@ -16443,15 +16453,15 @@ static int llama_encode_internal(
|
|
16443
16453
|
// there are two cases here
|
16444
16454
|
if (llama_model_has_decoder(&lctx.model)) {
|
16445
16455
|
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
16446
|
-
embd = gf
|
16456
|
+
embd = lm_ggml_graph_node(gf, -1);
|
16447
16457
|
LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
16448
16458
|
} else {
|
16449
16459
|
// second case is an encoder-only T5 model
|
16450
16460
|
if (cparams.embeddings) {
|
16451
16461
|
// only output embeddings if required
|
16452
|
-
embd = gf
|
16462
|
+
embd = lm_ggml_graph_node(gf, -1);
|
16453
16463
|
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
16454
|
-
embd = gf
|
16464
|
+
embd = lm_ggml_graph_node(gf, -2);
|
16455
16465
|
}
|
16456
16466
|
LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
16457
16467
|
}
|
@@ -17541,6 +17551,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
17541
17551
|
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
17542
17552
|
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
17543
17553
|
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
17554
|
+
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
17555
|
+
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
17544
17556
|
|
17545
17557
|
// do not quantize relative position bias (T5)
|
17546
17558
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
@@ -17950,6 +17962,7 @@ struct llama_context_params llama_context_default_params() {
|
|
17950
17962
|
/*.embeddings =*/ false,
|
17951
17963
|
/*.offload_kqv =*/ true,
|
17952
17964
|
/*.flash_attn =*/ false,
|
17965
|
+
/*.no_perf =*/ true,
|
17953
17966
|
/*.abort_callback =*/ nullptr,
|
17954
17967
|
/*.abort_callback_data =*/ nullptr,
|
17955
17968
|
};
|
@@ -18160,6 +18173,7 @@ struct llama_context * llama_new_context_with_model(
|
|
18160
18173
|
cparams.embeddings = params.embeddings;
|
18161
18174
|
cparams.offload_kqv = params.offload_kqv;
|
18162
18175
|
cparams.flash_attn = params.flash_attn;
|
18176
|
+
cparams.no_perf = params.no_perf;
|
18163
18177
|
cparams.pooling_type = params.pooling_type;
|
18164
18178
|
|
18165
18179
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -18497,7 +18511,7 @@ struct llama_context * llama_new_context_with_model(
|
|
18497
18511
|
|
18498
18512
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
18499
18513
|
int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched);
|
18500
|
-
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf
|
18514
|
+
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, lm_ggml_graph_n_nodes(gf));
|
18501
18515
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
18502
18516
|
}
|
18503
18517
|
}
|
@@ -20078,10 +20092,14 @@ void llama_synchronize(struct llama_context * ctx) {
|
|
20078
20092
|
|
20079
20093
|
// add the evaluation to the stats
|
20080
20094
|
if (ctx->n_queued_tokens == 1) {
|
20081
|
-
ctx->
|
20095
|
+
if (!ctx->cparams.no_perf) {
|
20096
|
+
ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
|
20097
|
+
}
|
20082
20098
|
ctx->n_eval++;
|
20083
20099
|
} else if (ctx->n_queued_tokens > 1) {
|
20084
|
-
ctx->
|
20100
|
+
if (!ctx->cparams.no_perf) {
|
20101
|
+
ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
|
20102
|
+
}
|
20085
20103
|
ctx->n_p_eval += ctx->n_queued_tokens;
|
20086
20104
|
}
|
20087
20105
|
|
@@ -20677,6 +20695,7 @@ const char * llama_print_system_info(void) {
|
|
20677
20695
|
s += "ARM_FMA = " + std::to_string(lm_ggml_cpu_has_arm_fma()) + " | ";
|
20678
20696
|
s += "F16C = " + std::to_string(lm_ggml_cpu_has_f16c()) + " | ";
|
20679
20697
|
s += "FP16_VA = " + std::to_string(lm_ggml_cpu_has_fp16_va()) + " | ";
|
20698
|
+
s += "RISCV_VECT = " + std::to_string(lm_ggml_cpu_has_riscv_v()) + " | ";
|
20680
20699
|
s += "WASM_SIMD = " + std::to_string(lm_ggml_cpu_has_wasm_simd()) + " | ";
|
20681
20700
|
s += "BLAS = " + std::to_string(lm_ggml_cpu_has_blas()) + " | ";
|
20682
20701
|
s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | ";
|
@@ -20688,65 +20707,40 @@ const char * llama_print_system_info(void) {
|
|
20688
20707
|
return s.c_str();
|
20689
20708
|
}
|
20690
20709
|
|
20691
|
-
|
20692
|
-
|
20693
|
-
case LLAMA_PERF_TYPE_CONTEXT:
|
20694
|
-
{
|
20695
|
-
const auto * p = (const struct llama_context *) ctx;
|
20696
|
-
|
20697
|
-
const double t_start_ms = 1e-3 * p->t_start_us;
|
20698
|
-
const double t_end_ms = 1.00 * lm_ggml_time_ms();
|
20699
|
-
const double t_load_ms = 1e-3 * p->t_load_us;
|
20700
|
-
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
|
20701
|
-
const double t_eval_ms = 1e-3 * p->t_eval_us;
|
20702
|
-
|
20703
|
-
const int32_t n_p_eval = std::max(0, p->n_p_eval);
|
20704
|
-
const int32_t n_eval = std::max(1, p->n_eval);
|
20705
|
-
|
20706
|
-
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
|
20707
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
20708
|
-
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
|
20709
|
-
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20710
|
-
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
|
20711
|
-
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
|
20712
|
-
} break;
|
20713
|
-
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
20714
|
-
{
|
20715
|
-
const auto * smpl = (const struct llama_sampler *) ctx;
|
20716
|
-
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
|
20710
|
+
struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
|
20711
|
+
struct llama_perf_context_data data = {};
|
20717
20712
|
|
20718
|
-
|
20713
|
+
if (ctx == nullptr) {
|
20714
|
+
return data;
|
20715
|
+
}
|
20719
20716
|
|
20720
|
-
|
20717
|
+
data.t_start_ms = 1e-3 * ctx->t_start_us;
|
20718
|
+
data.t_load_ms = 1e-3 * ctx->t_load_us;
|
20719
|
+
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
|
20720
|
+
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
|
20721
|
+
data.n_p_eval = std::max(1, ctx->n_p_eval);
|
20722
|
+
data.n_eval = std::max(1, ctx->n_eval);
|
20721
20723
|
|
20722
|
-
|
20723
|
-
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
|
20724
|
-
} break;
|
20725
|
-
default:
|
20726
|
-
LM_GGML_ABORT("invalid perf type");
|
20727
|
-
}
|
20724
|
+
return data;
|
20728
20725
|
}
|
20729
20726
|
|
20730
|
-
void
|
20731
|
-
|
20732
|
-
case LLAMA_PERF_TYPE_CONTEXT:
|
20733
|
-
{
|
20734
|
-
auto * p = (struct llama_context *) ctx;
|
20727
|
+
void llama_perf_context_print(const struct llama_context * ctx) {
|
20728
|
+
const auto data = llama_perf_context(ctx);
|
20735
20729
|
|
20736
|
-
|
20737
|
-
p->t_eval_us = p->n_eval = 0;
|
20738
|
-
p->t_p_eval_us = p->n_p_eval = 0;
|
20739
|
-
} break;
|
20740
|
-
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
20741
|
-
{
|
20742
|
-
auto * smpl = (struct llama_sampler *) ctx;
|
20743
|
-
auto * p = (struct llama_sampler_chain *) smpl->ctx;
|
20730
|
+
const double t_end_ms = 1e-3 * lm_ggml_time_us();
|
20744
20731
|
|
20745
|
-
|
20746
|
-
|
20747
|
-
|
20748
|
-
|
20749
|
-
|
20732
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
20733
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
20734
|
+
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
20735
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20736
|
+
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
20737
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
20738
|
+
}
|
20739
|
+
|
20740
|
+
void llama_perf_context_reset(struct llama_context * ctx) {
|
20741
|
+
ctx->t_start_us = lm_ggml_time_us();
|
20742
|
+
ctx->t_eval_us = ctx->n_eval = 0;
|
20743
|
+
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
20750
20744
|
}
|
20751
20745
|
|
20752
20746
|
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
package/cpp/llama.h
CHANGED
@@ -344,7 +344,7 @@ extern "C" {
|
|
344
344
|
bool embeddings; // if true, extract embeddings (together with logits)
|
345
345
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
346
346
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
347
|
-
|
347
|
+
bool no_perf; // whether to measure performance timings
|
348
348
|
|
349
349
|
// Abort callback
|
350
350
|
// if it returns true, execution of llama_decode() will be aborted
|
@@ -1057,6 +1057,9 @@ extern "C" {
|
|
1057
1057
|
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
1058
1058
|
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
1059
1059
|
|
1060
|
+
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
1061
|
+
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
|
1062
|
+
|
1060
1063
|
// available samplers:
|
1061
1064
|
|
1062
1065
|
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
|
@@ -1131,15 +1134,20 @@ extern "C" {
|
|
1131
1134
|
int32_t n_logit_bias,
|
1132
1135
|
const llama_logit_bias * logit_bias);
|
1133
1136
|
|
1134
|
-
|
1137
|
+
|
1138
|
+
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
1139
|
+
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
1140
|
+
|
1141
|
+
/// @details Sample and accept a token from the idx-th output of the last evaluation
|
1135
1142
|
//
|
1143
|
+
// Shorthand for:
|
1136
1144
|
// const auto * logits = llama_get_logits_ith(ctx, idx);
|
1137
1145
|
// llama_token_data_array cur_p = { ... init from logits ... };
|
1138
1146
|
// llama_sampler_apply(smpl, &cur_p);
|
1139
|
-
//
|
1140
|
-
//
|
1141
|
-
//
|
1142
|
-
//
|
1147
|
+
// auto token = cur_p.data[cur_p.selected].id;
|
1148
|
+
// llama_sampler_accept(smpl, token);
|
1149
|
+
// return token;
|
1150
|
+
// Returns the sampled token
|
1143
1151
|
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
|
1144
1152
|
|
1145
1153
|
// TODO: extend in the future
|
@@ -1172,13 +1180,30 @@ extern "C" {
|
|
1172
1180
|
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
1173
1181
|
//
|
1174
1182
|
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1183
|
+
struct llama_perf_context_data {
|
1184
|
+
double t_start_ms;
|
1185
|
+
double t_load_ms;
|
1186
|
+
double t_p_eval_ms;
|
1187
|
+
double t_eval_ms;
|
1188
|
+
|
1189
|
+
int32_t n_p_eval;
|
1190
|
+
int32_t n_eval;
|
1178
1191
|
};
|
1179
1192
|
|
1180
|
-
|
1181
|
-
|
1193
|
+
struct llama_perf_sampler_data {
|
1194
|
+
double t_sample_ms;
|
1195
|
+
|
1196
|
+
int32_t n_sample;
|
1197
|
+
};
|
1198
|
+
|
1199
|
+
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
1200
|
+
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
|
1201
|
+
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
|
1202
|
+
|
1203
|
+
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
|
1204
|
+
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
|
1205
|
+
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
1206
|
+
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
1182
1207
|
|
1183
1208
|
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
1184
1209
|
|
package/cpp/rn-llama.hpp
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
#include "llama.h"
|
8
8
|
|
9
9
|
#include <android/log.h>
|
10
|
+
#include "sampling.h"
|
10
11
|
#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
|
11
12
|
#define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
12
13
|
|
@@ -334,7 +335,7 @@ struct llama_rn_context
|
|
334
335
|
{
|
335
336
|
// number of tokens to keep when resetting context
|
336
337
|
n_remain = params.n_predict;
|
337
|
-
|
338
|
+
llama_perf_context_reset(ctx);
|
338
339
|
is_predicting = true;
|
339
340
|
}
|
340
341
|
|
package/cpp/sampling.cpp
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
|
3
3
|
#include "common.h"
|
4
4
|
|
5
|
+
#include <cmath>
|
6
|
+
#include <unordered_map>
|
7
|
+
|
5
8
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
6
9
|
// TODO: deduplicate with llama-impl.h
|
7
10
|
template<typename T>
|
@@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const {
|
|
139
142
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
140
143
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
141
144
|
|
142
|
-
lparams.no_perf =
|
145
|
+
lparams.no_perf = params.no_perf;
|
143
146
|
|
144
147
|
auto * result = new gpt_sampler {
|
145
148
|
/* .params = */ params,
|
@@ -257,10 +260,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|
257
260
|
// TODO: measure grammar performance
|
258
261
|
|
259
262
|
if (gsmpl) {
|
260
|
-
|
263
|
+
llama_perf_sampler_print(gsmpl->chain);
|
261
264
|
}
|
262
265
|
if (ctx) {
|
263
|
-
|
266
|
+
llama_perf_context_print(ctx);
|
264
267
|
}
|
265
268
|
}
|
266
269
|
|
@@ -310,6 +313,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
|
310
313
|
return cur_p.data[cur_p.selected].id;
|
311
314
|
}
|
312
315
|
|
316
|
+
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
|
317
|
+
return llama_sampler_get_seed(gsmpl->chain);
|
318
|
+
}
|
319
|
+
|
313
320
|
// helpers
|
314
321
|
|
315
322
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
@@ -432,7 +439,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
|
432
439
|
}
|
433
440
|
|
434
441
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
435
|
-
std::unordered_map<char, gpt_sampler_type> sampler_name_map {
|
442
|
+
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
436
443
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
437
444
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
438
445
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
package/cpp/sampling.h
CHANGED
@@ -2,65 +2,11 @@
|
|
2
2
|
|
3
3
|
#include "llama.h"
|
4
4
|
|
5
|
+
#include "common.h"
|
6
|
+
|
5
7
|
#include <string>
|
6
8
|
#include <vector>
|
7
9
|
|
8
|
-
enum gpt_sampler_type {
|
9
|
-
GPT_SAMPLER_TYPE_NONE = 0,
|
10
|
-
GPT_SAMPLER_TYPE_TOP_K = 1,
|
11
|
-
GPT_SAMPLER_TYPE_TOP_P = 2,
|
12
|
-
GPT_SAMPLER_TYPE_MIN_P = 3,
|
13
|
-
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
14
|
-
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
15
|
-
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
16
|
-
GPT_SAMPLER_TYPE_XTC = 7,
|
17
|
-
};
|
18
|
-
|
19
|
-
// sampling parameters
|
20
|
-
struct gpt_sampler_params {
|
21
|
-
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
22
|
-
|
23
|
-
int32_t n_prev = 64; // number of previous tokens to remember
|
24
|
-
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
25
|
-
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
26
|
-
int32_t top_k = 40; // <= 0 to use vocab size
|
27
|
-
float top_p = 0.95f; // 1.0 = disabled
|
28
|
-
float min_p = 0.05f; // 0.0 = disabled
|
29
|
-
float tfs_z = 1.00f; // 1.0 = disabled
|
30
|
-
float xtc_t = 0.0f; // 0.0 = disabled
|
31
|
-
float xtc_p = 0.0f;
|
32
|
-
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
33
|
-
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
34
|
-
float dynatemp_range = 0.00f; // 0.0 = disabled
|
35
|
-
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
36
|
-
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
37
|
-
float penalty_repeat = 1.00f; // 1.0 = disabled
|
38
|
-
float penalty_freq = 0.00f; // 0.0 = disabled
|
39
|
-
float penalty_present = 0.00f; // 0.0 = disabled
|
40
|
-
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
41
|
-
float mirostat_tau = 5.00f; // target entropy
|
42
|
-
float mirostat_eta = 0.10f; // learning rate
|
43
|
-
bool penalize_nl = false; // consider newlines as a repeatable token
|
44
|
-
bool ignore_eos = false;
|
45
|
-
|
46
|
-
std::vector<enum gpt_sampler_type> samplers = {
|
47
|
-
GPT_SAMPLER_TYPE_TOP_K,
|
48
|
-
GPT_SAMPLER_TYPE_TFS_Z,
|
49
|
-
GPT_SAMPLER_TYPE_TYPICAL_P,
|
50
|
-
GPT_SAMPLER_TYPE_TOP_P,
|
51
|
-
GPT_SAMPLER_TYPE_MIN_P,
|
52
|
-
GPT_SAMPLER_TYPE_XTC,
|
53
|
-
GPT_SAMPLER_TYPE_TEMPERATURE
|
54
|
-
};
|
55
|
-
|
56
|
-
std::string grammar; // optional BNF-like grammar to constrain sampling
|
57
|
-
|
58
|
-
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
59
|
-
|
60
|
-
// print the parameters into a string
|
61
|
-
std::string print() const;
|
62
|
-
};
|
63
|
-
|
64
10
|
// gpt_sampler extends llama_sampler with additional functionality:
|
65
11
|
//
|
66
12
|
// - grammar support
|
@@ -114,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|
114
60
|
//
|
115
61
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
116
62
|
|
63
|
+
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
|
64
|
+
|
117
65
|
// helpers
|
118
66
|
|
119
67
|
// access the internal list of current candidate tokens
|