@fugood/llama.node 1.4.15 → 1.6.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +1 -5
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +76 -61
- package/src/LlamaContext.cpp +20 -32
- package/src/llama.cpp/common/CMakeLists.txt +12 -0
- package/src/llama.cpp/common/arg.cpp +20 -0
- package/src/llama.cpp/common/chat-parser.cpp +3 -3
- package/src/llama.cpp/common/chat-parser.h +4 -4
- package/src/llama.cpp/common/chat.cpp +289 -34
- package/src/llama.cpp/common/chat.h +32 -20
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +31 -25
- package/src/llama.cpp/common/download.cpp +19 -14
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/json-partial.h +1 -0
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +5 -1
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
|
@@ -1513,12 +1513,9 @@ static void llama_sampler_top_p_backend_apply(
|
|
|
1513
1513
|
mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
|
|
1514
1514
|
mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
|
|
1515
1515
|
|
|
1516
|
-
//
|
|
1517
|
-
//
|
|
1518
|
-
|
|
1519
|
-
// others will be 0 (meaning that will not effect the logits).
|
|
1520
|
-
const float large_val = 1e9f;
|
|
1521
|
-
struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
|
1516
|
+
// Apply -INFINITY bias for masked-out tokens
|
|
1517
|
+
// log(1) = 0 (keep), log(0) = -INF (discard)
|
|
1518
|
+
struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
|
|
1522
1519
|
ggml_set_name(top_p_bias, "top_p_bias");
|
|
1523
1520
|
|
|
1524
1521
|
data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
|
|
@@ -1673,15 +1670,11 @@ static void llama_sampler_min_p_backend_apply(
|
|
|
1673
1670
|
struct ggml_tensor * mask = ggml_step(ctx, sub);
|
|
1674
1671
|
ggml_set_name(mask, "min_p_mask");
|
|
1675
1672
|
|
|
1676
|
-
//
|
|
1677
|
-
//
|
|
1678
|
-
|
|
1679
|
-
// others will be 0 (meaning that will not effect the logits).
|
|
1680
|
-
const float large_val = 1e9f;
|
|
1681
|
-
struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
|
1673
|
+
// Apply -INFINITY bias for masked-out tokens
|
|
1674
|
+
// log(1) = 0 (keep), log(0) = -INF (discard)
|
|
1675
|
+
struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
|
|
1682
1676
|
ggml_set_name(min_p_bias, "min_p_bias");
|
|
1683
1677
|
|
|
1684
|
-
// Add the min_p bias to the logits.
|
|
1685
1678
|
data->logits = ggml_add(ctx, data->logits, min_p_bias);
|
|
1686
1679
|
ggml_set_name(data->logits, "min_p_logits");
|
|
1687
1680
|
|
|
@@ -3293,6 +3286,170 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
|
|
|
3293
3286
|
return result;
|
|
3294
3287
|
}
|
|
3295
3288
|
|
|
3289
|
+
// adaptive-p sampler state
|
|
3290
|
+
//
|
|
3291
|
+
// maintains an exponential moving average of the *ORIGINAL* probabilities
|
|
3292
|
+
// of selected tokens, used to compute an adapted target at each sampling step.
|
|
3293
|
+
//
|
|
3294
|
+
// see llama.h for a full description of the sampler
|
|
3295
|
+
//
|
|
3296
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
|
3297
|
+
//
|
|
3298
|
+
struct llama_sampler_adaptive_p {
|
|
3299
|
+
const float target; // target probability (0.0 - 1.0; negative = disabled)
|
|
3300
|
+
const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
|
|
3301
|
+
const uint32_t seed; // original RNG seed
|
|
3302
|
+
uint32_t seed_cur; // actual RNG seed
|
|
3303
|
+
std::mt19937 rng; // RNG state
|
|
3304
|
+
float weighted_sum; // sum(p_i * decay^i)
|
|
3305
|
+
float total_weight; // sum(decay^i), converges to 1/(1-decay)
|
|
3306
|
+
std::vector<float> original_probs; // pre-transform probs, cached for EMA update
|
|
3307
|
+
llama_token pending_token_id; // token ID of selected token
|
|
3308
|
+
int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs
|
|
3309
|
+
};
|
|
3310
|
+
|
|
3311
|
+
// adaptive probability transformation constants
|
|
3312
|
+
static constexpr float DISTRIBUTION_WIDTH = 0.3f;
|
|
3313
|
+
static constexpr float PEAK_LOGIT_VALUE = 5.0f;
|
|
3314
|
+
static constexpr float SHARPNESS = 10.0f;
|
|
3315
|
+
static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
|
|
3316
|
+
|
|
3317
|
+
static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
|
|
3318
|
+
return "adaptive-p";
|
|
3319
|
+
}
|
|
3320
|
+
|
|
3321
|
+
static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
3322
|
+
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
|
3323
|
+
|
|
3324
|
+
llama_sampler_softmax_impl(cur_p, false);
|
|
3325
|
+
|
|
3326
|
+
if (ctx->target < 0.0f) {
|
|
3327
|
+
// at negative target values, adaptive-p is no-op
|
|
3328
|
+
// we simply sample from the existing distribution
|
|
3329
|
+
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
|
3330
|
+
return;
|
|
3331
|
+
}
|
|
3332
|
+
|
|
3333
|
+
// store the original probabilities
|
|
3334
|
+
ctx->original_probs.resize(cur_p->size);
|
|
3335
|
+
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
3336
|
+
ctx->original_probs[i] = cur_p->data[i].p;
|
|
3337
|
+
}
|
|
3338
|
+
|
|
3339
|
+
// using the EMA, compute the adapted target probability for the current sampling step
|
|
3340
|
+
auto target = std::clamp(ctx->target, 0.0f, 1.0f);
|
|
3341
|
+
float adapted_target = std::clamp(
|
|
3342
|
+
ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
|
|
3343
|
+
0.0f, 1.0f
|
|
3344
|
+
);
|
|
3345
|
+
|
|
3346
|
+
// adaptive probability transform
|
|
3347
|
+
//
|
|
3348
|
+
// quadratic near target for fine differentiation, transitioning to linear decay in the
|
|
3349
|
+
// tails. unbounded negative logits ensure proper suppression of far-from-target tokens
|
|
3350
|
+
// after the softmax.
|
|
3351
|
+
//
|
|
3352
|
+
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
3353
|
+
if (cur_p->data[i].logit == -INFINITY) {
|
|
3354
|
+
// don't transform logits that are -INFINITY
|
|
3355
|
+
// (as masked out by e.g. min-p and top-p when using backend sampling)
|
|
3356
|
+
continue;
|
|
3357
|
+
}
|
|
3358
|
+
float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
|
|
3359
|
+
cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
|
|
3360
|
+
}
|
|
3361
|
+
|
|
3362
|
+
// softmax and sample from the transformed distribution
|
|
3363
|
+
llama_sampler_softmax_impl(cur_p, false);
|
|
3364
|
+
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
|
3365
|
+
cur_p->selected = idx;
|
|
3366
|
+
|
|
3367
|
+
// store the selected token ID for acceptance later
|
|
3368
|
+
ctx->pending_token_id = cur_p->data[idx].id;
|
|
3369
|
+
ctx->pending_token_idx = idx;
|
|
3370
|
+
}
|
|
3371
|
+
|
|
3372
|
+
static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
|
|
3373
|
+
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
|
3374
|
+
if (ctx->pending_token_id == token) {
|
|
3375
|
+
GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
|
|
3376
|
+
GGML_ASSERT(ctx->pending_token_idx != -1);
|
|
3377
|
+
// update EMA with the original probability of the selected token
|
|
3378
|
+
ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
|
|
3379
|
+
ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
|
|
3380
|
+
}
|
|
3381
|
+
ctx->pending_token_id = LLAMA_TOKEN_NULL;
|
|
3382
|
+
ctx->pending_token_idx = -1;
|
|
3383
|
+
}
|
|
3384
|
+
|
|
3385
|
+
static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
|
|
3386
|
+
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
|
3387
|
+
// ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
|
|
3388
|
+
// original_probs is completely overwritten on every call to _apply.
|
|
3389
|
+
// so we only need to reset the EMA state and pending token.
|
|
3390
|
+
ctx->weighted_sum = ctx->target / (1.0f - ctx->decay);
|
|
3391
|
+
ctx->total_weight = 1.0f / (1.0f - ctx->decay);
|
|
3392
|
+
ctx->pending_token_id = LLAMA_TOKEN_NULL;
|
|
3393
|
+
ctx->pending_token_idx = -1;
|
|
3394
|
+
ctx->seed_cur = get_rng_seed(ctx->seed);
|
|
3395
|
+
ctx->rng.seed(ctx->seed_cur);
|
|
3396
|
+
}
|
|
3397
|
+
|
|
3398
|
+
static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
|
|
3399
|
+
const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
|
|
3400
|
+
auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
|
|
3401
|
+
auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
|
|
3402
|
+
|
|
3403
|
+
// copy everything (target, decay, seed, and RNG are already set)
|
|
3404
|
+
result_ctx->weighted_sum = ctx->weighted_sum;
|
|
3405
|
+
result_ctx->total_weight = ctx->total_weight;
|
|
3406
|
+
result_ctx->pending_token_id = ctx->pending_token_id;
|
|
3407
|
+
result_ctx->pending_token_idx = ctx->pending_token_idx;
|
|
3408
|
+
|
|
3409
|
+
return result;
|
|
3410
|
+
}
|
|
3411
|
+
|
|
3412
|
+
static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
|
|
3413
|
+
delete (llama_sampler_adaptive_p *) smpl->ctx;
|
|
3414
|
+
}
|
|
3415
|
+
|
|
3416
|
+
static struct llama_sampler_i llama_sampler_adaptive_p_i = {
|
|
3417
|
+
/* .name = */ llama_sampler_adaptive_p_name,
|
|
3418
|
+
/* .accept = */ llama_sampler_adaptive_p_accept,
|
|
3419
|
+
/* .apply = */ llama_sampler_adaptive_p_apply,
|
|
3420
|
+
/* .reset = */ llama_sampler_adaptive_p_reset,
|
|
3421
|
+
/* .clone = */ llama_sampler_adaptive_p_clone,
|
|
3422
|
+
/* .free = */ llama_sampler_adaptive_p_free,
|
|
3423
|
+
/* .backend_init = */ nullptr,
|
|
3424
|
+
/* .backend_accept = */ nullptr,
|
|
3425
|
+
/* .backend_apply = */ nullptr,
|
|
3426
|
+
/* .backend_set_input = */ nullptr,
|
|
3427
|
+
};
|
|
3428
|
+
|
|
3429
|
+
struct llama_sampler * llama_sampler_init_adaptive_p(
|
|
3430
|
+
float target,
|
|
3431
|
+
float decay,
|
|
3432
|
+
uint32_t seed
|
|
3433
|
+
) {
|
|
3434
|
+
auto seed_cur = get_rng_seed(seed);
|
|
3435
|
+
float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
|
|
3436
|
+
return llama_sampler_init(
|
|
3437
|
+
/* .iface = */ &llama_sampler_adaptive_p_i,
|
|
3438
|
+
/* .ctx = */ new llama_sampler_adaptive_p {
|
|
3439
|
+
/* .target = */ target,
|
|
3440
|
+
/* .decay = */ clamped_decay,
|
|
3441
|
+
/* .seed = */ seed,
|
|
3442
|
+
/* .seed_cur = */ seed_cur,
|
|
3443
|
+
/* .rng = */ std::mt19937(seed_cur),
|
|
3444
|
+
/* .weighted_sum = */ target / (1.0f - clamped_decay),
|
|
3445
|
+
/* .total_weight = */ 1.0f / (1.0f - clamped_decay),
|
|
3446
|
+
/* .original_probs = */ {},
|
|
3447
|
+
/* .pending_token_id = */ LLAMA_TOKEN_NULL,
|
|
3448
|
+
/* .pending_token_idx = */ -1
|
|
3449
|
+
}
|
|
3450
|
+
);
|
|
3451
|
+
}
|
|
3452
|
+
|
|
3296
3453
|
// logit-bias
|
|
3297
3454
|
|
|
3298
3455
|
struct llama_sampler_logit_bias : public llama_sampler_backend {
|