@fugood/llama.node 1.4.15 → 1.6.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/lib/binding.ts +1 -5
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +2 -2
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +76 -61
  6. package/src/LlamaContext.cpp +20 -32
  7. package/src/llama.cpp/common/CMakeLists.txt +12 -0
  8. package/src/llama.cpp/common/arg.cpp +20 -0
  9. package/src/llama.cpp/common/chat-parser.cpp +3 -3
  10. package/src/llama.cpp/common/chat-parser.h +4 -4
  11. package/src/llama.cpp/common/chat.cpp +289 -34
  12. package/src/llama.cpp/common/chat.h +32 -20
  13. package/src/llama.cpp/common/common.cpp +0 -1
  14. package/src/llama.cpp/common/common.h +31 -25
  15. package/src/llama.cpp/common/download.cpp +19 -14
  16. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  17. package/src/llama.cpp/common/jinja/caps.h +24 -0
  18. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  19. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  20. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  21. package/src/llama.cpp/common/jinja/parser.h +21 -0
  22. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  23. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  24. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  25. package/src/llama.cpp/common/jinja/string.h +58 -0
  26. package/src/llama.cpp/common/jinja/utils.h +49 -0
  27. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  28. package/src/llama.cpp/common/jinja/value.h +464 -0
  29. package/src/llama.cpp/common/json-partial.h +1 -0
  30. package/src/llama.cpp/common/sampling.cpp +52 -19
  31. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  34. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  36. package/src/llama.cpp/include/llama-cpp.h +3 -1
  37. package/src/llama.cpp/include/llama.h +29 -2
  38. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  39. package/src/llama.cpp/src/llama-adapter.h +1 -3
  40. package/src/llama.cpp/src/llama-context.cpp +232 -144
  41. package/src/llama.cpp/src/llama-context.h +10 -0
  42. package/src/llama.cpp/src/llama-cparams.h +2 -0
  43. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  44. package/src/llama.cpp/src/llama-hparams.h +38 -1
  45. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  46. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  47. package/src/llama.cpp/src/llama-mmap.cpp +5 -1
  48. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  49. package/src/llama.cpp/src/llama-model.cpp +5 -1
  50. package/src/llama.cpp/src/llama-model.h +3 -2
  51. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
@@ -1513,12 +1513,9 @@ static void llama_sampler_top_p_backend_apply(
1513
1513
  mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
1514
1514
  mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
1515
1515
 
1516
- // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
1517
- // top_p_bias = (mask * 1e9f) - 1e9f.
1518
- // So entries in the mask that we want to discard will become -1e9f, and
1519
- // others will be 0 (meaning that will not effect the logits).
1520
- const float large_val = 1e9f;
1521
- struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
1516
+ // Apply -INFINITY bias for masked-out tokens
1517
+ // log(1) = 0 (keep), log(0) = -INF (discard)
1518
+ struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
1522
1519
  ggml_set_name(top_p_bias, "top_p_bias");
1523
1520
 
1524
1521
  data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
@@ -1673,15 +1670,11 @@ static void llama_sampler_min_p_backend_apply(
1673
1670
  struct ggml_tensor * mask = ggml_step(ctx, sub);
1674
1671
  ggml_set_name(mask, "min_p_mask");
1675
1672
 
1676
- // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
1677
- // min_p_bias = (mask * 1e9f) - 1e9f.
1678
- // So entries in the mask that we want to discard will become -1e9f, and
1679
- // others will be 0 (meaning that will not effect the logits).
1680
- const float large_val = 1e9f;
1681
- struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
1673
+ // Apply -INFINITY bias for masked-out tokens
1674
+ // log(1) = 0 (keep), log(0) = -INF (discard)
1675
+ struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
1682
1676
  ggml_set_name(min_p_bias, "min_p_bias");
1683
1677
 
1684
- // Add the min_p bias to the logits.
1685
1678
  data->logits = ggml_add(ctx, data->logits, min_p_bias);
1686
1679
  ggml_set_name(data->logits, "min_p_logits");
1687
1680
 
@@ -3293,6 +3286,170 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
3293
3286
  return result;
3294
3287
  }
3295
3288
 
3289
+ // adaptive-p sampler state
3290
+ //
3291
+ // maintains an exponential moving average of the *ORIGINAL* probabilities
3292
+ // of selected tokens, used to compute an adapted target at each sampling step.
3293
+ //
3294
+ // see llama.h for a full description of the sampler
3295
+ //
3296
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17927
3297
+ //
3298
+ struct llama_sampler_adaptive_p {
3299
+ const float target; // target probability (0.0 - 1.0; negative = disabled)
3300
+ const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
3301
+ const uint32_t seed; // original RNG seed
3302
+ uint32_t seed_cur; // actual RNG seed
3303
+ std::mt19937 rng; // RNG state
3304
+ float weighted_sum; // sum(p_i * decay^i)
3305
+ float total_weight; // sum(decay^i), converges to 1/(1-decay)
3306
+ std::vector<float> original_probs; // pre-transform probs, cached for EMA update
3307
+ llama_token pending_token_id; // token ID of selected token
3308
+ int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs
3309
+ };
3310
+
3311
+ // adaptive probability transformation constants
3312
+ static constexpr float DISTRIBUTION_WIDTH = 0.3f;
3313
+ static constexpr float PEAK_LOGIT_VALUE = 5.0f;
3314
+ static constexpr float SHARPNESS = 10.0f;
3315
+ static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
3316
+
3317
+ static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
3318
+ return "adaptive-p";
3319
+ }
3320
+
3321
+ static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
3322
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
3323
+
3324
+ llama_sampler_softmax_impl(cur_p, false);
3325
+
3326
+ if (ctx->target < 0.0f) {
3327
+ // at negative target values, adaptive-p is no-op
3328
+ // we simply sample from the existing distribution
3329
+ cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
3330
+ return;
3331
+ }
3332
+
3333
+ // store the original probabilities
3334
+ ctx->original_probs.resize(cur_p->size);
3335
+ for (size_t i = 0; i < cur_p->size; ++i) {
3336
+ ctx->original_probs[i] = cur_p->data[i].p;
3337
+ }
3338
+
3339
+ // using the EMA, compute the adapted target probability for the current sampling step
3340
+ auto target = std::clamp(ctx->target, 0.0f, 1.0f);
3341
+ float adapted_target = std::clamp(
3342
+ ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
3343
+ 0.0f, 1.0f
3344
+ );
3345
+
3346
+ // adaptive probability transform
3347
+ //
3348
+ // quadratic near target for fine differentiation, transitioning to linear decay in the
3349
+ // tails. unbounded negative logits ensure proper suppression of far-from-target tokens
3350
+ // after the softmax.
3351
+ //
3352
+ for (size_t i = 0; i < cur_p->size; ++i) {
3353
+ if (cur_p->data[i].logit == -INFINITY) {
3354
+ // don't transform logits that are -INFINITY
3355
+ // (as masked out by e.g. min-p and top-p when using backend sampling)
3356
+ continue;
3357
+ }
3358
+ float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
3359
+ cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
3360
+ }
3361
+
3362
+ // softmax and sample from the transformed distribution
3363
+ llama_sampler_softmax_impl(cur_p, false);
3364
+ const int idx = llama_sample_dist(cur_p, ctx->rng);
3365
+ cur_p->selected = idx;
3366
+
3367
+ // store the selected token ID for acceptance later
3368
+ ctx->pending_token_id = cur_p->data[idx].id;
3369
+ ctx->pending_token_idx = idx;
3370
+ }
3371
+
3372
+ static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
3373
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
3374
+ if (ctx->pending_token_id == token) {
3375
+ GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
3376
+ GGML_ASSERT(ctx->pending_token_idx != -1);
3377
+ // update EMA with the original probability of the selected token
3378
+ ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
3379
+ ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
3380
+ }
3381
+ ctx->pending_token_id = LLAMA_TOKEN_NULL;
3382
+ ctx->pending_token_idx = -1;
3383
+ }
3384
+
3385
+ static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
3386
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
3387
+ // ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
3388
+ // original_probs is completely overwritten on every call to _apply.
3389
+ // so we only need to reset the EMA state and pending token.
3390
+ ctx->weighted_sum = ctx->target / (1.0f - ctx->decay);
3391
+ ctx->total_weight = 1.0f / (1.0f - ctx->decay);
3392
+ ctx->pending_token_id = LLAMA_TOKEN_NULL;
3393
+ ctx->pending_token_idx = -1;
3394
+ ctx->seed_cur = get_rng_seed(ctx->seed);
3395
+ ctx->rng.seed(ctx->seed_cur);
3396
+ }
3397
+
3398
+ static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
3399
+ const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
3400
+ auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
3401
+ auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
3402
+
3403
+ // copy everything (target, decay, seed, and RNG are already set)
3404
+ result_ctx->weighted_sum = ctx->weighted_sum;
3405
+ result_ctx->total_weight = ctx->total_weight;
3406
+ result_ctx->pending_token_id = ctx->pending_token_id;
3407
+ result_ctx->pending_token_idx = ctx->pending_token_idx;
3408
+
3409
+ return result;
3410
+ }
3411
+
3412
+ static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
3413
+ delete (llama_sampler_adaptive_p *) smpl->ctx;
3414
+ }
3415
+
3416
+ static struct llama_sampler_i llama_sampler_adaptive_p_i = {
3417
+ /* .name = */ llama_sampler_adaptive_p_name,
3418
+ /* .accept = */ llama_sampler_adaptive_p_accept,
3419
+ /* .apply = */ llama_sampler_adaptive_p_apply,
3420
+ /* .reset = */ llama_sampler_adaptive_p_reset,
3421
+ /* .clone = */ llama_sampler_adaptive_p_clone,
3422
+ /* .free = */ llama_sampler_adaptive_p_free,
3423
+ /* .backend_init = */ nullptr,
3424
+ /* .backend_accept = */ nullptr,
3425
+ /* .backend_apply = */ nullptr,
3426
+ /* .backend_set_input = */ nullptr,
3427
+ };
3428
+
3429
+ struct llama_sampler * llama_sampler_init_adaptive_p(
3430
+ float target,
3431
+ float decay,
3432
+ uint32_t seed
3433
+ ) {
3434
+ auto seed_cur = get_rng_seed(seed);
3435
+ float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
3436
+ return llama_sampler_init(
3437
+ /* .iface = */ &llama_sampler_adaptive_p_i,
3438
+ /* .ctx = */ new llama_sampler_adaptive_p {
3439
+ /* .target = */ target,
3440
+ /* .decay = */ clamped_decay,
3441
+ /* .seed = */ seed,
3442
+ /* .seed_cur = */ seed_cur,
3443
+ /* .rng = */ std::mt19937(seed_cur),
3444
+ /* .weighted_sum = */ target / (1.0f - clamped_decay),
3445
+ /* .total_weight = */ 1.0f / (1.0f - clamped_decay),
3446
+ /* .original_probs = */ {},
3447
+ /* .pending_token_id = */ LLAMA_TOKEN_NULL,
3448
+ /* .pending_token_idx = */ -1
3449
+ }
3450
+ );
3451
+ }
3452
+
3296
3453
  // logit-bias
3297
3454
 
3298
3455
  struct llama_sampler_logit_bias : public llama_sampler_backend {