npm - @fugood/llama.node - Versions diffs - 1.4.15 → 1.6.0-rc.0 - Mend

@fugood/llama.node 1.4.15 → 1.6.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/lib/binding.ts +1 -5
package/lib/index.js +2 -2
package/lib/index.ts +2 -2
package/package.json +15 -15
package/scripts/llama.cpp.patch +76 -61
package/src/LlamaContext.cpp +20 -32
package/src/llama.cpp/common/CMakeLists.txt +12 -0
package/src/llama.cpp/common/arg.cpp +20 -0
package/src/llama.cpp/common/chat-parser.cpp +3 -3
package/src/llama.cpp/common/chat-parser.h +4 -4
package/src/llama.cpp/common/chat.cpp +289 -34
package/src/llama.cpp/common/chat.h +32 -20
package/src/llama.cpp/common/common.cpp +0 -1
package/src/llama.cpp/common/common.h +31 -25
package/src/llama.cpp/common/download.cpp +19 -14
package/src/llama.cpp/common/jinja/caps.cpp +237 -0
package/src/llama.cpp/common/jinja/caps.h +24 -0
package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
package/src/llama.cpp/common/jinja/lexer.h +157 -0
package/src/llama.cpp/common/jinja/parser.cpp +591 -0
package/src/llama.cpp/common/jinja/parser.h +21 -0
package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
package/src/llama.cpp/common/jinja/runtime.h +628 -0
package/src/llama.cpp/common/jinja/string.cpp +207 -0
package/src/llama.cpp/common/jinja/string.h +58 -0
package/src/llama.cpp/common/jinja/utils.h +49 -0
package/src/llama.cpp/common/jinja/value.cpp +1221 -0
package/src/llama.cpp/common/jinja/value.h +464 -0
package/src/llama.cpp/common/json-partial.h +1 -0
package/src/llama.cpp/common/sampling.cpp +52 -19
package/src/llama.cpp/ggml/include/ggml.h +39 -7
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
package/src/llama.cpp/include/llama-cpp.h +3 -1
package/src/llama.cpp/include/llama.h +29 -2
package/src/llama.cpp/src/llama-adapter.cpp +7 -13
package/src/llama.cpp/src/llama-adapter.h +1 -3
package/src/llama.cpp/src/llama-context.cpp +232 -144
package/src/llama.cpp/src/llama-context.h +10 -0
package/src/llama.cpp/src/llama-cparams.h +2 -0
package/src/llama.cpp/src/llama-hparams.cpp +0 -36
package/src/llama.cpp/src/llama-hparams.h +38 -1
package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
package/src/llama.cpp/src/llama-kv-cache.h +0 -2
package/src/llama.cpp/src/llama-mmap.cpp +5 -1
package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
package/src/llama.cpp/src/llama-model.cpp +5 -1
package/src/llama.cpp/src/llama-model.h +3 -2
package/src/llama.cpp/src/llama-sampling.cpp +170 -13

package/src/llama.cpp/src/llama-sampling.cpp CHANGED Viewed

@@ -1513,12 +1513,9 @@ static void llama_sampler_top_p_backend_apply(
     mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
     mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
-    // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
-    // top_p_bias = (mask * 1e9f) - 1e9f.
-    // So entries in the mask that we want to discard will become -1e9f, and
-    // others will be 0 (meaning that will not effect the logits).
-    const float large_val = 1e9f;
-    struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
+    // Apply -INFINITY bias for masked-out tokens
+    // log(1) = 0 (keep), log(0) = -INF (discard)
+    struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
     ggml_set_name(top_p_bias, "top_p_bias");
     data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
@@ -1673,15 +1670,11 @@ static void llama_sampler_min_p_backend_apply(
     struct ggml_tensor * mask = ggml_step(ctx, sub);
     ggml_set_name(mask, "min_p_mask");
-    // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
-    // min_p_bias = (mask * 1e9f) - 1e9f.
-    // So entries in the mask that we want to discard will become -1e9f, and
-    // others will be 0 (meaning that will not effect the logits).
-    const float large_val = 1e9f;
-    struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
+    // Apply -INFINITY bias for masked-out tokens
+    // log(1) = 0 (keep), log(0) = -INF (discard)
+    struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
     ggml_set_name(min_p_bias, "min_p_bias");
-    // Add the min_p bias to the logits.
     data->logits = ggml_add(ctx, data->logits, min_p_bias);
     ggml_set_name(data->logits, "min_p_logits");
@@ -3293,6 +3286,170 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
     return result;
 }
+// adaptive-p sampler state
+//
+// maintains an exponential moving average of the *ORIGINAL* probabilities
+// of selected tokens, used to compute an adapted target at each sampling step.
+//
+// see llama.h for a full description of the sampler
+//
+// ref: https://github.com/ggml-org/llama.cpp/pull/17927
+//
+struct llama_sampler_adaptive_p {
+    const float        target;            // target probability (0.0 - 1.0; negative = disabled)
+    const float        decay;             // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
+    const uint32_t     seed;              // original RNG seed
+    uint32_t           seed_cur;          // actual RNG seed
+    std::mt19937       rng;               // RNG state
+    float              weighted_sum;      // sum(p_i * decay^i)
+    float              total_weight;      // sum(decay^i), converges to 1/(1-decay)
+    std::vector<float> original_probs;    // pre-transform probs, cached for EMA update
+    llama_token        pending_token_id;  // token ID of selected token
+    int32_t            pending_token_idx; // index of orig. prob. of selected token in original_probs
+};
+// adaptive probability transformation constants
+static constexpr float DISTRIBUTION_WIDTH =  0.3f;
+static constexpr float PEAK_LOGIT_VALUE   =  5.0f;
+static constexpr float SHARPNESS          = 10.0f;
+static constexpr float INV_WIDTH          =  1.0f / DISTRIBUTION_WIDTH;
+static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
+    return "adaptive-p";
+}
+static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+    llama_sampler_softmax_impl(cur_p, false);
+    if (ctx->target < 0.0f) {
+        // at negative target values, adaptive-p is no-op
+        // we simply sample from the existing distribution
+        cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+        return;
+    }
+    // store the original probabilities
+    ctx->original_probs.resize(cur_p->size);
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        ctx->original_probs[i] = cur_p->data[i].p;
+    }
+    // using the EMA, compute the adapted target probability for the current sampling step
+    auto target = std::clamp(ctx->target, 0.0f, 1.0f);
+    float adapted_target = std::clamp(
+        ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
+        0.0f, 1.0f
+    );
+    // adaptive probability transform
+    //
+    // quadratic near target for fine differentiation, transitioning to linear decay in the
+    // tails. unbounded negative logits ensure proper suppression of far-from-target tokens
+    // after the softmax.
+    //
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (cur_p->data[i].logit == -INFINITY) {
+            // don't transform logits that are -INFINITY
+            // (as masked out by e.g. min-p and top-p when using backend sampling)
+            continue;
+        }
+        float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
+        cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
+    }
+    // softmax and sample from the transformed distribution
+    llama_sampler_softmax_impl(cur_p, false);
+    const int idx   = llama_sample_dist(cur_p, ctx->rng);
+    cur_p->selected = idx;
+    // store the selected token ID for acceptance later
+    ctx->pending_token_id  = cur_p->data[idx].id;
+    ctx->pending_token_idx = idx;
+}
+static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+    if (ctx->pending_token_id == token) {
+        GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
+        GGML_ASSERT(ctx->pending_token_idx != -1);
+        // update EMA with the original probability of the selected token
+        ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
+        ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
+    }
+    ctx->pending_token_id = LLAMA_TOKEN_NULL;
+    ctx->pending_token_idx = -1;
+}
+static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+    // ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
+    // original_probs is completely overwritten on every call to _apply.
+    // so we only need to reset the EMA state and pending token.
+    ctx->weighted_sum      = ctx->target / (1.0f - ctx->decay);
+    ctx->total_weight      = 1.0f / (1.0f - ctx->decay);
+    ctx->pending_token_id  = LLAMA_TOKEN_NULL;
+    ctx->pending_token_idx = -1;
+    ctx->seed_cur          = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+}
+static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
+    const auto * ctx  = (const llama_sampler_adaptive_p *) smpl->ctx;
+    auto * result     = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
+    auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
+    // copy everything (target, decay, seed, and RNG are already set)
+    result_ctx->weighted_sum      = ctx->weighted_sum;
+    result_ctx->total_weight      = ctx->total_weight;
+    result_ctx->pending_token_id  = ctx->pending_token_id;
+    result_ctx->pending_token_idx = ctx->pending_token_idx;
+    return result;
+}
+static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_adaptive_p *) smpl->ctx;
+}
+static struct llama_sampler_i llama_sampler_adaptive_p_i = {
+    /* .name              = */ llama_sampler_adaptive_p_name,
+    /* .accept            = */ llama_sampler_adaptive_p_accept,
+    /* .apply             = */ llama_sampler_adaptive_p_apply,
+    /* .reset             = */ llama_sampler_adaptive_p_reset,
+    /* .clone             = */ llama_sampler_adaptive_p_clone,
+    /* .free              = */ llama_sampler_adaptive_p_free,
+    /* .backend_init      = */ nullptr,
+    /* .backend_accept    = */ nullptr,
+    /* .backend_apply     = */ nullptr,
+    /* .backend_set_input = */ nullptr,
+};
+struct llama_sampler * llama_sampler_init_adaptive_p(
+    float    target,
+    float    decay,
+    uint32_t seed
+) {
+    auto seed_cur = get_rng_seed(seed);
+    float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_adaptive_p_i,
+        /* .ctx   = */ new llama_sampler_adaptive_p {
+            /* .target            = */ target,
+            /* .decay             = */ clamped_decay,
+            /* .seed              = */ seed,
+            /* .seed_cur          = */ seed_cur,
+            /* .rng               = */ std::mt19937(seed_cur),
+            /* .weighted_sum      = */ target / (1.0f - clamped_decay),
+            /* .total_weight      = */ 1.0f / (1.0f - clamped_decay),
+            /* .original_probs    = */ {},
+            /* .pending_token_id  = */ LLAMA_TOKEN_NULL,
+            /* .pending_token_idx = */ -1
+        }
+    );
+}
 // logit-bias
 struct llama_sampler_logit_bias : public llama_sampler_backend {