npm - cui-llama.rn - Versions diffs - 1.0.3 → 1.0.6 - Mend

cui-llama.rn 1.0.3 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/README.md +35 -39
package/android/src/main/CMakeLists.txt +12 -2
package/android/src/main/java/com/rnllama/LlamaContext.java +29 -9
package/android/src/main/java/com/rnllama/RNLlama.java +33 -1
package/android/src/main/jni.cpp +62 -8
package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +5 -0
package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +5 -0
package/cpp/common.cpp +3237 -3231
package/cpp/common.h +469 -468
package/cpp/ggml-aarch64.c +2193 -2193
package/cpp/ggml-aarch64.h +39 -39
package/cpp/ggml-alloc.c +1036 -1042
package/cpp/ggml-backend-impl.h +153 -153
package/cpp/ggml-backend.c +2240 -2234
package/cpp/ggml-backend.h +238 -238
package/cpp/ggml-common.h +1833 -1829
package/cpp/ggml-impl.h +755 -655
package/cpp/ggml-metal.h +65 -65
package/cpp/ggml-metal.m +3269 -3269
package/cpp/ggml-quants.c +14872 -14860
package/cpp/ggml-quants.h +132 -132
package/cpp/ggml.c +22055 -22044
package/cpp/ggml.h +2453 -2447
package/cpp/llama-grammar.cpp +539 -0
package/cpp/llama-grammar.h +39 -0
package/cpp/llama-impl.h +26 -0
package/cpp/llama-sampling.cpp +635 -0
package/cpp/llama-sampling.h +56 -0
package/cpp/llama-vocab.cpp +1721 -0
package/cpp/llama-vocab.h +130 -0
package/cpp/llama.cpp +19171 -21892
package/cpp/llama.h +1240 -1217
package/cpp/log.h +737 -737
package/cpp/rn-llama.hpp +207 -29
package/cpp/sampling.cpp +460 -460
package/cpp/sgemm.cpp +1027 -1027
package/cpp/sgemm.h +14 -14
package/cpp/unicode.cpp +6 -0
package/cpp/unicode.h +3 -0
package/ios/RNLlama.mm +15 -6
package/ios/RNLlamaContext.h +2 -8
package/ios/RNLlamaContext.mm +41 -34
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/commonjs/chat.js +37 -0
package/lib/commonjs/chat.js.map +1 -0
package/lib/commonjs/index.js +14 -1
package/lib/commonjs/index.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/module/chat.js +31 -0
package/lib/module/chat.js.map +1 -0
package/lib/module/index.js +14 -1
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +5 -1
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/lib/typescript/chat.d.ts +10 -0
package/lib/typescript/chat.d.ts.map +1 -0
package/lib/typescript/index.d.ts +9 -2
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +1 -1
package/src/NativeRNLlama.ts +10 -1
package/src/chat.ts +44 -0
package/src/index.ts +31 -4

package/cpp/llama-sampling.cpp ADDED Viewed

@@ -0,0 +1,635 @@
+#include "llama-sampling.h"
+#include <algorithm>
+#include <cstring>
+#include <ctime>
+#include <cfloat>
+#include <numeric>
+#include <unordered_map>
+static void llama_log_softmax(float * array, size_t size) {
+    float max_l = *std::max_element(array, array + size);
+    float sum = 0.f;
+    for (size_t i = 0; i < size; ++i) {
+        float p = expf(array[i] - max_l);
+        sum += p;
+        array[i] = p;
+    }
+    for (size_t i = 0; i < size; ++i) {
+        array[i] = logf(array[i] / sum);
+    }
+}
+void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        seed = time(NULL);
+    }
+    smpl->rng.seed(seed);
+}
+void llama_sample_softmax_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    LM_GGML_ASSERT(candidates->size > 0);
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    // Sort the logits in descending order
+    if (!candidates->sorted) {
+        std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        });
+        candidates->sorted = true;
+    }
+    float max_l = candidates->data[0].logit;
+    float cum_sum = 0.0f;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float p = expf(candidates->data[i].logit - max_l);
+        candidates->data[i].p = p;
+        cum_sum += p;
+    }
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].p /= cum_sum;
+    }
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
+    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
+    // if (k >= (int32_t)candidates->size) {
+    //     return;
+    // }
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    if (k <= 0) {
+        k = candidates->size;
+    }
+    k = std::max(k, (int) min_keep);
+    k = std::min(k, (int) candidates->size);
+    // Sort scores in descending order
+    if (!candidates->sorted) {
+        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        };
+        if (k <= 128) {
+            std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
+        } else {
+            constexpr int   nbuckets     = 128;
+            constexpr float bucket_low   = -10.0f;
+            constexpr float bucket_high  =  10.0f;
+            constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
+            constexpr float bucker_inter = -bucket_low * bucket_scale;
+            std::vector<int> bucket_idx(candidates->size);
+            std::vector<int> histo(nbuckets, 0);
+            for (int i = 0; i < (int)candidates->size; ++i) {
+                const float val = candidates->data[i].logit;
+                int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+                ib = std::max(0, std::min(nbuckets-1, ib));
+                bucket_idx[i] = ib;
+                ++histo[ib];
+            }
+            int nhave = 0;
+            int ib = nbuckets - 1;
+            for ( ; ib >= 0; --ib) {
+                nhave += histo[ib];
+                if (nhave >= k) break;
+            }
+            std::vector<llama_token_data> tmp_tokens(nhave);
+            auto ptr = tmp_tokens.data();
+            std::vector<llama_token_data*> bucket_ptrs;
+            bucket_ptrs.reserve(nbuckets - ib);
+            for (int j = nbuckets - 1; j >= ib; --j) {
+                bucket_ptrs.push_back(ptr);
+                ptr += histo[j];
+            }
+            for (int i = 0; i < (int)candidates->size; ++i) {
+                int j = bucket_idx[i];
+                if (j >= ib) {
+                    *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
+                }
+            }
+            ptr = tmp_tokens.data();
+            int ndone = 0;
+            for (int j = nbuckets-1; j > ib; --j) {
+                std::sort(ptr, ptr + histo[j], comp);
+                ptr += histo[j];
+                ndone += histo[j];
+            }
+            std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
+            std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+        }
+        candidates->sorted = true;
+    }
+    candidates->size = k;
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_top_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    if (p >= 1.0f) {
+        return;
+    }
+    llama_sample_softmax_impl(smpl, candidates);
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = candidates->size;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        cum_sum += candidates->data[i].p;
+        // Check if the running sum is at least p or if we have kept at least min_keep tokens
+        // we set the last index to i+1 to indicate that the current iterate should be included in the set
+        if (cum_sum >= p && i + 1 >= min_keep) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+    // Resize the output vector to keep only the top-p tokens
+    candidates->size = last_idx;
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_min_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    if (p <= 0.0f || !candidates->size) {
+        return;
+    }
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    bool min_p_applied = false;
+    // if the candidates aren't sorted, try the unsorted implementation first
+    if (!candidates->sorted) {
+        std::vector<llama_token_data> filtered_tokens;
+        float max_logit = -FLT_MAX;
+        for (size_t i = 0; i < candidates->size; ++i) {
+            max_logit = std::max(max_logit, candidates->data[i].logit);
+        }
+        const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
+        for (size_t i = 0; i < candidates->size; ++i) {
+            if (candidates->data[i].logit >= min_logit) {
+                filtered_tokens.push_back(candidates->data[i]);
+            }
+        }
+        // if we have enough values the operation was a success
+        if (filtered_tokens.size() >= min_keep) {
+            memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
+            candidates->size = filtered_tokens.size();
+            min_p_applied = true;
+        }
+    }
+    // if the candidates are sorted or the unsorted implementation failed, use this implementation
+    if (!min_p_applied) {
+        // Sort the logits in descending order
+        if (!candidates->sorted) {
+            std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+                return a.logit > b.logit;
+            });
+            candidates->sorted = true;
+        }
+        const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
+        size_t i = 1; // first token always matches
+        for (; i < candidates->size; ++i) {
+            if (candidates->data[i].logit < min_logit && i >= min_keep) {
+                break; // prob too small
+            }
+        }
+        // Resize the output vector to keep only the matching tokens
+        candidates->size = i;
+    }
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
+    if (z >= 1.0f || candidates->size <= 2) {
+        return;
+    }
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    // Compute the first and second derivatives
+    std::vector<float> first_derivatives(candidates->size - 1);
+    std::vector<float> second_derivatives(candidates->size - 2);
+    for (size_t i = 0; i < first_derivatives.size(); ++i) {
+        first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
+    }
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
+    }
+    // Calculate absolute value of second derivatives
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = std::abs(second_derivatives[i]);
+    }
+    // Normalize the second derivatives
+    {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+        if (second_derivatives_sum > 1e-6f) {
+            for (float & value : second_derivatives) {
+                value /= second_derivatives_sum;
+            }
+        } else {
+            for (float & value : second_derivatives) {
+                value = 1.0f / second_derivatives.size();
+            }
+        }
+    }
+    float cum_sum = 0.0f;
+    size_t last_idx = candidates->size;
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        cum_sum += second_derivatives[i];
+        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
+        if (cum_sum > z && i >= min_keep) {
+            last_idx = i;
+            break;
+        }
+    }
+    // Resize the output vector to keep only the tokens above the tail location
+    candidates->size = last_idx;
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    // Reference implementation:
+    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
+    if (p >= 1.0f) {
+        return;
+    }
+    // Compute the softmax of logits and calculate entropy
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    float entropy = 0.0f;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        entropy += -candidates->data[i].p * logf(candidates->data[i].p);
+    }
+    // Compute the absolute difference between negative log probability and entropy for each candidate
+    std::vector<float> shifted_scores;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
+        shifted_scores.push_back(shifted_score);
+    }
+    // Sort tokens based on the shifted_scores and their corresponding indices
+    std::vector<size_t> indices(candidates->size);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
+        return shifted_scores[a] < shifted_scores[b];
+    });
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = indices.size();
+    for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        cum_sum += candidates->data[idx].p;
+        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
+        if (cum_sum > p && i >= min_keep - 1) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+    // Resize the output vector to keep only the locally typical tokens
+    std::vector<llama_token_data> new_candidates;
+    for (size_t i = 0; i < last_idx; ++i) {
+        size_t idx = indices[i];
+        new_candidates.push_back(candidates->data[idx]);
+    }
+    // Replace the data in candidates with the new_candidates data
+    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
+    candidates->size = new_candidates.size();
+    candidates->sorted = false;
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_entropy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    // no need to do anything if there is only one (or zero) candidates
+    if(candidates->size <= 1) {
+        return;
+    }
+    // Calculate maximum possible entropy
+    float max_entropy = -logf(1.0f / candidates->size);
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+    // Calculate entropy of the softmax probabilities
+    float entropy = 0.0f;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float prob = candidates->data[i].p;
+        if (prob > 0.0f) { // Ensure no log(0)
+            entropy -= prob * logf(prob);
+        }
+    }
+    // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates->size != 1 above)
+    float normalized_entropy = entropy / max_entropy;
+    // Map the normalized entropy to the desired temperature range using the power function
+    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+#ifdef DEBUG
+    LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+    LLAMA_LOG_INFO("Entropy: %f\n", entropy);
+    LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+    LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+    LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
+    LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+#endif
+    // Apply the dynamically calculated temperature scaling
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].logit /= dyn_temp;
+    }
+    // Re-compute softmax probabilities after scaling logits with dynamic temperature
+    double max_l_double = candidates->data[0].logit;
+    double cum_sum_double = 0.0;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        double p = exp(candidates->data[i].logit - max_l_double);
+        candidates->data[i].p = p; // Store the scaled probability
+        cum_sum_double += p;
+    }
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+    }
+#ifdef DEBUG
+    // Print the updated top 25 probabilities after temperature scaling
+    LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+    for (size_t i = 0; i < 25 && i < candidates->size; ++i) {
+        LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates->data[i].p * 100.0f);
+    }
+#endif
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) {
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].logit /= temp;
+    }
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_repetition_penalties_impl(
+        struct llama_sampling * smpl,
+       llama_token_data_array * candidates,
+            const llama_token * last_tokens,
+                       size_t   penalty_last_n,
+                       float   penalty_repeat,
+                       float   penalty_freq,
+                       float   penalty_present) {
+    if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
+        return;
+    }
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    // Create a frequency map to count occurrences of each token in last_tokens
+    std::unordered_map<llama_token, int> token_count;
+    for (size_t i = 0; i < penalty_last_n; ++i) {
+        token_count[last_tokens[i]]++;
+    }
+    // Apply frequency and presence penalties to the candidates
+    for (size_t i = 0; i < candidates->size; ++i) {
+        const auto token_iter = token_count.find(candidates->data[i].id);
+        if (token_iter == token_count.end()) {
+            continue;
+        }
+        const int count = token_iter->second;
+        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+        if (candidates->data[i].logit <= 0) {
+            candidates->data[i].logit *= penalty_repeat;
+        } else {
+            candidates->data[i].logit /= penalty_repeat;
+        }
+        candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
+    }
+    candidates->sorted = false;
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+}
+void llama_sample_apply_guidance_impl(
+        struct llama_sampling * smpl,
+                        float * logits,
+                        float * logits_guidance,
+                        float   scale) {
+    LM_GGML_ASSERT(smpl);
+    const auto t_start_sample_us = lm_ggml_time_us();
+    const auto n_vocab = smpl->n_vocab;
+    llama_log_softmax(logits, n_vocab);
+    llama_log_softmax(logits_guidance, n_vocab);
+    for (int i = 0; i < n_vocab; ++i) {
+              auto & l = logits[i];
+        const auto & g = logits_guidance[i];
+        l = scale * (l - g) + g;
+    }
+    smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+}
+llama_token llama_sample_token_mirostat_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
+    LM_GGML_ASSERT(smpl);
+    const int32_t n_vocab = float(smpl->n_vocab);
+    int64_t t_start_sample_us = lm_ggml_time_us();
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+    // Estimate s_hat using the most probable m tokens
+    float s_hat = 0.0;
+    float sum_ti_bi = 0.0;
+    float sum_ti_sq = 0.0;
+    for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
+        float t_i = logf(float(i + 2) / float(i + 1));
+        float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
+        sum_ti_bi += t_i * b_i;
+        sum_ti_sq += t_i * t_i;
+    }
+    s_hat = sum_ti_bi / sum_ti_sq;
+    // Compute k from the estimated s_hat and target surprise value
+    float epsilon_hat = s_hat - 1;
+    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);
+    // Sample the next word X using top-k sampling
+    llama_sample_top_k_impl((struct llama_sampling *) nullptr, candidates, int(k), 1);
+    smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    llama_token X = llama_sample_token_impl(smpl, candidates);
+    t_start_sample_us = lm_ggml_time_us();
+    // Compute error as the difference between observed surprise and target surprise value
+    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return candidate.id == X;
+    }));
+    float observed_surprise = -log2f(candidates->data[X_idx].p);
+    float e = observed_surprise - tau;
+    // Update mu using the learning rate and error
+    *mu = *mu - eta * e;
+    smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    return X;
+}
+llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
+    int64_t t_start_sample_us;
+    t_start_sample_us = lm_ggml_time_us();
+    llama_sample_softmax_impl(smpl, candidates);
+    // Truncate the words with surprise values greater than mu
+    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return -log2f(candidate.p) > *mu;
+    }));
+    if (candidates->size == 0) {
+        candidates->size = 1;
+    }
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+    // Normalize the probabilities of the remaining words
+    llama_sample_softmax_impl(smpl, candidates);
+    // Sample the next word X from the remaining words
+    llama_token X = llama_sample_token_impl(smpl, candidates);
+    t_start_sample_us = lm_ggml_time_us();
+    // Compute error as the difference between observed surprise and target surprise value
+    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return candidate.id == X;
+    }));
+    float observed_surprise = -log2f(candidates->data[X_idx].p);
+    float e = observed_surprise - tau;
+    // Update mu using the learning rate and error
+    *mu = *mu - eta * e;
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    }
+    return X;
+}
+llama_token llama_sample_token_greedy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    // Find max element
+    auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit < b.logit;
+    });
+    llama_token result = max_iter->id;
+    if (smpl) {
+        smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+        smpl->n_sample++;
+    }
+    return result;
+}
+llama_token llama_sample_token_with_rng_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) {
+    LM_GGML_ASSERT(smpl);
+    const int64_t t_start_sample_us = lm_ggml_time_us();
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+    std::vector<float> probs;
+    probs.reserve(candidates->size);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        probs.push_back(candidates->data[i].p);
+    }
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+    llama_token result = candidates->data[idx].id;
+    smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
+    smpl->n_sample++;
+    return result;
+}
+llama_token llama_sample_token_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    return llama_sample_token_with_rng_impl(smpl, candidates, smpl->rng);
+}

package/cpp/llama-sampling.h ADDED Viewed

@@ -0,0 +1,56 @@
+#pragma once
+#include "llama-impl.h"
+struct llama_sampling {
+    llama_sampling(int32_t n_vocab) : n_vocab(n_vocab) {}
+    std::mt19937 rng;
+    int32_t n_vocab = 0;
+    mutable int64_t t_sample_us = 0;
+    mutable int32_t n_sample = 0;
+    void reset_timings() const {
+        t_sample_us = 0;
+        n_sample = 0;
+    }
+};
+//
+// internal API
+//
+void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed);
+void llama_sample_softmax_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates);
+void llama_sample_top_k_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
+void llama_sample_top_p_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_min_p_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep);
+void llama_sample_typical_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_entropy_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
+void llama_sample_temp_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
+void llama_sample_repetition_penalties_impl(
+        struct llama_sampling * smpl,
+       llama_token_data_array * candidates,
+            const llama_token * last_tokens,
+                       size_t   penalty_last_n,
+                        float   penalty_repeat,
+                        float   penalty_freq,
+                        float   penalty_present);
+void llama_sample_apply_guidance_impl(
+        struct llama_sampling * smpl,
+                        float * logits,
+                        float * logits_guidance,
+                        float   scale);
+llama_token llama_sample_token_mirostat_impl   (struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
+llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
+llama_token llama_sample_token_greedy_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates);
+llama_token llama_sample_token_with_rng_impl   (struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng);
+llama_token llama_sample_token_impl            (struct llama_sampling * smpl, llama_token_data_array * candidates);