npm - cui-llama.rn - Versions diffs - 1.3.6 → 1.4.1 - Mend

cui-llama.rn 1.3.6 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

package/README.md +22 -1
package/android/src/main/CMakeLists.txt +25 -26
package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
package/android/src/main/jni-utils.h +94 -0
package/android/src/main/jni.cpp +133 -63
package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
package/cpp/common.cpp +2085 -1982
package/cpp/common.h +696 -664
package/cpp/ggml-alloc.c +1042 -1037
package/cpp/ggml-backend-impl.h +255 -256
package/cpp/ggml-backend-reg.cpp +582 -582
package/cpp/ggml-backend.cpp +2002 -2002
package/cpp/ggml-backend.h +354 -352
package/cpp/ggml-common.h +1853 -1853
package/cpp/ggml-cpp.h +39 -39
package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
package/cpp/ggml-cpu-aarch64.h +8 -8
package/cpp/ggml-cpu-impl.h +386 -386
package/cpp/ggml-cpu-quants.c +10920 -10839
package/cpp/ggml-cpu-traits.cpp +36 -36
package/cpp/ggml-cpu-traits.h +38 -38
package/cpp/ggml-cpu.c +14391 -14122
package/cpp/ggml-cpu.cpp +635 -627
package/cpp/ggml-cpu.h +135 -135
package/cpp/ggml-impl.h +567 -567
package/cpp/ggml-metal-impl.h +288 -0
package/cpp/ggml-metal.m +4884 -4884
package/cpp/ggml-opt.cpp +854 -0
package/cpp/ggml-opt.h +216 -0
package/cpp/ggml-quants.c +5238 -5238
package/cpp/ggml-threading.h +14 -14
package/cpp/ggml.c +6514 -6448
package/cpp/ggml.h +2194 -2163
package/cpp/gguf.cpp +1329 -1325
package/cpp/gguf.h +202 -202
package/cpp/json-schema-to-grammar.cpp +1045 -1045
package/cpp/json-schema-to-grammar.h +8 -8
package/cpp/json.hpp +24766 -24766
package/cpp/llama-adapter.cpp +347 -346
package/cpp/llama-adapter.h +74 -73
package/cpp/llama-arch.cpp +1487 -1434
package/cpp/llama-arch.h +400 -395
package/cpp/llama-batch.cpp +368 -368
package/cpp/llama-batch.h +88 -88
package/cpp/llama-chat.cpp +578 -567
package/cpp/llama-chat.h +52 -51
package/cpp/llama-context.cpp +1775 -1771
package/cpp/llama-context.h +128 -128
package/cpp/llama-cparams.cpp +1 -1
package/cpp/llama-cparams.h +37 -37
package/cpp/llama-cpp.h +30 -30
package/cpp/llama-grammar.cpp +1139 -1139
package/cpp/llama-grammar.h +143 -143
package/cpp/llama-hparams.cpp +71 -71
package/cpp/llama-hparams.h +139 -140
package/cpp/llama-impl.cpp +167 -167
package/cpp/llama-impl.h +61 -61
package/cpp/llama-kv-cache.cpp +718 -718
package/cpp/llama-kv-cache.h +218 -218
package/cpp/llama-mmap.cpp +590 -589
package/cpp/llama-mmap.h +67 -67
package/cpp/llama-model-loader.cpp +1124 -1011
package/cpp/llama-model-loader.h +167 -158
package/cpp/llama-model.cpp +3997 -2202
package/cpp/llama-model.h +370 -391
package/cpp/llama-sampling.cpp +2408 -2406
package/cpp/llama-sampling.h +32 -48
package/cpp/llama-vocab.cpp +3247 -1982
package/cpp/llama-vocab.h +125 -182
package/cpp/llama.cpp +10077 -12544
package/cpp/llama.h +1323 -1285
package/cpp/log.cpp +401 -401
package/cpp/log.h +121 -121
package/cpp/rn-llama.hpp +123 -116
package/cpp/sampling.cpp +505 -500
package/cpp/sgemm.cpp +2597 -2597
package/cpp/sgemm.h +14 -14
package/cpp/speculative.cpp +277 -274
package/cpp/speculative.h +28 -28
package/cpp/unicode.cpp +2 -3
package/ios/RNLlama.mm +47 -0
package/ios/RNLlamaContext.h +3 -1
package/ios/RNLlamaContext.mm +71 -14
package/jest/mock.js +15 -3
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/commonjs/index.js +33 -37
package/lib/commonjs/index.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/module/index.js +31 -35
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +26 -6
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/lib/typescript/index.d.ts +21 -36
package/lib/typescript/index.d.ts.map +1 -1
package/llama-rn.podspec +4 -18
package/package.json +2 -3
package/src/NativeRNLlama.ts +32 -13
package/src/index.ts +52 -47
package/cpp/llama.cpp.rej +0 -23

package/cpp/sgemm.h CHANGED Viewed

@@ -1,14 +1,14 @@
-#pragma once
-#include <stdint.h>
-#include <stdbool.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-bool llamafile_sgemm(const struct lm_lm_ggml_compute_params * params, int64_t, int64_t, int64_t,
-                     const void *, int64_t, const void *, int64_t, void *, int64_t,
-                     int, int, int);
-#ifdef __cplusplus
-}
-#endif
+#pragma once
+#include <stdint.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+bool llamafile_sgemm(const struct lm_ggml_compute_params * params, int64_t, int64_t, int64_t,
+                     const void *, int64_t, const void *, int64_t, void *, int64_t,
+                     int, int, int);
+#ifdef __cplusplus
+}
+#endif

package/cpp/speculative.cpp CHANGED Viewed

@@ -1,274 +1,277 @@
-#include "speculative.h"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include <cstring>
-#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
-#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
-struct common_speculative {
-    struct llama_context * ctx;
-    struct common_sampler * smpl;
-    llama_batch batch;
-    llama_tokens prompt;
-};
-struct common_speculative * common_speculative_init(
-        struct llama_context * ctx_dft) {
-    auto * result = new common_speculative {
-        /* .ctx    = */ ctx_dft,
-        /* .smpl   = */ nullptr,
-        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
-        /* .prompt = */ {},
-    };
-    // TODO: optimize or pass from outside?
-#if 0
-    {
-        common_params_sampling params;
-        params.no_perf = false;
-        params.top_k = 40;
-        params.top_p = 0.9;
-        params.samplers = {
-            COMMON_SAMPLER_TYPE_TOP_K,
-            COMMON_SAMPLER_TYPE_TOP_P,
-            COMMON_SAMPLER_TYPE_INFILL,
-        };
-        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
-    }
-#else
-    {
-        common_params_sampling params;
-        params.no_perf = false;
-        params.top_k = 10;
-        params.samplers = {
-            COMMON_SAMPLER_TYPE_TOP_K,
-        };
-        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
-    }
-#endif
-    return result;
-}
-void common_speculative_free(struct common_speculative * spec) {
-    if (spec == nullptr) {
-        return;
-    }
-    common_sampler_free(spec->smpl);
-    llama_batch_free(spec->batch);
-    delete spec;
-}
-bool common_speculative_are_compatible(
-        const struct llama_context * ctx_tgt,
-        const struct llama_context * ctx_dft) {
-    const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
-    const struct llama_model * model_dft = llama_get_model(ctx_dft);
-    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
-    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
-    const bool vocab_type_dft = llama_vocab_type(model_dft);
-    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
-    if (vocab_type_tgt != vocab_type_dft) {
-        LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
-                     "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
-        return false;
-    }
-    if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
-        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
-        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
-        llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
-        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
-        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
-        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
-        return false;
-    }
-    {
-        const int n_vocab_tgt = llama_n_vocab(model_tgt);
-        const int n_vocab_dft = llama_n_vocab(model_dft);
-        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
-        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
-                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
-            return false;
-        }
-        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
-            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
-            const char * token_text_dft = llama_token_get_text(model_dft, i);
-            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                LOG_ERR("%s: draft model vocab must match target model to use speculation but "
-                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
-                        common_token_to_piece(ctx_tgt, i).c_str(),
-                        common_token_to_piece(ctx_dft, i).c_str());
-                return false;
-            }
-        }
-    }
-    return true;
-}
-llama_tokens common_speculative_gen_draft(
-        struct common_speculative * spec,
-        struct common_speculative_params params,
-        const llama_tokens & prompt_tgt,
-        llama_token id_last) {
-    auto & batch  = spec->batch;
-    auto & ctx    = spec->ctx;
-    auto & smpl   = spec->smpl;
-    auto & prompt = spec->prompt;
-    int reuse_i = 0;
-    int reuse_n = 0;
-    const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
-    const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
-    // reuse as much as possible from the old draft context
-    // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
-    for (int i = 0; i < (int) prompt.size(); ++i) {
-        int cur = 0;
-        while (i_start + cur < (int) prompt_tgt.size() &&
-               i       + cur < (int) prompt.size() &&
-               prompt_tgt[i_start + cur] == prompt[i + cur]) {
-            cur++;
-        }
-        if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
-            reuse_i = i;
-            reuse_n = cur;
-        }
-    }
-    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
-    llama_tokens result;
-    result.reserve(params.n_draft);
-    if (reuse_n == 0) {
-        llama_kv_cache_clear(ctx);
-        prompt.clear();
-    } else {
-        // this happens when a previous draft has been discarded (for example, due to being too small), but the
-        // target model agreed with it. in this case, we simply pass back the previous results to save compute
-        if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
-            for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
-                result.push_back(prompt[i]);
-                if (params.n_draft <= (int) result.size()) {
-                    break;
-                }
-            }
-            return result;
-        }
-        if (reuse_i > 0) {
-            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
-            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
-            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
-        }
-        if (reuse_n < (int) prompt.size()) {
-            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
-            prompt.erase(prompt.begin() + reuse_n, prompt.end());
-        }
-    }
-    // prepare a batch to evaluate any new tokens in the prompt
-    common_batch_clear(batch);
-    for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
-        //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
-        common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
-        prompt.push_back(prompt_tgt[i]);
-    }
-    // we should rarely end-up here during normal decoding
-    if (batch.n_tokens > 0) {
-        //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
-        llama_decode(ctx, batch);
-    }
-    const llama_pos n_past = prompt.size();
-    LOG_DBG("%s: n_past = %d\n", __func__, n_past);
-    common_batch_clear(batch);
-    common_batch_add  (batch, id_last, n_past, { 0 }, true);
-    prompt.push_back(id_last);
-    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
-    llama_decode(ctx, batch);
-    common_sampler_reset(smpl);
-    // sample n_draft tokens from the draft model
-    for (int i = 0; i < params.n_draft; ++i) {
-        common_batch_clear(batch);
-        common_sampler_sample(smpl, ctx, 0, true);
-        const auto * cur_p = common_sampler_get_candidates(smpl);
-        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
-            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
-        }
-        // add drafted token for each sequence
-        const llama_token id = cur_p->data[0].id;
-        // only collect very high-confidence draft tokens
-        if (cur_p->data[0].p < params.p_min) {
-            break;
-        }
-        common_sampler_accept(smpl, id, true);
-        result.push_back(id);
-        if (params.n_draft <= (int) result.size()) {
-            break;
-        }
-        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
-        // evaluate the drafted tokens on the draft model
-        llama_decode(ctx, batch);
-        prompt.push_back(id);
-    }
-    return result;
-}
+#include "speculative.h"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include <cstring>
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
+#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
+struct common_speculative {
+    struct llama_context * ctx;
+    struct common_sampler * smpl;
+    llama_batch batch;
+    llama_tokens prompt;
+};
+struct common_speculative * common_speculative_init(
+        struct llama_context * ctx_dft) {
+    auto * result = new common_speculative {
+        /* .ctx    = */ ctx_dft,
+        /* .smpl   = */ nullptr,
+        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
+        /* .prompt = */ {},
+    };
+    // TODO: optimize or pass from outside?
+#if 0
+    {
+        common_params_sampling params;
+        params.no_perf = false;
+        params.top_k = 40;
+        params.top_p = 0.9;
+        params.samplers = {
+            COMMON_SAMPLER_TYPE_TOP_K,
+            COMMON_SAMPLER_TYPE_TOP_P,
+            COMMON_SAMPLER_TYPE_INFILL,
+        };
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
+    }
+#else
+    {
+        common_params_sampling params;
+        params.no_perf = false;
+        params.top_k = 10;
+        params.samplers = {
+            COMMON_SAMPLER_TYPE_TOP_K,
+        };
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
+    }
+#endif
+    return result;
+}
+void common_speculative_free(struct common_speculative * spec) {
+    if (spec == nullptr) {
+        return;
+    }
+    common_sampler_free(spec->smpl);
+    llama_batch_free(spec->batch);
+    delete spec;
+}
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft) {
+    const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
+    const struct llama_model * model_dft = llama_get_model(ctx_dft);
+    const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
+    const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
+    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
+    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
+    if (vocab_type_tgt != vocab_type_dft) {
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
+                     "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
+        return false;
+    }
+    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
+        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
+        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
+        LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
+        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
+        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
+        return false;
+    }
+    {
+        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
+        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
+        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
+        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
+                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            return false;
+        }
+        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
+            const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
+            const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
+            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
+                LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
+                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
+                        common_token_to_piece(ctx_tgt, i).c_str(),
+                        common_token_to_piece(ctx_dft, i).c_str());
+                return false;
+            }
+        }
+    }
+    return true;
+}
+llama_tokens common_speculative_gen_draft(
+        struct common_speculative * spec,
+        struct common_speculative_params params,
+        const llama_tokens & prompt_tgt,
+        llama_token id_last) {
+    auto & batch  = spec->batch;
+    auto & ctx    = spec->ctx;
+    auto & smpl   = spec->smpl;
+    auto & prompt = spec->prompt;
+    int reuse_i = 0;
+    int reuse_n = 0;
+    const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
+    const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
+    // reuse as much as possible from the old draft context
+    // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
+    for (int i = 0; i < (int) prompt.size(); ++i) {
+        int cur = 0;
+        while (i_start + cur < (int) prompt_tgt.size() &&
+               i       + cur < (int) prompt.size() &&
+               prompt_tgt[i_start + cur] == prompt[i + cur]) {
+            cur++;
+        }
+        if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
+            reuse_i = i;
+            reuse_n = cur;
+        }
+    }
+    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
+    llama_tokens result;
+    result.reserve(params.n_draft);
+    if (reuse_n == 0) {
+        llama_kv_cache_clear(ctx);
+        prompt.clear();
+    } else {
+        // this happens when a previous draft has been discarded (for example, due to being too small), but the
+        // target model agreed with it. in this case, we simply pass back the previous results to save compute
+        if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
+            for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
+                result.push_back(prompt[i]);
+                if (params.n_draft <= (int) result.size()) {
+                    break;
+                }
+            }
+            return result;
+        }
+        if (reuse_i > 0) {
+            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
+            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
+        }
+        if (reuse_n < (int) prompt.size()) {
+            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+            prompt.erase(prompt.begin() + reuse_n, prompt.end());
+        }
+    }
+    // prepare a batch to evaluate any new tokens in the prompt
+    common_batch_clear(batch);
+    for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
+        //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
+        common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
+        prompt.push_back(prompt_tgt[i]);
+    }
+    // we should rarely end-up here during normal decoding
+    if (batch.n_tokens > 0) {
+        //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
+        llama_decode(ctx, batch);
+    }
+    const llama_pos n_past = prompt.size();
+    LOG_DBG("%s: n_past = %d\n", __func__, n_past);
+    common_batch_clear(batch);
+    common_batch_add  (batch, id_last, n_past, { 0 }, true);
+    prompt.push_back(id_last);
+    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
+    llama_decode(ctx, batch);
+    common_sampler_reset(smpl);
+    // sample n_draft tokens from the draft model
+    for (int i = 0; i < params.n_draft; ++i) {
+        common_batch_clear(batch);
+        common_sampler_sample(smpl, ctx, 0, true);
+        const auto * cur_p = common_sampler_get_candidates(smpl);
+        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
+            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
+        }
+        // add drafted token for each sequence
+        const llama_token id = cur_p->data[0].id;
+        // only collect very high-confidence draft tokens
+        if (cur_p->data[0].p < params.p_min) {
+            break;
+        }
+        common_sampler_accept(smpl, id, true);
+        result.push_back(id);
+        if (params.n_draft <= (int) result.size()) {
+            break;
+        }
+        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
+        // evaluate the drafted tokens on the draft model
+        llama_decode(ctx, batch);
+        prompt.push_back(id);
+    }
+    return result;
+}

package/cpp/speculative.h CHANGED Viewed

@@ -1,28 +1,28 @@
-#pragma once
-#include "llama.h"
-#include "common.h"
-struct common_speculative;
-struct common_speculative_params {
-    int n_draft = 16;  // max drafted tokens
-    int n_reuse = 256;
-    float p_min = 0.9f; // min probabiliy required to accept a token in the draft
-};
-struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
-void common_speculative_free(struct common_speculative * spec);
-bool common_speculative_are_compatible(
-        const struct llama_context * ctx_tgt,
-        const struct llama_context * ctx_dft);
-// sample up to n_draft tokens and add them to the batch using the draft model
-llama_tokens common_speculative_gen_draft(
-               struct common_speculative * spec,
-        struct common_speculative_params   params,
-                      const llama_tokens & prompt,
-                             llama_token   id_last);
+#pragma once
+#include "llama.h"
+#include "common.h"
+struct common_speculative;
+struct common_speculative_params {
+    int n_draft = 16;  // max drafted tokens
+    int n_reuse = 256;
+    float p_min = 0.9f; // min probabiliy required to accept a token in the draft
+};
+struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
+void common_speculative_free(struct common_speculative * spec);
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft);
+// sample up to n_draft tokens and add them to the batch using the draft model
+llama_tokens common_speculative_gen_draft(
+               struct common_speculative * spec,
+        struct common_speculative_params   params,
+                      const llama_tokens & prompt,
+                             llama_token   id_last);