npm - cui-llama.rn - Versions diffs - 1.1.4 → 1.1.6 - Mend

cui-llama.rn 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/android/src/main/CMakeLists.txt +1 -0
package/android/src/main/jni.cpp +3 -4
package/cpp/common.cpp +183 -1990
package/cpp/common.h +101 -130
package/cpp/ggml-impl.h +32 -0
package/cpp/ggml-metal.m +38 -28
package/cpp/ggml-quants.c +275 -84
package/cpp/ggml.c +89 -35
package/cpp/ggml.h +30 -67
package/cpp/llama-impl.h +1 -0
package/cpp/llama-sampling.cpp +218 -102
package/cpp/llama.cpp +599 -120
package/cpp/llama.h +33 -25
package/cpp/log.cpp +401 -0
package/cpp/log.h +85 -703
package/cpp/rn-llama.hpp +9 -11
package/cpp/sampling.cpp +12 -9
package/cpp/sampling.h +4 -56
package/cpp/sgemm.cpp +38 -0
package/package.json +1 -1

package/cpp/llama.h CHANGED Viewed

@@ -344,7 +344,7 @@ extern "C" {
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-      //bool no_perf;     // whether to measure performance timings, TODO: implement
+        bool no_perf;     // whether to measure performance timings
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
@@ -1057,6 +1057,9 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
     LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
+    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
+    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
     // available samplers:
     LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
@@ -1131,15 +1134,20 @@ extern "C" {
                              int32_t   n_logit_bias,
               const llama_logit_bias * logit_bias);
-    // Shorthand for:
+    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
+    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
+    /// @details Sample and accept a token from the idx-th output of the last evaluation
     //
+    // Shorthand for:
     //    const auto * logits = llama_get_logits_ith(ctx, idx);
     //    llama_token_data_array cur_p = { ... init from logits ... };
     //    llama_sampler_apply(smpl, &cur_p);
-    //    return cur_p.data[cur_p.selected].id;
-    //
-    // At this point, this is mostly a convenience function.
-    //
+    //    auto token = cur_p.data[cur_p.selected].id;
+    //    llama_sampler_accept(smpl, token);
+    //    return token;
+    // Returns the sampled token
     LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
     // TODO: extend in the future
@@ -1172,21 +1180,8 @@ extern "C" {
     // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
     //
-    enum llama_perf_type {
-        LLAMA_PERF_TYPE_CONTEXT       = 0,
-        LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
-    };
-    LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
-    LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);
-    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
-    // Keeps timings of samplers
-    LLAMA_API struct llama_sampler_timings {int64_t t_sample_us; int32_t n_sample;};
-    LLAMA_API struct llama_token_timings {
+    struct llama_perf_context_data {
         double t_start_ms;
-        double t_end_ms;
         double t_load_ms;
         double t_p_eval_ms;
         double t_eval_ms;
@@ -1194,11 +1189,24 @@ extern "C" {
         int32_t n_p_eval;
         int32_t n_eval;
     };
-    // helper function for getting timings
-    LLAMA_API struct llama_token_timings llama_get_token_timings(const void * v_ctx) ;
-    LLAMA_API struct llama_sampler_timings llama_sampler_chain_timings(struct llama_sampler * chain);
-    LLAMA_API struct llama_sampler_timings gpt_sampler_get_timigs(const struct gpt_sampler * gsmpl);
+    struct llama_perf_sampler_data {
+        double t_sample_ms;
+        int32_t n_sample;
+    };
+    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
+    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
+    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
+    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 #ifdef __cplusplus
 }
 #endif

package/cpp/log.cpp ADDED Viewed

@@ -0,0 +1,401 @@
+#include "log.h"
+#include <condition_variable>
+#include <cstdarg>
+#include <cstdio>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+void gpt_log_set_verbosity_thold(int verbosity) {
+    gpt_log_verbosity_thold = verbosity;
+}
+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD    "\033[1m"
+#define LOG_COL_RED     "\033[31m"
+#define LOG_COL_GREEN   "\033[32m"
+#define LOG_COL_YELLOW  "\033[33m"
+#define LOG_COL_BLUE    "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN    "\033[36m"
+#define LOG_COL_WHITE   "\033[37m"
+static int64_t t_us() {
+    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+}
+// colors
+enum gpt_log_col : int {
+    GPT_LOG_COL_DEFAULT = 0,
+    GPT_LOG_COL_BOLD,
+    GPT_LOG_COL_RED,
+    GPT_LOG_COL_GREEN,
+    GPT_LOG_COL_YELLOW,
+    GPT_LOG_COL_BLUE,
+    GPT_LOG_COL_MAGENTA,
+    GPT_LOG_COL_CYAN,
+    GPT_LOG_COL_WHITE,
+};
+// disable colors by default
+static std::vector<const char *> g_col = {
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+};
+struct gpt_log_entry {
+    enum lm_ggml_log_level level;
+    bool prefix;
+    int64_t timestamp;
+    std::vector<char> msg;
+    // signals the worker thread to stop
+    bool is_end;
+    void print(FILE * file = nullptr) const {
+        FILE * fcur = file;
+        if (!fcur) {
+            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
+            // these messages will still be logged to a file
+            if (level == LM_GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+                return;
+            }
+            fcur = stdout;
+            if (level != LM_GGML_LOG_LEVEL_NONE) {
+                fcur = stderr;
+            }
+        }
+        if (level != LM_GGML_LOG_LEVEL_NONE && prefix) {
+            if (timestamp) {
+                // [M.s.ms.us]
+                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
+                        g_col[GPT_LOG_COL_BLUE],
+                        (int) (timestamp / 1000000 / 60),
+                        (int) (timestamp / 1000000 % 60),
+                        (int) (timestamp / 1000 % 1000),
+                        (int) (timestamp % 1000),
+                        g_col[GPT_LOG_COL_DEFAULT]);
+            }
+            switch (level) {
+                case LM_GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
+                case LM_GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
+                case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
+                case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
+                default:
+                    break;
+            }
+        }
+        fprintf(fcur, "%s", msg.data());
+        if (level == LM_GGML_LOG_LEVEL_WARN || level == LM_GGML_LOG_LEVEL_ERROR || level == LM_GGML_LOG_LEVEL_DEBUG) {
+            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
+        }
+        fflush(fcur);
+    }
+};
+struct gpt_log {
+    // default capacity - will be expanded if needed
+    gpt_log() : gpt_log(256) {}
+    gpt_log(size_t capacity) {
+        file = nullptr;
+        prefix = false;
+        timestamps = false;
+        running = false;
+        t_start = t_us();
+        // initial message size - will be expanded if longer messages arrive
+        entries.resize(capacity);
+        for (auto & entry : entries) {
+            entry.msg.resize(256);
+        }
+        head = 0;
+        tail = 0;
+        resume();
+    }
+    ~gpt_log() {
+        pause();
+        if (file) {
+            fclose(file);
+        }
+    }
+private:
+    std::mutex mtx;
+    std::thread thrd;
+    std::condition_variable cv;
+    FILE * file;
+    bool prefix;
+    bool timestamps;
+    bool running;
+    int64_t t_start;
+    // ring buffer of entries
+    std::vector<gpt_log_entry> entries;
+    size_t head;
+    size_t tail;
+    // worker thread copies into this
+    gpt_log_entry cur;
+public:
+    void add(enum lm_ggml_log_level level, const char * fmt, va_list args) {
+        std::lock_guard<std::mutex> lock(mtx);
+        if (!running) {
+            // discard messages while the worker thread is paused
+            return;
+        }
+        auto & entry = entries[tail];
+        {
+            // cannot use args twice, so make a copy in case we need to expand the buffer
+            va_list args_copy;
+            va_copy(args_copy, args);
+#if 1
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
+            }
+#else
+            // hack for bolding arguments
+            std::stringstream ss;
+            for (int i = 0; fmt[i] != 0; i++) {
+                if (fmt[i] == '%') {
+                    ss << LOG_COL_BOLD;
+                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
+                    ss << LOG_COL_DEFAULT;
+                    if (fmt[i] == 0) break;
+                }
+                ss << fmt[i];
+            }
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
+            }
+#endif
+        }
+        entry.level = level;
+        entry.prefix = prefix;
+        entry.timestamp = 0;
+        if (timestamps) {
+            entry.timestamp = t_us() - t_start;
+        }
+        entry.is_end = false;
+        tail = (tail + 1) % entries.size();
+        if (tail == head) {
+            // expand the buffer
+            std::vector<gpt_log_entry> new_entries(2*entries.size());
+            size_t new_tail = 0;
+            do {
+                new_entries[new_tail] = std::move(entries[head]);
+                head     = (head     + 1) % entries.size();
+                new_tail = (new_tail + 1);
+            } while (head != tail);
+            head = 0;
+            tail = new_tail;
+            for (size_t i = tail; i < new_entries.size(); i++) {
+                new_entries[i].msg.resize(256);
+            }
+            entries = std::move(new_entries);
+        }
+        cv.notify_one();
+    }
+    void resume() {
+        std::lock_guard<std::mutex> lock(mtx);
+        if (running) {
+            return;
+        }
+        running = true;
+        thrd = std::thread([this]() {
+            while (true) {
+                {
+                    std::unique_lock<std::mutex> lock(mtx);
+                    cv.wait(lock, [this]() { return head != tail; });
+                    cur = entries[head];
+                    head = (head + 1) % entries.size();
+                }
+                if (cur.is_end) {
+                    break;
+                }
+                cur.print(); // stdout and stderr
+                if (file) {
+                    cur.print(file);
+                }
+            }
+        });
+    }
+    void pause() {
+        {
+            std::lock_guard<std::mutex> lock(mtx);
+            if (!running) {
+                return;
+            }
+            running = false;
+            // push an entry to signal the worker thread to stop
+            {
+                auto & entry = entries[tail];
+                entry.is_end = true;
+                tail = (tail + 1) % entries.size();
+            }
+            cv.notify_one();
+        }
+        thrd.join();
+    }
+    void set_file(const char * path) {
+        pause();
+        if (file) {
+            fclose(file);
+        }
+        if (path) {
+            file = fopen(path, "w");
+        } else {
+            file = nullptr;
+        }
+        resume();
+    }
+    void set_colors(bool colors) {
+        pause();
+        if (colors) {
+            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
+            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
+            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
+            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
+            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
+            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
+            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
+        } else {
+            for (size_t i = 0; i < g_col.size(); i++) {
+                g_col[i] = "";
+            }
+        }
+        resume();
+    }
+    void set_prefix(bool prefix) {
+        std::lock_guard<std::mutex> lock(mtx);
+        this->prefix = prefix;
+    }
+    void set_timestamps(bool timestamps) {
+        std::lock_guard<std::mutex> lock(mtx);
+        this->timestamps = timestamps;
+    }
+};
+//
+// public API
+//
+struct gpt_log * gpt_log_init() {
+    return new gpt_log;
+}
+struct gpt_log * gpt_log_main() {
+    static struct gpt_log log;
+    return &log;
+}
+void gpt_log_pause(struct gpt_log * log) {
+    log->pause();
+}
+void gpt_log_resume(struct gpt_log * log) {
+    log->resume();
+}
+void gpt_log_free(struct gpt_log * log) {
+    delete log;
+}
+void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    log->add(level, fmt, args);
+    va_end(args);
+}
+void gpt_log_set_file(struct gpt_log * log, const char * file) {
+    log->set_file(file);
+}
+void gpt_log_set_colors(struct gpt_log * log, bool colors) {
+    log->set_colors(colors);
+}
+void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
+    log->set_prefix(prefix);
+}
+void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
+    log->set_timestamps(timestamps);
+}