npm - cui-llama.rn - Versions diffs - 1.2.3 → 1.2.6 - Mend

cui-llama.rn 1.2.3 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +0 -2
package/android/src/main/CMakeLists.txt +1 -0
package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
package/android/src/main/jni.cpp +9 -11
package/cpp/common.cpp +85 -75
package/cpp/common.h +127 -91
package/cpp/ggml-aarch64.c +269 -0
package/cpp/ggml-alloc.c +17 -19
package/cpp/ggml-backend-impl.h +4 -15
package/cpp/ggml-backend.cpp +1697 -1626
package/cpp/ggml-backend.h +13 -25
package/cpp/ggml-cpp.h +38 -0
package/cpp/ggml-cpu.c +13720 -0
package/cpp/ggml-cpu.h +150 -0
package/cpp/ggml-impl.h +95 -0
package/cpp/ggml-metal.m +185 -71
package/cpp/ggml-quants.c +38 -51
package/cpp/ggml.c +4468 -19500
package/cpp/ggml.h +26 -146
package/cpp/json-schema-to-grammar.cpp +1 -1
package/cpp/llama-sampling.cpp +742 -249
package/cpp/llama-sampling.h +21 -2
package/cpp/llama-vocab.cpp +49 -9
package/cpp/llama-vocab.h +35 -11
package/cpp/llama.cpp +2468 -2307
package/cpp/llama.h +65 -32
package/cpp/log.cpp +50 -50
package/cpp/log.h +18 -18
package/cpp/rn-llama.hpp +23 -22
package/cpp/sampling.cpp +117 -118
package/cpp/sampling.h +20 -20
package/cpp/sgemm.cpp +57 -0
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +0 -1
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/package.json +1 -1
package/src/NativeRNLlama.ts +0 -1

package/cpp/common.h CHANGED Viewed

@@ -24,12 +24,12 @@
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-struct llama_lora_adapter_info {
+struct common_lora_adapter_info {
     std::string path;
     float scale;
 };
-struct llama_lora_adapter_container : llama_lora_adapter_info {
+struct common_lora_adapter_container : common_lora_adapter_info {
     struct llama_lora_adapter * adapter;
 };
@@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
 extern char const * LLAMA_COMPILER;
 extern char const * LLAMA_BUILD_TARGET;
-struct llama_control_vector_load_info;
+struct common_control_vector_load_info;
 #define print_build_info() do {                                                                     \
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
@@ -93,15 +93,17 @@ enum llama_example {
     LLAMA_EXAMPLE_COUNT,
 };
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-    GPT_SAMPLER_TYPE_XTC         = 7,
+enum common_sampler_type {
+    COMMON_SAMPLER_TYPE_NONE        = 0,
+    COMMON_SAMPLER_TYPE_DRY         = 1,
+    COMMON_SAMPLER_TYPE_TOP_K       = 2,
+    COMMON_SAMPLER_TYPE_TOP_P       = 3,
+    COMMON_SAMPLER_TYPE_MIN_P       = 4,
+  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
+    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
+    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
+    COMMON_SAMPLER_TYPE_XTC         = 8,
+    COMMON_SAMPLER_TYPE_INFILL      = 9,
 };
 // dimensionality reduction methods, used by cvector-generator
@@ -111,41 +113,47 @@ enum dimre_method {
 };
 // sampler parameters
-struct gpt_sampler_params {
+struct common_sampler_params {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   xtc_t             = 0.0f;  // 0.0 = disabled
-    float   xtc_p             = 0.0f;
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    bool    no_perf           = false; // disable performance metrics
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE,
-        GPT_SAMPLER_TYPE_XTC
+    int32_t n_prev             = 64;    // number of previous tokens to remember
+    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k              = 40;    // <= 0 to use vocab size
+    float   top_p              = 0.95f; // 1.0 = disabled
+    float   min_p              = 0.05f; // 0.0 = disabled
+    float   xtc_probability    = 0.00f; // 0.0 = disabled
+    float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
+    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
+    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range     = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat     = 1.00f; // 1.0 = disabled
+    float   penalty_freq       = 0.00f; // 0.0 = disabled
+    float   penalty_present    = 0.00f; // 0.0 = disabled
+    float   dry_multiplier     = 0.0f;  // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
+    float   dry_base           = 1.75f; // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
+    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
+    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau       = 5.00f; // target entropy
+    float   mirostat_eta       = 0.10f; // learning rate
+    bool    penalize_nl        = false; // consider newlines as a repeatable token
+    bool    ignore_eos         = false;
+    bool    no_perf            = false; // disable performance metrics
+    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
+    std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_DRY,
+        COMMON_SAMPLER_TYPE_TOP_K,
+        COMMON_SAMPLER_TYPE_TYPICAL_P,
+        COMMON_SAMPLER_TYPE_TOP_P,
+        COMMON_SAMPLER_TYPE_MIN_P,
+        COMMON_SAMPLER_TYPE_XTC,
+        COMMON_SAMPLER_TYPE_TEMPERATURE,
     };
     std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -156,13 +164,13 @@ struct gpt_sampler_params {
     std::string print() const;
 };
-struct gpt_params {
+struct common_params {
     void * progress_callback_user_data        = nullptr;
     llama_progress_callback progress_callback = nullptr;
     bool vocab_only               = false;
     int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =     0; // context size
+    int32_t n_ctx                 =  4096; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                =     0; // number of tokens to keep from initial prompt
@@ -202,7 +210,7 @@ struct gpt_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    struct gpt_sampler_params sparams;
+    struct common_sampler_params sparams;
     std::string model                = ""; // model path                                                    // NOLINT
     std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
@@ -227,9 +235,9 @@ struct gpt_params {
     std::vector<llama_model_kv_override> kv_overrides;
     bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
-    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
+    std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
-    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
+    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
     int32_t verbosity                  = 0;
     int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -287,21 +295,21 @@ struct gpt_params {
     // embedding
     bool embedding         = false; // get only sentence embedding
-    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
-    std::string embd_sep   = "\n";  // separator of embendings
+    std::string embd_sep   = "\n";  // separator of embeddings
     bool reranking         = false; // enable reranking support on server
     // server params
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT
     std::string chat_template = "";                                                                         // NOLINT
-    std::string system_prompt = "";                                                                         // NOLINT
     bool enable_chat_template = true;
     std::vector<std::string> api_keys;
@@ -367,20 +375,31 @@ struct gpt_params {
 // call once at the start of a program if it uses libcommon
 // initializes the logging system and prints info about the build
-void gpt_init();
+void common_init();
-std::string gpt_params_get_system_info(const gpt_params & params);
+std::string common_params_get_system_info(const common_params & params);
-bool parse_cpu_range(const std::string& range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
-bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
 bool set_process_priority(enum lm_ggml_sched_priority prio);
 //
 // String utils
 //
-std::vector<std::string> string_split(std::string input, char separator);
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+std::string string_format(const char * fmt, ...);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
@@ -389,6 +408,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
+    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
     std::vector<T> values;
     std::istringstream str_stream(str);
     std::string token;
@@ -401,6 +421,22 @@ static std::vector<T> string_split(const std::string & str, char delim) {
     return values;
 }
+template<>
+std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
+{
+    std::vector<std::string> parts;
+    size_t begin_pos = 0;
+    size_t separator_pos = input.find(separator);
+    while (separator_pos != std::string::npos) {
+        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
+        parts.emplace_back(part);
+        begin_pos = separator_pos + 1;
+        separator_pos = input.find(separator, begin_pos);
+    }
+    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
+    return parts;
+}
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@@ -423,29 +459,29 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
-struct llama_init_result {
+struct common_init_result {
     struct llama_model   * model   = nullptr;
     struct llama_context * context = nullptr;
-    std::vector<llama_lora_adapter_container> lora_adapters;
+    std::vector<common_lora_adapter_container> lora_adapters;
 };
-struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
+struct common_init_result     common_init_from_params(common_params & params);
-struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
-struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
+struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 // clear LoRA adapters from context, then apply new list of adapters
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
 // Batch utils
-void llama_batch_clear(struct llama_batch & batch);
+void common_batch_clear(struct llama_batch & batch);
-void llama_batch_add(
+void common_batch_add(
                  struct llama_batch & batch,
                         llama_token   id,
                           llama_pos   pos,
@@ -458,13 +494,13 @@ void llama_batch_add(
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special = false);
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
     const struct llama_model * model,
            const std::string & text,
                         bool   add_special,
@@ -472,7 +508,7 @@ std::vector<llama_token> llama_tokenize(
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
-std::string llama_token_to_piece(
+std::string common_token_to_piece(
         const struct llama_context * ctx,
                        llama_token   token,
                        bool          special = true);
@@ -480,7 +516,7 @@ std::string llama_token_to_piece(
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
-std::string llama_detokenize(
+std::string common_detokenize(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
@@ -490,31 +526,31 @@ std::string llama_detokenize(
 //
 // same with llama_chat_message, but uses std::string
-struct llama_chat_msg {
+struct common_chat_msg {
     std::string role;
     std::string content;
 };
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool llama_chat_verify_template(const std::string & tmpl);
+bool common_chat_verify_template(const std::string & tmpl);
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
-std::string llama_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & chat,
+        const std::vector<common_chat_msg> & chat,
         bool add_ass);
 // Format single message, while taking into account the position of that message in chat history
-std::string llama_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & past_msg,
-        const llama_chat_msg & new_msg,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
         bool add_ass);
 // Returns an example of formatted chat
-std::string llama_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct llama_model * model,
         const std::string & tmpl);
 //
@@ -522,31 +558,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
 //
 // Dump the KV cache view with the number of sequences per cell.
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
 // Dump the KV cache view showing individual sequences in each cell (long output).
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
 //
 // Embedding utils
 //
-void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
-float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
 //
 // Control vector utils
 //
-struct llama_control_vector_data {
+struct common_control_vector_data {
     int n_embd;
     // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
     std::vector<float> data;
 };
-struct llama_control_vector_load_info {
+struct common_control_vector_load_info {
     float strength;
     std::string fname;
@@ -554,7 +590,7 @@ struct llama_control_vector_load_info {
 // Load control vectors, scale each by strength, and add them together.
 // On error, returns {-1, empty}
-llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
 //
 // Split utils
@@ -573,5 +609,5 @@ void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
 void yaml_dump_non_result_info(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
+    FILE * stream, const common_params & params, const llama_context * lctx,
     const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);