npm - cui-llama.rn - Versions diffs - 1.4.6 → 1.6.0 - Mend

cui-llama.rn 1.4.6 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (366) hide show

package/cpp/common.h CHANGED Viewed

@@ -132,10 +132,6 @@ struct common_grammar_trigger {
     common_grammar_trigger_type type;
     std::string value;
     llama_token token = LLAMA_TOKEN_NULL;
-    // T can only be nlohmann::ordered_json
-    template <class T> T to_json() const;
-    template <class T> static common_grammar_trigger from_json(const T & in);
 };
 // sampling parameters
@@ -195,6 +191,13 @@ struct common_params_sampling {
     std::string print() const;
 };
+struct common_params_model {
+    std::string path    = ""; // model local path                                           // NOLINT
+    std::string url     = ""; // model url to download                                      // NOLINT
+    std::string hf_repo = ""; // HF repo                                                    // NOLINT
+    std::string hf_file = ""; // HF file                                                    // NOLINT
+};
 struct common_params_speculative {
     std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
@@ -208,19 +211,11 @@ struct common_params_speculative {
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-    std::string model = "";     // draft model for speculative decoding                      // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
 };
 struct common_params_vocoder {
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-    std::string model     = ""; // model path                                                // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
     std::string speaker_file = ""; // speaker file path                                      // NOLINT
@@ -282,12 +277,10 @@ struct common_params {
     struct common_params_speculative speculative;
     struct common_params_vocoder     vocoder;
-    std::string model                = ""; // model path                                                    // NOLINT
+    struct common_params_model model;
     std::string model_alias          = ""; // model alias                                                   // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
     std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
     std::string system_prompt        = "";                                                                  // NOLINT
     std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
@@ -301,6 +294,7 @@ struct common_params {
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
     bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
     std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -362,7 +356,7 @@ struct common_params {
     common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
     // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    struct common_params_model mmproj;
     std::vector<std::string> image; // path to image file(s)
     // embedding
@@ -561,23 +555,6 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
-struct llama_model * common_load_model_from_url(
-    const std::string & model_url,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-struct llama_model * common_load_model_from_hf(
-    const std::string & repo,
-    const std::string & remote_path,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-std::pair<std::string, std::string> common_get_hf_file(
-    const std::string & hf_repo_with_tag,
-    const std::string & hf_token);
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

package/cpp/cpu-common.h ADDED Viewed

@@ -0,0 +1,72 @@
+#pragma once
+#include "ggml.h"
+#include "ggml-cpu-traits.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-impl.h"
+#ifdef __cplusplus
+#include <utility>
+// convenience functions/macros for use in template calls
+// note: these won't be required after the 'traits' lookup table is used.
+static inline lm_ggml_fp16_t f32_to_f16(float x) {
+    return LM_GGML_FP32_TO_FP16(x);
+}
+static inline float f16_to_f32(lm_ggml_fp16_t x) {
+    return LM_GGML_FP16_TO_FP32(x);
+}
+static inline lm_ggml_bf16_t f32_to_bf16(float x) {
+    return LM_GGML_FP32_TO_BF16(x);
+}
+static inline float bf16_to_f32(lm_ggml_bf16_t x) {
+    return LM_GGML_BF16_TO_FP32(x);
+}
+static inline float f32_to_f32(float x) {
+    return x;
+}
+// TODO - merge this into the traits table, after using row-based conversions
+template <class T>
+struct type_conversion_table;
+template <>
+struct type_conversion_table<lm_ggml_fp16_t> {
+    static constexpr float (*to_f32)(lm_ggml_fp16_t) = f16_to_f32;
+    static constexpr lm_ggml_fp16_t (*from_f32)(float) = f32_to_f16;
+};
+template <>
+struct type_conversion_table<float> {
+    static constexpr float (*to_f32)(float) = f32_to_f32;
+    static constexpr float (*from_f32)(float) = f32_to_f32;
+};
+template <>
+struct type_conversion_table<lm_ggml_bf16_t> {
+    static constexpr float (*to_f32)(lm_ggml_bf16_t) = bf16_to_f32;
+    static constexpr lm_ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
+};
+static std::pair<int64_t, int64_t> get_thread_range(const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0) {
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+    const int64_t nr  = lm_ggml_nrows(src0);
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+    return {ir0, ir1};
+}
+#endif

package/cpp/ggml-common.h CHANGED Viewed

@@ -158,6 +158,12 @@ typedef sycl::half2 lm_ggml_half2;
 #endif // LM_GGML_COMMON_DECL_CUDA || LM_GGML_COMMON_DECL_HIP
+#ifdef _MSC_VER
+#define LM_GGML_EXTENSION
+#else // _MSC_VER
+#define LM_GGML_EXTENSION __extension__
+#endif // _MSC_VER
 #define QK4_0 32
 typedef struct {
     lm_ggml_half d;           // delta
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(lm_ggml_half) + QK4_0 / 2, "wrong q4_
 #define QK4_1 32
 typedef struct {
-    union {
+    LM_GGML_EXTENSION union {
         struct {
             lm_ggml_half d; // delta
             lm_ggml_half m; // min
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(lm_ggml_half) + sizeof(uint32_t) + QK
 #define QK5_1 32
 typedef struct {
-    union {
+    LM_GGML_EXTENSION union {
         struct {
             lm_ggml_half d; // delta
             lm_ggml_half m; // min
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(lm_ggml_half) + QK8_0, "wrong q8_0 bl
 #define QK8_1 32
 typedef struct {
-    union {
+    LM_GGML_EXTENSION union {
         struct {
             lm_ggml_half d; // delta
             lm_ggml_half s; // d * sum(qs[i])
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(lm_ggml_half) + QK_K / 4, "wrong tq2
 typedef struct {
     uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
     uint8_t qs[QK_K/4];      // quants
-    union {
+    LM_GGML_EXTENSION union {
         struct {
             lm_ggml_half d;    // super-block scale for quantized scales
             lm_ggml_half dmin; // super-block scale for quantized mins
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 +
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 typedef struct {
-    union {
+    LM_GGML_EXTENSION union {
         struct {
             lm_ggml_half d;    // super-block scale for quantized scales
             lm_ggml_half dmin; // super-block scale for quantized mins
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 typedef struct {
-    union {
+    LM_GGML_EXTENSION union {
         struct {
             lm_ggml_half d;    // super-block scale for quantized scales
             lm_ggml_half dmin; // super-block scale for quantized mins