npm - cui-llama.rn - Versions diffs - 1.4.6 → 1.5.0 - Mend

cui-llama.rn 1.4.6 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/android/src/main/CMakeLists.txt +9 -2
package/android/src/main/jni.cpp +52 -34
package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
package/cpp/binary-ops.cpp +158 -0
package/cpp/binary-ops.h +16 -0
package/cpp/chat.cpp +1769 -1779
package/cpp/chat.h +9 -1
package/cpp/common.cpp +20 -522
package/cpp/common.h +13 -36
package/cpp/cpu-common.h +72 -0
package/cpp/ggml-common.h +12 -6
package/cpp/ggml-cpu-aarch64.cpp +1557 -80
package/cpp/ggml-cpu-impl.h +2 -21
package/cpp/ggml-cpu-quants.c +904 -405
package/cpp/ggml-cpu.c +909 -13237
package/cpp/ggml-impl.h +50 -23
package/cpp/ggml-metal-impl.h +77 -3
package/cpp/ggml-metal.m +794 -580
package/cpp/ggml.c +92 -3
package/cpp/ggml.h +29 -5
package/cpp/gguf.cpp +1 -0
package/cpp/llama-adapter.cpp +55 -20
package/cpp/llama-adapter.h +11 -9
package/cpp/llama-arch.cpp +217 -16
package/cpp/llama-arch.h +25 -0
package/cpp/llama-batch.h +2 -2
package/cpp/llama-chat.cpp +54 -2
package/cpp/llama-chat.h +3 -0
package/cpp/llama-context.cpp +2294 -1238
package/cpp/llama-context.h +214 -77
package/cpp/llama-cparams.h +1 -0
package/cpp/llama-graph.cpp +1695 -0
package/cpp/llama-graph.h +592 -0
package/cpp/llama-hparams.cpp +8 -0
package/cpp/llama-hparams.h +17 -0
package/cpp/llama-io.cpp +15 -0
package/cpp/llama-io.h +35 -0
package/cpp/llama-kv-cache.cpp +965 -303
package/cpp/llama-kv-cache.h +145 -151
package/cpp/llama-memory.cpp +1 -0
package/cpp/llama-memory.h +21 -0
package/cpp/llama-mmap.cpp +1 -1
package/cpp/llama-model-loader.cpp +10 -5
package/cpp/llama-model-loader.h +5 -3
package/cpp/llama-model.cpp +9194 -201
package/cpp/llama-model.h +40 -1
package/cpp/llama-sampling.cpp +5 -0
package/cpp/llama-vocab.cpp +36 -5
package/cpp/llama.cpp +51 -9984
package/cpp/llama.h +102 -22
package/cpp/log.cpp +34 -0
package/cpp/minja/chat-template.hpp +15 -7
package/cpp/minja/minja.hpp +120 -94
package/cpp/ops.cpp +8723 -0
package/cpp/ops.h +128 -0
package/cpp/rn-llama.cpp +44 -53
package/cpp/rn-llama.h +2 -12
package/cpp/sampling.cpp +3 -0
package/cpp/sgemm.cpp +533 -88
package/cpp/simd-mappings.h +888 -0
package/cpp/speculative.cpp +4 -4
package/cpp/unary-ops.cpp +186 -0
package/cpp/unary-ops.h +28 -0
package/cpp/vec.cpp +258 -0
package/cpp/vec.h +802 -0
package/ios/CMakeLists.txt +5 -2
package/ios/RNLlama.mm +2 -2
package/ios/RNLlamaContext.mm +40 -24
package/package.json +1 -1
package/src/NativeRNLlama.ts +6 -4
package/src/index.ts +3 -1
package/cpp/chat-template.hpp +0 -529
package/cpp/minja.hpp +0 -2915

package/cpp/ggml.c CHANGED Viewed

@@ -942,6 +942,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "RMS_NORM",
     "RMS_NORM_BACK",
     "GROUP_NORM",
+    "L2_NORM",
     "MUL_MAT",
     "MUL_MAT_ID",
@@ -990,6 +991,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "ADD_REL_POS",
     "RWKV_WKV6",
     "GATED_LINEAR_ATTN",
+    "RWKV_WKV7",
     "UNARY",
@@ -1009,7 +1011,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
-static_assert(LM_GGML_OP_COUNT == 83, "LM_GGML_OP_COUNT != 83");
+static_assert(LM_GGML_OP_COUNT == 85, "LM_GGML_OP_COUNT != 85");
 static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "none",
@@ -1039,6 +1041,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "rms_norm(x)",
     "rms_norm_back(x)",
     "group_norm(x)",
+    "l2_norm(x)",
     "X*Y",
     "X[i]*Y",
@@ -1087,6 +1090,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "add_rel_pos(x)",
     "rwkv_wkv6(k, v, r, tf, td, s)",
     "gated_linear_attn(k, v, q, gate, s)",
+    "rwkv_wkv7(r, w, k, v, a, b, s)",
     "unary(x)",
@@ -1106,7 +1110,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "adamw(x)",
 };
-static_assert(LM_GGML_OP_COUNT == 83, "LM_GGML_OP_COUNT != 83");
+static_assert(LM_GGML_OP_COUNT == 85, "LM_GGML_OP_COUNT != 85");
 static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
@@ -1168,6 +1172,12 @@ int64_t lm_ggml_nrows(const struct lm_ggml_tensor * tensor) {
 }
 size_t lm_ggml_nbytes(const struct lm_ggml_tensor * tensor) {
+    for (int i = 0; i < LM_GGML_MAX_DIMS; ++i) {
+        if (tensor->ne[i] <= 0) {
+            return 0;
+        }
+    }
     size_t nbytes;
     const size_t blck_size = lm_ggml_blck_size(tensor->type);
     if (blck_size == 1) {
@@ -2699,6 +2709,37 @@ struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
     return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
 }
+// lm_ggml_l2_norm
+static struct lm_ggml_tensor * lm_ggml_l2_norm_impl(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        float                 eps,
+        bool                  inplace) {
+    struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
+    lm_ggml_set_op_params_f32(result, 0, eps);
+    result->op     = LM_GGML_OP_L2_NORM;
+    result->src[0] = a;
+    return result;
+}
+struct lm_ggml_tensor * lm_ggml_l2_norm(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        float                 eps) {
+    return lm_ggml_l2_norm_impl(ctx, a, eps, false);
+}
+struct lm_ggml_tensor * lm_ggml_l2_norm_inplace(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        float                 eps) {
+    return lm_ggml_l2_norm_impl(ctx, a, eps, true);
+}
 // lm_ggml_mul_mat
 static inline bool lm_ggml_can_mul_mat(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1) {
@@ -4347,7 +4388,7 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext(
     }
     // permute(0, 2, 1, 3)
-    int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
+    int64_t ne[4] = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] };
     struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
     float params[] = { scale, max_bias, logit_softcap };
@@ -4733,6 +4774,54 @@ struct lm_ggml_tensor * lm_ggml_gated_linear_attn(
     return result;
 }
+// lm_ggml_rwkv_wkv7
+struct lm_ggml_tensor * lm_ggml_rwkv_wkv7(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * r,
+        struct lm_ggml_tensor  * w,
+        struct lm_ggml_tensor  * k,
+        struct lm_ggml_tensor  * v,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * b,
+        struct lm_ggml_tensor  * state) {
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(r));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(w));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(k));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(v));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(a));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(b));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(state));
+    const int64_t S = k->ne[0];
+    const int64_t H = k->ne[1];
+    const int64_t n_tokens = k->ne[2];
+    const int64_t n_seqs = state->ne[1];
+    {
+        LM_GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens);
+        LM_GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens);
+        LM_GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
+        LM_GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens);
+        LM_GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens);
+        LM_GGML_ASSERT(lm_ggml_nelements(state) == S * S * H * n_seqs);
+    }
+    // concat output and new_state
+    const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+    result->op     = LM_GGML_OP_RWKV_WKV7;
+    result->src[0] = r;
+    result->src[1] = w;
+    result->src[2] = k;
+    result->src[3] = v;
+    result->src[4] = a;
+    result->src[5] = b;
+    result->src[6] = state;
+    return result;
+}
 // lm_ggml_unary
 static struct lm_ggml_tensor * lm_ggml_unary_impl(

package/cpp/ggml.h CHANGED Viewed

@@ -455,6 +455,7 @@ extern "C" {
         LM_GGML_OP_RMS_NORM,
         LM_GGML_OP_RMS_NORM_BACK,
         LM_GGML_OP_GROUP_NORM,
+        LM_GGML_OP_L2_NORM,
         LM_GGML_OP_MUL_MAT,
         LM_GGML_OP_MUL_MAT_ID,
@@ -503,6 +504,7 @@ extern "C" {
         LM_GGML_OP_ADD_REL_POS,
         LM_GGML_OP_RWKV_WKV6,
         LM_GGML_OP_GATED_LINEAR_ATTN,
+        LM_GGML_OP_RWKV_WKV7,
         LM_GGML_OP_UNARY,
@@ -1096,6 +1098,18 @@ extern "C" {
             int                   n_groups,
             float                 eps);
+    // l2 normalize along rows
+    // used in rwkv v7
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_l2_norm(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            float                 eps);
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_l2_norm_inplace(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            float                 eps);
     // a - x
     // b - dy
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_rms_norm_back(
@@ -1778,11 +1792,11 @@ extern "C" {
 #define LM_GGML_KQ_MASK_PAD 64
-    // q:    [n_embd, n_batch,     n_head,    1]
-    // k:    [n_embd, n_kv,        n_head_kv, 1]
-    // v:    [n_embd, n_kv,        n_head_kv, 1] !! not transposed !!
-    // mask: [n_kv,   n_batch_pad, 1,         1] !! n_batch_pad = LM_GGML_PAD(n_batch, LM_GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd, n_head,      n_batch,   1] !! permuted !!
+    // q:    [n_embd_k, n_batch,     n_head,    1]
+    // k:    [n_embd_k, n_kv,        n_head_kv, 1]
+    // v:    [n_embd_v, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,     n_batch_pad, 1,         1] !! n_batch_pad = LM_GGML_PAD(n_batch, LM_GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd_v, n_head,      n_batch,   1] !! permuted !!
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_flash_attn_ext(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * q,
@@ -1891,6 +1905,16 @@ extern "C" {
             struct lm_ggml_tensor  * state,
             float scale);
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_rwkv_wkv7(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * r,
+            struct lm_ggml_tensor  * w,
+            struct lm_ggml_tensor  * k,
+            struct lm_ggml_tensor  * v,
+            struct lm_ggml_tensor  * a,
+            struct lm_ggml_tensor  * b,
+            struct lm_ggml_tensor  * state);
     // custom operators
     typedef void (*lm_ggml_unary_op_f32_t) (const int, float *, const float *);

package/cpp/gguf.cpp CHANGED Viewed

@@ -932,6 +932,7 @@ static void lm_gguf_check_reserved_keys(const std::string & key, const T val) {
         if constexpr (std::is_same<T, uint32_t>::value) {
             LM_GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && LM_GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2");
         } else {
+            LM_GGML_UNUSED(val);
             LM_GGML_ABORT(LM_GGUF_KEY_GENERAL_ALIGNMENT " must be type u32");
         }
     }

package/cpp/llama-adapter.cpp CHANGED Viewed

@@ -4,14 +4,13 @@
 #include "llama-mmap.h"
 #include "llama-model.h"
-#include <algorithm>
 #include <map>
 #include <cassert>
 #include <stdexcept>
 // vec
-struct lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
+lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
     if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
         return nullptr;
     }
@@ -19,7 +18,7 @@ struct lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
     return tensors[il];
 }
-struct lm_ggml_tensor * llama_adapter_cvec::apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int  il) const {
+lm_ggml_tensor * llama_adapter_cvec::apply_to(lm_ggml_context * ctx, lm_ggml_tensor * cur, int  il) const {
     lm_ggml_tensor * layer_dir = tensor_for(il);
     if (layer_dir != nullptr) {
         cur = lm_ggml_add(ctx, cur, layer_dir);
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
     auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
-            struct lm_ggml_init_params params = {
+            lm_ggml_init_params params = {
                 /*.mem_size   =*/ hparams.n_layer*lm_ggml_tensor_overhead(),
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
     return true;
 }
-int32_t llama_adapter_cvec::apply(
+bool llama_adapter_cvec::apply(
         const llama_model & model,
         const float * data,
         size_t len,
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
         // disable the current control vector (but leave allocated for later)
         layer_start = -1;
         layer_end   = -1;
-        return 0;
+        return true;
     }
     if (n_embd != (int) hparams.n_embd) {
         LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
-        return 1;
+        return false;
     }
     if (tensors.empty()) {
         if (!init(model)) {
-            return 1;
+            return false;
         }
     }
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
         }
     }
-    return 0;
+    return true;
 }
 // lora
-llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct lm_ggml_tensor * w) {
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(lm_ggml_tensor * w) {
     const std::string name(w->name);
     const auto pos = ab_map.find(name);
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct lm_ggml_tensor
     return nullptr;
 }
-static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
     LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
     lm_ggml_context * ctx_init;
-    struct lm_gguf_init_params meta_lm_gguf_params = {
+    lm_gguf_init_params meta_lm_gguf_params = {
         /* .no_alloc = */ true,
         /* .ctx      = */ &ctx_init,
     };
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
             // add a new context
-            struct lm_ggml_init_params params = {
+            lm_ggml_init_params params = {
                 /*.mem_size   =*/ n_tensors*lm_ggml_tensor_overhead(),
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
@@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
         }
     }
+    // get extra buffer types of the CPU
+    // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
+    std::vector<lm_ggml_backend_buffer_type_t> buft_extra;
+    {
+        auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
+        auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
+        auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
+            lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
+        if (lm_ggml_backend_dev_get_extra_bufts_fn) {
+            lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+            while (extra_bufts && *extra_bufts) {
+                buft_extra.emplace_back(*extra_bufts);
+                ++extra_bufts;
+            }
+        }
+    }
     // add tensors
     for (auto & it : ab_map) {
         const std::string & name = it.first;
@@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
         }
-        struct lm_ggml_context * dev_ctx = ctx_for_buft(lm_ggml_backend_buffer_get_type(model_tensor->buffer));
+        auto * buft = lm_ggml_backend_buffer_get_type(model_tensor->buffer);
+        // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
+        for (auto & ex : buft_extra) {
+            if (ex == buft) {
+                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, lm_ggml_backend_buft_name(buft));
+                auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
+                buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
+                break;
+            }
+        }
+        LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, lm_ggml_backend_buft_name(buft));
+        lm_ggml_context * dev_ctx = ctx_for_buft(buft);
         // validate tensor shape
         if (is_token_embd) {
             // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
         }
         // save tensor to adapter
-        struct lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
-        struct lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
+        lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
+        lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
         lm_ggml_set_name(tensor_a, w.a->name);
         lm_ggml_set_name(tensor_b, w.b->name);
         adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
@@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
     {
         llama_file lm_gguf_file(path_lora, "rb");
         std::vector<uint8_t> read_buf;
-        auto set_tensor = [&](struct lm_ggml_tensor * orig, struct lm_ggml_tensor * dev) {
+        auto set_tensor = [&](lm_ggml_tensor * orig, lm_ggml_tensor * dev) {
             size_t offs = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), lm_gguf_find_tensor(ctx_gguf.get(), orig->name));
             size_t size = lm_ggml_nbytes(orig);
             read_buf.resize(size);
@@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
     LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
-struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
-    struct llama_adapter_lora * adapter = new llama_adapter_lora();
+llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
+    llama_adapter_lora * adapter = new llama_adapter_lora();
     try {
         llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
     return nullptr;
 }
-void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
+void llama_adapter_lora_free(llama_adapter_lora * adapter) {
     delete adapter;
 }

package/cpp/llama-adapter.h CHANGED Viewed

@@ -15,11 +15,11 @@
 //
 struct llama_adapter_cvec {
-    struct lm_ggml_tensor * tensor_for(int il) const;
+    lm_ggml_tensor * tensor_for(int il) const;
-    struct lm_ggml_tensor * apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int  il) const;
+    lm_ggml_tensor * apply_to(lm_ggml_context * ctx, lm_ggml_tensor * cur, int  il) const;
-    int32_t apply(
+    bool apply(
             const llama_model & model,
             const float * data,
             size_t len,
@@ -36,7 +36,7 @@ private:
     std::vector<lm_ggml_context_ptr> ctxs;
     std::vector<lm_ggml_backend_buffer_ptr> bufs;
-    std::vector<struct lm_ggml_tensor *> tensors; // per layer
+    std::vector<lm_ggml_tensor *> tensors; // per layer
 };
 //
@@ -44,8 +44,8 @@ private:
 //
 struct llama_adapter_lora_weight {
-    struct lm_ggml_tensor * a = nullptr;
-    struct lm_ggml_tensor * b = nullptr;
+    lm_ggml_tensor * a = nullptr;
+    lm_ggml_tensor * b = nullptr;
     // get actual scale based on rank and alpha
     float get_scale(float alpha, float adapter_scale) const {
@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
     }
     llama_adapter_lora_weight() = default;
-    llama_adapter_lora_weight(struct lm_ggml_tensor * a, struct lm_ggml_tensor * b) : a(a), b(b) {}
+    llama_adapter_lora_weight(lm_ggml_tensor * a, lm_ggml_tensor * b) : a(a), b(b) {}
 };
 struct llama_adapter_lora {
     // map tensor name to lora_a_b
-    std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
+    std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
     std::vector<lm_ggml_context_ptr> ctxs;
     std::vector<lm_ggml_backend_buffer_ptr> bufs;
@@ -70,5 +70,7 @@ struct llama_adapter_lora {
     llama_adapter_lora() = default;
     ~llama_adapter_lora() = default;
-    llama_adapter_lora_weight * get_weight(struct lm_ggml_tensor * w);
+    llama_adapter_lora_weight * get_weight(lm_ggml_tensor * w);
 };
+using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;