npm - cui-llama.rn - Versions diffs - 1.3.3 → 1.3.5 - Mend

cui-llama.rn 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/android/src/main/CMakeLists.txt +5 -7
package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
package/android/src/main/jni.cpp +9 -9
package/cpp/common.cpp +28 -44
package/cpp/common.h +35 -14
package/cpp/ggml-alloc.c +0 -1
package/cpp/ggml-backend-impl.h +38 -20
package/cpp/ggml-backend-reg.cpp +246 -92
package/cpp/ggml-backend.h +1 -0
package/cpp/ggml-common.h +42 -48
package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +642 -223
package/cpp/ggml-cpu-aarch64.h +2 -26
package/cpp/ggml-cpu-traits.cpp +36 -0
package/cpp/ggml-cpu-traits.h +38 -0
package/cpp/ggml-cpu.c +14122 -13971
package/cpp/ggml-cpu.cpp +627 -715
package/cpp/ggml-cpu.h +0 -17
package/cpp/ggml-impl.h +22 -6
package/cpp/ggml-metal.m +482 -24
package/cpp/ggml-quants.c +0 -9
package/cpp/ggml-threading.h +4 -2
package/cpp/ggml.c +284 -178
package/cpp/ggml.h +73 -25
package/cpp/llama-grammar.cpp +15 -15
package/cpp/llama-grammar.h +2 -5
package/cpp/llama-sampling.cpp +35 -90
package/cpp/llama-vocab.cpp +7 -2
package/cpp/llama-vocab.h +1 -1
package/cpp/llama.cpp +1782 -586
package/cpp/llama.h +20 -19
package/cpp/sampling.cpp +11 -16
package/cpp/sgemm.cpp +265 -258
package/cpp/sgemm.h +2 -2
package/cpp/speculative.cpp +4 -0
package/cpp/unicode.cpp +51 -51
package/cpp/unicode.h +9 -10
package/lib/commonjs/index.js +38 -1
package/lib/commonjs/index.js.map +1 -1
package/lib/module/index.js +36 -0
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +2 -3
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/lib/typescript/index.d.ts +36 -2
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +1 -1
package/src/NativeRNLlama.ts +3 -3
package/src/index.ts +46 -2
package/cpp/amx/amx.cpp +0 -196
package/cpp/amx/amx.h +0 -20
package/cpp/amx/common.h +0 -101
package/cpp/amx/mmq.cpp +0 -2524
package/cpp/amx/mmq.h +0 -16
package/cpp/ggml-aarch64.c +0 -129
package/cpp/ggml-aarch64.h +0 -19

package/cpp/ggml.c CHANGED Viewed

@@ -8,7 +8,10 @@
 // FIXME: required here for quantization functions
 #include "ggml-quants.h"
-#include "ggml-aarch64.h"
+#ifdef LM_GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
+#endif
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -801,32 +804,23 @@ static const struct lm_ggml_type_traits type_traits[LM_GGML_TYPE_COUNT] = {
         .to_float                 = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
         .from_float_ref           = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
     },
-    [LM_GGML_TYPE_Q4_0_4_4] = {
-        .type_name                = "q4_0_4x4",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 4,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [31] = { // LM_GGML_TYPE_Q4_0_4_4
+        .type_name                = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
-    [LM_GGML_TYPE_Q4_0_4_8] = {
-        .type_name                = "q4_0_4x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [32] = { // LM_GGML_TYPE_Q4_0_4_8
+        .type_name                = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
-    [LM_GGML_TYPE_Q4_0_8_8] = {
-        .type_name                = "q4_0_8x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [33] = { // LM_GGML_TYPE_Q4_0_8_8
+        .type_name                = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
     [LM_GGML_TYPE_TQ1_0] = {
         .type_name                = "tq1_0",
@@ -844,14 +838,23 @@ static const struct lm_ggml_type_traits type_traits[LM_GGML_TYPE_COUNT] = {
         .to_float                 = (lm_ggml_to_float_t) dequantize_row_tq2_0,
         .from_float_ref           = (lm_ggml_from_float_t) quantize_row_tq2_0_ref,
     },
-    [LM_GGML_TYPE_IQ4_NL_4_4] = {
-        .type_name                = "iq4_nl_4x4",
-        .blck_size                = QK4_NL,
-        .blck_size_interleave     = 4,
-        .type_size                = sizeof(block_iq4_nl),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [36] = { // LM_GGML_TYPE_IQ4_NL_4_4
+        .type_name                = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [37] = { // LM_GGML_TYPE_IQ4_NL_4_8
+        .type_name                = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [38] = { // LM_GGML_TYPE_IQ4_NL_8_8
+        .type_name                = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
 };
@@ -963,6 +966,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "POOL_2D_BACK",
     "UPSCALE",
     "PAD",
+    "PAD_REFLECT_1D",
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
@@ -996,7 +1000,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
-static_assert(LM_GGML_OP_COUNT == 81, "LM_GGML_OP_COUNT != 81");
+static_assert(LM_GGML_OP_COUNT == 82, "LM_GGML_OP_COUNT != 82");
 static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "none",
@@ -1058,6 +1062,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "pool_2d_back(x)",
     "upscale(x)",
     "pad(x)",
+    "pad_reflect_1d(x)",
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
@@ -1091,7 +1096,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "adamw(x)",
 };
-static_assert(LM_GGML_OP_COUNT == 81, "LM_GGML_OP_COUNT != 81");
+static_assert(LM_GGML_OP_COUNT == 82, "LM_GGML_OP_COUNT != 82");
 static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
@@ -1281,9 +1286,6 @@ enum lm_ggml_type lm_ggml_ftype_to_lm_ggml_type(enum lm_ggml_ftype ftype) {
         case LM_GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = LM_GGML_TYPE_IQ4_XS;   break;
         case LM_GGML_FTYPE_MOSTLY_IQ3_S:         wtype = LM_GGML_TYPE_IQ3_S;    break;
         case LM_GGML_FTYPE_MOSTLY_IQ2_S:         wtype = LM_GGML_TYPE_IQ2_S;    break;
-        case LM_GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = LM_GGML_TYPE_Q4_0_4_4; break;
-        case LM_GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = LM_GGML_TYPE_Q4_0_4_8; break;
-        case LM_GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = LM_GGML_TYPE_Q4_0_8_8; break;
         case LM_GGML_FTYPE_UNKNOWN:              wtype = LM_GGML_TYPE_COUNT; break;
         case LM_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = LM_GGML_TYPE_COUNT; break;
     }
@@ -3528,15 +3530,18 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl(
         LM_GGML_ASSERT(c->ne[0] >= n_dims / 2);
     }
+    int sections[4] = {0, 0, 0, 0};
     struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
-    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
     memcpy(params +  5, &freq_base,    sizeof(float));
     memcpy(params +  6, &freq_scale,   sizeof(float));
     memcpy(params +  7, &ext_factor,   sizeof(float));
     memcpy(params +  8, &attn_factor,  sizeof(float));
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &sections,     sizeof(int)*4);
     lm_ggml_set_op_params(result, params, sizeof(params));
     result->op     = LM_GGML_OP_ROPE;
@@ -3558,6 +3563,53 @@ struct lm_ggml_tensor * lm_ggml_rope(
     );
 }
+struct lm_ggml_tensor * lm_ggml_rope_multi(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * b,
+        struct lm_ggml_tensor  * c,
+        int                   n_dims,
+        int                   sections[4],
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    // Multimodal Rotary Position Embedding
+    LM_GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+    LM_GGML_ASSERT(lm_ggml_is_vector(b));
+    LM_GGML_ASSERT(b->type == LM_GGML_TYPE_I32);
+    LM_GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
+    if (c) {
+        LM_GGML_ASSERT(c->type == LM_GGML_TYPE_F32);
+        LM_GGML_ASSERT(c->ne[0] >= n_dims / 2);
+    }
+    struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a);
+    int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(&params[11], sections,      sizeof(int)*4);
+    lm_ggml_set_op_params(result, params, sizeof(params));
+    result->op   = LM_GGML_OP_ROPE;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+    return result;
+}
 struct lm_ggml_tensor * lm_ggml_rope_inplace(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
@@ -3721,13 +3773,84 @@ struct lm_ggml_tensor * lm_ggml_clamp(
     return result;
 }
-// lm_ggml_conv_1d
 static int64_t lm_ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
-LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+struct lm_ggml_tensor * lm_ggml_im2col(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D,
+        enum lm_ggml_type        dst_type) {
+    if (is_2D) {
+        LM_GGML_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        //LM_GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
+        LM_GGML_ASSERT(b->ne[1] == a->ne[1]);
+        LM_GGML_ASSERT(b->ne[3] == 1);
+    }
+    const int64_t OH = is_2D ? lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+    const int64_t OW =         lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    LM_GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
+    LM_GGML_ASSERT((OW > 0)           && "b too small compared to a");
+    const int64_t ne[4] = {
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
+        OW,
+        is_2D ? OH : b->ne[2],
+        is_2D ?      b->ne[3] : 1,
+    };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, dst_type, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    lm_ggml_set_op_params(result, params, sizeof(params));
+    result->op     = LM_GGML_OP_IM2COL;
+    result->src[0] = a;
+    result->src[1] = b;
+    return result;
+}
+struct lm_ggml_tensor * lm_ggml_im2col_back(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * b,
+        int64_t             * ne,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D) {
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    lm_ggml_set_op_params(result, params, sizeof(params));
+    result->op     = LM_GGML_OP_IM2COL_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+    return result;
+}
+// lm_ggml_conv_1d
+struct lm_ggml_tensor * lm_ggml_conv_1d(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
         struct lm_ggml_tensor  * b,
@@ -3757,137 +3880,75 @@ struct lm_ggml_tensor* lm_ggml_conv_1d_ph(
     return lm_ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
-// lm_ggml_conv_transpose_1d
-static int64_t lm_ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
-}
+// lm_ggml_conv_1d_dw
-LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
+struct lm_ggml_tensor * lm_ggml_conv_1d_dw(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
         struct lm_ggml_tensor  * b,
         int                   s0,
         int                   p0,
         int                   d0) {
-    LM_GGML_ASSERT(lm_ggml_is_matrix(b));
-    LM_GGML_ASSERT(a->ne[2] == b->ne[1]);
-    LM_GGML_ASSERT(a->ne[3] == 1);
+    struct lm_ggml_tensor * new_a = lm_ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
+    struct lm_ggml_tensor * new_b = lm_ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
-    LM_GGML_ASSERT(p0 == 0);
-    LM_GGML_ASSERT(d0 == 1);
-    const int64_t ne[4] = {
-        lm_ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
-        a->ne[1], b->ne[2], 1,
-    };
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+    struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, LM_GGML_TYPE_F16);
-    int32_t params[] = { s0, p0, d0 };
-    lm_ggml_set_op_params(result, params, sizeof(params));
+    struct lm_ggml_tensor * result = lm_ggml_mul_mat(ctx, im2col, a);
-    result->op     = LM_GGML_OP_CONV_TRANSPOSE_1D;
-    result->src[0] = a;
-    result->src[1] = b;
+    result = lm_ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
     return result;
 }
-// lm_ggml_conv_depthwise
+// lm_ggml_conv_1d_dw_ph
-struct lm_ggml_tensor * lm_ggml_conv_depthwise_2d(
+struct lm_ggml_tensor * lm_ggml_conv_1d_dw_ph(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
         struct lm_ggml_tensor  * b,
         int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1) {
-    struct lm_ggml_tensor * new_a = lm_ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
-    struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, new_a,
-                                        lm_ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-                                        s0, s1, p0, p1, d0, d1, true, LM_GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
-    struct lm_ggml_tensor * new_b = lm_ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+        int                   d0) {
+    return lm_ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
+}
-    new_a = lm_ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
-    struct lm_ggml_tensor * result = lm_ggml_mul_mat(ctx, new_a, new_b);
-    result = lm_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+// lm_ggml_conv_transpose_1d
-    return result;
+static int64_t lm_ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
 }
-// lm_ggml_conv_2d
-// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OH, OW, IC*KH*KW]
-struct lm_ggml_tensor * lm_ggml_im2col(
+LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
         struct lm_ggml_tensor  * b,
         int                   s0,
-        int                   s1,
         int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1,
-        bool                  is_2D,
-        enum lm_ggml_type        dst_type) {
-    if(is_2D) {
-        LM_GGML_ASSERT(a->ne[2] == b->ne[2]);
-    } else {
-        LM_GGML_ASSERT(a->ne[1] == b->ne[1]);
-        LM_GGML_ASSERT(b->ne[3] == 1);
-    }
-    const int64_t OH = is_2D ? lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
-    const int64_t OW =         lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+        int                   d0) {
+    LM_GGML_ASSERT(lm_ggml_is_matrix(b));
+    LM_GGML_ASSERT(a->ne[2] == b->ne[1]);
+    LM_GGML_ASSERT(a->ne[3] == 1);
-    LM_GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
-    LM_GGML_ASSERT((OW > 0)           && "b too small compared to a");
+    LM_GGML_ASSERT(p0 == 0);
+    LM_GGML_ASSERT(d0 == 1);
     const int64_t ne[4] = {
-        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
-        OW,
-        is_2D ? OH : b->ne[2],
-        is_2D ?      b->ne[3] : 1,
+        lm_ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
     };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, dst_type, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    int32_t params[] = { s0, p0, d0 };
     lm_ggml_set_op_params(result, params, sizeof(params));
-    result->op     = LM_GGML_OP_IM2COL;
+    result->op     = LM_GGML_OP_CONV_TRANSPOSE_1D;
     result->src[0] = a;
     result->src[1] = b;
     return result;
 }
-struct lm_ggml_tensor * lm_ggml_im2col_back(
-        struct lm_ggml_context * ctx,
-        struct lm_ggml_tensor  * a,
-        struct lm_ggml_tensor  * b,
-        int64_t             * ne,
-        int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1,
-        bool                  is_2D) {
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
-    lm_ggml_set_op_params(result, params, sizeof(params));
-    result->op     = LM_GGML_OP_IM2COL_BACK;
-    result->src[0] = a;
-    result->src[1] = b;
-    return result;
-}
+// lm_ggml_conv_2d
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
@@ -3934,6 +3995,31 @@ struct lm_ggml_tensor * lm_ggml_conv_2d_s1_ph(
     return lm_ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
 }
+// lm_ggml_conv_2d_dw
+struct lm_ggml_tensor * lm_ggml_conv_2d_dw(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct lm_ggml_tensor * new_a = lm_ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, new_a,
+                                        lm_ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true, LM_GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
+    struct lm_ggml_tensor * new_b = lm_ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+    new_a = lm_ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+    struct lm_ggml_tensor * result = lm_ggml_mul_mat(ctx, new_a, new_b);
+    result = lm_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+    return result;
+}
 // lm_ggml_conv_transpose_2d_p0
 static int64_t lm_ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
@@ -4110,6 +4196,37 @@ struct lm_ggml_tensor * lm_ggml_pad(
     return result;
 }
+// lm_ggml_pad_reflect_1d
+struct lm_ggml_tensor * lm_ggml_pad_reflect_1d(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        int                   p0,
+        int                   p1) {
+    LM_GGML_ASSERT(p0 >= 0);
+    LM_GGML_ASSERT(p1 >= 0);
+    LM_GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
+    LM_GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(a));
+    LM_GGML_ASSERT(a->type == LM_GGML_TYPE_F32);
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] + p0 + p1,
+            a->ne[1],
+            a->ne[2],
+            a->ne[3]);
+    int32_t params[] = { p0, p1 };
+    lm_ggml_set_op_params(result, params, sizeof(params));
+    result->op     = LM_GGML_OP_PAD_REFLECT_1D;
+    result->src[0] = a;
+    return result;
+}
 // lm_ggml_arange
 struct lm_ggml_tensor * lm_ggml_arange(
@@ -5967,12 +6084,12 @@ struct lm_ggml_tensor * lm_ggml_graph_get_tensor(const struct lm_ggml_cgraph * c
 struct lm_ggml_tensor * lm_ggml_graph_get_grad(const struct lm_ggml_cgraph * cgraph, const struct lm_ggml_tensor * node) {
     const size_t igrad = lm_ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != LM_GGML_HASHSET_FULL && lm_ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
+    return igrad != LM_GGML_HASHSET_FULL && lm_ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
 }
 struct lm_ggml_tensor * lm_ggml_graph_get_grad_acc(const struct lm_ggml_cgraph * cgraph, const struct lm_ggml_tensor * node) {
     const size_t igrad = lm_ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != LM_GGML_HASHSET_FULL && lm_ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
+    return igrad != LM_GGML_HASHSET_FULL && lm_ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
 }
 void lm_ggml_graph_print(const struct lm_ggml_cgraph * cgraph) {
@@ -6284,9 +6401,6 @@ size_t lm_ggml_quantize_chunk(
         case LM_GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case LM_GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case LM_GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case LM_GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case LM_GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case LM_GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case LM_GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(lm_ggml_fp16_t);
@@ -6422,7 +6536,7 @@ struct lm_gguf_context {
     void * data;
 };
-static size_t lm_gguf_type_size(enum lm_gguf_type type) {
+size_t lm_gguf_type_size(enum lm_gguf_type type) {
     LM_GGML_ASSERT(0 <= type && type < LM_GGUF_TYPE_COUNT);
     return LM_GGUF_TYPE_SIZE[type];
 }
@@ -6550,13 +6664,7 @@ struct lm_gguf_context * lm_gguf_init_empty(void) {
     return ctx;
 }
-struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gguf_init_params params) {
-    FILE * file = lm_ggml_fopen(fname, "rb");
-    if (!file) {
-        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
-        return NULL;
-    }
+struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params) {
     // offset from start of file
     size_t offset = 0;
@@ -6569,7 +6677,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         for (uint32_t i = 0; i < sizeof(magic); i++) {
             if (magic[i] != LM_GGUF_MAGIC[i]) {
                 fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
-                fclose(file);
                 return NULL;
             }
         }
@@ -6580,7 +6687,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
     struct lm_gguf_context * ctx = calloc(1, sizeof(struct lm_gguf_context));
     if (!ctx) {
         fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
-        fclose(file);
         return NULL;
     }
@@ -6598,7 +6704,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         if (ctx->header.version == 1) {
             fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
-            fclose(file);
             lm_gguf_free(ctx);
             return NULL;
         }
@@ -6611,7 +6716,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         if (!ok) {
             fprintf(stderr, "%s: failed to read header\n", __func__);
-            fclose(file);
             lm_gguf_free(ctx);
             return NULL;
         }
@@ -6621,12 +6725,13 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
     {
         const uint64_t n_kv = ctx->header.n_kv;
-        ctx->kv = calloc(n_kv, sizeof(struct lm_gguf_kv));
-        if (!ctx->kv) {
-            fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
-            fclose(file);
-            lm_gguf_free(ctx);
-            return NULL;
+        if (n_kv > 0) {
+            ctx->kv = calloc(n_kv, sizeof(struct lm_gguf_kv));
+            if (!ctx->kv) {
+                fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
+                lm_gguf_free(ctx);
+                return NULL;
+            }
         }
         for (uint64_t i = 0; i < n_kv; ++i) {
@@ -6673,7 +6778,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
                                     // prevent from integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/lm_gguf_type_size(kv->value.arr.type)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
                                         lm_gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6681,7 +6785,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
                                     kv->value.arr.data = calloc(kv->value.arr.n, lm_gguf_type_size(kv->value.arr.type));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
                                         lm_gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6693,7 +6796,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
                                     // prevent from integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/sizeof(struct lm_gguf_str)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
                                         lm_gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6701,7 +6803,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
                                     kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct lm_gguf_str));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
                                         lm_gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6732,7 +6833,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         if (!ok) {
             fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
-            fclose(file);
             lm_gguf_free(ctx);
             return NULL;
         }
@@ -6743,7 +6843,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct lm_gguf_tensor_info));
         if (!ctx->infos) {
             fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
-            fclose(file);
             lm_gguf_free(ctx);
             return NULL;
         }
@@ -6779,7 +6878,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor info\n", __func__);
-                fclose(file);
                 lm_gguf_free(ctx);
                 return NULL;
             }
@@ -6818,10 +6916,17 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
                 (int64_t) info->ne[2] *
                 (int64_t) info->ne[3];
-            if (lm_ggml_blck_size(info->type) == 0 || ne % lm_ggml_blck_size(info->type) != 0) {
+            if (lm_ggml_blck_size(info->type) == 0 ) {
+                // this tensor type support have been removed:
+                fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
+                        __func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type));
+                lm_gguf_free(ctx);
+                return NULL;
+            }
+            if (ne % lm_ggml_blck_size(info->type) != 0) {
                 fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
                         __func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type));
-                fclose(file);
                 lm_gguf_free(ctx);
                 return NULL;
             }
@@ -6853,7 +6958,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         *params.ctx = lm_ggml_init(pdata);
         if (*params.ctx == NULL) {
             fprintf(stderr, "%s: failed to initialize context\n", __func__);
-            fclose(file);
             lm_gguf_free(ctx);
             return NULL;
         }
@@ -6872,7 +6976,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor data\n", __func__);
-                fclose(file);
                 lm_ggml_free(ctx_data);
                 lm_gguf_free(ctx);
                 return NULL;
@@ -6911,7 +7014,6 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         if (!ok) {
             fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
-            fclose(file);
             lm_ggml_free(ctx_data);
             lm_gguf_free(ctx);
             return NULL;
@@ -6920,11 +7022,21 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
         lm_ggml_set_no_alloc(ctx_data, params.no_alloc);
     }
-    fclose(file);
     return ctx;
 }
+struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gguf_init_params params) {
+    FILE * file = lm_ggml_fopen(fname, "rb");
+    if (!file) {
+        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
+        return NULL;
+    }
+    struct lm_gguf_context * result = lm_gguf_init_from_file_impl(file, params);
+    fclose(file);
+    return result;
+}
 void lm_gguf_free(struct lm_gguf_context * ctx) {
     if (ctx == NULL) {
         return;
@@ -7384,13 +7496,7 @@ void lm_gguf_set_tensor_data(struct lm_gguf_context * ctx, const char * name, co
 //    fwrite(val, sizeof(char), size, file);
 //}
-struct lm_gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
-static struct lm_gguf_buf lm_gguf_buf_init(size_t size) {
+struct lm_gguf_buf lm_gguf_buf_init(size_t size) {
     struct lm_gguf_buf buf = {
         /*buf.data   =*/ size == 0 ? NULL : LM_GGML_CALLOC(1, size),
         /*buf.size   =*/ size,
@@ -7400,7 +7506,7 @@ static struct lm_gguf_buf lm_gguf_buf_init(size_t size) {
     return buf;
 }
-static void lm_gguf_buf_free(struct lm_gguf_buf buf) {
+void lm_gguf_buf_free(struct lm_gguf_buf buf) {
     if (buf.data) {
         LM_GGML_FREE(buf.data);
     }
@@ -7438,7 +7544,7 @@ static void lm_gguf_bwrite_el(struct lm_gguf_buf * buf, const void * val, size_t
     buf->offset += el_size;
 }
-static void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_gguf_buf * buf, bool only_meta) {
+void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_gguf_buf * buf, bool only_meta) {
     // write header
     lm_gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
     lm_gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));