npm - cui-llama.rn - Versions diffs - 1.6.0 → 1.6.1 - Mend

cui-llama.rn 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} RENAMED Viewed

@@ -215,7 +215,7 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
         .nrows                    = 1,
     },
     [LM_GGML_TYPE_F16] = {
-        .from_float               = (lm_ggml_from_float_t) lm_ggml_fp32_to_fp16_row,
+        .from_float               = (lm_ggml_from_float_t) lm_ggml_cpu_fp32_to_fp16,
         .vec_dot                  = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_f16,
         .vec_dot_type             = LM_GGML_TYPE_F16,
         .nrows                    = 1,
@@ -356,7 +356,7 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
         .from_float               = quantize_row_q8_K,
     },
     [LM_GGML_TYPE_BF16] = {
-        .from_float               = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
+        .from_float               = (lm_ggml_from_float_t) lm_ggml_cpu_fp32_to_bf16,
         .vec_dot                  = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
         .vec_dot_type             = LM_GGML_TYPE_BF16,
         .nrows                    = 1,
@@ -1932,6 +1932,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_im2col_back_f32(params, tensor);
             } break;
+        case LM_GGML_OP_CONV_2D_DW:
+            {
+                lm_ggml_compute_forward_conv_2d_dw(params, tensor);
+            } break;
         case LM_GGML_OP_CONV_TRANSPOSE_2D:
             {
                 lm_ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -2027,41 +2031,6 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_rwkv_wkv7(params, tensor);
             } break;
-        case LM_GGML_OP_MAP_UNARY:
-            {
-                lm_ggml_unary_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                lm_ggml_compute_forward_map_unary(params, tensor, fun);
-            }
-            break;
-        case LM_GGML_OP_MAP_BINARY:
-            {
-                lm_ggml_binary_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                lm_ggml_compute_forward_map_binary(params, tensor, fun);
-            }
-            break;
-        case LM_GGML_OP_MAP_CUSTOM1_F32:
-            {
-                lm_ggml_custom1_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                lm_ggml_compute_forward_map_custom1_f32(params, tensor, fun);
-            }
-            break;
-        case LM_GGML_OP_MAP_CUSTOM2_F32:
-            {
-                lm_ggml_custom2_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                lm_ggml_compute_forward_map_custom2_f32(params, tensor, fun);
-            }
-            break;
-        case LM_GGML_OP_MAP_CUSTOM3_F32:
-            {
-                lm_ggml_custom3_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                lm_ggml_compute_forward_map_custom3_f32(params, tensor, fun);
-            }
-            break;
         case LM_GGML_OP_MAP_CUSTOM1:
             {
                 lm_ggml_compute_forward_map_custom1(params, tensor);
@@ -2077,6 +2046,11 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
                 lm_ggml_compute_forward_map_custom3(params, tensor);
             }
             break;
+        case LM_GGML_OP_CUSTOM:
+            {
+                lm_ggml_compute_forward_custom(params, tensor);
+            }
+            break;
         case LM_GGML_OP_CROSS_ENTROPY_LOSS:
             {
                 lm_ggml_compute_forward_cross_entropy_loss(params, tensor);
@@ -2298,6 +2272,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
             } break;
         case LM_GGML_OP_IM2COL:
         case LM_GGML_OP_IM2COL_BACK:
+        case LM_GGML_OP_CONV_2D_DW:
         case LM_GGML_OP_CONV_TRANSPOSE_1D:
         case LM_GGML_OP_CONV_TRANSPOSE_2D:
             {
@@ -2328,11 +2303,6 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
         case LM_GGML_OP_WIN_PART:
         case LM_GGML_OP_WIN_UNPART:
         case LM_GGML_OP_GET_REL_POS:
-        case LM_GGML_OP_MAP_UNARY:
-        case LM_GGML_OP_MAP_BINARY:
-        case LM_GGML_OP_MAP_CUSTOM1_F32:
-        case LM_GGML_OP_MAP_CUSTOM2_F32:
-        case LM_GGML_OP_MAP_CUSTOM3_F32:
             {
                 n_tasks = 1;
             } break;
@@ -2366,6 +2336,16 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
                     n_tasks = MIN(p.n_tasks, n_threads);
                 }
             } break;
+        case LM_GGML_OP_CUSTOM:
+            {
+                struct lm_ggml_custom_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == LM_GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p.n_tasks, n_threads);
+                }
+            } break;
         case LM_GGML_OP_CROSS_ENTROPY_LOSS:
         case LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK:
         case LM_GGML_OP_OPT_STEP_ADAMW:
@@ -3186,6 +3166,93 @@ enum lm_ggml_status lm_ggml_graph_compute_with_ctx(struct lm_ggml_context * ctx,
     return lm_ggml_graph_compute(cgraph, &cplan);
 }
+void lm_ggml_cpu_fp32_to_fp16(const float * x, lm_ggml_fp16_t * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__F16C__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        __m512 x_vec = _mm512_loadu_ps(x + i);
+        __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm256_storeu_si256((__m256i *)(y + i), y_vec);
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        __m256 x_vec = _mm256_loadu_ps(x + i);
+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
+    }
+    for (; i + 3 < n; i += 4) {
+        __m128 x_vec = _mm_loadu_ps(x + i);
+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = LM_GGML_FP32_TO_FP16(x[i]);
+    }
+}
+void lm_ggml_cpu_fp16_to_fp32(const lm_ggml_fp16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__F16C__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
+        __m512 y_vec = _mm512_cvtph_ps(x_vec);
+        _mm512_storeu_ps(y + i, y_vec);
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
+        __m256 y_vec = _mm256_cvtph_ps(x_vec);
+        _mm256_storeu_ps(y + i, y_vec);
+    }
+    for (; i + 3 < n; i += 4) {
+        __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
+        __m128 y_vec = _mm_cvtph_ps(x_vec);
+        _mm_storeu_ps(y + i, y_vec);
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = LM_GGML_FP16_TO_FP32(x[i]);
+    }
+}
+void lm_ggml_cpu_fp32_to_bf16(const float * x, lm_ggml_bf16_t * y, int64_t n) {
+    int64_t i = 0;
+    for (; i < n; ++i) {
+        y[i] = LM_GGML_FP32_TO_BF16(x[i]);
+    }
+}
+void lm_ggml_cpu_bf16_to_fp32(const lm_ggml_bf16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__AVX2__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i,
+                        _mm512_castsi512_ps(
+                            _mm512_slli_epi32(
+                                _mm512_cvtepu16_epi32(
+                                    _mm256_loadu_si256(
+                                        (const __m256i *)(x + i))),
+                                16)));
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i,
+                        _mm256_castsi256_ps(
+                            _mm256_slli_epi32(
+                                _mm256_cvtepu16_epi32(
+                                    _mm_loadu_si128(
+                                        (const __m128i *)(x + i))),
+                                16)));
+    }
+#endif
+    for (; i < n; i++) {
+        y[i] = LM_GGML_BF16_TO_FP32(x[i]);
+    }
+}
 int lm_ggml_cpu_has_avx(void) {
 #if defined(__AVX__)

package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} RENAMED Viewed

@@ -4,6 +4,7 @@
 #include "ggml-cpu-aarch64.h"
 #include "ggml-cpu-traits.h"
 #include "ggml-impl.h"
+#include "amx/amx.h"
 #include <cctype>
 #include <string>
@@ -424,6 +425,8 @@ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, co
         }
         case LM_GGML_OP_IM2COL_BACK:
             return src0->type == LM_GGML_TYPE_F32 && src1->type == LM_GGML_TYPE_F32;
+        case LM_GGML_OP_GET_ROWS_BACK:
+            return src0->type == LM_GGML_TYPE_F32 || src0->type == LM_GGML_TYPE_F16;
         case LM_GGML_OP_OUT_PROD:
             return (src0->type == LM_GGML_TYPE_F32 || (lm_ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
                 src1->type == LM_GGML_TYPE_F32 && op->type == LM_GGML_TYPE_F32;

package/cpp/{ops.cpp → ggml-cpu/ops.cpp} RENAMED Viewed

@@ -4222,7 +4222,7 @@ static void lm_ggml_compute_forward_get_rows_f16(
         LM_GGML_ASSERT(i01 >= 0 && i01 < ne01);
-        lm_ggml_fp16_to_fp32_row(
+        lm_ggml_cpu_fp16_to_fp32(
             (const lm_ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }
@@ -4263,7 +4263,7 @@ static void lm_ggml_compute_forward_get_rows_bf16(
         LM_GGML_ASSERT(i01 >= 0 && i01 < ne01);
-        lm_ggml_bf16_to_fp32_row(
+        lm_ggml_cpu_bf16_to_fp32(
             (const lm_ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                         (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }
@@ -6064,6 +6064,178 @@ void lm_ggml_compute_forward_conv_transpose_2d(
     }
 }
+// lm_ggml_compute_forward_conv_2d_dw
+struct lm_ggml_conv_2d_dw_params {
+    int64_t channels;
+    int64_t batch;
+    int64_t src_w;
+    int64_t src_h;
+    int64_t dst_w;
+    int64_t dst_h;
+    int64_t knl_w;
+    int64_t knl_h;
+    int stride_x;
+    int stride_y;
+    int pad_x;
+    int pad_y;
+    int dilation_x;
+    int dilation_y;
+};
+static void lm_ggml_compute_forward_conv_2d_dw_cwhn(
+        const lm_ggml_compute_params * params,
+        const lm_ggml_tensor * src,
+        const lm_ggml_tensor * kernel,
+        lm_ggml_tensor * dst,
+        const lm_ggml_conv_2d_dw_params & p) {
+    const int64_t c = p.channels;
+    const float * knl_data = (const float *)kernel->data;
+    const int64_t rows_total = p.dst_h * p.batch;
+    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
+    const int64_t row_start = params->ith * rows_per_thread;
+    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
+#ifdef LM_GGML_SIMD
+    const int64_t pkg_size = LM_GGML_F32_EPR;
+    const int64_t pkg_count = c / pkg_size;
+    const int64_t c_pkg_end = pkg_count * pkg_size;
+#else
+    const int64_t c_pkg_end = 0;
+#endif
+    for (int64_t row = row_start; row < row_end; ++row) {
+        const int64_t dst_y = row % p.dst_h;
+        const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
+        for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+            float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
+            const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
+            const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
+#ifdef LM_GGML_SIMD
+            // Vectorized loop
+            for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
+                LM_GGML_F32_VEC sum = LM_GGML_F32_VEC_ZERO;
+                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                    if (src_y < 0 || src_y >= p.src_h) {
+                        continue;
+                    }
+                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                        if (src_x < 0 || src_x >= p.src_w) {
+                            continue;
+                        }
+                        LM_GGML_F32_VEC k = LM_GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
+                        LM_GGML_F32_VEC s = LM_GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
+                        sum = LM_GGML_F32_VEC_FMA(sum, k, s);
+                    }
+                }
+                LM_GGML_F32_VEC_STORE(dst_data + c_i, sum);
+            }
+#endif
+            // Scalar loop
+            for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
+                float sum = 0.0f;
+                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                    if (src_y < 0 || src_y >= p.src_h) {
+                        continue;
+                    }
+                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                        if (src_x < 0 || src_x >= p.src_w) {
+                            continue;
+                        }
+                        sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
+                             * src_data[(src_y * p.src_w + src_x) * c + c_i];
+                    }
+                }
+                dst_data[c_i] = sum;
+            }
+        }
+    }
+}
+static void lm_ggml_compute_forward_conv_2d_dw_whcn(
+        const lm_ggml_compute_params * params,
+        const lm_ggml_tensor * src,
+        const lm_ggml_tensor * kernel,
+        lm_ggml_tensor * dst,
+        const lm_ggml_conv_2d_dw_params & p) {
+    const int64_t n = p.channels * p.batch;
+    const int64_t per_thread = (n + params->nth - 1) / params->nth;
+    const int64_t start = params->ith * per_thread;
+    const int64_t end = MIN(start + per_thread, n);
+    for (int64_t i = start; i < end; ++i) {
+        const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
+        const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
+        float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
+        for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
+            for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+                float sum = 0.0f;
+                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                    const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+                    if (src_y < 0 || src_y >= p.src_h) {
+                        continue;
+                    }
+                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                        const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                        if (src_x < 0 || src_x >= p.src_w) {
+                            continue;
+                        }
+                        sum += knl_data[knl_y * p.knl_w + knl_x]
+                             * src_data[src_y * p.src_w + src_x];
+                    }
+                }
+                dst_data[dst_y * p.dst_w + dst_x] = sum;
+            }
+        }
+    }
+}
+void lm_ggml_compute_forward_conv_2d_dw(
+        const lm_ggml_compute_params * params,
+        lm_ggml_tensor * dst) {
+    const lm_ggml_tensor * kernel = dst->src[0];
+    const lm_ggml_tensor * src = dst->src[1];
+    lm_ggml_conv_2d_dw_params p;
+    p.channels = src->ne[2];
+    p.batch = src->ne[3];
+    p.src_w = src->ne[0];
+    p.src_h = src->ne[1];
+    p.dst_w = dst->ne[0];
+    p.dst_h = dst->ne[1];
+    p.knl_w = kernel->ne[0];
+    p.knl_h = kernel->ne[1];
+    p.stride_x = dst->op_params[0];
+    p.stride_y = dst->op_params[1];
+    p.pad_x = dst->op_params[2];
+    p.pad_y = dst->op_params[3];
+    p.dilation_x = dst->op_params[4];
+    p.dilation_y = dst->op_params[5];
+    LM_GGML_ASSERT(kernel->ne[3] == p.channels);
+    LM_GGML_ASSERT(dst->ne[3] == p.batch);
+    if (lm_ggml_is_contiguous(src)) {
+        lm_ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
+    } else if (lm_ggml_is_contiguous_channels(src)) {
+        // kernel should also have channels most contiguous in memory
+        LM_GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
+        lm_ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
+    } else {
+        LM_GGML_ABORT("non-contiguous memory layout not supported");
+    }
+}
 // lm_ggml_compute_forward_pool_1d_sk_p0
 static void lm_ggml_compute_forward_pool_1d_sk_p0(
@@ -6351,24 +6523,72 @@ static void lm_ggml_compute_forward_upscale_f32(
     const float sf2 = (float)ne2/src0->ne[2];
     const float sf3 = (float)ne3/src0->ne[3];
-    // TODO: optimize
+    const lm_ggml_scale_mode mode = (lm_ggml_scale_mode) lm_ggml_get_op_params_i32(dst, 0);
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        const int64_t i03 = i3 / sf3;
-        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-            const int64_t i02 = i2 / sf2;
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                const int64_t i01 = i1 / sf1;
-                for (int64_t i0 = 0; i0 < ne0; i0++) {
-                    const int64_t i00 = i0 / sf0;
+    if (mode == LM_GGML_SCALE_MODE_NEAREST) {
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    const int64_t i01 = i1 / sf1;
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const int64_t i00 = i0 / sf0;
-                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+                        const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
-                    *y = *x;
+                        *y = *x;
+                    }
+                }
+            }
+        }
+    } else if (mode == LM_GGML_SCALE_MODE_BILINEAR) {
+        // setting a pixel offset of 0 would replicate the behavior of pytorch interpolate with align_corners=True
+        const float pixel_offset = 0.5f;
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
+                    int64_t y0 = (int64_t)floorf(y);
+                    int64_t y1 = y0 + 1;
+                    y0 = std::max(int64_t(0), std::min(y0, ne01 - 1));
+                    y1 = std::max(int64_t(0), std::min(y1, ne01 - 1));
+                    float dy = y - (float)y0;
+                    dy = std::max(0.0f, std::min(dy, 1.0f));
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
+                        int64_t x0 = (int64_t)floorf(x);
+                        int64_t x1 = x0 + 1;
+                        x0 = std::max(int64_t(0), std::min(x0, ne00 - 1));
+                        x1 = std::max(int64_t(0), std::min(x1, ne00 - 1));
+                        float dx = x - (float)x0;
+                        dx = std::max(0.0f, std::min(dx, 1.0f));
+                        // fetch the four surrounding pixel values and interpolate
+                        const float a = *(const float *)((const char *)src0->data + x0*nb00 + y0*nb01 + i02*nb02 + i03*nb03);
+                        const float b = *(const float *)((const char *)src0->data + x1*nb00 + y0*nb01 + i02*nb02 + i03*nb03);
+                        const float c = *(const float *)((const char *)src0->data + x0*nb00 + y1*nb01 + i02*nb02 + i03*nb03);
+                        const float d = *(const float *)((const char *)src0->data + x1*nb00 + y1*nb01 + i02*nb02 + i03*nb03);
+                        const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
+                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                        *y_dst = val;
+                    }
                 }
             }
         }
+    } else {
+        LM_GGML_ABORT("unsupported upscale mode");
     }
 }
@@ -8268,152 +8488,6 @@ void lm_ggml_compute_forward_rwkv_wkv7(
     }
 }
-// lm_ggml_compute_forward_map_unary
-static void lm_ggml_compute_forward_map_unary_f32(
-        const lm_ggml_compute_params * params,
-        lm_ggml_tensor * dst,
-        const lm_ggml_unary_op_f32_t fun) {
-    const lm_ggml_tensor * src0 = dst->src[0];
-    if (params->ith != 0) {
-        return;
-    }
-    assert(lm_ggml_is_contiguous_1(src0));
-    assert(lm_ggml_is_contiguous_1(dst));
-    assert(lm_ggml_are_same_shape(src0, dst));
-    const int n  = lm_ggml_nrows(src0);
-    const int nc = src0->ne[0];
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-void lm_ggml_compute_forward_map_unary(
-        const lm_ggml_compute_params * params,
-        lm_ggml_tensor * dst,
-        const lm_ggml_unary_op_f32_t fun) {
-    const lm_ggml_tensor * src0 = dst->src[0];
-    switch (src0->type) {
-        case LM_GGML_TYPE_F32:
-            {
-                lm_ggml_compute_forward_map_unary_f32(params, dst, fun);
-            } break;
-        default:
-            {
-                LM_GGML_ABORT("fatal error");
-            }
-    }
-}
-// lm_ggml_compute_forward_map_binary
-static void lm_ggml_compute_forward_map_binary_f32(
-        const lm_ggml_compute_params * params,
-        lm_ggml_tensor * dst,
-        const lm_ggml_binary_op_f32_t fun) {
-    const lm_ggml_tensor * src0 = dst->src[0];
-    const lm_ggml_tensor * src1 = dst->src[1];
-    if (params->ith != 0) {
-        return;
-    }
-    assert(lm_ggml_is_contiguous_1(src0));
-    assert(lm_ggml_is_contiguous_1(src1));
-    assert(lm_ggml_is_contiguous_1(dst));
-    assert(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst));
-    const int n  = lm_ggml_nrows(src0);
-    const int nc = src0->ne[0];
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-void lm_ggml_compute_forward_map_binary(
-        const lm_ggml_compute_params * params,
-        lm_ggml_tensor * dst,
-        const lm_ggml_binary_op_f32_t fun) {
-    const lm_ggml_tensor * src0 = dst->src[0];
-    switch (src0->type) {
-        case LM_GGML_TYPE_F32:
-            {
-                lm_ggml_compute_forward_map_binary_f32(params, dst, fun);
-            } break;
-        default:
-            {
-                LM_GGML_ABORT("fatal error");
-            }
-    }
-}
-// lm_ggml_compute_forward_map_custom1
-void lm_ggml_compute_forward_map_custom1_f32(
-        const lm_ggml_compute_params * params,
-        lm_ggml_tensor * dst,
-        const lm_ggml_custom1_op_f32_t fun) {
-    const lm_ggml_tensor * a = dst->src[0];
-    if (params->ith != 0) {
-        return;
-    }
-    fun(dst, a);
-}
-// lm_ggml_compute_forward_map_custom2
-void lm_ggml_compute_forward_map_custom2_f32(
-        const lm_ggml_compute_params * params,
-        lm_ggml_tensor * dst,
-        const lm_ggml_custom2_op_f32_t fun) {
-    const lm_ggml_tensor * a = dst->src[0];
-    const lm_ggml_tensor * b = dst->src[1];
-    if (params->ith != 0) {
-        return;
-    }
-    fun(dst, a, b);
-}
-// lm_ggml_compute_forward_map_custom3
-void lm_ggml_compute_forward_map_custom3_f32(
-        const lm_ggml_compute_params * params,
-        lm_ggml_tensor * dst,
-        const lm_ggml_custom3_op_f32_t fun) {
-    const lm_ggml_tensor * a = dst->src[0];
-    const lm_ggml_tensor * b = dst->src[1];
-    const lm_ggml_tensor * c = dst->src[1];
-    if (params->ith != 0) {
-        return;
-    }
-    fun(dst, a, b, c);
-}
 // lm_ggml_compute_forward_map_custom1
 void lm_ggml_compute_forward_map_custom1(
@@ -8459,6 +8533,18 @@ void lm_ggml_compute_forward_map_custom3(
     p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
 }
+// lm_ggml_compute_forward_custom
+void lm_ggml_compute_forward_custom(
+    const struct lm_ggml_compute_params * params,
+          struct lm_ggml_tensor * dst) {
+    struct lm_ggml_custom_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
+    p.fun(dst, params->ith, params->nth, p.userdata);
+}
 // lm_ggml_compute_forward_cross_entropy_loss
 static void lm_ggml_compute_forward_cross_entropy_loss_f32(