npm - @novastera-oss/llamarn - Versions diffs - 0.4.1 → 0.4.4 - Mend

@novastera-oss/llamarn 0.4.1 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (976) hide show

package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h ADDED Viewed

@@ -0,0 +1,149 @@
+#ifndef OPS_UTILS_H
+#define OPS_UTILS_H
+#include "htp-msg.h"
+#ifndef MAX
+#    define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+#ifndef MIN
+#    define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+static inline uint64_t htp_get_cycles() {
+    uint64_t cycles = 0;
+    asm volatile(" %0 = c15:14\n" : "=r"(cycles));
+    return cycles;
+}
+static inline uint64_t htp_get_pktcnt() {
+    uint64_t pktcnt;
+    asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
+    return pktcnt;
+}
+static inline int32_t htp_is_aligned(void * addr, uint32_t align) {
+    return ((size_t) addr & (align - 1)) == 0;
+}
+static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
+    return m * ((n + m - 1) / m);
+}
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
+// Precompute mp (m' in the paper) and L such that division
+// can be computed using a multiply (high 32b of 64b result)
+// and a shift:
+//
+// n/d = (mulhi(n, mp) + n) >> L;
+struct fastdiv_values {
+    uint32_t mp;
+    uint32_t l;
+};
+static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
+    struct fastdiv_values result = { 0, 0 };
+    // compute L = ceil(log2(d));
+    while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
+        ++(result.l);
+    }
+    result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
+    return result;
+}
+static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
+    // Compute high 32 bits of n * mp
+    const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32);  // mulhi(n, mp)
+    // add n, apply bit shift
+    return (hi + n) >> vals->l;
+}
+static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) {
+    return n - fastdiv(n, vals) * d;
+}
+static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
+    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
+    asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
+}
+static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
+    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
+    uint32_t right_off = left_off + n;
+    return right_off <= chunk_size;
+}
+static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n && p < p_end; i++) {
+        p += snprintf(p, p_end - p, "%d, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n && p < p_end; i++) {
+        p += snprintf(p, p_end - p, "%d, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += snprintf(p, p_end - p, "%d, ", (int) x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += snprintf(p, p_end - p, "%.6f, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+static inline void htp_dump_f32(char * pref, const float * x, uint32_t n) {
+    uint32_t n0 = n / 16;
+    uint32_t n1 = n % 16;
+    uint32_t i = 0;
+    for (; i < n0; i++) {
+        htp_dump_fp32_line(pref, x + (16 * i), 16);
+    }
+    if (n1) {
+        htp_dump_fp32_line(pref, x + (16 * i), n1);
+    }
+}
+static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
+    uint32_t n0 = n / 16;
+    uint32_t n1 = n % 16;
+    uint32_t i = 0;
+    for (; i < n0; i++) {
+        htp_dump_fp16_line(pref, x + (16 * i), 16);
+    }
+    if (n1) {
+        htp_dump_fp16_line(pref, x + (16 * i), n1);
+    }
+}
+#endif /* OPS_UTILS_H */

package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c ADDED Viewed

@@ -0,0 +1,418 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <qurt_thread.h>
+#include <string.h>
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+#define htp_rope_preamble              \
+    const uint32_t ne00 = src0->ne[0]; \
+    const uint32_t ne01 = src0->ne[1]; \
+    const uint32_t ne02 = src0->ne[2]; \
+    const uint32_t ne03 = src0->ne[3]; \
+                                       \
+    const uint32_t ne0 = dst->ne[0];   \
+    const uint32_t ne1 = dst->ne[1];   \
+    const uint32_t ne2 = dst->ne[2];   \
+    const uint32_t ne3 = dst->ne[3];   \
+                                       \
+    const uint32_t nb00 = src0->nb[0]; \
+    const uint32_t nb01 = src0->nb[1]; \
+    const uint32_t nb02 = src0->nb[2]; \
+    const uint32_t nb03 = src0->nb[3]; \
+                                       \
+    const uint32_t nb0 = dst->nb[0];   \
+    const uint32_t nb1 = dst->nb[1];   \
+    const uint32_t nb2 = dst->nb[2];   \
+    const uint32_t nb3 = dst->nb[3];
+struct rope_th_ctx {
+    int32_t n_dims;
+    int32_t mode;
+    int32_t n_ctx_orig;
+    int32_t sections[4];
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+    float theta_scale;
+    float corr_dims[2];
+    struct htp_ops_context * octx;
+};
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+    return (1 - MIN(1, MAX(0, y)));
+}
+static void rope_cache_init(const float   theta_base,
+                            float         freq_scale,
+                            const float * freq_factors,
+                            float *       corr_dims,
+                            uint32_t      ne0,
+                            float         ext_factor,
+                            float         mscale,
+                            float *       cache,
+                            float         theta_scale) {
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+    float theta = theta_base;
+    for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
+        float theta_extrap = theta / ff;
+        // Get n-d rotational scaling corrected for extrapolation
+        float theta_interp = freq_scale * theta_extrap;
+        float theta2       = theta_interp;
+        if (ext_factor != 0.0f) {
+            float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+            theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            // Get n-d magnitude scaling corrected for interpolation
+            mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+        }
+        cache[i0 + 0] = cosf(theta2) * mscale;
+        cache[i0 + 1] = sinf(theta2) * mscale;
+        theta *= theta_scale;
+    }
+}
+#define M_PI 3.1415926535897932384626433
+static void rope_corr_dims(int     n_dims,
+                           int     n_ctx_orig,
+                           float   freq_base,
+                           float   beta_fast,
+                           float   beta_slow,
+                           float * dims) {
+    float start = floorf(n_dims * logf(n_ctx_orig / (beta_fast * 2 * (float) M_PI)) / (2 * logf(freq_base)));
+    float end   = ceilf(n_dims * logf(n_ctx_orig / (beta_slow * 2 * (float) M_PI)) / (2 * logf(freq_base)));
+    dims[0]     = MAX(0, start);
+    dims[1]     = MIN(n_dims - 1, end);
+}
+static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context * octx) {
+    memset(rope_ctx, 0, sizeof(struct rope_th_ctx));
+    const int32_t * op_params = &octx->op_params[0];
+    rope_ctx->n_dims     = ((const int32_t *) op_params)[1];
+    rope_ctx->mode       = ((const int32_t *) op_params)[2];
+    rope_ctx->n_ctx_orig = ((const int32_t *) op_params)[4];
+    memcpy(&rope_ctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
+    memcpy(&rope_ctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
+    memcpy(&rope_ctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
+    memcpy(&rope_ctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
+    memcpy(&rope_ctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
+    memcpy(&rope_ctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
+    memcpy(&rope_ctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
+    rope_ctx->theta_scale = powf(rope_ctx->freq_base, -2.0f / rope_ctx->n_dims);
+    rope_corr_dims(rope_ctx->n_dims, rope_ctx->n_ctx_orig, rope_ctx->freq_base, rope_ctx->beta_fast,
+                   rope_ctx->beta_slow, rope_ctx->corr_dims);
+    rope_ctx->octx = octx;
+    FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rope_ctx->n_dims,
+         rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
+}
+static void hvx_calc_rope_f32(const float * restrict src0,
+                              float * restrict dst,
+                              const int num_elems,
+                              const float * restrict theta_cache) {
+    // for (int i = 0; i < num_elems; i += 2) {
+    //const float cos_theta = theta_cache[i + 0];
+    //const float sin_theta = theta_cache[i + 1];
+    //const float x0 = src[0];
+    //const float x1 = src[1];
+    //dst[0] = x0*cos_theta - x1*sin_theta;
+    //dst[1] = x0*sin_theta + x1*cos_theta;
+    //src += 2;
+    //dst += 2;
+    // }
+    const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
+    const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
+    uint8_t * restrict dst_curr         = (uint8_t *) dst;
+    int step_of_1 = num_elems >> 6;  // 6 because we process two vectors at once
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector v0 = *(HVX_Vector *) src0_curr;
+        HVX_Vector v1 = *(HVX_Vector *) (src0_curr + VLEN);
+        HVX_Vector v2 = *(HVX_Vector *) theta_curr;
+        HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
+        HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);  // vx0_x1[0] = x0, vx0_x1[1] = x1
+        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
+        HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+        HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+        HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+        HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+        HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
+        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
+        HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
+        *(HVX_Vector *) dst_curr          = Q6_V_lo_W(vstore);
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_V_hi_W(vstore);
+        src0_curr += 2 * VLEN;
+        theta_curr += 2 * VLEN;
+        dst_curr += 2 * VLEN;
+    }
+}
+static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
+                         const uint32_t       ir0,
+                         const uint32_t       ir1,
+                         int                  nth,
+                         int                  ith,
+                         int                  opt_path) {
+    struct htp_ops_context * octx = rope_ctx->octx;
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    const struct htp_tensor * src2 = &octx->src2;
+    struct htp_tensor *       dst  = &octx->dst;
+    htp_rope_preamble;
+    const int32_t * pos = (const int32_t *) src1->data;
+    float * wp0 = (float *) (octx->src0_spad.data + (ith * nb01));
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        freq_factors = (const float *) src2->data;
+    }
+    int ir = 0;
+    for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
+        for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
+            const int32_t p = pos[i2];
+            rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
+                            rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
+            for (uint32_t i1 = 0; i1 < ne1; i1++) {  // attn-heads
+                if (ir++ < ir0) {
+                    continue;
+                }
+                if (ir > ir1) {
+                    break;
+                }
+                const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+                float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+                const float * src_loc      = src;
+                float *       dst_data_loc = dst_data;
+                if (1 == opt_path) {
+                    hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
+                } else {
+                    for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
+                        const float cos_theta = wp0[i0 + 0];
+                        const float sin_theta = wp0[i0 + 1];
+                        const float x0 = src_loc[0];
+                        const float x1 = src_loc[1];
+                        dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
+                        dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
+                        src_loc += 2;
+                        dst_data_loc += 2;
+                    }
+                }
+                for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
+                    dst_data_loc[0] = src_loc[0];
+                    dst_data_loc[1] = src_loc[1];
+                    src_loc += 2;
+                    dst_data_loc += 2;
+                }
+            }
+        }
+    }
+}
+static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int ith) {
+    struct htp_ops_context * octx = rope_ctx->octx;
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    struct htp_tensor *       dst  = &octx->dst;
+    htp_rope_preamble;
+    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+        FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
+        is_aligned = 0;
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+    rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
+    t2 = HAP_perf_get_qtimer_count();
+    FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
+    struct rope_th_ctx * rope_ctx = (struct rope_th_ctx *) data;
+    rope_job_f32_per_thread(rope_ctx, n, i);
+}
+static int execute_op_rope_f32(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    const struct htp_tensor * src2 = &octx->src2;
+    struct htp_tensor *       dst  = &octx->dst;
+    worker_callback_t op_func;
+    const char *      op_type = NULL;
+    struct rope_th_ctx rope_ctx;
+    switch (octx->op) {
+        case HTP_OP_ROPE:
+            op_func = rope_job_dispatcher_f32;
+            op_type = "rope-f32";
+            init_rope_ctx(&rope_ctx, octx);
+            break;
+        default:
+            FARF(ERROR, "Unsupported Op %u\n", octx->op);
+            return HTP_STATUS_NO_SUPPORT;
+    }
+    const uint32_t n_threads = octx->n_threads;
+    const size_t src0_row_size = src0->nb[1];
+    const size_t src1_row_size = src0_row_size;
+    const size_t dst_row_size  = dst->nb[1];
+    // VTCM scratchpads for all tensors
+    // N rows per thread, padded to HVX vector size
+    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
+    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
+    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+    if (src2->ne[0]) {
+        FARF(HIGH,
+             "%s: %ux%ux%ux%u (x %ux%ux%ux%u x %ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u "
+             "dst-spad-size %u\n",
+             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
+             src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1], dst->ne[2],
+             dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
+    } else {
+        FARF(HIGH,
+             "%s: %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
+             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
+             octx->dst_spad.size);
+    }
+    // Make sure the reserved vtcm size is sufficient
+    if (octx->ctx->vtcm_size < spad_size) {
+        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+    uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        uint32_t n_jobs             = MIN(n_threads, src0_nrows);
+        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, op_func, &rope_ctx, n_jobs);
+    }
+    return err;
+}
+int op_rope(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+    switch (octx->src0.type) {
+        case HTP_TYPE_F32:
+            err = execute_op_rope_f32(octx);
+            break;
+        default:
+            err = HTP_STATUS_NO_SUPPORT;
+            break;
+    }
+    return err;
+}