RubyGems - whispercpp - Versions diffs - 1.3.5 → 1.3.6 - Mend

whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (610) hide show

data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c CHANGED Viewed

@@ -2,41 +2,52 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
-#ifdef HTP_DEBUG
-#    define FARF_HIGH 1
-#endif
 #include <HAP_farf.h>
-#include <HAP_mem.h>
 #include <HAP_perf.h>
-#include <HAP_ps.h>
-#include <hexagon_protos.h>
-#include <hexagon_types.h>
 #include <math.h>
-#include <qurt_thread.h>
 #include <string.h>
+#include "hex-dma.h"
+#include "hvx-utils.h"
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
-#include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
-#include "hvx-utils.h"
-#include "ops-utils.h"
-typedef void (*hvx_elemwise_f32_func)(const uint8_t * src0,
-                                      const uint8_t * src1,
-                                      uint8_t *       data_dst,
-                                      const int       num_elems);
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
-static hvx_elemwise_f32_func func_table_HVX[]     = { hvx_mul_f32, hvx_add_f32, hvx_sub_f32 };
-static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_opt, hvx_add_f32_opt, hvx_sub_f32_opt };
+// Context for binary operations
+struct htp_binary_context {
+    struct htp_ops_context * octx;
+    struct fastdiv_values dim1_div;
+    struct fastdiv_values dim2_div;
+    struct fastdiv_values dim12_div;
+    struct fastdiv_values src1_dim1_div; // ne11
+    struct fastdiv_values src1_dim2_div; // ne12
+    struct fastdiv_values src1_dim3_div; // ne13
+    uint32_t nrows_per_thread;
+    bool split_at_ne01;
+    bool split_at_ne02;
+    // Precomputed values
+    uint32_t block_max;
+    size_t   src0_row_size_aligned;
+    size_t   src1_row_size_aligned;
+    size_t   dst_row_size_aligned;
+    uint32_t src1_fetch_rows; // 1 or block_max
+    uint32_t src1_dma_stride; // 0 or stride
+};
 #define htp_binary_preamble            \
     const struct htp_tensor * src0 = &octx->src0; \
     const struct htp_tensor * src1 = &octx->src1; \
-    const struct htp_tensor * src2 = &octx->src2; \
     struct htp_tensor *       dst  = &octx->dst;  \
                                        \
     const uint32_t ne00 = src0->ne[0]; \
@@ -49,272 +60,752 @@ static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_opt, hvx_add_f
     const uint32_t ne12 = src1->ne[2]; \
     const uint32_t ne13 = src1->ne[3]; \
                                        \
-    const uint32_t ne0 = dst->ne[0];   \
-    const uint32_t ne1 = dst->ne[1];   \
-    const uint32_t ne2 = dst->ne[2];   \
-    const uint32_t ne3 = dst->ne[3];   \
-                                       \
-    const uint32_t nb00 = src0->nb[0]; \
     const uint32_t nb01 = src0->nb[1]; \
     const uint32_t nb02 = src0->nb[2]; \
     const uint32_t nb03 = src0->nb[3]; \
                                        \
-    const uint32_t nb10 = src1->nb[0]; \
     const uint32_t nb11 = src1->nb[1]; \
     const uint32_t nb12 = src1->nb[2]; \
     const uint32_t nb13 = src1->nb[3]; \
                                        \
-    const uint32_t nb0 = dst->nb[0];   \
     const uint32_t nb1 = dst->nb[1];   \
     const uint32_t nb2 = dst->nb[2];   \
-    const uint32_t nb3 = dst->nb[3];   \
-                                       \
-    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+    const uint32_t nb3 = dst->nb[3];
-static void binary_job_f32_per_thread(struct htp_ops_context * octx,
-                                      uint8_t *                spad_data,
-                                      uint32_t                 nth,
-                                      uint32_t                 ith,
-                                      enum htp_op              op) {
-    htp_binary_preamble;
+static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row,
+                                uint32_t ne01, uint32_t ne02) {
+    uint32_t i03, i02, i01, rem;
+    i03 = fastdiv(ir, &bctx->dim12_div);
+    rem = ir - i03 * (ne02 * ne01);
+    i02 = fastdiv(rem, &bctx->dim1_div);
+    i01 = rem - i02 * ne01;
-    const size_t src0_row_size = nb01;
-    const size_t src1_row_size = nb11;
-    const size_t dst_row_size  = nb1;
+    uint32_t rows_left = end_row - ir;
+    uint32_t block_limit = rows_left;
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
-    const uint32_t src1_nrows = ne11 * ne12 * ne13;  // src1 rows
+    if (bctx->split_at_ne01) {
+        block_limit = MIN(block_limit, ne01 - i01);
+    }
+    if (bctx->split_at_ne02) {
+         uint32_t rows_in_plane = (ne02 * ne01) - rem;
+         block_limit = MIN(block_limit, rows_in_plane);
+    }
-    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
-    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    return MIN(bctx->block_max, block_limit);
+}
-    // no work for this thread
-    if (src0_start_row >= src0_end_row) {
-        return;
+// Macro for scalar op switch
+#define COMPUTE_SCALAR_OP(DST, SRC, VAL, TYPE, N) \
+    if(TYPE == HTP_TYPE_F32) { \
+        switch (octx->op) { \
+            case HTP_OP_ADD: hvx_add_scalar_f32_aa(DST, SRC, *(float *)VAL, N); break; \
+            case HTP_OP_SUB: hvx_sub_scalar_f32_aa(DST, SRC, *(float *)VAL, N); break; \
+            case HTP_OP_MUL: hvx_mul_scalar_f32_aa(DST, SRC, *(float *)VAL, N); break; \
+            case HTP_OP_DIV: hvx_mul_scalar_f32_aa(DST, SRC, 1.0f / (*(float *)VAL), N); break; \
+            default: break; \
+        } \
+    } \
+    else { \
+        switch (octx->op) { \
+            case HTP_OP_ADD: hvx_add_scalar_f16_aa(DST, SRC, *(_Float16 *)VAL, N); break; \
+            case HTP_OP_SUB: hvx_sub_scalar_f16_aa(DST, SRC, *(_Float16 *)VAL, N); break; \
+            case HTP_OP_MUL: hvx_mul_scalar_f16_aa(DST, SRC, *(_Float16 *)VAL, N); break; \
+            case HTP_OP_DIV: hvx_div_scalar_f16_aa(DST, SRC, *(_Float16 *)VAL, N); break; \
+            default: break; \
+        } \
     }
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
-        FARF(HIGH, "binary-f32: unaligned addresses in elementwise op, possibly slower execution\n");
-        is_aligned = 0;
+// Macro for vector op switch (All Aligned)
+#define COMPUTE_VECTOR_OP_AAA(DST, SRC0, SRC1, TYPE, N) \
+    if(TYPE == HTP_TYPE_F32) { \
+        switch (octx->op) { \
+            case HTP_OP_ADD: hvx_add_f32_aaa(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_SUB: hvx_sub_f32_aaa(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_MUL: hvx_mul_f32_aaa(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_DIV: hvx_div_f32_aaa(DST, SRC0, SRC1, N); break; \
+            default: break; \
+        } \
+    } \
+    else { \
+        switch (octx->op) { \
+            case HTP_OP_ADD: hvx_add_f16_aaa(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_SUB: hvx_sub_f16_aaa(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_MUL: hvx_mul_f16_aaa(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_DIV: hvx_div_f16_aaa(DST, SRC0, SRC1, N); break; \
+            default: break; \
+        } \
     }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
+// Macro for vector op switch (Dst Aligned, Src0 Aligned, Src1 Unaligned)
+#define COMPUTE_VECTOR_OP_AAU(DST, SRC0, SRC1, TYPE, N) \
+    if(TYPE == HTP_TYPE_F32) { \
+        switch (octx->op) { \
+            case HTP_OP_ADD: hvx_add_f32_aau(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_SUB: hvx_sub_f32_aau(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_MUL: hvx_mul_f32_aau(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_DIV: hvx_div_f32_aau(DST, SRC0, SRC1, N); break; \
+            default: break; \
+        } \
+    } \
+    else { \
+        switch (octx->op) { \
+            case HTP_OP_ADD: hvx_add_f16_aau(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_SUB: hvx_sub_f16_aau(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_MUL: hvx_mul_f16_aau(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_DIV: hvx_div_f16_aau(DST, SRC0, SRC1, N); break; \
+            default: break; \
+        } \
     }
-    hvx_elemwise_f32_func func_HVX = (1 == opt_path) ? func_table_HVX_opt[op] : func_table_HVX[op];
+// Macro for vector op switch (All Unaligned - generic loop used in element repeat)
+#define COMPUTE_VECTOR_OP_UUU(DST, SRC0, SRC1, TYPE, N) \
+    if(TYPE == HTP_TYPE_F32) { \
+        switch (octx->op) { \
+            case HTP_OP_ADD: hvx_add_f32_uuu(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_SUB: hvx_sub_f32_uuu(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_MUL: hvx_mul_f32_uuu(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_DIV: hvx_div_f32_uuu(DST, SRC0, SRC1, N); break; \
+            default: break; \
+        } \
+    } \
+    else { \
+        switch (octx->op) { \
+            case HTP_OP_ADD: hvx_add_f16_uuu(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_SUB: hvx_sub_f16_uuu(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_MUL: hvx_mul_f16_uuu(DST, SRC0, SRC1, N); break; \
+            case HTP_OP_DIV: hvx_div_f16_uuu(DST, SRC0, SRC1, N); break; \
+            default: break; \
+        } \
+    }
-    uint8_t * restrict spad_data_th = spad_data + (ith * src0_row_size);
+// 1. Scalar src1 (ne10 == 1)
+static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_binary_context * bctx = (struct htp_binary_context *) data;
+    struct htp_ops_context * octx = bctx->octx;
+    htp_binary_preamble;
-    const uint8_t * restrict src0_ptr = (const uint8_t *) src0->data + (src0_start_row * src0_row_size);
-    uint8_t * restrict dst_ptr        = (uint8_t *) dst->data + (src0_start_row * dst_row_size);
+    const uint32_t src0_type = octx->src0.type;
+    const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
+    const uint32_t total_rows = ne01 * ne02 * ne03;
+    const uint32_t start_row = bctx->nrows_per_thread * ith;
+    const uint32_t end_row   = MIN(start_row + bctx->nrows_per_thread, total_rows);
+    if (start_row >= end_row) return;
+    uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_base  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
+    size_t src0_spad_half    = octx->src0_spad.size_per_thread / 2;
+    size_t dst_spad_half     = octx->dst_spad.size_per_thread  / 2;
+    dma_queue * q = octx->ctx->dma[ith];
+    uint32_t ir_prefetch = start_row;
+    int spad_idx = 0;
+    // Preamble
+    for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
+        uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+        rem = ir_prefetch - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
+        uint8_t * dst_curr  = (uint8_t *)dst->data  + i03 * nb3  + i02 * nb2  + i01 * nb1;
+        uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
+        uint8_t * d_spad  = dst_spad_base  + spad_idx * dst_spad_half;
+        dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+        dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
+        ir_prefetch += current_block_size;
+        spad_idx ^= 1;
+    }
-    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
+    // Main loop
+    for (uint32_t ir = start_row; ir < end_row; ) {
+        uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
-    const uint32_t ne02_ne01 = ne02 * ne01;
+        uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
+        uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        const uint32_t i03 = fastdiv(ir, &octx->src0_div21);
-        const uint32_t i02 = fastdiv(ir - i03 * ne02_ne01, &octx->src0_div1);
-        const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01);
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir, &bctx->dim12_div);
+        rem = ir - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
-        const uint32_t i13 = fastmodulo(i03, ne13, &octx->src1_div3);
-        const uint32_t i12 = fastmodulo(i02, ne12, &octx->src1_div2);
-        const uint32_t i11 = fastmodulo(i01, ne11, &octx->src1_div1);
+        // src1 indices (broadcast/repeat)
+        uint32_t i13 = fastmodulo(i03, ne13, &bctx->src1_dim3_div);
+        uint32_t i12 = fastmodulo(i02, ne12, &bctx->src1_dim2_div);
+        uint32_t i11 = fastmodulo(i01, ne11, &bctx->src1_dim1_div);
-        const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;
+        uint8_t * src1_ptr = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
+        uint32_t s1_stride = (ne11 == 1) ? 0 : nb11;
-        if (ir + 1 < src0_end_row) {
-            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
-            if (src1_row_size == src0_row_size) {
-                htp_l2fetch(src1_ptr, 1, src1_row_size, src1_row_size);
-            }
+        for (uint32_t r = 0; r < current_block_size; r++) {
+            uint8_t * r_src0 = s0_spad + r * bctx->src0_row_size_aligned;
+            uint8_t * r_dst  = d_spad + r * bctx->dst_row_size_aligned;
+            COMPUTE_SCALAR_OP(r_dst, r_src0, src1_ptr, src0_type, ne00);
+            src1_ptr += s1_stride;
         }
-        const uint32_t nr0 = ne00 / ne10;
-        if (nr0 > 1) {
-            if ((1 == is_aligned) && (nr0 == ne00)) {
-                hvx_bcast_fp32_a(spad_data_th, *(float *) src1_ptr, nr0);
-            } else {
-                for (uint32_t r = 0; r < nr0; r++) {
-                    memcpy(spad_data_th + r * nb11, (const uint8_t *) src1_ptr, nb11);
-                }
-            }
-            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, (uint8_t *) dst_ptr, ne00);
-        } else {
-            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
+        uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
+        dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
+        if (ir_prefetch < end_row) {
+             uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+             uint32_t p03, p02, p01, prem;
+             p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+             prem = ir_prefetch - p03 * (ne02 * ne01);
+             p02 = fastdiv(prem, &bctx->dim1_div);
+             p01 = prem - p02 * ne01;
+             uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
+             dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
+             ir_prefetch += next_block_size;
         }
+        ir += current_block_size;
+    }
+    dma_queue_flush(q);
+}
-        src0_ptr += src0_row_size;
-        dst_ptr += dst_row_size;
+// 2. Vector Same Shape (ne1x == ne0x) or Simple Broadcast
+static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_binary_context * bctx = (struct htp_binary_context *) data;
+    struct htp_ops_context * octx = bctx->octx;
+    htp_binary_preamble;
+    const uint32_t src0_type = octx->src0.type;
+    const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
+    const uint32_t total_rows = ne01 * ne02 * ne03;
+    const uint32_t start_row = bctx->nrows_per_thread * ith;
+    const uint32_t end_row   = MIN(start_row + bctx->nrows_per_thread, total_rows);
+    if (start_row >= end_row) return;
+    uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
+    uint8_t * dst_spad_base  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
+    size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
+    size_t src1_spad_half = octx->src1_spad.size_per_thread / 2;
+    size_t dst_spad_half  = octx->dst_spad.size_per_thread  / 2;
+    dma_queue * q = octx->ctx->dma[ith];
+    uint32_t ir_prefetch = start_row;
+    int spad_idx = 0;
+    for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
+        uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+        rem = ir_prefetch - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        uint32_t i13 = (ne13 == 1) ? 0 : i03;
+        uint32_t i12 = (ne12 == 1) ? 0 : i02;
+        uint32_t i11 = (ne11 == 1) ? 0 : i01;
+        uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
+        uint8_t * src1_base = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
+        uint8_t * dst_curr  = (uint8_t *)dst->data  + i03 * nb3  + i02 * nb2  + i01 * nb1;
+        uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
+        uint8_t * s1_spad = src1_spad_base + spad_idx * src1_spad_half;
+        uint8_t * d_spad  = dst_spad_base  + spad_idx * dst_spad_half;
+        dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+        dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
+        dma_queue_push(q, dma_make_ptr(s1_spad, src1_base), bctx->src1_row_size_aligned, bctx->src1_dma_stride, row_size_bytes, current_block_size);
+        ir_prefetch += current_block_size;
+        spad_idx ^= 1;
     }
-    t2 = HAP_perf_get_qtimer_count();
+    for (uint32_t ir = start_row; ir < end_row; ) {
+        uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
+        uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
+        uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
+        uint8_t * s1_spad = (uint8_t *) dma_queue_pop(q).dst;
+        for (uint32_t r = 0; r < current_block_size; r++) {
+            uint8_t * r_src0 = s0_spad + r * bctx->src0_row_size_aligned;
+            uint8_t * r_src1 = s1_spad + r * bctx->src1_row_size_aligned;
+            uint8_t * r_dst  = d_spad  + r * bctx->dst_row_size_aligned;
+            COMPUTE_VECTOR_OP_AAA(r_dst, r_src0, r_src1, src0_type, ne00);
+        }
-    FARF(HIGH, "binary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
-         ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir, &bctx->dim12_div);
+        rem = ir - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
+        dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
+        if (ir_prefetch < end_row) {
+             uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+             uint32_t p03, p02, p01, prem;
+             p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+             prem = ir_prefetch - p03 * (ne02 * ne01);
+             p02 = fastdiv(prem, &bctx->dim1_div);
+             p01 = prem - p02 * ne01;
+             uint32_t p13 = (ne13 == 1) ? 0 : p03;
+             uint32_t p12 = (ne12 == 1) ? 0 : p02;
+             uint32_t p11 = (ne11 == 1) ? 0 : p01;
+             uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
+             uint8_t * s1_next = (uint8_t *)src1->data + p13 * nb13 + p12 * nb12 + p11 * nb11;
+             dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
+             dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned, bctx->src1_dma_stride, row_size_bytes, next_block_size);
+             ir_prefetch += next_block_size;
+        }
+        ir += current_block_size;
+    }
+    dma_queue_flush(q);
 }
-static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
-                                             uint8_t *                spad_data,
-                                             uint32_t                 nth,
-                                             uint32_t                 ith,
-                                             hvx_elemwise_f32_func    func_HVX) {
+// 3. Row Broadcast (ne11 == 1, ne12 == 1, single row src1)
+static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_binary_context * bctx = (struct htp_binary_context *) data;
+    struct htp_ops_context * octx = bctx->octx;
     htp_binary_preamble;
-    const size_t src0_row_size = nb01;
-    const size_t src1_row_size = nb11;
-    const size_t dst_row_size  = nb1;
+    const uint32_t src0_type = octx->src0.type;
+    const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
+    const uint32_t total_rows = ne01 * ne02 * ne03;
+    const uint32_t start_row = bctx->nrows_per_thread * ith;
+    const uint32_t end_row   = MIN(start_row + bctx->nrows_per_thread, total_rows);
+    if (start_row >= end_row) return;
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * src1_spad = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
+    uint8_t * dst_spad_base  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
-    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
-    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
+    size_t dst_spad_half  = octx->dst_spad.size_per_thread  / 2;
-    // no work for this thread
-    if (src0_start_row >= src0_end_row) {
-        return;
-    }
+    dma_queue * q = octx->ctx->dma[ith];
+    uint32_t ir_prefetch = start_row;
+    int spad_idx = 0;
+    void * s1_ptr = (void *) src1_spad;
+    for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
+        uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+        rem = ir_prefetch - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+        uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
+        uint8_t * dst_curr  = (uint8_t *)dst->data  + i03 * nb3  + i02 * nb2  + i01 * nb1;
-    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
-        FARF(HIGH, "add-id-f32: unaligned addresses, possibly slower execution\n");
+        uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
+        uint8_t * d_spad  = dst_spad_base  + spad_idx * dst_spad_half;
+        dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+        dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
+        ir_prefetch += current_block_size;
+        spad_idx ^= 1;
     }
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
-    const uint32_t ne02_ne01  = ne02 * ne01;
-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        // src0 indices
-        const uint32_t i03 = fastdiv(ir, &octx->src0_div21);
-        const uint32_t i02 = fastdiv(ir - i03 * ne02_ne01, &octx->src0_div1);
-        const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01);
-        // src1 indices
-        const int i11 = *(int32_t *) ((char *) src2->data + i01 * src2->nb[0] + i02 * src2->nb[1]);
-        assert(i11 >= 0 && i11 < ne11);
-        float * restrict dst_ptr        = (float *) (data_dst + i03 * nb3 + i02 * nb2 + i01 * nb1);
-        const float * restrict src0_ptr = (const float *) (data_src0 + i03 * nb03 + i02 * nb02 + i01 * nb01);
-        const float * restrict src1_ptr = (const float *) (data_src1 + 0 + 0 + i11 * nb11);
-        if (ir + 1 < src0_end_row) {
-            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
-            if (src1_row_size == src0_row_size) {
-                htp_l2fetch(src1_ptr + ne10, 1, src1_row_size, src1_row_size);
-            }
+    for (uint32_t ir = start_row; ir < end_row; ) {
+        uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
+        uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
+        uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
+        for (uint32_t r = 0; r < current_block_size; r++) {
+            uint8_t * r_src0 = s0_spad + r * bctx->src0_row_size_aligned;
+            uint8_t * r_src1 = (uint8_t *)s1_ptr; // Constant
+            uint8_t * r_dst  = d_spad + r * bctx->dst_row_size_aligned;
+            COMPUTE_VECTOR_OP_AAA(r_dst, r_src0, r_src1, src0_type, ne00);
         }
-        const uint32_t nr0 = ne00 / ne10;
-        if (nr0 > 1) {
-            for (uint32_t r = 0; r < nr0; r++) {
-                memcpy(spad_data + r * nb10, (const uint8_t *) src1_ptr, nb10);
-            }
-            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data, (uint8_t *) dst_ptr, ne00);
-        } else {
-            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir, &bctx->dim12_div);
+        rem = ir - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
+        dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
+        if (ir_prefetch < end_row) {
+             uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+             uint32_t p03, p02, p01, prem;
+             p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+             prem = ir_prefetch - p03 * (ne02 * ne01);
+             p02 = fastdiv(prem, &bctx->dim1_div);
+             p01 = prem - p02 * ne01;
+             uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
+             dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
+             ir_prefetch += next_block_size;
         }
+        ir += current_block_size;
+    }
+    dma_queue_flush(q);
+}
+// 4. Vector Complex (ne10 == ne00, complex broadcast)
+static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_binary_context * bctx = (struct htp_binary_context *) data;
+    struct htp_ops_context * octx = bctx->octx;
+    htp_binary_preamble;
+    const uint32_t src0_type = octx->src0.type;
+    const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
+    const uint32_t total_rows = ne01 * ne02 * ne03;
+    const uint32_t start_row = bctx->nrows_per_thread * ith;
+    const uint32_t end_row   = MIN(start_row + bctx->nrows_per_thread, total_rows);
+    if (start_row >= end_row) return;
+    uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_base  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
+    size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
+    size_t dst_spad_half  = octx->dst_spad.size_per_thread  / 2;
+    dma_queue * q = octx->ctx->dma[ith];
+    uint32_t ir_prefetch = start_row;
+    int spad_idx = 0;
+    for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
+        uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+        rem = ir_prefetch - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
+        uint8_t * dst_curr  = (uint8_t *)dst->data  + i03 * nb3  + i02 * nb2  + i01 * nb1;
+        uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
+        uint8_t * d_spad  = dst_spad_base  + spad_idx * dst_spad_half;
+        dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+        dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
+        ir_prefetch += current_block_size;
+        spad_idx ^= 1;
     }
-    t2 = HAP_perf_get_qtimer_count();
+    for (uint32_t ir = start_row; ir < end_row; ) {
+        uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
+        uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
+        uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir, &bctx->dim12_div);
+        rem = ir - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        for (uint32_t r = 0; r < current_block_size; r++) {
+            uint32_t r_i01 = i01 + r;
+            uint32_t i13 = fastmodulo(i03, ne13, &bctx->src1_dim3_div);
+            uint32_t i12 = fastmodulo(i02, ne12, &bctx->src1_dim2_div);
+            uint32_t i11 = fastmodulo(r_i01, ne11, &bctx->src1_dim1_div);
+            uint8_t * r_src0 = s0_spad + r * bctx->src0_row_size_aligned;
+            uint8_t * r_src1 = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
+            uint8_t * r_dst  = d_spad + r * bctx->dst_row_size_aligned;
+            // Read src1 from DDR (unaligned)
+            COMPUTE_VECTOR_OP_AAU(r_dst, r_src0, r_src1, src0_type, ne00);
+        }
-    FARF(HIGH, "add-id-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", ith, nth,
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
-         src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1],
-         dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+        uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
+        dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
+        if (ir_prefetch < end_row) {
+             uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+             uint32_t p03, p02, p01, prem;
+             p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+             prem = ir_prefetch - p03 * (ne02 * ne01);
+             p02 = fastdiv(prem, &bctx->dim1_div);
+             p01 = prem - p02 * ne01;
+             uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
+             dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
+             ir_prefetch += next_block_size;
+        }
+        ir += current_block_size;
+    }
+    dma_queue_flush(q);
 }
-static void binary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+// 5. Element Repeat (ne10 != ne00)
+static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_binary_context * bctx = (struct htp_binary_context *) data;
+    struct htp_ops_context * octx = bctx->octx;
+    htp_binary_preamble;
-    switch (octx->op) {
-        case HTP_OP_MUL:
-        case HTP_OP_ADD:
-        case HTP_OP_SUB:
-            binary_job_f32_per_thread(octx, octx->src1_spad.data, n, i, octx->op);
-            break;
+    const uint32_t src0_type = octx->src0.type;
+    const uint32_t elem_size_bytes = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
+    const uint32_t row_size_bytes = ne00 * elem_size_bytes;;
+    const uint32_t total_rows = ne01 * ne02 * ne03;
+    const uint32_t start_row = bctx->nrows_per_thread * ith;
+    const uint32_t end_row   = MIN(start_row + bctx->nrows_per_thread, total_rows);
+    if (start_row >= end_row) return;
+    uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_base  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
+    size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
+    size_t dst_spad_half  = octx->dst_spad.size_per_thread  / 2;
+    dma_queue * q = octx->ctx->dma[ith];
+    uint32_t ir_prefetch = start_row;
+    int spad_idx = 0;
+    for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
+        uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+        rem = ir_prefetch - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
+        uint8_t * dst_curr  = (uint8_t *)dst->data  + i03 * nb3  + i02 * nb2  + i01 * nb1;
+        uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
+        uint8_t * d_spad  = dst_spad_base  + spad_idx * dst_spad_half;
+        dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+        dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
+        ir_prefetch += current_block_size;
+        spad_idx ^= 1;
+    }
-        case HTP_OP_ADD_ID:
-            binary_add_id_job_f32_per_thread(octx, octx->src0_spad.data, n, i, hvx_add_f32);
-            break;
+    for (uint32_t ir = start_row; ir < end_row; ) {
+        uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
+        uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
+        uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir, &bctx->dim12_div);
+        rem = ir - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        for (uint32_t r = 0; r < current_block_size; r++) {
+            uint32_t r_i01 = i01 + r;
+            uint32_t i13 = fastmodulo(i03, ne13, &bctx->src1_dim3_div);
+            uint32_t i12 = fastmodulo(i02, ne12, &bctx->src1_dim2_div);
+            uint32_t i11 = fastmodulo(r_i01, ne11, &bctx->src1_dim1_div);
+            uint8_t * r_src0 = s0_spad + r * bctx->src0_row_size_aligned;
+            uint8_t * r_src1_row = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
+            uint8_t * r_dst  = d_spad + r * bctx->dst_row_size_aligned;
+            // Repeat src1 row
+            for (uint32_t c = 0; c < ne00; c += ne10) {
+                uint32_t len = MIN(ne10, ne00 - c);
+                // Use UUU for speed and simplicity
+                COMPUTE_VECTOR_OP_UUU(r_dst + c * elem_size_bytes, r_src0 + c * elem_size_bytes, r_src1_row, src0_type, len);
+            }
+        }
-        default:
-            FARF(ERROR, "Unknown Binary Op %u", octx->op);
-            break;
+        uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
+        dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
+        if (ir_prefetch < end_row) {
+             uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+             uint32_t p03, p02, p01, prem;
+             p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+             prem = ir_prefetch - p03 * (ne02 * ne01);
+             p02 = fastdiv(prem, &bctx->dim1_div);
+             p01 = prem - p02 * ne01;
+             uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
+             dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
+             ir_prefetch += next_block_size;
+        }
+        ir += current_block_size;
     }
+    dma_queue_flush(q);
 }
-static int execute_op_binary_f32(struct htp_ops_context * octx) {
-    int err = HTP_STATUS_OK;
+// 6. ADD_ID (src1 gathered via src2 indices)
+static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_binary_context * bctx = (struct htp_binary_context *) data;
+    struct htp_ops_context * octx = bctx->octx;
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
+    const struct htp_tensor * src2 = &octx->src2;
     struct htp_tensor *       dst  = &octx->dst;
-    worker_callback_t binary_op_func;
-    const char *      op_type = NULL;
-    switch (octx->op) {
-        case HTP_OP_MUL:
-            binary_op_func = binary_job_dispatcher_f32;
-            op_type        = "mul-f32";
-            break;
-        case HTP_OP_ADD:
-            binary_op_func = binary_job_dispatcher_f32;
-            op_type        = "add-f32";
-            break;
-        case HTP_OP_SUB:
-            binary_op_func = binary_job_dispatcher_f32;
-            op_type        = "sub-f32";
-            break;
-        case HTP_OP_ADD_ID:
-            binary_op_func = binary_job_dispatcher_f32;
-            op_type        = "add-id-f32";
-            break;
-        default:
-            FARF(ERROR, "Unsupported binary-Op %u\n", octx->op);
-            return HTP_STATUS_NO_SUPPORT;
+    const uint32_t ne00 = src0->ne[0];
+    const uint32_t ne01 = src0->ne[1];
+    const uint32_t ne02 = src0->ne[2];
+    const uint32_t ne03 = src0->ne[3];
+    const uint32_t ne11 = src1->ne[1]; // for bounds check
+    const uint32_t nb01 = src0->nb[1];
+    const uint32_t nb02 = src0->nb[2];
+    const uint32_t nb03 = src0->nb[3];
+    const uint32_t nb11 = src1->nb[1]; // src1 row stride
+    const uint32_t nb1 = dst->nb[1];
+    const uint32_t nb2 = dst->nb[2];
+    const uint32_t nb3 = dst->nb[3];
+    const uint32_t total_rows = ne01 * ne02 * ne03;
+    const uint32_t start_row = bctx->nrows_per_thread * ith;
+    const uint32_t end_row   = MIN(start_row + bctx->nrows_per_thread, total_rows);
+    if (start_row >= end_row) return;
+    uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_base  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
+    size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
+    size_t dst_spad_half  = octx->dst_spad.size_per_thread  / 2;
+    dma_queue * q = octx->ctx->dma[ith];
+    uint32_t ir_prefetch = start_row;
+    int spad_idx = 0;
+    for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
+        uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+        rem = ir_prefetch - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
+        uint8_t * dst_curr  = (uint8_t *)dst->data  + i03 * nb3  + i02 * nb2  + i01 * nb1;
+        uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
+        uint8_t * d_spad  = dst_spad_base  + spad_idx * dst_spad_half;
+        dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
+        dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), current_block_size);
+        ir_prefetch += current_block_size;
+        spad_idx ^= 1;
     }
-    const int      n_threads  = octx->n_threads;
+    for (uint32_t ir = start_row; ir < end_row; ) {
+        uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
+        uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
+        uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
+        uint32_t i03, i02, i01, rem;
+        i03 = fastdiv(ir, &bctx->dim12_div);
+        rem = ir - i03 * (ne02 * ne01);
+        i02 = fastdiv(rem, &bctx->dim1_div);
+        i01 = rem - i02 * ne01;
+        for (uint32_t r = 0; r < current_block_size; r++) {
+            uint32_t r_i01 = i01 + r; // linear within block since we split at ne01
+            const int32_t idx = *(int32_t *)((char *)src2->data + r_i01 * src2->nb[0] + i02 * src2->nb[1]);
+            uint8_t * r_src1 = (uint8_t *)src1->data + idx * nb11;
+            uint8_t * r_src0 = s0_spad + r * bctx->src0_row_size_aligned;
+            uint8_t * r_dst  = d_spad + r * bctx->dst_row_size_aligned;
+            hvx_add_f32_aau(r_dst, r_src0, r_src1, ne00);
+        }
+        uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
+        dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, ne00 * sizeof(float), current_block_size);
+        if (ir_prefetch < end_row) {
+             uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
+             uint32_t p03, p02, p01, prem;
+             p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
+             prem = ir_prefetch - p03 * (ne02 * ne01);
+             p02 = fastdiv(prem, &bctx->dim1_div);
+             p01 = prem - p02 * ne01;
+             uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
+             dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), next_block_size);
+             ir_prefetch += next_block_size;
+        }
+        ir += current_block_size;
+    }
+    dma_queue_flush(q);
+}
+static int execute_op_binary(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    struct htp_tensor *       dst  = &octx->dst;
     const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+    const uint32_t n_threads  = MIN(octx->n_threads, src0_nrows);
+    // Use packed row sizes for VTCM allocation
+    const uint32_t src0_type = octx->src0.type;
+    const size_t elem_size = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
+    const size_t src0_row_size = src0->ne[0] * elem_size;
+    const size_t src1_row_size = src1->ne[0] * elem_size;
+    const size_t dst_row_size  = dst->ne[0] * elem_size;
+    // Align to VLEN
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
+    bool is_add_id = (octx->op == HTP_OP_ADD_ID);
+    bool is_scalar = !is_add_id && (src1->ne[0] == 1);
+    // Determine which kernel we will use to alloc memory and dispatch
+    bool use_vector_same = !is_add_id && !is_scalar && ((src0->nb[1] % VLEN) == 0) && (src1->ne[0] == src0->ne[0]) &&
+               (src1->ne[1] == src0->ne[1] || src1->ne[1] == 1) &&
+               (src1->ne[2] == src0->ne[2] || src1->ne[2] == 1) &&
+               (src1->ne[3] == src0->ne[3] || src1->ne[3] == 1);
+    bool is_row_bcast = use_vector_same && (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1);
+    bool use_complex = !is_add_id && !is_scalar && !use_vector_same && (src1->ne[0] == src0->ne[0]);
+    bool use_repeat  = !is_add_id && !is_scalar && !use_vector_same && (src1->ne[0] != src0->ne[0]);
+    size_t spad_row_total;
+    if (is_scalar) {
+        spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+    } else if (is_row_bcast) {
+        spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+    } else if (use_vector_same) {
+        spad_row_total = 2 * (src0_row_size_aligned + src1_row_size_aligned + dst_row_size_aligned);
+    } else if (is_add_id) {
+        spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned); // src1 read directly
+    } else {
+        spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+    }
-    const size_t src0_row_size = src0->nb[1];
-    const size_t src1_row_size = src1->nb[1];
-    const size_t dst_row_size  = dst->nb[1];
+    size_t rows_per_buffer = octx->ctx->vtcm_size / (n_threads * spad_row_total);
+    // Adjust for static src1 in row_bcast case
+    if (is_row_bcast) {
+        size_t needed_static = src1_row_size_aligned;
+        if (octx->ctx->vtcm_size < needed_static) return HTP_STATUS_VTCM_TOO_SMALL;
+        size_t avail = octx->ctx->vtcm_size - needed_static;
+        rows_per_buffer = avail / (n_threads * spad_row_total);
+    }
-    // VTCM scratchpads for all tensors
-    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
-    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
+    if (rows_per_buffer < 1) {
+         FARF(ERROR, "binary: VTCM too small\n");
+         return HTP_STATUS_VTCM_TOO_SMALL;
+    }
-    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+    octx->src0_spad.size_per_thread = rows_per_buffer * 2 * src0_row_size_aligned;
+    octx->dst_spad.size_per_thread  = rows_per_buffer * 2 * dst_row_size_aligned;
-    FARF(HIGH,
-         "%s: (%ux%ux%ux%u) * (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
-         op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
-         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
-         octx->dst_spad.size);
+    if (is_scalar || use_complex || use_repeat || is_add_id) {
+        octx->src1_spad.size_per_thread = 0;
+    } else if (is_row_bcast) {
+        octx->src1_spad.size_per_thread = 0;
+    } else {
+        octx->src1_spad.size_per_thread = rows_per_buffer * 2 * src1_row_size_aligned;
+    }
-    // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "binary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type,
-             octx->ctx->vtcm_size, spad_size);
+    octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
+    if (is_row_bcast) {
+        octx->src1_spad.size = src1_row_size_aligned;
+    } else {
+        octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
+    }
+    octx->dst_spad.size  = n_threads * octx->dst_spad.size_per_thread;
+    if (octx->ctx->vtcm_size < (octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size)) {
         return HTP_STATUS_VTCM_TOO_SMALL;
     }
@@ -322,39 +813,79 @@ static int execute_op_binary_f32(struct htp_ops_context * octx) {
     octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
     octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
-    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
-        uint32_t n_jobs = MIN(n_threads, src0_nrows);
+    if ((octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        return HTP_STATUS_OK;
+    }
+    dma_queue * q = octx->ctx->dma[0];
+    if (is_row_bcast) {
+        dma_queue_push(q, dma_make_ptr(octx->src1_spad.data, (const void *) src1->data), src1_row_size_aligned, 0, src1->ne[0] * elem_size, 1);
+    }
+    struct htp_binary_context bctx;
+    bctx.octx = octx;
+    bctx.nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
+    bctx.block_max = rows_per_buffer;
+    bctx.src0_row_size_aligned = src0_row_size_aligned;
+    bctx.src1_row_size_aligned = src1_row_size_aligned;
+    bctx.dst_row_size_aligned  = dst_row_size_aligned;
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+    bctx.dim1_div = init_fastdiv_values(src0->ne[1]);
+    bctx.dim2_div = init_fastdiv_values(src0->ne[2]);
+    bctx.dim12_div = init_fastdiv_values(src0->ne[1] * src0->ne[2]);
-        octx->src0_div21 = init_fastdiv_values(src0->ne[2] * src0->ne[1]);
-        octx->src0_div3  = init_fastdiv_values(src0->ne[3]);
-        octx->src0_div2  = init_fastdiv_values(src0->ne[2]);
-        octx->src0_div1  = init_fastdiv_values(src0->ne[1]);
+    bctx.src1_dim1_div = init_fastdiv_values(src1->ne[1]);
+    bctx.src1_dim2_div = init_fastdiv_values(src1->ne[2]);
+    bctx.src1_dim3_div = init_fastdiv_values(src1->ne[3]);
-        octx->src1_div21 = init_fastdiv_values(src1->ne[2] * src1->ne[1]);
-        octx->src1_div3  = init_fastdiv_values(src1->ne[3]);
-        octx->src1_div2  = init_fastdiv_values(src1->ne[2]);
-        octx->src1_div1  = init_fastdiv_values(src1->ne[1]);
+    bool src0_contig_dim1 = (src0->nb[2] == src0->ne[1] * src0->nb[1]);
+    bool dst_contig_dim1  = (dst->nb[2] == src0->ne[1] * dst->nb[1]);
-        worker_pool_run_func(octx->ctx->worker_pool, binary_op_func, octx, n_jobs);
+    bool src0_contig_dim2 = (src0->nb[3] == src0->ne[2] * src0->nb[2]);
+    bool dst_contig_dim2  = (dst->nb[3] == src0->ne[2] * dst->nb[2]);
+    bctx.split_at_ne01 = (src0->ne[2] > 1) &&
+                         ((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
+    bctx.split_at_ne02 = (src0->ne[3] > 1) &&
+                         ((src1->ne[2] > 1) || (src1->ne[3] > 1) || !src0_contig_dim2 || !dst_contig_dim2);
+    // Precompute specific kernel parameters
+    if (use_vector_same) {
+        bctx.src1_dma_stride = (src1->ne[1] == 1) ? 0 : src1->nb[1];
+        bctx.src1_fetch_rows = (src1->ne[1] == 1) ? 1 : rows_per_buffer;
+    }
+    worker_callback_t worker_func;
+    if (is_add_id) worker_func = binary_job_add_id;
+    else if (is_scalar) worker_func = binary_job_scalar;
+    else if (is_row_bcast) worker_func = binary_job_vector_row_broadcast;
+    else if (use_vector_same) worker_func = binary_job_vector_same_shape;
+    else if (use_complex) worker_func = binary_job_vector_complex;
+    else worker_func = binary_job_element_repeat;
+    if (is_row_bcast) {
+        dma_queue_pop(q);
     }
-    return err;
+    worker_pool_run_func(octx->ctx->worker_pool, worker_func, &bctx, n_threads);
+    return HTP_STATUS_OK;
 }
 int op_binary(struct htp_ops_context * octx) {
-    int err = HTP_STATUS_OK;
-    switch (octx->src0.type) {
-        case HTP_TYPE_F32:
-            err = execute_op_binary_f32(octx);
-            break;
+    // Does not support permutations of src1
+    const struct htp_tensor * src1 = &octx->src1;
+    if (src1->nb[1] < src1->nb[0]) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
-        default:
-            err = HTP_STATUS_NO_SUPPORT;
-            break;
+    const uint32_t src0_type = octx->src0.type;
+    if ((src0_type == HTP_TYPE_F32) || (src0_type == HTP_TYPE_F16)) {
+        return execute_op_binary(octx);
     }
-    return err;
+    return HTP_STATUS_NO_SUPPORT;
 }