RubyGems - whispercpp - Versions diffs - 1.3.6 → 1.3.7 - Mend

whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (828) hide show

data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp CHANGED Viewed

@@ -7,10 +7,17 @@
 #include <atomic>
 #include <chrono>
-#include <cstddef>
 #include <mutex>
+#include <thread>
+#include <cstddef>
 #include <stdexcept>
 #include <string>
+#include <sstream>
+#include <iomanip>
+#include <unordered_set>
+#include <unordered_map>
+#include <regex>
+#include <queue>
 #ifdef _WIN32
 #    include <sal.h>
@@ -32,23 +39,38 @@
 #include "ggml-hexagon.h"
 #include "ggml-impl.h"
 #include "ggml-quants.h"
-#include "op-desc.h"
-#include "htp-msg.h"
+#include "htp-opnode.h"
+#include "htp-ops.h"
 #include "htp_iface.h"
 #include "htp-drv.h"
-static size_t opt_ndev         = 1;
-static size_t opt_nhvx         = 0; // use all
-static int    opt_arch         = 0; // autodetect
-static int    opt_etm          = 0;
-static int    opt_verbose      = 0;
-static int    opt_profile      = 0;
-static int    opt_hostbuf      = 1; // hostbuf ON by default
-static int    opt_experimental = 0;
+using intvec  = std::vector<int>;
+using uintvec = std::vector<unsigned int>;
+using u32vec  = std::vector<uint32_t>;
+static int    opt_arch    = 0; // autodetect
+static size_t opt_ndev    = 1;
+static size_t opt_nhvx    = 0; // use all
+static int    opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
+static size_t opt_vmem    = HTP_OP_MAX_VMEM_DEFAULT;  // max available va space for buffer mappings
+static size_t opt_mbuf    = 1ul * 1024 * 1024 * 1024; // max buffer size
+static int    opt_etm     = 0;
+static int    opt_verbose = 0;
+static int    opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
+static int    opt_hostbuf = 1; // hostbuf ON by default
+// Default PMU events, if profiling with PMU (mode=2) is enabled
+// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
+//     https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
+static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
 // Enable all stages by default
-static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
-static int opt_opsync = 0;  // synchronous ops
+static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
+static int opt_opbatch  = 1024; // max number of ops in a batch
+static int opt_opqueue  = 16;   // max number of pending batches
+static int opt_oppoll   = 0;    // polling for batch completions
+static std::regex* opt_opfilter = NULL; // regex of ops to not claim
 #define HEX_VERBOSE(...) \
     if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
@@ -80,47 +102,45 @@ static const char * status_to_str(uint32_t status) {
 // ** debug helpers
-static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
+static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
     if (!opt_verbose) return;
-    op_desc desc(op);
+    htp_opformat fmt(node);
     GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
-                ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
+                node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
 }
 static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
     if (!opt_verbose) return;
-    op_desc desc(op);
-    GGML_LOG_DEBUG("ggml-hex: %s supports-op %s : %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
-                ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
+    htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
+    GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
+                ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
 }
-static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
-                                      uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
+static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
+                                      uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
     if (!opt_profile) return;
-    op_desc desc(op);
-    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
-                ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
-                op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
+    char pmu_str[256] = "";
+    if (opt_profile > 1) {
+        static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
+        sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
+                pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
+    }
+    htp_opformat fmt(node);
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
+            node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
 }
 // ** backend sessions
-struct ggml_hexagon_session {
-    ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
-    ~ggml_hexagon_session() noexcept(true);
-    void allocate(int dev_id) noexcept(false);
-    void release() noexcept(true);
-    void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
-    void flush();
-    ggml_backend_buffer_type buffer_type        = {};
-    ggml_backend_buffer_type repack_buffer_type = {};
+struct ggml_hexagon_opbatch;
+struct ggml_hexagon_opqueue;
+struct htp_opnode;
+struct ggml_hexagon_session {
     std::string      name;
     remote_handle64  handle;
     dspqueue_t       queue;
@@ -132,87 +152,28 @@ struct ggml_hexagon_session {
     bool             valid_handle;
     bool             valid_queue;
     bool             valid_iface;
-    std::atomic<int> op_pending;
-    uint32_t         prof_usecs;
-    uint32_t         prof_cycles;
-    uint32_t         prof_pkts;
-};
-void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
-    // Bump pending flag (cleared in the session::flush once we get the response)
-    this->op_pending++;  // atomic inc
-    int err = dspqueue_write(this->queue,
-                             0,                       // flags - the framework will autoset this
-                             n_bufs,                  // number of buffers
-                             bufs,                    // buffer references
-                             sizeof(req),             // Message length
-                             (const uint8_t *) &req,  // Message
-                             DSPQUEUE_TIMEOUT         // Timeout
-    );
-    if (err != 0) {
-        GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
-    }
-    if (sync) {
-        flush();
-    }
-}
-// Flush HTP response queue i.e wait for all outstanding requests to complete
-void ggml_hexagon_session::flush() {
-    dspqueue_t q = this->queue;
-    // Repeatedly read packets from the queue until it's empty. We don't
-    // necessarily get a separate callback for each packet, and new packets
-    // may arrive while we're processing the previous one.
-    while (this->op_pending) {
-        struct htp_general_rsp rsp;
-        uint32_t               rsp_size;
-        uint32_t               flags;
-        struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
-        uint32_t               n_bufs;
-        // Read response packet from queue
-        int err = dspqueue_read(q, &flags,
-                                HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
-                                &n_bufs,                 // Number of buffer references
-                                bufs,                    // Buffer references
-                                sizeof(rsp),             // Max message length
-                                &rsp_size,               // Message length
-                                (uint8_t *) &rsp,        // Message
-                                DSPQUEUE_TIMEOUT);       // Timeout
+    std::atomic<int>      op_pending;
+    ggml_hexagon_opbatch* op_batch;
+    ggml_hexagon_opqueue* op_queue;
-        if (err == AEE_EEXPIRED) {
-            // TODO: might need to bail out if the HTP is stuck on something
-            continue;
-        }
+    ggml_backend_buffer_type buffer_type        = {};
+    ggml_backend_buffer_type repack_buffer_type = {};
-        if (err != 0) {
-            GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
-        }
+    ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
+    ~ggml_hexagon_session() noexcept(true);
-        // Basic sanity checks
-        if (rsp_size != sizeof(rsp)) {
-            GGML_ABORT("ggml-hex: dspcall : bad response (size)\n");
-        }
+    const char* c_name() const { return name.c_str(); }
-        if (rsp.status != HTP_STATUS_OK) {
-            GGML_LOG_ERROR("ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status));
-            // TODO: handle errors
-        }
+    void allocate(int dev_id) noexcept(false);
+    void release() noexcept(true);
-        // TODO: update profiling implementation, currently only works for opt_opsync mode
-        this->prof_usecs  = rsp.prof_usecs;
-        this->prof_cycles = rsp.prof_cycles;
-        this->prof_pkts   = rsp.prof_pkts;
+    void enqueue_op(const htp_opnode & node);
+    void flush(bool all = true);
-        this->op_pending--;  // atomic dec
-    }
-}
+    void flush_pending(bool all = false);
+    void flush_batch();
+};
 // ** backend buffers
@@ -226,82 +187,94 @@ struct ggml_backend_hexagon_buffer_type_context {
     std::string            name;
 };
-struct ggml_backend_hexagon_buffer_context {
-    bool mmap_to(ggml_hexagon_session * s) {
-        HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
-                    s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd,
-                    (int) this->repack);
+struct ggml_hexagon_shared_buffer {
+    ggml_hexagon_session * sess;
+    uint8_t *              base;
+    size_t                 size;
+    int                    fd;
+    bool                   mapped;
+    bool                   pinned;
+    void mmap() {
+        fastrpc_map_flags flags = this->pinned ? FASTRPC_MAP_FD : FASTRPC_MAP_FD_DELAYED;
-        int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
+        int err = fastrpc_mmap(sess->domain_id, this->fd, (void *) this->base, 0, this->size, flags);
         if (err != 0) {
-            GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
-                    s->domain_id, this->size, this->fd, (unsigned) err);
-            return false;
+            GGML_LOG_ERROR("ggml-hex: %s buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", sess->c_name(),
+                    sess->domain_id, this->size, this->fd, (unsigned) err);
+            throw std::runtime_error("ggml-hex: fastrpc_mmap failed (see log for details)");
         }
-        return true;
-    }
+        HEX_VERBOSE("ggml-hex: %s mapped buffer: base %p size %zu fd %d pinned %u\n",
+                sess->c_name(), (void *) this->base, this->size, this->fd, pinned);
-    bool mmap() {
-        if (this->mapped) {
-            return true;
-        }
-        if (!mmap_to(this->sess)) {
-            return false;
-        }
         this->mapped = true;
-        return true;
     }
-    void munmap() {
-        if (!this->mapped) {
-            return;
+    void unmap() {
+        if (!this->mapped) return;
+        if (!this->pinned) {
+            // HTP might still hold a reference, tell it drop it
+            htp_iface_munmap(sess->handle, this->fd);
         }
-        fastrpc_munmap(this->sess->domain_id, this->fd, this->base, this->size);
+        fastrpc_munmap(sess->domain_id, this->fd, (void *) this->base, this->size);
+        HEX_VERBOSE("ggml-hex: %s unmapped buffer: base %p size %zu fd %d\n", sess->c_name(),
+                (void *) this->base, size, this->fd);
         this->mapped = false;
+        this->fd     = -1;
     }
-    ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
-        size += 4 * 1024;  // extra page for padding
+    void alloc(size_t size) {
+        if (this->base) return;
-        this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+        this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size);
         if (!this->base) {
-            GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
+            GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->c_name(), size);
             throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
         }
         this->fd = rpcmem_to_fd(this->base);
         if (this->fd < 0) {
-            GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
-            rpcmem_free(this->base);
-            this->base = NULL;
+            GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->c_name(), (void *) this->base);
             throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
         }
+        this->size = size;
-        HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
-                    (void *) this->base, size, this->fd, (int) repack);
+        HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d pinned %d\n", sess->c_name(),
+                    (void *) this->base, this->size, this->fd, (int) pinned);
+        mmap();
+    }
+    void free() {
+        if (!this->base) return;
+        unmap();
+        rpcmem_free(this->base);
+        HEX_VERBOSE("ggml-hex: %s freed buffer: base %p size %zu fd %d\n", sess->c_name(),
+                (void *) this->base, size, this->fd);
+        this->base = NULL;
+    }
+    ggml_hexagon_shared_buffer(ggml_hexagon_session * sess, size_t size, bool pinned = false) {
         this->sess   = sess;
-        this->size   = size;
+        this->size   = 0;
+        this->base   = nullptr;
+        this->fd     = -1;
         this->mapped = false;
-        this->repack = repack;
-    }
+        this->pinned = pinned;
-    ~ggml_backend_hexagon_buffer_context() {
-        munmap();
-        if (this->base) {
-            rpcmem_free(this->base);
-            this->base = NULL;
-        }
+        alloc(size);
     }
-    ggml_hexagon_session * sess;  // primary session
-    uint8_t *              base;
-    size_t                 size;
-    int                    fd;
-    bool                   mapped;  // mmap is done
-    bool                   repack;  // repacked buffer
+    ~ggml_hexagon_shared_buffer() {
+        free();
+    }
 };
 static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
@@ -309,30 +282,26 @@ static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_
 }
 static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
-    delete ctx;
+    auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
+    delete sbuf;
 }
 static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
-    auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
-    return ctx->base;
+    auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
+    return sbuf->base;
 }
 static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    auto ctx  = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
-    auto sess = ctx->sess;
+    auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
+    auto sess = sbuf->sess;
-    HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
-                tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage,
-                (int) ctx->repack);
+    HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d\n", sess->c_name(),
+                tensor->name, (void *) sbuf->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage);
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        ; // nothing to do for the view
-    } else {
-        if (!ctx->mapped) {
-            ctx->mmap();
-        }
+        return GGML_STATUS_SUCCESS; // nothing to do for the view
     }
     return GGML_STATUS_SUCCESS;
 }
@@ -460,7 +429,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
         d[7]          = x[i * 8 + 7].d;
     }
-    if (opt_verbose > 1) {
+    if (opt_verbose > 2) {
         for (int i = 0; i < nb; i++) {
             dump_packed_block_q4x4x2(y, i, k);
         }
@@ -479,7 +448,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
     const uint8_t * y_q = y + 0;              // quants first
     const uint8_t * y_d = y + qrow_size;      // then scales
-    if (opt_verbose > 1) {
+    if (opt_verbose > 2) {
         for (int i = 0; i < nb; i++) {
             dump_packed_block_q4x4x2(y, i, k);
         }
@@ -583,7 +552,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
@@ -644,7 +613,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
     // Ensure we don't try to copy more data than the tensor actually contains.
     const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -693,6 +662,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
     ggml_aligned_free(buf_rp, row_size_rp);
 }
+static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) {
+    static const int qk = QK4_1;
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const int x0             = (x->qs[i] & 0x0F);
+        const int x1             = (x->qs[i] >> 4);
+        qs[bi * qk + i + 0]      = x0;
+        qs[bi * qk + i + qk / 2] = x1;
+    }
+}
+static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) {
+    static const int qk = QK4_1;
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const uint8_t x0 = qs[bi * qk + i + 0];
+        const uint8_t x1 = qs[bi * qk + i + qk / 2];
+        x->qs[i]         = x0 | (x1 << 4);
+    }
+}
+static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+    const int        nloe = k % qk;           // leftovers
+    const int dblk_size = 8 * 4;              // 8x (d, m) __fp16 = 32 bytes
+    const int qblk_size = qk / 2;             // int4 = 128 bytes
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+    uint8_t * y_q = y + 0;                    // quants first
+    uint8_t * y_d = y + qrow_size;            // then scales/offsets
+    // Repack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+        unpack_q4_1_quants(qs, &x[i * 8 + 0], 0);
+        unpack_q4_1_quants(qs, &x[i * 8 + 1], 1);
+        unpack_q4_1_quants(qs, &x[i * 8 + 2], 2);
+        unpack_q4_1_quants(qs, &x[i * 8 + 3], 3);
+        unpack_q4_1_quants(qs, &x[i * 8 + 4], 4);
+        unpack_q4_1_quants(qs, &x[i * 8 + 5], 5);
+        unpack_q4_1_quants(qs, &x[i * 8 + 6], 6);
+        unpack_q4_1_quants(qs, &x[i * 8 + 7], 7);
+        bool partial = (nloe && i == nb-1);
+        uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
+        }
+    }
+    // Repack the scales and offsets
+    for (int i = 0; i < nb; i++) {
+        ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size);
+        for (int j = 0; j < 8; j++) {
+            d_m[j * 2 + 0] = x[i * 8 + j].d;
+            d_m[j * 2 + 1] = x[i * 8 + j].m;
+        }
+    }
+}
+static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+    const int        nloe = k % qk;           // leftovers
+    const int dblk_size = 8 * 4;              // 8x (d, m) __fp16 = 32 bytes
+    const int qblk_size = qk / 2;             // int4 = 128 bytes
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+    const uint8_t * y_q = y + 0;              // quants first
+    const uint8_t * y_d = y + qrow_size;      // then scales/offsets
+    // Unpack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];
+        bool partial = (nloe && i == nb-1);
+        const uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            if (partial) {
+                qs[j*2+0] = q[j] & 0x0F;
+                qs[j*2+1] = q[j] >> 4;
+            } else {
+                qs[j+000] = q[j] & 0x0F;
+                qs[j+128] = q[j] >> 4;
+            }
+        }
+        pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
+        pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
+        pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
+        pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
+        pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
+        pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
+        pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
+        pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
+    }
+    // Unpack the scales and offsets
+    for (int i = 0; i < nb; i++) {
+        const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size);
+        for (int j = 0; j < 8; j++) {
+            x[i * 8 + j].d = d_m[j * 2 + 0];
+            x[i * 8 + j].m = d_m[j * 2 + 1];
+        }
+    }
+}
+static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+    uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+    memset(qs, 0, sizeof(qs));
+    for (int i = 0; i < nb; i++) {
+        pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
+        pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
+        pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
+        pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
+        pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
+        pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
+        pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
+        pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
+    }
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < 8; j++) {
+            x[i * 8 + j].d = 0;
+            x[i * 8 + j].m = 0;
+        }
+    }
+}
+static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+    const int64_t n_full_rows = n_bytes_to_copy / row_size;
+    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+    HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+    init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
+    for (int64_t i = 0; i < n_full_rows; i++) {
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+        memcpy(buf_pd, src, row_size);
+        repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+    if (n_rem_bytes > 0) {
+        const int64_t i = n_full_rows;
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+        init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(buf_pd, src, n_rem_bytes);
+        repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, n_rem_bytes);
+    }
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+    const int64_t n_full_rows = n_bytes_to_copy / row_size;
+    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+    HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+    memset(buf_rp, 0, row_size_rp);  // clear-out padded buffer to make sure the tail is all zeros
+    for (int64_t i = 0; i < n_full_rows; i++) {
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+        memcpy(buf_rp, src, row_size);
+        unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
+        memcpy(dst, buf_pd, row_size);
+    }
+    if (n_rem_bytes > 0) {
+        const int64_t i = n_full_rows;
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+        // We still need to read and unpack the entire source row because quantization is block-based.
+        memcpy(buf_rp, src, row_size);
+        unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
+        memcpy(dst, buf_pd, n_rem_bytes);
+    }
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
 // ======== Q8x4x2 ====================
 static void dump_block_q8_0(const block_q8_0 * b, int i) {
     HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
@@ -795,7 +997,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
         d[7]          = x[i * 8 + 7].d;
     }
-    if (opt_verbose > 1) {
+    if (opt_verbose > 2) {
         for (int i = 0; i < nb; i++) {
             dump_packed_block_q8x4x2(y, i, k);
         }
@@ -813,7 +1015,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
     const uint8_t * y_q = y + 0;              // quants first
     const uint8_t * y_d = y + qrow_size;      // then scales
-    if (opt_verbose > 1) {
+    if (opt_verbose > 2) {
         for (int i = 0; i < nb; i++) {
             dump_packed_block_q8x4x2(y, i, k);
         }
@@ -909,7 +1111,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size quants + scales)
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
@@ -970,7 +1172,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size quants + scales)
     // Ensure we don't try to copy more data than the tensor actually contains.
     const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1148,7 +1350,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
         e[7]        = x[i * 8 + 7].e;
     }
-    if (opt_verbose > 1) {
+    if (opt_verbose > 2) {
         for (int i = 0; i < nb; i++) {
             dump_packed_block_mxfp4x4x2(y, i, k);
         }
@@ -1167,7 +1369,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
     const uint8_t * y_q = y + 0;              // quants first
     const uint8_t * y_e = y + qrow_size;      // then scales
-    if (opt_verbose > 1) {
+    if (opt_verbose > 2) {
         for (int i = 0; i < nb; i++) {
             dump_packed_block_mxfp4x4x2(y, i, k);
         }
@@ -1271,7 +1473,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
@@ -1332,7 +1534,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
     size_t row_size    = ggml_row_size(t->type, t->ne[0]);
     size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
-    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+    size_t row_size_rp = row_size_pd;  // scratch must hold one full padded tile (qblk_size/2 quants + scales)
     // Ensure we don't try to copy more data than the tensor actually contains.
     const size_t total_tensor_size = (size_t)nrows * row_size;
@@ -1386,11 +1588,10 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                    const void *          data,
                                                    size_t                offset,
                                                    size_t                size) {
-    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
-    auto sess = ctx->sess;
+    auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
+    auto sess = sbuf->sess;
-    HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
-                offset, size);
+    HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
     switch (tensor->type) {
         case GGML_TYPE_Q4_0:
@@ -1399,10 +1600,23 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
             repack_q4_0_q4x4x2(tensor, data, size);
             break;
-        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q4_1:
             GGML_ASSERT(offset == 0);
             GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
-            repack_q8_0_q8x4x2(tensor, data, size);
+            repack_q4_1_q4x4x2(tensor, data, size);
+            break;
+        case GGML_TYPE_Q8_0:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            repack_q8_0_q8x4x2(tensor, data, size);
+            break;
+        case GGML_TYPE_IQ4_NL:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            // IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
+            repack_q4_0_q4x4x2(tensor, data, size);
             break;
         case GGML_TYPE_MXFP4:
@@ -1422,11 +1636,10 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                    void *                data,
                                                    size_t                offset,
                                                    size_t                size) {
-    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
-    auto sess = ctx->sess;
+    auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
+    auto sess = sbuf->sess;
-    HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
-                offset, size);
+    HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
     switch (tensor->type) {
         case GGML_TYPE_Q4_0:
@@ -1435,12 +1648,24 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
             repack_q4x4x2_q4_0(data, tensor, size);
             break;
+        case GGML_TYPE_Q4_1:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            repack_q4x4x2_q4_1(data, tensor, size);
+            break;
         case GGML_TYPE_Q8_0:
             GGML_ASSERT(offset == 0);
             GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
             repack_q8x4x2_q8_0(data, tensor, size);
             break;
+        case GGML_TYPE_IQ4_NL:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            repack_q4x4x2_q4_0(data, tensor, size);
+            break;
         case GGML_TYPE_MXFP4:
             GGML_ASSERT(offset == 0);
             GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@@ -1464,10 +1689,10 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t      bu
 }
 static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
-    auto sess = ctx->sess;
-    HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
-    memset(ctx->base, value, ctx->size);
+    auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
+    auto sess = sbuf->sess;
+    HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->c_name(), (void *) sbuf->base, sbuf->size);
+    memset(sbuf->base, value, sbuf->size);
 }
 static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
@@ -1477,6 +1702,8 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
     /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_hexagon_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_hexagon_buffer_get_tensor,
+    /* .set_tensor_2d   = */ NULL,
+    /* .get_tensor_2d   = */ NULL,
     /* .cpy_tensor      = */ ggml_backend_hexagon_buffer_cpy_tensor,
     /* .clear           = */ ggml_backend_hexagon_buffer_clear,
     /* .reset           = */ NULL,
@@ -1492,10 +1719,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
             ggml_backend_buffer_type_t buffer_type, size_t size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
     try {
-        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
-        return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
+        size += 4 * 1024;  // guard page
+        ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
+        return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
     } catch (const std::exception & exc) {
-        GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
+        GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (host): %s\n", sess->c_name(), exc.what());
         return nullptr;
     }
 }
@@ -1504,10 +1732,11 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
             ggml_backend_buffer_type_t buffer_type, size_t size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
     try {
-        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
-        return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
+        size += 4 * 1024;  // guard page
+        ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
+        return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
     } catch (const std::exception & exc) {
-        GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
+        GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (repack): %s\n", sess->c_name(), exc.what());
         return nullptr;
     }
 }
@@ -1522,7 +1751,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffe
 }
 static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
-    return 1 * 1024 * 1024 * 1024;  // 1GB per buffer
+    return opt_mbuf; // typically 1GB per buffer
     GGML_UNUSED(buffer_type);
 }
@@ -1554,6 +1783,448 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
     /* .is_host          = */ ggml_backend_hexagon_repack_buffer_type_is_host,
 };
+struct ggml_hexagon_opbatch {
+    ggml_hexagon_session*            sess;
+    std::vector<htp_opnode>          ops;       // htp_opnode of ops
+    std::vector<htp_buf_desc>        h_bufs;    // htp buffer descriptors
+    std::vector<htp_tensor>          h_tens;    // htp tensor descriptors
+    std::vector<htp_op_desc>         h_ops;     // htp op descriptors
+    std::unordered_map<int, int>                b_map; // buffer fd   to index
+    std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr  to index
+    std::unordered_multimap<void*, int>         d_map; // tensor data to index
+    unsigned int n_bufs;     // num buffers in the batch
+    unsigned int n_tens;     // num tensors ...
+    unsigned int n_ops;      // num ops ...
+    size_t       b_vmem;     // sum of all buffer sizes
+    unsigned int n_bufs_max;
+    unsigned int n_tens_max;
+    unsigned int n_ops_max;
+    size_t       b_vmem_max;
+    void reset() {
+        n_bufs = 0;
+        n_tens = 0;
+        n_ops  = 0;
+        b_vmem = 0;
+        b_map.clear();
+        t_map.clear();
+        d_map.clear();
+    }
+    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size, size_t max_vmem) {
+        this->sess = sess;
+        n_bufs_max = HTP_OP_MAX_BUFS;
+        n_ops_max  = batch_size;
+        n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
+        b_vmem_max = max_vmem;
+        ops.resize(n_ops_max);
+        h_bufs.resize(n_bufs_max);
+        h_tens.resize(n_tens_max);
+        h_ops.resize(n_ops_max);
+        b_map.reserve(n_bufs_max);
+        t_map.reserve(n_tens_max);
+        d_map.reserve(n_tens_max);
+        GGML_LOG_INFO("ggml-hex: %s op batching: n-bufs %u n-tensors %u n-ops %u vmem %zu\n",
+                sess->c_name(), n_bufs_max, n_tens_max, n_ops_max, b_vmem_max);
+        reset();
+    }
+    bool empty() const { return n_ops == 0; }
+    // add buffer and return its index
+    int add_buffer(ggml_hexagon_shared_buffer * sbuf) {
+        // Lookup by fd
+        auto it = b_map.find(sbuf->fd);
+        if (it != b_map.end()) { return it->second; }
+        // Add new buffer to the batch
+        int bi = n_bufs++;
+        GGML_ASSERT(n_bufs < HTP_OP_MAX_BUFS);
+        b_map.insert({sbuf->fd, bi});
+        htp_buf_desc &b = h_bufs[bi];
+        b.base = (uint64_t) sbuf->base;
+        b.fd   = sbuf->fd;
+        b.size = sbuf->size;
+        b_vmem += b.size;
+        HEX_VERBOSE("ggml-hex: add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem);
+        return bi;
+    }
+    bool same_shape(const htp_tensor * h, const ggml_tensor * t) const {
+        return (h->ne[0] == t->ne[0]) && (h->ne[1] == t->ne[1]) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) &&
+               (h->nb[0] == t->nb[0]) && (h->nb[1] == t->nb[1]) && (h->nb[2] == t->nb[2]) && (h->nb[3] == t->nb[3]);
+    }
+    // add tensor and return its index
+    int add_tensor(const ggml_tensor * t) {
+        auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
+        // First lookup by tensor data
+        auto range = d_map.equal_range(t->data);
+        for (auto it = range.first; it != range.second; ++it) {
+            htp_tensor * h = &h_tens[it->second];
+            if (same_shape(h, t)) { return it->second; }
+        }
+        // Lookup by tensor ptr
+        auto it = t_map.find(t);
+        if (it != t_map.end()) { return it->second; }
+        // Add new tensor to the batch
+        int ti = n_tens++;
+        GGML_ASSERT(n_tens <= n_tens_max);
+        t_map.insert({t,       ti});
+        d_map.insert({t->data, ti});
+        uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
+        size_t   t_size   = ggml_nbytes(t);
+        htp_tensor &h = h_tens[ti];
+        h.bi    = add_buffer(sbuf);
+        h.data  = t_offset;
+        h.size  = t_size;
+        h.type  = t->type;
+        h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3];
+        h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3];
+        h.flags = 0;
+        if (ggml_backend_buffer_get_usage(t->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
+            h.flags |= HTP_TENSOR_COMPUTE;
+        }
+        HEX_VERBOSE("ggml-hex: add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n",
+                ti, t->name, h.bi, (void*) t->data, (size_t) t_offset, t_size, h.flags,
+                (size_t) t->ne[0], (size_t) t->ne[1], (size_t) t->ne[2], (size_t) t->ne[3]);
+        return ti;
+    }
+    bool fit_op(const htp_opnode & node) const {
+        if (n_ops >= n_ops_max ) return false;
+        // check how much extras we will need
+        size_t extra_bufs = 0;
+        size_t extra_vmem = 0;
+        size_t extra_tens = 0;
+        auto fit_tensor = [&](const ggml_tensor *t) {
+            if (!t) return;
+            if (!t_map.count(t)) {
+                extra_tens++;
+                auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
+                if (!b_map.count(sbuf->fd)) {
+                    extra_vmem += sbuf->size;
+                    extra_bufs += 1;
+                }
+            }
+        };
+        for (const auto * src : node.get_inputs()) {
+            fit_tensor(src);
+        }
+        fit_tensor(node.dst());
+        if ((extra_bufs + n_bufs) > n_bufs_max) return false;
+        if ((extra_tens + n_tens) > n_tens_max) return false;
+        if ((extra_vmem + b_vmem) > b_vmem_max) return false;
+        return true;
+    }
+    // assumes that fit_op() was called first and returned true
+    void add_op(const htp_opnode & node) {
+        // Add new op
+        unsigned int n = n_ops++;
+        GGML_ASSERT(n_ops <= n_ops_max);
+        ops[n] = node;
+        htp_op_desc &o = h_ops[n];
+        memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
+        o.opcode = node.opcode;
+        o.flags  = 0;
+        if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
+            o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+        }
+        ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
+        auto inputs = node.get_inputs();
+        for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
+            o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
+        }
+        o.dst = add_tensor(node.dst());
+    }
+};
+struct ggml_hexagon_opqueue {
+    // Shared buffer for storing batches
+    ggml_hexagon_shared_buffer *shm_buf;
+    size_t                      shm_blk_size;
+    using opvec = std::vector<htp_opnode>;
+    std::queue<unsigned int>    done;       // completed batch ids
+    std::vector<opvec>          op_cache;   // per batch op cache
+    std::vector<uint64_t>       start_usec; // per batch start time
+    ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
+        size_t n_bufs    = HTP_OP_MAX_BUFS;
+        size_t n_ops     = batch_size;
+        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
+        shm_blk_size = sizeof(htp_buf_desc)  * n_bufs    +
+                       sizeof(htp_tensor)    * n_tensors +
+                       sizeof(htp_op_desc)   * n_ops     +
+                       sizeof(htp_prof_desc) * n_ops;
+        shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
+        op_cache.resize(depth);
+        start_usec.resize(depth, 0);
+        // init done queue
+        for (unsigned int i = 0; i < depth; i++) { done.push(i); }
+        if (opt_verbose) {
+            GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
+                    sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
+        }
+    }
+    ~ggml_hexagon_opqueue() {
+        delete shm_buf;
+    }
+    // push new batch
+    bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
+        static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
+        static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
+        static_assert(sizeof(htp_buf_desc)    % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
+        static_assert(sizeof(htp_tensor)      % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
+        static_assert(sizeof(htp_op_desc)     % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+        static_assert(sizeof(htp_prof_desc)   % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
+        if (done.empty()) { return false; }
+        req.id        = done.front(); done.pop(); // batch id
+        req.n_bufs    = op_batch->n_bufs;
+        req.n_tensors = op_batch->n_tens;
+        req.n_ops     = op_batch->n_ops;
+        op_cache[req.id]   = op_batch->ops;
+        start_usec[req.id] = ggml_time_us();
+        const size_t b_size = sizeof(htp_buf_desc)  * req.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * req.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * req.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
+        dbuf.ptr      = shm_buf->base + (req.id * shm_blk_size);
+        dbuf.fd       = shm_buf->fd;
+        dbuf.flags    = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+        dbuf.offset   = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
+        dbuf.size     = b_size + t_size + o_size + p_size;
+        GGML_ASSERT(dbuf.size <= shm_blk_size);
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * b_ptr = m_ptr; m_ptr += b_size;
+        uint8_t * t_ptr = m_ptr; m_ptr += t_size;
+        uint8_t * o_ptr = m_ptr;
+        memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
+        memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
+        memcpy(o_ptr, (void *) op_batch->h_ops.data(),  o_size);
+        HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
+                shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
+                b_size, t_size, o_size, (size_t) dbuf.size);
+        op_batch->reset();
+        if (opt_verbose > 1) {
+            htp_buf_desc *b = (htp_buf_desc*) b_ptr;
+            for (unsigned int i=0; i < req.n_bufs; i++) {
+                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
+                            b[i].fd, (void *) b[i].base, (size_t) b[i].size);
+            }
+            htp_tensor *t = (htp_tensor*) t_ptr;
+            for (unsigned int i=0; i < req.n_tensors; i++) {
+                GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
+                            shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
+                            (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
+            }
+        }
+        return true;
+    }
+    void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
+        GGML_ASSERT(rsp.id < op_cache.size());
+        done.push(rsp.id);
+        const size_t b_size = sizeof(htp_buf_desc)  * rsp.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * rsp.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * rsp.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
+        const size_t m_size = b_size + t_size + o_size + p_size;
+        GGML_ASSERT(m_size <= shm_blk_size);
+        HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
+                shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
+                (size_t) dbuf.size, b_size, t_size, o_size);
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
+        if (opt_profile && rsp.n_ops > 0) {
+            auto & ops = op_cache[rsp.id];
+            uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
+            uint32_t htp_usec   = 0;
+            GGML_ASSERT(rsp.n_ops <= ops.size());
+            const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
+            for (uint32_t i = 0; i < rsp.n_ops; i++) {
+                htp_usec += pd[i].usecs;
+                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
+            }
+            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
+                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
+        }
+    }
+};
+// Flush HTP response queue i.e wait for all outstanding requests to complete
+void ggml_hexagon_session::flush_pending(bool all) {
+    while (this->op_pending) {
+        struct htp_opbatch_rsp rsp;
+        uint32_t               rsp_size;
+        uint32_t               flags;
+        struct dspqueue_buffer dbuf;
+        uint32_t               n_dbufs;
+        // Read response packet from queue
+        const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT;
+        int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo);
+        if (err == AEE_EEXPIRED) {
+            continue;
+        }
+        if (err != 0) {
+            GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
+        }
+        // Basic sanity checks
+        if (rsp_size != sizeof(rsp) || n_dbufs != 1) {
+            GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
+        }
+        if (rsp.status != HTP_STATUS_OK) {
+            GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
+            // TODO: handle errors
+        }
+        op_queue->pop(rsp, dbuf);
+        this->op_pending--;  // atomic dec
+        if (!all) break;
+    }
+}
+void ggml_hexagon_session::flush_batch() {
+    if (op_batch->empty()) { return; }
+    htp_opbatch_req req {};
+    dspqueue_buffer dbuf{};
+    if (!op_queue->push(req, dbuf, op_batch)) {
+        flush_pending(false);
+        op_queue->push(req, dbuf, op_batch);
+    }
+    // Bump pending flag (cleared in the session::flush once we get the response)
+    this->op_pending++;  // atomic inc
+    HEX_VERBOSE("ggml-hex: %s queue-opbatch: %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
+    int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
+    if (err != 0) {
+        GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
+    }
+}
+void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
+    if (!op_batch->fit_op(node)) {
+        flush_batch();
+    }
+    op_batch->add_op(node);
+}
+// Flush HTP response queue i.e wait for all outstanding requests to complete
+void ggml_hexagon_session::flush(bool all) {
+    flush_batch();
+    flush_pending(all);
+}
+static size_t ggml_hexagon_measure_max_vmem(ggml_hexagon_session *sess) {
+    // Allocate a bunch pinned buffers till failure.
+    // This is kind of expensive but handy for figuring out exactly how much we can mmap on a specific device.
+    // Typically we're going to allocate all/most of these buffers anyway for the model weights.
+    std::vector<ggml_hexagon_shared_buffer *> sbufs;
+    const size_t MiB = 1024 * 1024;
+    const size_t GiB = MiB  * 1024;
+    size_t vmem = 0;
+    size_t step = 256u * MiB;
+    try {
+        sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
+        sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
+        sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
+        while (1) {
+            sbufs.push_back(new ggml_hexagon_shared_buffer(sess, step, true));
+            vmem += step;
+        }
+    } catch (...) { }
+    for (auto b : sbufs) { delete b; }
+    return vmem - step; // backoff to account for overhead from internal mappings
+}
 void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     this->valid_session = false;
     this->valid_handle  = false;
@@ -1566,11 +2237,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     this->name       = std::string("HTP") + std::to_string(dev_id);
     this->op_pending  = 0;
-    this->prof_usecs  = 0;
-    this->prof_cycles = 0;
-    this->prof_pkts   = 0;
-    GGML_LOG_INFO("ggml-hex: allocating new session: %s\n", this->name.c_str());
+    GGML_LOG_DEBUG("ggml-hex: %s allocating new session\n", this->name.c_str());
     domain * my_domain = get_domain(this->domain_id);
     if (my_domain == NULL) {
@@ -1646,9 +2314,6 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     this->valid_handle = true;
-    GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
-                  this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
     // Enable FastRPC QoS mode
     {
         struct remote_rpc_control_latency l;
@@ -1660,11 +2325,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
         }
     }
+    GGML_LOG_INFO("ggml-hex: %s new session : session-id %d domain-id %d uri %s handle 0x%lx\n", this->c_name(),
+                  this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
+    const size_t req_q_size = (sizeof(htp_opbatch_req) * opt_opqueue * 2) + 1024;
+    const size_t rsp_q_size = (sizeof(htp_opbatch_rsp) * opt_opqueue * 2) + 1024;
     // Now let's setup the DSP queue
     err = dspqueue_create(this->domain_id,
                           0,              // Flags
-                          128 * 1024,     // Request  queue size (in bytes)
-                          64 * 1024,      // Response queue size (in bytes)
+                          req_q_size,     // Request  queue size (in bytes)
+                          rsp_q_size,     // Response queue size (in bytes)
                           nullptr,        // Read packet callback (we handle reads explicitly)
                           nullptr,        // Error callback (we handle errors during reads)
                           (void *) this,  // Callback context
@@ -1684,18 +2355,36 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     }
     if (opt_etm) {
-        err = htp_iface_enable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 1);
         if (err != 0) {
             GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
         }
     }
-    // Start the DSP-side service. We need to pass the queue ID to the
-    // DSP in a FastRPC call; the DSP side will import the queue and start
-    // listening for packets in a callback.
-    err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
+        err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+    // Allocate buffers and state for op batching
+    this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
+    if (!opt_vmem) {
+        opt_vmem = ggml_hexagon_measure_max_vmem(this);
+        GGML_LOG_INFO("ggml-hex: %s measured max vmem %zu\n", this->c_name(), opt_vmem);
+    }
+    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch, opt_vmem);
+    // Start dspqueue/opbatch processing
+    err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx, opt_vmem);
     if (err != 0) {
-        GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
+        GGML_LOG_ERROR("ggml-hex: %s failed to start session: 0x%08x\n", this->c_name(), (unsigned) err);
         throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
     }
     this->valid_iface = true;
@@ -1706,21 +2395,32 @@ void ggml_hexagon_session::release() noexcept(true) {
     int err;
-    // Stop the DSP-side service and close the queue
     if (this->valid_iface) {
+        // Stop dspqueue/opbatch processing
         err = htp_iface_stop(this->handle);
         if (err != 0) {
             GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
         }
     }
+    delete this->op_batch;
+    delete this->op_queue;
     if (opt_etm) {
-        err = htp_iface_disable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 0);
         if (err != 0) {
             GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
         }
     }
+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        err = htp_iface_profiler(this->handle, 0, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
     if (this->valid_queue) {
         err = dspqueue_close(queue);
         if (err != 0) {
@@ -1737,6 +2437,9 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
     buffer_type.device        = dev;
     repack_buffer_type.device = dev;
+    op_batch = nullptr;
+    op_queue = nullptr;
     try {
         allocate(dev_id);
@@ -1799,9 +2502,66 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
         return false;
     }
-    return opt_experimental;
+    if (dst->ne[3] != 1) {
+        return false;
+    }
+    return true;
 }
+static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * q     = op->src[0];
+    const struct ggml_tensor * k     = op->src[1];
+    const struct ggml_tensor * v     = op->src[2];
+    const struct ggml_tensor * g     = op->src[3];
+    const struct ggml_tensor * beta  = op->src[4];
+    const struct ggml_tensor * state = op->src[5];
+    const struct ggml_tensor * dst   = op;
+    if (!q || !k || !v || !g || !beta || !state) {
+        return false;
+    }
+    if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
+        g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
+        dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+    if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
+        !ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
+        !ggml_is_contiguous(dst)) {
+        return false;
+    }
+    const int64_t S_v      = v->ne[0];
+    const int64_t H        = v->ne[1];
+    const int64_t n_tokens = v->ne[2];
+    const int64_t n_seqs   = v->ne[3];
+    const int64_t K        = ggml_get_op_params_i32(op, 0);
+    if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
+        return false;
+    }
+    if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
+        q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
+        (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
+        return false;
+    }
+    if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
+        return false;
+    }
+    // state holds s0 only [S_v, S_v, H, n_seqs]; K is op param 0.
+    if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
+        return false;
+    }
+    if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
+        return false;
+    }
+    GGML_UNUSED(sess);
+    return true;
+}
 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];
@@ -1817,7 +2577,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_MXFP4:
             if (src0->ne[0] % 32) {
                 return false;
@@ -1842,6 +2604,27 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
                 GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
                 return false;
             }
+            if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
+                GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
+                return false;
+            }
+            if (ggml_nrows(src1) > 1024) {
+                return false;  // no huge batches (for now)
+            }
+            break;
+        case GGML_TYPE_F32:
+            if (src1->type != GGML_TYPE_F32) {
+                return false;
+            }
+            if (src0->nb[1] < src0->nb[0]) {
+                GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F32 src0 not supported\n");
+                return false;
+            }
+            if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) {
+                GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n");
+                return false;
+            }
             if (ggml_nrows(src1) > 1024) {
                 return false;  // no huge batches (for now)
             }
@@ -1866,7 +2649,9 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_MXFP4:
             if ((src0->ne[0] % 32)) {
                 return false;
@@ -1960,8 +2745,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
         return false;
     }
-    // TODO: add support for non-contigiuos tensors
-    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
+    // dst must be contiguous; src0 may be non-contiguous
+    if (!ggml_is_contiguous(dst)) {
         return false;
     }
@@ -2064,8 +2849,25 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
         }
     }
-    return true;
-}
+    // Reject non-HVX-aligned sizes when ne[0] > HVX_F32_LANES
+    // The HVX softmax implementation has issues with tail handling for larger non-aligned sizes
+    // Small sizes (ne[0] <= 32) work correctly with tail-only processing
+    const int64_t ne0 = src0->ne[0];
+    if (ne0 > 32 && (ne0 & (32 - 1)) != 0) {
+        return false;
+    }
+    // HVX vector size constraints for softmax
+    #define SOFTMAX_MAX_ROW_SIZE 131072  // 128K elements max for numerical precision
+    // Reject very large row sizes to avoid numerical precision issues
+    // Softmax accumulation over many elements can lead to precision loss
+    if (ne0 > SOFTMAX_MAX_ROW_SIZE) {
+        return false;
+    }
+    return true;
+}
 static bool ggml_hexagon_supported_set_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
     const struct ggml_tensor * src0 = op->src[0]; // values
@@ -2132,7 +2934,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
     int mode = op_params[2];
-    if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
+    if (mode == GGML_ROPE_TYPE_VISION) {
         return false;
     }
     if (mode & 1) {
@@ -2206,486 +3008,238 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
     if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
         return false;
     }
-    // TODO: add support for non-contiguous tensors
-    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
+    if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) {
+        return false;
+    }
+    if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) {
         return false;
     }
     return true;
 }
-enum dspqbuf_type {
-    DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
-    DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
-    DSPQBUF_TYPE_CONSTANT,
-};
-static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) {
-    if (opt_verbose < 2) return;
-    auto buf  = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
-    auto sess = buf->sess;
-    GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
-                t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
-                (unsigned int) d->size);
-}
-// Init hexagon tensor from GGML tensor and Hexagon buffer
-static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) {
-    h->data  = 0;  // updated by the receiver
-    h->type  = t->type;
-    h->ne[0] = t->ne[0];
-    h->ne[1] = t->ne[1];
-    h->ne[2] = t->ne[2];
-    h->ne[3] = t->ne[3];
-    h->nb[0] = t->nb[0];
-    h->nb[1] = t->nb[1];
-    h->nb[2] = t->nb[2];
-    h->nb[3] = t->nb[3];
-}
+static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
-static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_tensor * t, dspqbuf_type type) {
-    if (!t) {
-        return 0;
+    if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
     }
-    auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+    GGML_UNUSED(sess);
+    return true;
+}
-    memset(d, 0, sizeof(*d));
-    d->fd     = buf->fd;
-    d->ptr    = t->data;
-    d->offset = (uint8_t *) t->data - buf->base;
-    d->size   = ggml_nbytes(t);
+static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
-    if (!d->size) {
-        // Some requests contain srcs where ggml_nbytes() returns 0 but the rest of the op is non-empty
-        d->size = 64;
+    if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
     }
-    switch (type) {
-        case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
-            // Flush CPU
-            d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
-            break;
-        case DSPQBUF_TYPE_CPU_WRITE_DSP_READ:
-            // Flush CPU, Invalidate DSP
-            d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
-            break;
-        default:
-            // Constant buffer, no cache maintenance
-            d->flags = 0;
-            break;
+    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
+        return false;
     }
-    htp_req_tensor_init(h, t);
-    dspqbuf_dump(d, t, type);
-    return 1;
+    GGML_UNUSED(sess);
+    return true;
 }
-typedef size_t (*htp_req_init_func_t)(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * op);
-template <htp_req_init_func_t _init_req_func>
-static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) {
-    uint64_t t = ggml_time_us();
-    // Construct HTP request
-    htp_general_req req;
-    memset(&req, 0, sizeof(req));
+static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
-    req.flags = flags;
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+    // diag only supports F32 currently
+    if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
     }
-    ggml_hexagon_dump_op_exec(sess->name, op, req.flags);
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
-        size_t n_bufs = _init_req_func(&req, bufs, op);
-        sess->enqueue(req, bufs, n_bufs, opt_opsync);
+    // Input must have ne[1] == 1 (vector input)
+    if (src0->ne[1] != 1) {
+        return false;
     }
-    t = ggml_time_us() - t;
+    // Output must be square in first two dimensions
+    if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
+        return false;
+    }
-    ggml_hexagon_dump_op_prof(sess->name, op, sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, t);
+    GGML_UNUSED(sess);
+    return true;
 }
-template <bool _is_src0_constant>
-static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    switch (t->op) {
-        case GGML_OP_MUL_MAT:
-            req->op = HTP_OP_MUL_MAT;
-            break;
-        case GGML_OP_MUL:
-            req->op = HTP_OP_MUL;
-            break;
-        case GGML_OP_ADD:
-            req->op = HTP_OP_ADD;
-            break;
-        case GGML_OP_SUB:
-            req->op = HTP_OP_SUB;
-            break;
-        case GGML_OP_DIV:
-            req->op = HTP_OP_DIV;
-            break;
-        default:
-            GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
-            break;
+static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0]; // A
+    const struct ggml_tensor * src1 = op->src[1]; // B
+    const struct ggml_tensor * dst  = op;         // X
+    if (!src0 || !src1) {
+        return false;
     }
-    // src0: Weights (mulmat) or First Operand (binary op).
-    // If constant (e.g. weights), no cache management is needed.
-    // src1: Input Activations (mulmat) or Second Operand (binary op).
+    if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
+    }
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
+    if (src0->ne[0] != src0->ne[1]) {
+        return false;
+    }
-    return n_bufs;
-}
+    if (src0->ne[1] != src1->ne[1]) {
+        return false;
+    }
-static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    req->op = HTP_OP_CPY;
+    if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
+        return false;
+    }
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
+    if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
+        return false;
+    }
-    return n_bufs;
+    GGML_UNUSED(sess);
+    return true;
 }
-static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    req->op = HTP_OP_GET_ROWS;
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
+static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
-    return n_bufs;
-}
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
-static inline size_t init_argsort_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    req->op = HTP_OP_ARGSORT;
-    memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
+    if (src0->type != GGML_TYPE_F32) { return false; }
+    if (dst->type  != GGML_TYPE_F32) { return false; }
+    if (!ggml_are_same_shape(src0, dst)) { return false; }
+    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; }
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
+    return true;
-    return n_bufs;
+    GGML_UNUSED(sess);
 }
-template <bool _is_src0_constant>
-static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    switch (t->op) {
-        case GGML_OP_MUL_MAT_ID:
-            req->op = HTP_OP_MUL_MAT_ID;
-            break;
-        case GGML_OP_ADD_ID:
-            req->op = HTP_OP_ADD_ID;
-            break;
-        default:
-            GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op);
-    }
-    // src0: Weights (mulmat) or Input Activations (other op).
-    // If constant, no cache management is needed.
-    // src1: Input Activations (mulmat) or Second Operand (binary op).
-    // src2: Expert IDs (mulmat) or Activated Experts (other op).
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
-    return n_bufs;
+static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
+    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
+    return sess->c_name();
 }
-static inline size_t init_set_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    req->op = HTP_OP_SET_ROWS;
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
-    return n_bufs;
+static void ggml_backend_hexagon_free(ggml_backend_t backend) {
+    // we just need to delete the backend here
+    // the sessions are allocated & freed as part of the registry
+    delete backend;
 }
-static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
-    bool supported = false;
+static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
     switch (t->op) {
-        case GGML_OP_RMS_NORM:
-            req->op   = HTP_OP_RMS_NORM;
-            supported = true;
-            break;
-        case GGML_OP_SCALE:
-            req->op   = HTP_OP_SCALE;
-            supported = true;
-            break;
-        case GGML_OP_SQR:
-            req->op   = HTP_OP_SQR;
-            supported = true;
-            break;
-        case GGML_OP_SQRT:
-            req->op   = HTP_OP_SQRT;
-            supported = true;
-            break;
+        case GGML_OP_FLASH_ATTN_EXT:  return HTP_OP_FLASH_ATTN_EXT;
+        case GGML_OP_MUL_MAT:         return HTP_OP_MUL_MAT;
+        case GGML_OP_MUL_MAT_ID:      return HTP_OP_MUL_MAT_ID;
+        case GGML_OP_MUL:             return HTP_OP_MUL;
+        case GGML_OP_ADD:             return HTP_OP_ADD;
+        case GGML_OP_ADD_ID:          return HTP_OP_ADD_ID;
+        case GGML_OP_SUB:             return HTP_OP_SUB;
+        case GGML_OP_DIV:             return HTP_OP_DIV;
+        case GGML_OP_CPY:             return HTP_OP_CPY;
+        case GGML_OP_CONT:            return HTP_OP_CPY;
+        case GGML_OP_GET_ROWS:        return HTP_OP_GET_ROWS;
+        case GGML_OP_SET_ROWS:        return HTP_OP_SET_ROWS;
+        case GGML_OP_SUM_ROWS:        return HTP_OP_SUM_ROWS;
+        case GGML_OP_ARGSORT:         return HTP_OP_ARGSORT;
+        case GGML_OP_NORM:            return HTP_OP_NORM;
+        case GGML_OP_L2_NORM:         return HTP_OP_L2_NORM;
+        case GGML_OP_RMS_NORM:        return HTP_OP_RMS_NORM;
+        case GGML_OP_CONCAT:          return HTP_OP_CONCAT;
+        case GGML_OP_SCALE:           return HTP_OP_SCALE;
+        case GGML_OP_SQR:             return HTP_OP_SQR;
+        case GGML_OP_SQRT:            return HTP_OP_SQRT;
+        case GGML_OP_SOFT_MAX:        return HTP_OP_SOFTMAX;
+        case GGML_OP_SSM_CONV:        return HTP_OP_SSM_CONV;
+        case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
+        case GGML_OP_ROPE:            return HTP_OP_ROPE;
+        case GGML_OP_REPEAT:          return HTP_OP_REPEAT;
+        case GGML_OP_CUMSUM:          return HTP_OP_CUMSUM;
+        case GGML_OP_FILL:            return HTP_OP_FILL;
+        case GGML_OP_DIAG:            return HTP_OP_DIAG;
+        case GGML_OP_SOLVE_TRI:       return HTP_OP_SOLVE_TRI;
+        case GGML_OP_TRI:             return HTP_OP_TRI;
+        case GGML_OP_PAD:             return HTP_OP_PAD;
         case GGML_OP_UNARY:
-            if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
-                req->op   = HTP_OP_UNARY_SILU;
-                supported = true;
-            } else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
-                req->op   = HTP_OP_UNARY_GELU;
-                supported = true;
+            switch (ggml_get_unary_op(t)) {
+                case GGML_UNARY_OP_SILU:       return HTP_OP_UNARY_SILU;
+                case GGML_UNARY_OP_GELU:       return HTP_OP_UNARY_GELU;
+                case GGML_UNARY_OP_GELU_QUICK: return HTP_OP_UNARY_GELU;
+                case GGML_UNARY_OP_SIGMOID:    return HTP_OP_UNARY_SIGMOID;
+                case GGML_UNARY_OP_NEG:        return HTP_OP_UNARY_NEG;
+                case GGML_UNARY_OP_EXP:        return HTP_OP_UNARY_EXP;
+                case GGML_UNARY_OP_SOFTPLUS:   return HTP_OP_UNARY_SOFTPLUS;
+                case GGML_UNARY_OP_TANH:       return HTP_OP_UNARY_TANH;
+            default:
+                break;
             }
             break;
         case GGML_OP_GLU:
-            if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU) {
-                req->op   = HTP_OP_GLU_SWIGLU;
-                supported = true;
-            } else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
-                req->op   = HTP_OP_GLU_SWIGLU_OAI;
-                supported = true;
-            } else if (ggml_get_glu_op(t) == GGML_GLU_OP_GEGLU) {
-                req->op   = HTP_OP_GLU_GEGLU;
-                supported = true;
+            switch (ggml_get_glu_op(t)) {
+                case GGML_GLU_OP_SWIGLU:     return HTP_OP_GLU_SWIGLU;
+                case GGML_GLU_OP_SWIGLU_OAI: return HTP_OP_GLU_SWIGLU_OAI;
+                case GGML_GLU_OP_GEGLU:      return HTP_OP_GLU_GEGLU;
+                default: break;
             }
             break;
-        case GGML_OP_SOFT_MAX:
-            req->op   = HTP_OP_SOFTMAX;
-            supported = true;
-            break;
         default:
-            break;
+            GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(t));
     }
-    if (!supported) {
-        GGML_ABORT("ggml-hex: unary : unsupported op: %d\n", t->op);
-    }
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
-    return n_bufs;
-}
-static inline size_t init_sum_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
-    req->op = HTP_OP_SUM_ROWS;
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
-    return n_bufs;
+    return HTP_OP_INVALID;
 }
-static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
-    req->op = HTP_OP_ROPE;
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
-    return n_bufs;
-}
-static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
-    req->op = HTP_OP_FLASH_ATTN_EXT;
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src3, &bufs[n_bufs], t->src[3], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src4, &bufs[n_bufs], t->src[4], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
-    return n_bufs;
-}
-static inline size_t init_ssm_conv_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
-    req->op = HTP_OP_SSM_CONV;
-    size_t n_bufs = 0;
-    n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
-    n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CONSTANT);
-    n_bufs += htp_req_buff_init(&req->dst,  &bufs[n_bufs], t,         DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
-    return n_bufs;
-}
-static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
-    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
-    return sess->name.c_str();
-}
-static void ggml_backend_hexagon_free(ggml_backend_t backend) {
-    // we just need to delete the backend here
-    // the sessions are allocated & freed as part of the registry
-    delete backend;
-}
-static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
-    return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
-}
-static inline bool is_compute_op(ggml_tensor *node)
+static inline bool op_is_compute(ggml_tensor *node)
 {
     return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
 }
-// scan the graph and figure out last compute op index
-static inline int last_compute_op(ggml_cgraph * graph) {
-    int last = 0;
-    for (int i = 0; i < graph->n_nodes; ++i) {
-        if (is_compute_op(graph->nodes[i])) {
-            last = i;
-        }
-    }
-    return last;
-}
 static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
     auto sess = static_cast<ggml_hexagon_session *>(backend->context);
-    HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
-    const int last = last_compute_op(graph);
+    HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
-    const struct ggml_tensor * prev_op = nullptr;  // prev executed op
+    std::vector<htp_opnode> nodes;
+    nodes.reserve(graph->n_nodes);
+    // Fusion
     for (int i = 0; i < graph->n_nodes; ++i) {
-        ggml_tensor * node = graph->nodes[i];
-        if (!is_compute_op(node)) {
+        ggml_tensor * n = graph->nodes[i];
+        if (!op_is_compute(n)) {
             continue;
         }
-        uint32_t flags = 0;
-        // skip quantizer if src1 is reused
-        if (op_reuse_src1(node, prev_op)) {
-            flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-        }
+        ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
-        prev_op = node;
+        htp_opnode node = {
+            /*.node =*/ n,
+            /*.fused =*/ {},
+            /*.opcode =*/ HTP_OP_INVALID
+        };
-        // ask for early notification for the last Op
-        if (i == last) {
-            flags |= HTP_OPFLAGS_EARLY_WAKEUP;
+        if (n->op == GGML_OP_RMS_NORM && next_node) {
+            if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
+                node.add_fused(next_node);
+                node.opcode = HTP_OP_RMS_NORM_MUL;
+                i++; // skip the fused MUL node
+            }
         }
-        switch (node->op) {
-            case GGML_OP_MUL_MAT:
-                if (ggml_is_quantized(node->src[0]->type)) {
-                    ggml_hexagon_dispatch_op<init_binary_req<true>>(sess, node, flags);
-                } else {
-                    ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
-                }
-                break;
-            case GGML_OP_MUL_MAT_ID:
-                if (ggml_is_quantized(node->src[0]->type)) {
-                    ggml_hexagon_dispatch_op<init_binary_id_req<true>>(sess, node, flags);
-                } else {
-                    ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
-                }
-                break;
-            case GGML_OP_MUL:
-            case GGML_OP_ADD:
-            case GGML_OP_SUB:
-            case GGML_OP_DIV:
-                ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
-                break;
-            case GGML_OP_ADD_ID:
-                ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
-                break;
-            case GGML_OP_RMS_NORM:
-            case GGML_OP_SCALE:
-                ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
-                break;
-            case GGML_OP_SQR:
-            case GGML_OP_SQRT:
-                ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
-                break;
-            case GGML_OP_SUM_ROWS:
-                ggml_hexagon_dispatch_op<init_sum_rows_req>(sess, node, flags);
-                break;
-            case GGML_OP_UNARY:
-                if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
-                        (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
-                    ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
-                }
-                break;
-            case GGML_OP_GLU:
-                if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
-                        (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI) ||
-                        (ggml_get_glu_op(node) == GGML_GLU_OP_GEGLU)) {
-                    ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
-                }
-                break;
-            case GGML_OP_SOFT_MAX:
-                ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
-                break;
-            case GGML_OP_ROPE:
-                ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
-                break;
-            case GGML_OP_FLASH_ATTN_EXT:
-                ggml_hexagon_dispatch_op<init_flash_attn_ext_req>(sess, node, flags);
-                break;
-            case GGML_OP_SET_ROWS:
-                ggml_hexagon_dispatch_op<init_set_rows_req>(sess, node, flags);
-                break;
-            case GGML_OP_GET_ROWS:
-                ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
-                break;
-            case GGML_OP_CPY:
-                ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
-                break;
-            case GGML_OP_ARGSORT:
-                ggml_hexagon_dispatch_op<init_argsort_req>(sess, node, flags);
-                break;
+        if (node.opcode == HTP_OP_INVALID) {
+            node.opcode = op_remap_to_htp(n);
+        }
-            case GGML_OP_SSM_CONV:
-                ggml_hexagon_dispatch_op<init_ssm_conv_req>(sess, node, flags);
-                break;
+        nodes.push_back(std::move(node));
+    }
-            default:
-                GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
+    // Queue and execute
+    if (opt_opstage & HTP_OPSTAGE_QUEUE) {
+        for (const auto & node : nodes) {
+            sess->enqueue_op(node);
         }
     }
@@ -2698,57 +3252,13 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
     auto sess = static_cast<ggml_hexagon_session *>(backend->context);
-    HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
+    HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->c_name());
     // Wait until all pending ops complete
     sess->flush();
 }
-struct node_info {
-    ggml_tensor * node;
-    std::vector<ggml_tensor *> fused;
-    ggml_op op() const {
-        return node->op;
-    }
-    const ggml_tensor * dst() const {
-        return fused.empty() ? node : fused.back();
-    }
-    const ggml_tensor * src0() const {
-        return node->src[0];
-    }
-    const ggml_tensor * src1() const {
-        return node->src[1];
-    }
-    bool is_empty() const {
-        return ggml_op_is_empty(node->op);
-    }
-    void add_fused(ggml_tensor * t) {
-        fused.push_back(t);
-    }
-    bool stackable() const {
-        switch (this->op()) {
-            case GGML_OP_MUL_MAT:
-            case GGML_OP_MUL_MAT_ID:
-                return ggml_is_quantized(this->src0()->type);
-            default:
-                return false;
-        }
-    }
-    bool same_input(const node_info& n) const {
-        return n.src1() == this->src1();
-    }
-};
-static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
+static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
     const int n = nodes.size();
     std::vector<int> res;
@@ -2802,14 +3312,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
     enum ggml_op ops[MAX_FUSE];
-    std::vector<node_info> nodes;
+    std::vector<htp_opnode> nodes;
     nodes.reserve(gf->n_nodes);
     // fuse nodes:
     // we don't want to make reorders that break fusing, so we first pack all fusable tensors
     //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
     for (int i = 0; i < n; i++) {
-        node_info node = {
+        htp_opnode node = {
             /*.node =*/gf->nodes[i],
             /*.fused =*/{},
         };
@@ -2876,6 +3386,8 @@ static struct ggml_backend_i hexagon_backend_i = {
     /* .free                    = */ ggml_backend_hexagon_free,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
+    /* .set_tensor_2d_async     = */ NULL,
+    /* .get_tensor_2d_async     = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
     /* .synchronize             = */ ggml_backend_hexagon_synchronize,
     /* .graph_plan_create       = */ NULL,
@@ -2915,7 +3427,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c
 static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
     auto sess = static_cast<ggml_hexagon_session *>(dev->context);
-    return sess->name.c_str();
+    return sess->c_name();
     GGML_UNUSED(dev);
 }
@@ -2926,8 +3438,7 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
 }
 static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // ~2GB per session for now
-    *free  = 2ULL * 1024 * 1024 * 1024;
+    *free  = 0;
     *total = *free;
     GGML_UNUSED(dev);
@@ -3006,9 +3517,77 @@ static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess,
     return true;
 }
+static bool ggml_hexagon_supported_cont(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    GGML_UNUSED(sess);
+    const struct ggml_tensor * src0 = op->src[0];
+    // CONT is same-type only, supports f32 and f16
+    if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
+    return true;
+}
+static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    GGML_UNUSED(sess);
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
+    // Support f32 and f16
+    if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
+    // src and dst must be the same type
+    if (src0->type != dst->type) return false;
+    // dst dims must be multiples of src dims
+    if (dst->ne[0] % src0->ne[0] != 0) return false;
+    if (dst->ne[1] % src0->ne[1] != 0) return false;
+    if (dst->ne[2] % src0->ne[2] != 0) return false;
+    if (dst->ne[3] % src0->ne[3] != 0) return false;
+    // require contiguous tensors (no transposition)
+    if (ggml_is_transposed(src0) || ggml_is_transposed(dst)) return false;
+    return true;
+}
+static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    int dim = ((const int32_t *) op->op_params)[0];
+    if (dim < 0 || dim >= GGML_MAX_DIMS) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        const struct ggml_tensor * src = op->src[i];
+        if (!src) {
+            continue;
+        }
+        if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
+            return false;
+        }
+    }
+    return true;
+}
+static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * dst = op;
+    if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
+        return false;
+    }
+    GGML_UNUSED(sess);
+    return true;
+}
 static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     auto sess = static_cast<ggml_hexagon_session *>(dev->context);
+    // reject ops that match the filter
+    if (opt_opfilter && std::regex_match(ggml_op_desc(op), *opt_opfilter)) {
+        return false;
+    }
     // all srcs & dsts must be mapped to the same session
     if (!ggml_hexagon_supported_buffers(sess, op)) {
         ggml_hexagon_dump_op_supp(sess->name, op, false);
@@ -3025,6 +3604,13 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             supp = true;
             break;
+        case GGML_OP_MUL:
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        case GGML_OP_DIV:
+            supp = ggml_hexagon_supported_binary(sess, op);
+            break;
         case GGML_OP_MUL_MAT:
             supp = ggml_hexagon_supported_mul_mat(sess, op);
             break;
@@ -3033,17 +3619,12 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             supp = ggml_hexagon_supported_mul_mat_id(sess, op);
             break;
-        case GGML_OP_MUL:
-        case GGML_OP_ADD:
-        case GGML_OP_SUB:
-        case GGML_OP_DIV:
-            supp = ggml_hexagon_supported_binary(sess, op);
-            break;
         case GGML_OP_ADD_ID:
             supp = ggml_hexagon_supported_add_id(sess, op);
             break;
+        case GGML_OP_NORM:
+        case GGML_OP_L2_NORM:
         case GGML_OP_RMS_NORM:
         case GGML_OP_SCALE:
             supp = ggml_hexagon_supported_unary(sess, op);
@@ -3063,21 +3644,36 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             break;
         case GGML_OP_UNARY:
-            {
-                const auto unary_op = ggml_get_unary_op(op);
-                if (unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_GELU) {
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_NEG:
+                case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_SIGMOID:
+                case GGML_UNARY_OP_SOFTPLUS:
+                case GGML_UNARY_OP_TANH:
+                    supp = ggml_hexagon_supported_unary(sess, op);
+                    break;
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_QUICK:
                     supp = ggml_hexagon_supported_activations(sess, op);
-                }
-                break;
+                    break;
+                default:
+                    break;
             }
+            break;
         case GGML_OP_GLU:
-            {
-                const auto glu_op = ggml_get_glu_op(op);
-                if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI) || (glu_op == GGML_GLU_OP_GEGLU)) {
+            switch (ggml_get_glu_op(op)) {
+                case GGML_GLU_OP_SWIGLU:
+                case GGML_GLU_OP_SWIGLU_OAI:
+                case GGML_GLU_OP_GEGLU:
                     supp = ggml_hexagon_supported_activations(sess, op);
-                }
-                break;
+                    break;
+                default:
+                    break;
             }
+            break;
         case GGML_OP_ROPE:
             supp = ggml_hexagon_supported_rope(sess, op);
             break;
@@ -3098,6 +3694,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             supp = ggml_hexagon_supported_cpy(sess, op);
             break;
+        case GGML_OP_CONT:
+            supp = ggml_hexagon_supported_cont(sess, op);
+            break;
+        case GGML_OP_REPEAT:
+            supp = ggml_hexagon_supported_repeat(sess, op);
+            break;
         case GGML_OP_ARGSORT:
             supp = ggml_hexagon_supported_argsort(sess, op);
             break;
@@ -3106,6 +3710,38 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             supp = ggml_hexagon_supported_ssm_conv(sess, op);
             break;
+        case GGML_OP_GATED_DELTA_NET:
+            supp = ggml_hexagon_supported_gated_delta_net(sess, op);
+            break;
+        case GGML_OP_CUMSUM:
+            supp = ggml_hexagon_supported_cumsum(sess, op);
+            break;
+        case GGML_OP_CONCAT:
+            supp = ggml_hexagon_supported_concat(sess, op);
+            break;
+        case GGML_OP_FILL:
+            supp = ggml_hexagon_supported_fill(sess, op);
+            break;
+        case GGML_OP_DIAG:
+            supp = ggml_hexagon_supported_diag(sess, op);
+            break;
+        case GGML_OP_SOLVE_TRI:
+            supp = ggml_hexagon_supported_solve_tri(sess, op);
+            break;
+        case GGML_OP_TRI:
+            supp = ggml_hexagon_supported_tri(sess, op);
+            break;
+        case GGML_OP_PAD:
+            supp = ggml_hexagon_supported_pad(sess, op);
+            break;
         default:
             break;
     }
@@ -3172,21 +3808,6 @@ struct ggml_hexagon_registry {
 ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
     GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
-    if (!opt_arch) {
-        int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
-        if (err != 0) {
-            GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
-            opt_arch = 73;
-        }
-    }
-#if defined(__ANDROID__)
-    if (opt_arch < 75) {
-        opt_ndev = 1;
-        GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
-    }
-#endif
     GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
     // Create devices / sessions
@@ -3241,53 +3862,117 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
     return NULL;
 }
+template<typename T> std::vector<T> str_to_vec(const char* str) {
+    std::stringstream ss(str);
+    std::vector<T> v;
+    std::string    t;
+    while (std::getline(ss, t, ',')) {
+        v.push_back(std::stoul(t, nullptr, 0));
+    }
+    return v;
+}
+template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
+    std::stringstream ss;
+    ss << std::setbase(BASE) << std::showbase;
+    for (auto i : v) { ss << i << ','; }
+    auto str = ss.str(); str.pop_back(); // drop last comma
+    return str;
+}
 static void ggml_hexagon_init(ggml_backend_reg * reg) {
     // Basic sanity checks to make sure definitions match
     static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
                   "please update hexagon_type to match ggml_type");
+    static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1,
+                  "please update hexagon_type to match ggml_type");
     static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
                   "please update hexagon_type to match ggml_type");
     static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
                   "please update hexagon_type to match ggml_type");
+    static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
+                  "please update hexagon_type to match ggml_type");
-    const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
-    const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
-    const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
-    const char * str_opmask  = getenv("GGML_HEXAGON_OPMASK");
-    const char * str_opsync  = getenv("GGML_HEXAGON_OPSYNC");
-    const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
-    const char * str_etm     = getenv("GGML_HEXAGON_ETM");
-    const char * str_nhvx    = getenv("GGML_HEXAGON_NHVX");
-    const char * str_ndev    = getenv("GGML_HEXAGON_NDEV");
-    const char * str_arch    = getenv("GGML_HEXAGON_ARCH");
-    opt_experimental = str_experimental ? atoi(str_experimental) : 0;
-    opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
-    opt_opmask       = str_opmask  ? strtoul(str_opmask, NULL, 0) : opt_opmask;
-    opt_opsync       = str_opsync  ? atoi(str_opsync)  : 0;
-    opt_profile      = str_profile ? atoi(str_profile) : 0;
-    opt_etm          = str_etm     ? atoi(str_etm) : 0;
-    opt_nhvx         = str_nhvx    ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
-    opt_ndev         = str_ndev    ? strtoul(str_ndev, NULL, 0) : opt_ndev;
+    const char * str_verbose  = getenv("GGML_HEXAGON_VERBOSE");
+    const char * str_hostbuf  = getenv("GGML_HEXAGON_HOSTBUF");
+    const char * str_opstage  = getenv("GGML_HEXAGON_OPSTAGE");
+    const char * str_opbatch  = getenv("GGML_HEXAGON_OPBATCH");
+    const char * str_opqueue  = getenv("GGML_HEXAGON_OPQUEUE");
+    const char * str_oppoll   = getenv("GGML_HEXAGON_OPPOLL");
+    const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
+    const char * str_profile  = getenv("GGML_HEXAGON_PROFILE");
+    const char * str_etm      = getenv("GGML_HEXAGON_ETM");
+    const char * str_nhvx     = getenv("GGML_HEXAGON_NHVX");
+    const char * str_use_hmx  = getenv("GGML_HEXAGON_USE_HMX");
+    const char * str_ndev     = getenv("GGML_HEXAGON_NDEV");
+    const char * str_arch     = getenv("GGML_HEXAGON_ARCH");
+    const char * str_vmem     = getenv("GGML_HEXAGON_VMEM");
+    const char * str_mbuf     = getenv("GGML_HEXAGON_MBUF");
+    // Init Arch first since it affects other defaults
+    if (!str_arch) {
+        int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
+            opt_arch = 73;
+        }
+    } else {
+        if (str_arch[0] == 'v' || str_arch[0] == 'V') {
+            str_arch++;
+        }
+        opt_arch = strtoul(str_arch, NULL, 0);
+    }
+    size_t MiB = 1024 * 1024;
+    // Update vmem default
+    opt_vmem = opt_arch >= 75 ? HTP_OP_MAX_VMEM_DEFAULT : 3000 * MiB;
+    auto RE_ICASE = std::regex_constants::icase;
+    opt_opfilter  = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
+    opt_verbose   = str_verbose  ? atoi(str_verbose)                      : 0;
+    opt_hostbuf   = str_hostbuf  ? atoi(str_hostbuf)                      : opt_hostbuf;
+    opt_opstage   = str_opstage  ? strtoul(str_opstage, NULL, 0)          : opt_opstage;
+    opt_opbatch   = str_opbatch  ? strtoul(str_opbatch, NULL, 0)          : opt_opbatch;
+    opt_opqueue   = str_opqueue  ? strtoul(str_opqueue, NULL, 0)          : opt_opqueue;
+    opt_oppoll    = str_oppoll   ? strtoul(str_oppoll,  NULL, 0)          : opt_oppoll;
+    opt_profile   = str_profile  ? atoi(str_profile)                      : 0;
+    opt_etm       = str_etm      ? atoi(str_etm)                          : 0;
+    opt_nhvx      = str_nhvx     ? strtoul(str_nhvx, NULL, 0)             : opt_nhvx;
+    opt_use_hmx   = str_use_hmx  ? atoi(str_use_hmx)                      : opt_use_hmx;
+    opt_ndev      = str_ndev     ? strtoul(str_ndev, NULL, 0)             : opt_ndev;
+    opt_hostbuf   = str_hostbuf  ? atoi(str_hostbuf)                      : opt_hostbuf;
+    opt_mbuf      = str_mbuf     ? strtoul(str_mbuf, NULL, 0) * MiB       : opt_mbuf;
+    opt_vmem      = str_vmem     ? strtoul(str_vmem, NULL, 0) * MiB       : opt_vmem;
     if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
         opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
     }
-    if (str_arch) {
-        if (str_arch[0] == 'v') {
-            str_arch++;
-        }
-        opt_arch = strtoul(str_arch, NULL, 0);
+#if defined(__ANDROID__)
+    if (opt_arch < 75) {
+        opt_ndev = 1;
+        GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
     }
+#endif
-    opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
+    if (str_profile) {
+        opt_pmu_evt = [&]() -> std::vector<uint32_t> {
+            auto v  = str_to_vec<uint32_t>(str_profile);
+            switch (v.size()) {
+                case 1:  opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
+                case 8:  opt_profile = 2;    return v;           // mode with custom  pmu events
+                default: opt_profile = 0;    return {};          // garbage input
+            }}();
+        if (opt_profile == 1) opt_pmu_evt = {};
+        GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
+                vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
+    }
     reg->context = new ggml_hexagon_registry(reg);
-    HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
-                sizeof(struct htp_general_rsp));
 }
 static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {