npm - @fugood/llama.node - Versions diffs - 0.3.13 → 0.3.15 - Mend

@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +1 -1
package/package.json +1 -1
package/src/LlamaContext.cpp +98 -76
package/src/LlamaContext.h +1 -1
package/src/common.hpp +1 -2
package/src/llama.cpp/.github/workflows/build.yml +89 -10
package/src/llama.cpp/.github/workflows/server.yml +2 -0
package/src/llama.cpp/CMakeLists.txt +9 -1
package/src/llama.cpp/cmake/common.cmake +2 -0
package/src/llama.cpp/common/CMakeLists.txt +3 -3
package/src/llama.cpp/common/arg.cpp +132 -13
package/src/llama.cpp/common/chat.cpp +960 -266
package/src/llama.cpp/common/chat.h +135 -0
package/src/llama.cpp/common/common.cpp +33 -174
package/src/llama.cpp/common/common.h +27 -67
package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
package/src/llama.cpp/common/ngram-cache.cpp +1 -0
package/src/llama.cpp/common/sampling.cpp +45 -7
package/src/llama.cpp/common/speculative.cpp +10 -9
package/src/llama.cpp/common/speculative.h +1 -1
package/src/llama.cpp/docs/build.md +45 -7
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
package/src/llama.cpp/examples/infill/infill.cpp +2 -2
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +373 -107
package/src/llama.cpp/examples/llava/clip.h +19 -3
package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
package/src/llama.cpp/examples/llava/llava.cpp +4 -2
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
package/src/llama.cpp/examples/main/main.cpp +79 -34
package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
package/src/llama.cpp/examples/run/run.cpp +196 -108
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
package/src/llama.cpp/examples/server/server.cpp +113 -101
package/src/llama.cpp/examples/server/utils.hpp +94 -105
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
package/src/llama.cpp/examples/tts/tts.cpp +263 -151
package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
package/src/llama.cpp/ggml/include/ggml.h +29 -1
package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
package/src/llama.cpp/ggml/src/ggml.c +93 -5
package/src/llama.cpp/include/llama.h +105 -27
package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
package/src/llama.cpp/requirements/requirements-all.txt +1 -0
package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
package/src/llama.cpp/requirements.txt +1 -0
package/src/llama.cpp/src/CMakeLists.txt +5 -2
package/src/llama.cpp/src/llama-adapter.cpp +19 -20
package/src/llama.cpp/src/llama-adapter.h +11 -9
package/src/llama.cpp/src/llama-arch.cpp +123 -16
package/src/llama.cpp/src/llama-arch.h +19 -0
package/src/llama.cpp/src/llama-batch.h +2 -2
package/src/llama.cpp/src/llama-chat.cpp +1 -0
package/src/llama.cpp/src/llama-context.cpp +2253 -1222
package/src/llama.cpp/src/llama-context.h +214 -77
package/src/llama.cpp/src/llama-cparams.h +1 -0
package/src/llama.cpp/src/llama-grammar.cpp +182 -182
package/src/llama.cpp/src/llama-grammar.h +12 -3
package/src/llama.cpp/src/llama-graph.cpp +1662 -0
package/src/llama.cpp/src/llama-graph.h +574 -0
package/src/llama.cpp/src/llama-hparams.cpp +8 -0
package/src/llama.cpp/src/llama-hparams.h +9 -0
package/src/llama.cpp/src/llama-io.cpp +15 -0
package/src/llama.cpp/src/llama-io.h +35 -0
package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
package/src/llama.cpp/src/llama-kv-cache.h +178 -109
package/src/llama.cpp/src/llama-memory.cpp +1 -0
package/src/llama.cpp/src/llama-memory.h +21 -0
package/src/llama.cpp/src/llama-mmap.cpp +11 -1
package/src/llama.cpp/src/llama-model.cpp +8230 -122
package/src/llama.cpp/src/llama-model.h +34 -1
package/src/llama.cpp/src/llama-quant.cpp +10 -1
package/src/llama.cpp/src/llama-sampling.cpp +43 -10
package/src/llama.cpp/src/llama-vocab.cpp +12 -0
package/src/llama.cpp/src/llama.cpp +51 -9837
package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
package/src/llama.cpp/tests/test-chat.cpp +593 -395
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
package/src/llama.cpp/Sources/llama/llama.h +0 -4
package/src/llama.cpp/common/chat.hpp +0 -55
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
/package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0

package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp ADDED Viewed

@@ -0,0 +1,11 @@
+#ifndef GGML_SYCL_CPY_HPP
+#define GGML_SYCL_CPY_HPP
+#include "common.hpp"
+typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
+void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1);
+void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+#endif // GGML_SYCL_CPY_HPP

package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp CHANGED Viewed

@@ -16,6 +16,8 @@
 #include "common.hpp"
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
+typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,
+                                            const int iqs, dfloat2 &v);
 static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
                                             const int iqs, dfloat2 &v) {
@@ -40,6 +42,29 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
 #endif // GGML_SYCL_F16
 }
+static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
+                                            const int iqs, dfloat2 &v) {
+    // const block_q4_0 * x = (const block_q4_0 *) vx;
+    const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
+    const int vui = *((const uint8_t *)qs+iqs);
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+#ifdef GGML_SYCL_F16
+    // v = v - {8.0f, 8.0f};
+    // v = v * {d, d};
+    v.s0() = (v.s0() - 8.0f) * d;
+    v.s1() = (v.s1() - 8.0f) * d;
+#else
+    v.x() = (v.x() - 8.0f) * d;
+    v.y() = (v.y() - 8.0f) * d;
+#endif // GGML_SYCL_F16
+}
 static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
                                             const int iqs, dfloat2 &v) {
     const block_q4_1 * x = (const block_q4_1 *) vx;
@@ -167,6 +192,36 @@ static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restri
     }
 }
+template<typename dst_t>
+static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const int64_t i = item_ct1.get_group(2);
+    auto k=nb32;
+    // assume 32 threads
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int lane_ib = i * WARP_SIZE + tid;
+    if (lane_ib >= k / QK4_0) {
+        return;
+    }
+    dst_t * y_ptr = yy + lane_ib * QK4_0;
+    auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2;
+    auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib;
+    const float d = float(*s_ptr);
+#pragma unroll
+    for (int l = 0; l < QK4_0 / 2; ++l) {
+        int vq = qs[l];
+        y_ptr[l + 0] = d * ((vq & 0xF) - 8);
+        y_ptr[l + 16] = d * ((vq >> 4) - 8);
+    }
+}
 template<typename dst_t>
 static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
                                   const sycl::nd_item<3> &item_ct1) {

package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp CHANGED Viewed

@@ -3,7 +3,6 @@
 #include "dequantize.hpp"
 #include "presets.hpp"
 static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
     const sycl::half *x = (const sycl::half *)vx;
@@ -91,6 +90,112 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
     }
 }
+template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
+static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
+                                   const sycl::nd_item<3> &item_ct1) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row >= nrows) {
+        return;
+    }
+    const int tid = item_ct1.get_local_id(2);
+    const int ncols_left = ncols % (QK4_0*WARP_SIZE);
+    const int ncols_align = ncols - ncols_left;
+    const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16
+    const int y_offset = qr == 1 ? 1 : qk/2;
+// partial sum for each thread
+#ifdef GGML_SYCL_F16
+    sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#else
+    float tmp = 0.0f;
+#endif // GGML_SYCL_F16
+    const char *d_ptr = (const char*)vx+ncols*nrows/2;
+    int i=0;
+    for (i = 0; i < ncols_align; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_SYCL_F16
+            dfloat2 t1{y[iybs + iqs + j / qr + 0],
+                        y[iybs + iqs + j / qr + y_offset]};
+            tmp += v * t1;
+#else
+            tmp += v.x() * y[iybs + iqs + j / qr + 0];
+            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
+#endif // GGML_SYCL_F16
+        }
+    }
+    for (; i < ncols; i += iter_stride) {
+        if (tid>=ncols_left/QK4_0) continue;
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_SYCL_F16
+            dfloat2 t1{y[iybs + iqs + j / qr + 0],
+                        y[iybs + iqs + j / qr + y_offset]};
+            tmp += v * t1;
+#else
+            tmp += v.x() * y[iybs + iqs + j / qr + 0];
+            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
+#endif // GGML_SYCL_F16
+        }
+    }
+    // sum up partial sums and write back result
+    const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
+    for (int mask = mask_start; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+    if (tid == 0) {
+#ifdef GGML_SYCL_F16
+        dst[row] = tmp.x() + tmp.y();
+#else
+        dst[row] = tmp;
+#endif // GGML_SYCL_F16
+    }
+}
 static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
                                          float *dst, const int ncols,
                                          const int nrows,
@@ -105,7 +210,7 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
                                                           nrows, item_ct1);
             });
@@ -759,6 +864,28 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
     }
 }
+static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
 static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
                                              float *dst, const int ncols,
@@ -775,7 +902,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
                     vx, y, dst, ncols, nrows, item_ct1);
             });
@@ -796,7 +923,7 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
                     vx, y, dst, ncols, nrows, item_ct1);
             });
@@ -817,7 +944,7 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
                     vx, y, dst, ncols, nrows, item_ct1);
             });
@@ -838,7 +965,7 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
                     vx, y, dst, ncols, nrows, item_ct1);
             });
@@ -859,7 +986,7 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
                     vx, y, dst, ncols, nrows, item_ct1);
             });
@@ -877,7 +1004,7 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
     const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
             dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
         });
 }
@@ -893,7 +1020,7 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
     const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
             dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
         });
 }
@@ -909,7 +1036,7 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
     const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
             dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
         });
 }
@@ -922,7 +1049,7 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
     const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
             dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
         });
 }
@@ -938,7 +1065,7 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
     const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
             dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
         });
 }
@@ -953,7 +1080,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
     const int64_t ne00 = src0->ne[0];
     const int64_t row_diff = row_high - row_low;
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_SYCL_F16
@@ -967,7 +1093,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
     if (src1_convert_f16) {
         src1_dfloat = src1_dfloat_a.alloc(ne00);
-        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
         GGML_ASSERT(to_fp16_sycl != nullptr);
         to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
     }
@@ -977,7 +1103,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+                dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            } else {
+                dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            }
             break;
         case GGML_TYPE_Q4_1:
             dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
@@ -1012,7 +1143,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
         default:
             printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
             GGML_ABORT("fatal error");
-            break;
     }
     GGML_UNUSED(src1);
@@ -1020,4 +1150,5 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
     GGML_UNUSED(src1_ddq_i);
     GGML_UNUSED(src1_ncols);
     GGML_UNUSED(src1_padded_row_size);
+    GGML_UNUSED(ctx);
 }