PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/metal/kernels/gemv_masked.h ADDED Viewed

@@ -0,0 +1,827 @@
+// Copyright © 2023-2024 Apple Inc.
+#include "mlx/backend/metal/kernels/steel/utils.h"
+using namespace metal;
+#define MLX_MTL_CONST static constant constexpr const
+#define MLX_MTL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")
+struct _NoMask {
+  char x;
+  constexpr METAL_FUNC operator bool() {
+    return true;
+  }
+  constexpr METAL_FUNC operator bool() const threadgroup {
+    return true;
+  }
+  constexpr METAL_FUNC operator bool() const device {
+    return true;
+  }
+  constexpr METAL_FUNC operator bool() const constant {
+    return true;
+  }
+};
+typedef struct _NoMask nomask_t;
+template <typename OutT, typename InT = OutT>
+struct ScaleOp {
+  OutT scale;
+  METAL_FUNC OutT apply(InT x) const {
+    return static_cast<OutT>(x) * scale;
+  }
+};
+template <
+    typename T,
+    typename out_mask_t,
+    typename op_mask_t,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN, /* Thread cols (in elements) */
+    typename AccT = float>
+struct GEMVKernel {
+  MLX_MTL_CONST int threadsM = BM * SM;
+  MLX_MTL_CONST int threadsN = BN * SN;
+  MLX_MTL_CONST int blockM = threadsM * TM;
+  MLX_MTL_CONST int blockN = threadsN * TN;
+  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");
+  static_assert(
+      SN == 8 || SN == 16 || SN == 32,
+      "gemv block must have a width of 8, 16, or 32");
+  static_assert(blockN >= blockM, "Masked gemv must have blockN >= blockM");
+  MLX_MTL_CONST bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
+  MLX_MTL_CONST bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;
+  MLX_MTL_CONST bool has_mul_operand_mask =
+      has_operand_mask && !metal::is_same_v<op_mask_t, bool>;
+  MLX_MTL_CONST bool has_mul_output_mask =
+      has_output_mask && !metal::is_same_v<out_mask_t, bool>;
+  // - The matrix of size (M = out_vec_size, K = in_vec_size) is divided up
+  //   into blocks of (blockM, blockN) divided among threadgroups
+  // - Every thread works on a block of (TM, TN)
+  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
+  //
+  // 1. A thread loads TN elements each from mat along TM rows
+  //    and the corresponding scalar from the vector
+  // 2. The thread then multiplies and adds to accumulate its local result for
+  //    the block
+  // 3. At the end, each thread has accumulated results over all blocks across
+  //    the rows. These are then summed up across the threadgroup
+  // 4. Each threadgroup writes its accumulated blockM outputs
+  //
+  // Edge case handling:
+  // - The threadgroup with the largest tid has blocks that exceed the matrix
+  //   * The blocks that start outside the matrix are never read (thread results
+  //     remain zero)
+  //   * The last thread that partially overlaps with the matrix is shifted
+  //     inwards such that the thread block fits exactly in the matrix
+  MLX_MTL_CONST short tgp_mem_size = BN > 1 ? BN*(blockM + TM) : 0;
+  MLX_MTL_CONST bool needs_tgp_reduction = BN > 1;
+  template <typename U = T>
+  static METAL_FUNC void
+  load_unsafe(const device T* src, thread U dst[TN], const int src_offset = 0) {
+    MLX_MTL_PRAGMA_UNROLL
+    for (int tn = 0; tn < TN; tn++) {
+      dst[tn] = static_cast<U>(src[src_offset + tn]);
+    }
+  }
+  template <typename U = T>
+  static METAL_FUNC void load_safe(
+      const device T* src,
+      thread U dst[TN],
+      const int src_offset = 0,
+      const int src_size = TN) {
+    if (src_offset + TN <= src_size) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tn = 0; tn < TN; tn++) {
+        dst[tn] = static_cast<U>(src[src_offset + tn]);
+      }
+    } else { // Edgecase
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tn = 0; tn < TN; tn++) {
+        dst[tn] = src_offset + tn < src_size
+            ? static_cast<U>(src[src_offset + tn])
+            : U(0);
+      }
+    }
+  }
+  static METAL_FUNC void run(
+      const device T* mat [[buffer(0)]],
+      const device T* in_vec [[buffer(1)]],
+      device T* out_vec [[buffer(3)]],
+      const constant int& in_vec_size [[buffer(4)]],
+      const constant int& out_vec_size [[buffer(5)]],
+      const constant int& matrix_ld [[buffer(6)]],
+      const device out_mask_t* out_mask [[buffer(20)]],
+      const device op_mask_t* mat_mask [[buffer(21)]],
+      const device op_mask_t* vec_mask [[buffer(22)]],
+      const constant int* mask_strides [[buffer(23)]],
+      threadgroup AccT* tgp_memory [[threadgroup(0)]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]],
+      uint simd_gid [[simdgroup_index_in_threadgroup]],
+      uint simd_lid [[thread_index_in_simdgroup]]) {
+    // Appease compiler
+    (void)lid;
+    // Thread local accumulation results
+    thread AccT result[TM] = {0};
+    thread T inter[TN];
+    thread AccT v_coeff[TN];
+    const int thrM = SN != 32 ? simd_lid / SN : 0;
+    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);
+    const int sgN = BN != 1 ? (simd_gid % BN) : 0;
+    const int simdM = BN != 1 ? SM * (simd_gid / BN) : int(SM * simd_gid);
+    const int simdN = BN != 1 ? SN * (simd_gid % BN) : 0;
+    int bm = (simdM + thrM) * TM;
+    int bn = (simdN + thrN) * TN;
+    // Block position
+    int out_row = tid.x * blockM + bm;
+    // Exit simdgroup if rows out of bound
+    if (out_row >= out_vec_size)
+      return;
+    // Adjust tail simdgroup to ensure in bound reads
+    out_row = out_row + TM <= out_vec_size ? out_row : out_vec_size - TM;
+    // Prepare mask offsets
+    const constant int* out_mask_strides = mask_strides;
+    const constant int* mat_mask_strides =
+        mask_strides + (has_output_mask ? 2 : 0);
+    const constant int* vec_mask_strides =
+        mat_mask_strides + (has_operand_mask ? 2 : 0);
+    const int m_block_idx = blockN > blockM ? out_row / blockN : int(tid.x);
+    const int out_mask_offset =
+        !has_output_mask ? 0 : m_block_idx * out_mask_strides[1];
+    int mat_mask_offset =
+        !has_operand_mask ? 0 : m_block_idx * mat_mask_strides[1];
+    int vec_mask_offset = 0;
+    const int mat_mask_step = !has_operand_mask ? 0 : mat_mask_strides[0];
+    const int vec_mask_step = !has_operand_mask ? 0 : vec_mask_strides[1];
+    T out_scale{1};
+    // Check output mask
+    if (has_output_mask) {
+      auto mask_out = out_mask[out_mask_offset];
+      // Write zeros and return if mask is 0
+      if (!mask_out) {
+        if (simdN == 0 && thrN == 0) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tm = 0; tm < TM; tm++) {
+            out_vec[out_row + tm] = T(0.);
+          }
+        }
+        return;
+      }
+      // Store scalar if multiplicative mask
+      if (has_mul_output_mask) {
+        out_scale = T(mask_out);
+      }
+    }
+    // Advance matrix
+    mat += out_row * matrix_ld;
+    // Prepare for loop
+    constexpr const uniform<int> loop_stride = make_uniform(blockN);
+    const uniform<int> in_size = make_uniform(in_vec_size);
+    const uniform<int> n_iter = in_size / loop_stride;
+    const uniform<int> last_iter = loop_stride * n_iter;
+    const uniform<int> leftover = in_size - last_iter;
+    // Loop over in_vec in blocks of blockN
+    for (int i = 0; i < n_iter; ++i) {
+      if (!has_operand_mask ||
+          (bool(mat_mask[mat_mask_offset]) &&
+           bool(vec_mask[vec_mask_offset]))) {
+        T block_scale{1};
+        if (has_mul_operand_mask) {
+          block_scale =
+              T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
+        }
+        load_unsafe<AccT>(in_vec, v_coeff, bn);
+        // Apply scale
+        if (has_mul_operand_mask) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tn = 0; tn < TN; tn++) {
+            v_coeff[tn] *= block_scale;
+          }
+        }
+        // Per thread work loop
+        int mat_offset = 0;
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tm = 0; tm < TM; tm++) {
+          // Load for the row
+          load_unsafe(mat, inter, mat_offset + bn);
+          // Accumulate results
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tn = 0; tn < TN; tn++) {
+            result[tm] += inter[tn] * v_coeff[tn];
+          }
+          mat_offset += matrix_ld;
+        }
+      }
+      bn += blockN;
+      mat_mask_offset += mat_mask_step;
+      vec_mask_offset += vec_mask_step;
+    }
+    if (leftover > 0) {
+      if (!has_operand_mask ||
+          (bool(mat_mask[mat_mask_offset]) &&
+           bool(vec_mask[vec_mask_offset]))) {
+        T block_scale{1};
+        if (has_mul_operand_mask) {
+          block_scale =
+              T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
+        }
+        load_safe<AccT>(in_vec, v_coeff, bn, in_size);
+        // Apply scale
+        if (has_mul_operand_mask) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tn = 0; tn < TN; tn++) {
+            v_coeff[tn] *= block_scale;
+          }
+        }
+        // Per thread work loop
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tm = 0; tm < TM; tm++) {
+          // Load for the row
+          load_safe(&mat[tm * matrix_ld], inter, bn, in_size);
+          // Accumulate results
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tn = 0; tn < TN; tn++) {
+            result[tm] += inter[tn] * v_coeff[tn];
+          }
+        }
+      }
+    }
+    // Apply out scale
+    if (has_mul_output_mask) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tm = 0; tm < TM; tm++) {
+        result[tm] *= out_scale;
+      }
+    }
+    // Simdgroup accumulations
+    MLX_MTL_PRAGMA_UNROLL
+    for (int tm = 0; tm < TM; tm++) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (ushort sn = (SN / 2); sn >= 1; sn >>= 1) {
+        result[tm] += simd_shuffle_down(result[tm], sn);
+      }
+    }
+    // Threadgroup accumulation results
+    if (needs_tgp_reduction) {
+      threadgroup AccT* tgp_results = tgp_memory + sgN * (blockM + TM) + bm;
+      if (thrN == 0) {
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tm = 0; tm < TM; tm++) {
+          tgp_results[tm] = result[tm];
+        }
+        threadgroup_barrier(mem_flags::mem_none);
+        if (sgN == 0) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int sgn = 1; sgn < BN; sgn++) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tm = 0; tm < TM; tm++) {
+              result[tm] += tgp_results[sgn * (blockM + TM) + tm];
+            }
+          }
+        }
+      }
+    }
+    // Write outputs
+    if (simdN == 0 && thrN == 0) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tm = 0; tm < TM; tm++) {
+        out_vec[out_row + tm] = static_cast<T>(result[tm]);
+      }
+    }
+  }
+};
+///////////////////////////////////////////////////////////////////////////////
+/// Vector matrix multiplication
+///////////////////////////////////////////////////////////////////////////////
+template <
+    typename T,
+    typename out_mask_t,
+    typename op_mask_t,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN, /* Thread cols (in elements) */
+    typename AccT = float>
+struct GEMVTKernel {
+  MLX_MTL_CONST int threadsM = BM * SM;
+  MLX_MTL_CONST int threadsN = BN * SN;
+  MLX_MTL_CONST int blockM = threadsM * TM;
+  MLX_MTL_CONST int blockN = threadsN * TN;
+  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");
+  MLX_MTL_CONST bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
+  MLX_MTL_CONST bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;
+  MLX_MTL_CONST bool has_mul_operand_mask =
+      has_operand_mask && !metal::is_same_v<op_mask_t, bool>;
+  MLX_MTL_CONST bool has_mul_output_mask =
+      has_output_mask && !metal::is_same_v<out_mask_t, bool>;
+  // - The matrix of size (M = in_vec_size, N = out_vec_size) is divided up
+  //   into blocks of (blockM, blockN) divided among threadgroups
+  // - Every thread works on a block of (TM, TN)
+  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
+  //
+  // 1. A thread loads TN elements each from mat along TM contiguous rows
+  //    and the corresponding scalar from the vector
+  // 2. The thread then accumulates its local result for the block
+  // 3. At the end, each thread has accumulated results over all blocks across
+  //    the rows. These are then summed up across the threadgroup
+  // 4. Each threadgroup writes its accumulated BN * TN outputs
+  //
+  // Edge case handling:
+  // - The threadgroup with the largest tid has blocks that exceed the matrix
+  //   * The blocks that start outside the matrix are never read (thread results
+  //     remain zero)
+  //   * The last thread that partially overlaps with the matrix is shifted
+  //     inwards such that the thread block fits exactly in the matrix
+  MLX_MTL_CONST short tgp_mem_size = BM > 1 ? BM*(blockN + TN) : 0;
+  MLX_MTL_CONST bool needs_tgp_reduction = BM > 1;
+  static METAL_FUNC void run(
+      const device T* mat [[buffer(0)]],
+      const device T* in_vec [[buffer(1)]],
+      device T* out_vec [[buffer(3)]],
+      const constant int& in_vec_size [[buffer(4)]],
+      const constant int& out_vec_size [[buffer(5)]],
+      const constant int& marix_ld [[buffer(6)]],
+      const device out_mask_t* out_mask [[buffer(20)]],
+      const device op_mask_t* mat_mask [[buffer(21)]],
+      const device op_mask_t* vec_mask [[buffer(22)]],
+      const constant int* mask_strides [[buffer(23)]],
+      threadgroup AccT* tgp_memory [[threadgroup(0)]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]],
+      uint simd_gid [[simdgroup_index_in_threadgroup]],
+      uint simd_lid [[thread_index_in_simdgroup]]) {
+    // Appease compiler
+    (void)lid;
+    // Thread local accumulation results
+    AccT result[TN] = {0};
+    T inter[TN];
+    AccT v_coeff[TM];
+    const int thrM = SN != 32 ? simd_lid / SN : 0;
+    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);
+    const int sgM = BN != 1 ? (simd_gid / BN) : int(simd_gid);
+    const int sgN = BN != 1 ? (simd_gid % BN) : 0;
+    const int simdM = SM * sgM;
+    const int simdN = SN * sgN;
+    int cm = (simdM + thrM);
+    int cn = (simdN + thrN);
+    int bm = cm * TM;
+    int bn = cn * TN;
+    int out_col = tid.x * blockN + bn;
+    // Prepare mask offsets
+    const constant int* out_mask_strides = mask_strides;
+    const constant int* mat_mask_strides =
+        out_mask_strides + (has_output_mask ? 2 : 0);
+    const constant int* vec_mask_strides =
+        mat_mask_strides + (has_operand_mask ? 2 : 0);
+    const int n_block_idx = blockM > blockN ? out_col / blockM : int(tid.x);
+    const int out_mask_offset =
+        !has_output_mask ? 0 : n_block_idx; // * out_mask_strides[0];
+    int mat_mask_offset =
+        !has_operand_mask ? 0 : n_block_idx * mat_mask_strides[0];
+    int vec_mask_offset = 0;
+    const int mat_mask_step = !has_operand_mask ? 0 : mat_mask_strides[1];
+    const int vec_mask_step = !has_operand_mask ? 0 : vec_mask_strides[0];
+    T out_scale{1};
+    // Check output mask
+    if (has_output_mask) {
+      auto mask_out = out_mask[out_mask_offset];
+      // Write zeros and return if mask is 0
+      if (!mask_out) {
+        if (cm == 0 && out_col < out_vec_size) {
+          if (out_col + TN <= out_vec_size) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tn = 0; tn < TN; tn++) {
+              out_vec[out_col + tn] = T(0.);
+            }
+          } else {
+            for (int tn = 0; tn < TN && (out_col + tn) < out_vec_size; tn++) {
+              out_vec[out_col + tn] = T(0.);
+            }
+          }
+        }
+        return;
+      }
+      // Store scalar if multiplicative mask
+      if (has_mul_output_mask) {
+        out_scale = T(mask_out);
+      }
+    }
+    // Prepare for loop
+    constexpr const uniform<int> loop_stride = make_uniform(blockM);
+    const uniform<int> in_size = make_uniform(in_vec_size);
+    const uniform<int> n_iter = in_size / loop_stride;
+    const uniform<int> last_iter = loop_stride * n_iter;
+    const uniform<int> leftover = in_size - last_iter;
+    // Edgecase handling
+    if (out_col < out_vec_size) {
+      out_col = (out_col + TN) <= out_vec_size ? out_col : out_vec_size - TN;
+      // Per thread accumulation main loop
+      for (int i = 0; i < n_iter; ++i) {
+        // Adding a threadgroup_barrier improves performance slightly
+        // This is possibly it may help exploit cache better
+        threadgroup_barrier(mem_flags::mem_none);
+        if (!has_operand_mask ||
+            (bool(mat_mask[mat_mask_offset]) &&
+             bool(vec_mask[vec_mask_offset]))) {
+          T block_scale{1};
+          if (has_mul_operand_mask) {
+            block_scale =
+                T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
+          }
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tm = 0; tm < TM; tm++) {
+            v_coeff[tm] = static_cast<AccT>(in_vec[bm + tm]);
+          }
+          // Apply scale
+          if (has_mul_operand_mask) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tm = 0; tm < TM; tm++) {
+              v_coeff[tm] *= block_scale;
+            }
+          }
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tm = 0; tm < TM; tm++) {
+            for (int tn = 0; tn < TN; tn++) {
+              inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
+            }
+            for (int tn = 0; tn < TN; tn++) {
+              result[tn] += v_coeff[tm] * inter[tn];
+            }
+          }
+        }
+        bm += blockM;
+        mat_mask_offset += mat_mask_step;
+        vec_mask_offset += vec_mask_step;
+      }
+      if (leftover > 0) {
+        if (!has_operand_mask ||
+            (bool(mat_mask[mat_mask_offset]) &&
+             bool(vec_mask[vec_mask_offset]))) {
+          T block_scale{1};
+          if (has_mul_operand_mask) {
+            block_scale =
+                T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
+          }
+          for (int tm = 0; tm < TM && bm + tm < in_vec_size; tm++) {
+            v_coeff[tm] = static_cast<AccT>(in_vec[bm + tm]);
+            if (has_mul_operand_mask) {
+              v_coeff[tm] *= block_scale;
+            }
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tn = 0; tn < TN; tn++) {
+              inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
+            }
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tn = 0; tn < TN; tn++) {
+              result[tn] += v_coeff[tm] * inter[tn];
+            }
+          }
+        }
+      }
+    }
+    // Apply out scale
+    if (has_mul_output_mask) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tn = 0; tn < TN; tn++) {
+        result[tn] *= out_scale;
+      }
+    }
+    // Simdgroup accumulations
+    MLX_MTL_PRAGMA_UNROLL
+    for (int tn = 0; tn < TN; tn++) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (ushort sm = (SM / 2); sm >= 1; sm >>= 1) {
+        result[tn] += simd_shuffle_down(result[tn], SN * sm);
+      }
+    }
+    // Threadgroup accumulation results
+    if (needs_tgp_reduction) {
+      threadgroup AccT* tgp_results = tgp_memory + sgM * (blockN + TN) + bn;
+      if (thrM == 0) {
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tn = 0; tn < TN; tn++) {
+          tgp_results[tn] = result[tn];
+        }
+        threadgroup_barrier(mem_flags::mem_none);
+        if (sgM == 0) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int sgm = 1; sgm < BM; sgm++) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tn = 0; tn < TN; tn++) {
+              result[tn] += tgp_results[sgm * (blockN + TN) + tn];
+            }
+          }
+        }
+      }
+    }
+    // Threadgroup accumulation and writing out results
+    if (cm == 0 && out_col < out_vec_size) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int j = 0; j < TN; j++) {
+        out_vec[out_col + j] = static_cast<T>(result[j]);
+      }
+    }
+  }
+};
+///////////////////////////////////////////////////////////////////////////////
+/// Matrix vector multiplication
+///////////////////////////////////////////////////////////////////////////////
+template <
+    typename T,
+    typename out_mask_t,
+    typename op_mask_t,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN, /* Thread cols (in elements) */
+    const bool kDoNCBatch> /* Batch ndim > 1 */
+[[kernel, max_total_threads_per_threadgroup(BM* BN * 32)]] void gemv_masked(
+    const device T* mat [[buffer(0)]],
+    const device T* in_vec [[buffer(1)]],
+    device T* out_vec [[buffer(3)]],
+    const constant int& in_vec_size [[buffer(4)]],
+    const constant int& out_vec_size [[buffer(5)]],
+    const constant int& marix_ld [[buffer(6)]],
+    const constant int& batch_ndim [[buffer(9)]],
+    const constant int* batch_shape [[buffer(10)]],
+    const constant int64_t* vector_batch_stride [[buffer(11)]],
+    const constant int64_t* matrix_batch_stride [[buffer(12)]],
+    const device out_mask_t* out_mask [[buffer(20)]],
+    const device op_mask_t* mat_mask [[buffer(21)]],
+    const device op_mask_t* vec_mask [[buffer(22)]],
+    const constant int* mask_strides [[buffer(23)]],
+    const constant int64_t* mask_batch_strides [[buffer(24)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  using gemv_kernel =
+      GEMVKernel<T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN>;
+  threadgroup float tgp_memory
+      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];
+  constexpr bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
+  constexpr bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;
+  // Update batch offsets
+  if (kDoNCBatch) {
+    in_vec += elem_to_loc(tid.z, batch_shape, vector_batch_stride, batch_ndim);
+    mat += elem_to_loc(tid.z, batch_shape, matrix_batch_stride, batch_ndim);
+    if (has_output_mask) {
+      out_mask +=
+          elem_to_loc(tid.z, batch_shape, mask_batch_strides, batch_ndim);
+      mask_batch_strides += batch_ndim;
+    }
+    if (has_operand_mask) {
+      const constant auto* mask_strides_mat = mask_batch_strides;
+      const constant auto* mask_strides_vec = mask_strides_mat + batch_ndim;
+      ulong2 batch_offsets = elem_to_loc_broadcast(
+          tid.z, batch_shape, mask_strides_mat, mask_strides_vec, batch_ndim);
+      mat_mask += batch_offsets.x;
+      vec_mask += batch_offsets.y;
+    }
+  } else {
+    in_vec += tid.z * vector_batch_stride[0];
+    mat += tid.z * matrix_batch_stride[0];
+    if (has_output_mask) {
+      out_mask += tid.z * mask_batch_strides[0];
+      mask_batch_strides += batch_ndim;
+    }
+    if (has_operand_mask) {
+      mat_mask += tid.z * mask_batch_strides[0];
+      vec_mask += tid.z * mask_batch_strides[batch_ndim];
+    }
+  }
+  out_vec += tid.z * out_vec_size;
+  gemv_kernel::run(
+      mat,
+      in_vec,
+      out_vec,
+      in_vec_size,
+      out_vec_size,
+      marix_ld,
+      out_mask,
+      mat_mask,
+      vec_mask,
+      mask_strides,
+      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
+      tid,
+      lid,
+      simd_gid,
+      simd_lid);
+}
+///////////////////////////////////////////////////////////////////////////////
+/// Vector matrix multiplication
+///////////////////////////////////////////////////////////////////////////////
+template <
+    typename T,
+    typename out_mask_t,
+    typename op_mask_t,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN, /* Thread cols (in elements) */
+    const bool kDoNCBatch> /* Batch ndim > 1 */
+[[kernel, max_total_threads_per_threadgroup(BM* BN * 32)]] void gemv_t_masked(
+    const device T* mat [[buffer(0)]],
+    const device T* in_vec [[buffer(1)]],
+    device T* out_vec [[buffer(3)]],
+    const constant int& in_vec_size [[buffer(4)]],
+    const constant int& out_vec_size [[buffer(5)]],
+    const constant int& marix_ld [[buffer(6)]],
+    const constant int& batch_ndim [[buffer(9)]],
+    const constant int* batch_shape [[buffer(10)]],
+    const constant int64_t* vector_batch_stride [[buffer(11)]],
+    const constant int64_t* matrix_batch_stride [[buffer(12)]],
+    const device out_mask_t* out_mask [[buffer(20)]],
+    const device op_mask_t* mat_mask [[buffer(21)]],
+    const device op_mask_t* vec_mask [[buffer(22)]],
+    const constant int* mask_strides [[buffer(23)]],
+    const constant int64_t* mask_batch_strides [[buffer(24)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  using gemv_kernel =
+      GEMVTKernel<T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN>;
+  threadgroup float tgp_memory
+      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];
+  constexpr bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
+  constexpr bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;
+  // Update batch offsets
+  if (kDoNCBatch) {
+    in_vec += elem_to_loc(tid.z, batch_shape, vector_batch_stride, batch_ndim);
+    mat += elem_to_loc(tid.z, batch_shape, matrix_batch_stride, batch_ndim);
+    if (has_output_mask) {
+      out_mask +=
+          elem_to_loc(tid.z, batch_shape, mask_batch_strides, batch_ndim);
+      mask_batch_strides += batch_ndim;
+    }
+    if (has_operand_mask) {
+      const constant auto* mask_strides_mat = mask_batch_strides;
+      const constant auto* mask_strides_vec = mask_strides_mat + batch_ndim;
+      ulong2 batch_offsets = elem_to_loc_broadcast(
+          tid.z, batch_shape, mask_strides_mat, mask_strides_vec, batch_ndim);
+      mat_mask += batch_offsets.x;
+      vec_mask += batch_offsets.y;
+    }
+  } else {
+    in_vec += tid.z * vector_batch_stride[0];
+    mat += tid.z * matrix_batch_stride[0];
+    if (has_output_mask) {
+      out_mask += tid.z * mask_batch_strides[0];
+      mask_batch_strides += batch_ndim;
+    }
+    if (has_operand_mask) {
+      mat_mask += tid.z * mask_batch_strides[0];
+      vec_mask += tid.z * mask_batch_strides[batch_ndim];
+    }
+  }
+  out_vec += tid.z * out_vec_size;
+  gemv_kernel::run(
+      mat,
+      in_vec,
+      out_vec,
+      in_vec_size,
+      out_vec_size,
+      marix_ld,
+      out_mask,
+      mat_mask,
+      vec_mask,
+      mask_strides,
+      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
+      tid,
+      lid,
+      simd_gid,
+      simd_lid);
+}