npm - @lgrammel/ds4-provider - Versions diffs - 0.0.1 - Mend

@lgrammel/ds4-provider 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/README.md +96 -0
package/binding.gyp +75 -0
package/dist/ds4-language-model.d.ts +71 -0
package/dist/ds4-language-model.d.ts.map +1 -0
package/dist/ds4-language-model.js +888 -0
package/dist/ds4-language-model.js.map +1 -0
package/dist/ds4-provider.d.ts +13 -0
package/dist/ds4-provider.d.ts.map +1 -0
package/dist/ds4-provider.js +20 -0
package/dist/ds4-provider.js.map +1 -0
package/dist/index.d.ts +4 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +4 -0
package/dist/index.js.map +1 -0
package/dist/native-binding.d.ts +42 -0
package/dist/native-binding.d.ts.map +1 -0
package/dist/native-binding.js +157 -0
package/dist/native-binding.js.map +1 -0
package/ds4/LICENSE +22 -0
package/ds4/ds4.c +18268 -0
package/ds4/ds4.h +196 -0
package/ds4/ds4_gpu.h +804 -0
package/ds4/ds4_metal.m +14657 -0
package/ds4/metal/argsort.metal +266 -0
package/ds4/metal/bin.metal +192 -0
package/ds4/metal/concat.metal +62 -0
package/ds4/metal/cpy.metal +57 -0
package/ds4/metal/dense.metal +1121 -0
package/ds4/metal/dsv4_hc.metal +861 -0
package/ds4/metal/dsv4_kv.metal +227 -0
package/ds4/metal/dsv4_misc.metal +1088 -0
package/ds4/metal/dsv4_rope.metal +155 -0
package/ds4/metal/flash_attn.metal +1426 -0
package/ds4/metal/get_rows.metal +54 -0
package/ds4/metal/glu.metal +36 -0
package/ds4/metal/moe.metal +1737 -0
package/ds4/metal/norm.metal +153 -0
package/ds4/metal/repeat.metal +52 -0
package/ds4/metal/set_rows.metal +55 -0
package/ds4/metal/softmax.metal +241 -0
package/ds4/metal/sum_rows.metal +102 -0
package/ds4/metal/unary.metal +312 -0
package/native/binding.cpp +621 -0
package/package.json +66 -0
package/scripts/postinstall.cjs +13 -0
package/scripts/vendor-ds4.cjs +67 -0

package/ds4/metal/norm.metal ADDED Viewed

@@ -0,0 +1,153 @@
+struct ds4_metal_args_norm {
+    int32_t  ne00;
+    int32_t  ne00_t;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    float    eps;
+    int32_t  nef1[3];
+    int32_t  nef2[3];
+    int32_t  nef3[3];
+    uint64_t nbf1[3];
+    uint64_t nbf2[3];
+    uint64_t nbf3[3];
+};
+// RMSNorm over one activation row, optionally fusing the learned weight
+// multiply. DS4 calls this before attention, before the FFN, and for plain
+// diagnostics that need normalized but unweighted rows.
+template <typename T, short F>
+kernel void kernel_rms_norm_fuse_impl(
+        constant ds4_metal_args_norm & args,
+        device const char * src0,
+        device const char * src1_0,
+        device const char * src1_1,
+        device       char * dst,
+        threadgroup float * shmem_f32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
+    const int i01 = tgpig.x;
+    const int i02 = tgpig.y;
+    const int i03 = tgpig.z;
+    device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
+    device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
+    device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
+    float sumf = 0.0f;
+    // parallel sum
+    for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
+        sumf += dot(x[i00], x[i00]);
+    }
+    sumf = simd_sum(sumf);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+    const float mean  = sumf/args.ne00;
+    const float scale = 1.0f/sqrt(mean + args.eps);
+    device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
+    for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
+        if (F == 1) {
+            y[i00] = (x[i00]*scale);
+        }
+        if (F == 2) {
+            y[i00] = (x[i00]*scale)*f0[i00];
+        }
+        if (F == 3) {
+            y[i00] = (x[i00]*scale)*f0[i00] + f1[i00];
+        }
+    }
+}
+typedef decltype(kernel_rms_norm_fuse_impl<float4, 1>) kernel_rms_norm_fuse_t;
+// Host-visible RMSNorm variants: plain norm and norm multiplied by weight.
+template [[host_name("kernel_rms_norm_f32_4")]]     kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 1>;
+template [[host_name("kernel_rms_norm_mul_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 2>;
+struct ds4_metal_args_qkv_rms_norm {
+    int32_t  q_n;
+    int32_t  q_n4;
+    int32_t  kv_n;
+    int32_t  kv_n4;
+    uint64_t q_row_stride;
+    uint64_t kv_row_stride;
+    float    eps;
+};
+// Normalizes DS4's q-lora row and KV row in one dispatch.  The two reductions
+// deliberately mirror kernel_rms_norm_mul_f32_4: Q uses the full 256-thread
+// row shape for 1024 floats, while KV only has work in the first 128 lanes for
+// its 512 floats.  This keeps the q/kv normalization math aligned with the
+// standalone kernels while removing one tiny launch from the attention setup.
+kernel void kernel_dsv4_qkv_rms_norm_f32_4(
+        constant ds4_metal_args_qkv_rms_norm & args,
+        device const float4 * q_src,
+        device const float4 * q_weight,
+        device       float4 * q_dst,
+        device const float4 * kv_src,
+        device const float4 * kv_weight,
+        device       float4 * kv_dst,
+        threadgroup float * shmem_f32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3 ntg[[threads_per_threadgroup]]) {
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
+    const uint row = tgpig.x;
+    const bool kv_task = tgpig.y != 0;
+    const int n = kv_task ? args.kv_n : args.q_n;
+    const int n4 = kv_task ? args.kv_n4 : args.q_n4;
+    const uint64_t row_stride4 = (kv_task ? args.kv_row_stride : args.q_row_stride) / sizeof(float4);
+    device const float4 * x = kv_task ? kv_src + row * row_stride4 : q_src + row * row_stride4;
+    device const float4 * w = kv_task ? kv_weight : q_weight;
+    device       float4 * y = kv_task ? kv_dst + row * row_stride4 : q_dst + row * row_stride4;
+    float sumf = 0.0f;
+    for (int i = tpitg.x; i < n4; i += ntg.x) {
+        const float4 v = x[i];
+        sumf += dot(v, v);
+    }
+    sumf = simd_sum(sumf);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+    const float scale = rsqrt(sumf / float(n) + args.eps);
+    for (int i = tpitg.x; i < n4; i += ntg.x) {
+        y[i] = (x[i] * scale) * w[i];
+    }
+}

package/ds4/metal/repeat.metal ADDED Viewed

@@ -0,0 +1,52 @@
+// DS4 Metal repeat kernel used for HC embedding expansion.
+struct ds4_metal_args_repeat {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+};
+// Repeats a source row into the HC channel dimension. DS4 uses this when the
+// token embedding has to become an HC activation block before layer 0.
+template<typename T>
+kernel void kernel_repeat(
+        constant ds4_metal_args_repeat & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i3 = tgpig.z;
+    const int i2 = tgpig.y;
+    const int i1 = tgpig.x;
+    const int i03 = i3%args.ne03;
+    const int i02 = i2%args.ne02;
+    const int i01 = i1%args.ne01;
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
+    device       char * dst_ptr  = dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1;
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i00 = i0%args.ne00;
+        *((device T *)(dst_ptr + i0*args.nb0)) = *((device T *)(src0_ptr + i00*args.nb00));
+    }
+}
+typedef decltype(kernel_repeat<float>) kernel_repeat_t;
+// Host-visible F32 repeat used for HC expansion of embeddings.
+template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;

package/ds4/metal/set_rows.metal ADDED Viewed

@@ -0,0 +1,55 @@
+// DS4 Metal set-rows kernel used for KV writes.
+struct ds4_metal_args_set_rows {
+    int32_t  nk0;
+    int32_t  ne01;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne11;
+    int32_t  ne12;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+};
+// Scatters rows into the KV cache by token position. DS4 uses this after Q/K/V
+// preparation so decode and later prefill chunks can attend to previous tokens.
+template<typename T, typename TI>
+kernel void kernel_set_rows_f(
+        constant ds4_metal_args_set_rows & args,
+        device const char  * src0,
+        device const char  * src1,
+        device       float * dst,
+        uint3                tgpig[[threadgroup_position_in_grid]],
+        uint                 tiitg[[thread_index_in_threadgroup]],
+        uint3                tptg [[threads_per_threadgroup]]) {
+    const int32_t i03 = tgpig.z;
+    const int32_t i02 = tgpig.y;
+    const int32_t i12 = i03%args.ne12;
+    const int32_t i11 = i02%args.ne11;
+    const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x;
+    if (i01 >= args.ne01) {
+        return;
+    }
+    const int32_t i10 = i01;
+    const TI      i1  = ((const device TI *) (src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0];
+          device T     * dst_row = (      device T     *) ((device char *) dst  + i1*args.nb1   + i02*args.nb2  + i03*args.nb3);
+    const device float * src_row = (const device float *) (                src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
+    for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) {
+        dst_row[ind] = (T) src_row[ind];
+    }
+}
+typedef decltype(kernel_set_rows_f<float, int64_t>) set_rows_f_t;
+// Host-visible F32/I32 scatter variant used by KV-cache writes.
+template [[host_name("kernel_set_rows_f32_i32")]] kernel set_rows_f_t kernel_set_rows_f<float, int32_t>;

package/ds4/metal/softmax.metal ADDED Viewed

@@ -0,0 +1,241 @@
+// DS4 Metal softmax kernel used by the compressor pooling compatibility path.
+// The single-compressed-row path is intentionally left as soft_max -> mul ->
+// sum_rows instead of using the fused dsv4_softmax_pool kernel.
+struct ds4_metal_args_soft_max {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    float    scale;
+    float    max_bias;
+    float    m0;
+    float    m1;
+    int32_t  n_head_log2;
+};
+// Row softmax for score matrices. DS4 uses it in the literal one-compressor-row
+// path where preserving the original graph operation boundary avoids drift.
+template<typename T>
+kernel void kernel_soft_max(
+        constant ds4_metal_args_soft_max & args,
+        device const  char * src0,
+        device const  char * src1,
+        device const  char * src2,
+        device        char * dst,
+        threadgroup  float * buf [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint3  tptg[[threads_per_threadgroup]]) {
+    const int32_t i03 = tgpig.z;
+    const int32_t i02 = tgpig.y;
+    const int32_t i01 = tgpig.x;
+    const int32_t i13 = i03%args.ne13;
+    const int32_t i12 = i02%args.ne12;
+    const int32_t i11 = i01;
+    device const float * psrc0 =                (device const float *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
+    device const     T * pmask = src1 != src0 ? (device const T *    ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr;
+    device const float * psrc2 = src2 != src0 ? (device const float *) (src2)                                                 : nullptr;
+    device       float * pdst  =                (device       float *) (dst  + i01*args.nb1  + i02*args.nb2  + i03*args.nb3);
+    float slope = 1.0f;
+    if (args.max_bias > 0.0f) {
+        const int32_t h = i02;
+        const float base = h < args.n_head_log2 ? args.m0 : args.m1;
+        const int   exp  = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
+        slope = pow(base, exp);
+    }
+    float lmax = psrc2 ? psrc2[i02] : -INFINITY;
+    for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
+        lmax = MAX(lmax, psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f));
+    }
+    float max_val = simd_max(lmax);
+    if (tptg.x > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = -INFINITY;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (tiisg == 0) {
+            buf[sgitg] = max_val;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        max_val = buf[tiisg];
+        max_val = simd_max(max_val);
+    }
+    float lsum = 0.0f;
+    for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
+        const float exp_psrc0 = exp((psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
+        lsum += exp_psrc0;
+        pdst[i00] = exp_psrc0;
+    }
+    threadgroup_barrier(mem_flags::mem_none);
+    float sum = simd_sum(lsum);
+    if (tptg.x > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (tiisg == 0) {
+            buf[sgitg] = sum;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sum = buf[tiisg];
+        sum = simd_sum(sum);
+    }
+    if (psrc2) {
+        sum += exp(psrc2[i02] - max_val);
+    }
+    const float inv_sum = 1.0f/sum;
+    for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
+        pdst[i00] *= inv_sum;
+    }
+}
+// Vectorized float4 row softmax for contiguous score rows whose length is a
+// multiple of four; used by the same DS4 compressor/indexer graph path.
+template<typename T>
+kernel void kernel_soft_max_4(
+        constant ds4_metal_args_soft_max & args,
+        device const  char * src0,
+        device const  char * src1,
+        device const  char * src2,
+        device        char * dst,
+        threadgroup  float * buf [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint3  tptg[[threads_per_threadgroup]]) {
+    const int32_t i03 = tgpig.z;
+    const int32_t i02 = tgpig.y;
+    const int32_t i01 = tgpig.x;
+    const int32_t i13 = i03%args.ne13;
+    const int32_t i12 = i02%args.ne12;
+    const int32_t i11 = i01;
+    device const float4 * psrc4 =                (device const float4 *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
+    device const      T * pmask = src1 != src0 ? (device const T *     ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr;
+    device const float *  psrc2 = src2 != src0 ? (device const float * ) (src2)                                                 : nullptr;
+    device       float4 * pdst4 =                (device       float4 *) (dst  + i01*args.nb1  + i02*args.nb2  + i03*args.nb3);
+    float slope = 1.0f;
+    if (args.max_bias > 0.0f) {
+        const int32_t h = i02;
+        const float base = h < args.n_head_log2 ? args.m0 : args.m1;
+        const int   exp  = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
+        slope = pow(base, exp);
+    }
+    float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY;
+    for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
+        lmax4 = fmax(lmax4, psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
+    }
+    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
+    float max_val = simd_max(lmax);
+    if (tptg.x > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = -INFINITY;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (tiisg == 0) {
+            buf[sgitg] = max_val;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        max_val = buf[tiisg];
+        max_val = simd_max(max_val);
+    }
+    float4 lsum4 = 0.0f;
+    for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
+        const float4 exp_psrc4 = exp((psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
+        lsum4 += exp_psrc4;
+        pdst4[i00] = exp_psrc4;
+    }
+    const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
+    threadgroup_barrier(mem_flags::mem_none);
+    float sum = simd_sum(lsum);
+    if (tptg.x > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (tiisg == 0) {
+            buf[sgitg] = sum;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sum = buf[tiisg];
+        sum = simd_sum(sum);
+    }
+    if (psrc2) {
+        sum += exp(psrc2[i02] - max_val);
+    }
+    const float inv_sum = 1.0f/sum;
+    for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
+        pdst4[i00] *= inv_sum;
+    }
+}
+typedef decltype(kernel_soft_max<float>)    kernel_soft_max_t;
+typedef decltype(kernel_soft_max_4<float4>) kernel_soft_max_4_t;
+// Host-visible F32 softmax variants used by compressor pooling.
+template [[host_name("kernel_soft_max_f32")]]   kernel kernel_soft_max_t   kernel_soft_max<float>;
+template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<float4>;

package/ds4/metal/sum_rows.metal ADDED Viewed

@@ -0,0 +1,102 @@
+// DS4 Metal row-sum kernel.
+#define FC_SUM_ROWS 1400
+#define OP_SUM_ROWS_NUM_SUM_ROWS 10
+#define OP_SUM_ROWS_NUM_MEAN     11
+struct ds4_metal_args_sum_rows {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int64_t  ne0;
+    int64_t  ne1;
+    int64_t  ne2;
+    int64_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+};
+static inline float sum(float x) {
+    return x;
+}
+static inline float sum(float4 x) {
+    return x[0] + x[1] + x[2] + x[3];
+}
+constant short FC_sum_rows_op [[function_constant(FC_SUM_ROWS + 0)]];
+// Reduces each row to a sum or mean. DS4 mainly uses the sum form to preserve
+// the compressor-pooling graph boundary in the single-compressor-row case.
+template <typename T0, typename T>
+kernel void kernel_sum_rows_impl(
+        constant ds4_metal_args_sum_rows & args,
+        device const char * src0,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+#define FC_OP  FC_sum_rows_op
+    const int i3 = tgpig.z;
+    const int i2 = tgpig.y;
+    const int i1 = tgpig.x;
+    threadgroup T0 * shmem_t = (threadgroup T0 *) shmem;
+    if (sgitg == 0) {
+        shmem_t[tiisg] = 0.0f;
+    }
+    device const T0 * src_row = (device const T0 *) (src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
+    device       T  * dst_row = (device       T  *) (dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
+    T0 sumf = T0(0.0f);
+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        sumf += src_row[i0];
+    }
+    sumf = simd_sum(sumf);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (tiisg == 0) {
+        shmem_t[sgitg] = sumf;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    sumf = shmem_t[tiisg];
+    sumf = simd_sum(sumf);
+    if (tpitg.x == 0) {
+        if (FC_OP == OP_SUM_ROWS_NUM_MEAN) {
+            if (is_same<float4, T0>::value) {
+                dst_row[0] = sum(sumf) / (4*args.ne00);
+            } else {
+                dst_row[0] = sum(sumf) / args.ne00;
+            }
+        } else {
+            dst_row[0] = sum(sumf);
+        }
+    }
+#undef FC_OP
+}
+typedef decltype(kernel_sum_rows_impl<float, float>) kernel_sum_rows_t;
+// Host-visible F32 row reduction used by compressor pooling.
+template [[host_name("kernel_sum_rows_f32_f32")]] kernel kernel_sum_rows_t kernel_sum_rows_impl<float, float>;