npm - @novastera-oss/llamarn - Versions diffs - 0.4.1 → 0.4.3-beta4 - Mend

@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (976) hide show

package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh CHANGED Viewed

@@ -3,7 +3,7 @@
 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
 void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
+    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
 void ggml_cuda_op_mul_mat_vec_q(
     ggml_backend_cuda_context & ctx,

package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu CHANGED Viewed

@@ -105,29 +105,29 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
 }
 template <int block_size, bool do_multiply = false, bool do_add = false>
-static __global__ void rms_norm_f32(const float * x, float *       dst,
+static __global__ void rms_norm_f32(const float * x,
+                                    float *       dst,
                                     const int     ncols,
                                     const int64_t stride_row,
                                     const int64_t stride_channel,
                                     const int64_t stride_sample,
                                     const float   eps,
-                                    const float * mul                = nullptr,
-                                    const int64_t mul_stride_row     = 0,
-                                    const int64_t mul_stride_channel = 0,
-                                    const int64_t mul_stride_sample  = 0,
-                                    const int     mul_ncols          = 0,
-                                    const int     mul_nrows          = 0,
-                                    const int     mul_nchannels      = 0,
-                                    const int     mul_nsamples       = 0,
-                                    const float * add                = nullptr,
-                                    const int64_t add_stride_row     = 0,
-                                    const int64_t add_stride_channel = 0,
-                                    const int64_t add_stride_sample  = 0,
-                                    const int     add_ncols          = 0,
-                                    const int     add_nrows          = 0,
-                                    const int     add_nchannels      = 0,
-                                    const int     add_nsamples       = 0) {
+                                    const float * mul                  = nullptr,
+                                    const int64_t mul_stride_row       = 0,
+                                    const int64_t mul_stride_channel   = 0,
+                                    const int64_t mul_stride_sample    = 0,
+                                    const uint3   mul_ncols_packed     = make_uint3(0, 0, 0),
+                                    const uint3   mul_nrows_packed     = make_uint3(0, 0, 0),
+                                    const uint3   mul_nchannels_packed = make_uint3(0, 0, 0),
+                                    const uint3   mul_nsamples_packed  = make_uint3(0, 0, 0),
+                                    const float * add                  = nullptr,
+                                    const int64_t add_stride_row       = 0,
+                                    const int64_t add_stride_channel   = 0,
+                                    const int64_t add_stride_sample    = 0,
+                                    const uint3   add_ncols_packed     = make_uint3(0, 0, 0),
+                                    const uint3   add_nrows_packed     = make_uint3(0, 0, 0),
+                                    const uint3   add_nchannels_packed = make_uint3(0, 0, 0),
+                                    const uint3   add_nsamples_packed  = make_uint3(0, 0, 0)) {
     const int nrows     = gridDim.x;
     const int nchannels = gridDim.y;
@@ -142,16 +142,16 @@ static __global__ void rms_norm_f32(const float * x, float *       dst,
     dst += ((sample*nchannels + channel)*nrows + row)*ncols;
     if constexpr (do_multiply) {
-        const int mul_row = row % mul_nrows;
-        const int mul_channel = channel % mul_nchannels;
-        const int mul_sample = sample % mul_nsamples;
-        mul += mul_sample*mul_stride_sample + mul_channel*mul_stride_channel + mul_row*mul_stride_row;
+        const uint32_t mul_row     = fastmodulo(row, mul_nrows_packed);
+        const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
+        const uint32_t mul_sample  = fastmodulo(sample, mul_nsamples_packed);
+        mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
     }
     if constexpr (do_add) {
-        const int add_row     = row % add_nrows;
-        const int add_channel = channel % add_nchannels;
-        const int add_sample  = sample % add_nsamples;
+        const int add_row     = fastmodulo(row, add_nrows_packed);
+        const int add_channel = fastmodulo(channel, add_nchannels_packed);
+        const int add_sample  = fastmodulo(sample, add_nsamples_packed);
         add += add_sample * add_stride_sample + add_channel * add_stride_channel + add_row * add_stride_row;
     }
@@ -165,15 +165,18 @@ static __global__ void rms_norm_f32(const float * x, float *       dst,
     // sum up partial sums
     tmp = warp_reduce_sum(tmp);
     if constexpr (block_size > WARP_SIZE) {
-        static_assert(block_size == 1024, "unexpected block_size");
+        static_assert((block_size <= 1024) && (block_size % 32 == 0), "unexpected block_size");
         __shared__ float s_sum[32];
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
+        const int        warp_id = tid / WARP_SIZE;
+        const int        lane_id = tid % WARP_SIZE;
         if (lane_id == 0) {
             s_sum[warp_id] = tmp;
         }
         __syncthreads();
-        tmp = s_sum[lane_id];
+        tmp = 0.0f;
+        if (lane_id < (block_size / WARP_SIZE)) {
+            tmp = s_sum[lane_id];
+        }
         tmp = warp_reduce_sum(tmp);
     }
@@ -182,12 +185,12 @@ static __global__ void rms_norm_f32(const float * x, float *       dst,
     for (int col = tid; col < ncols; col += block_size) {
         if constexpr (do_multiply && do_add) {
-            const int mul_col = col % mul_ncols;
-            const int add_col = col % add_ncols;
-            dst[col] = scale * x[col] * mul[mul_col] + add[add_col];
+            const int mul_col = fastmodulo(col, mul_ncols_packed);
+            const int add_col = fastmodulo(col, add_ncols_packed);
+            dst[col]          = scale * x[col] * mul[mul_col] + add[add_col];
         } else if constexpr (do_multiply) {
-            const int mul_col = col % mul_ncols;
-            dst[col] = scale * x[col] * mul[mul_col];
+            const int mul_col = fastmodulo(col, mul_ncols_packed);
+            dst[col]          = scale * x[col] * mul[mul_col];
         } else {
             dst[col] = scale * x[col];
         }
@@ -354,77 +357,86 @@ static void rms_norm_f32_cuda(
         const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) {
     const dim3 blocks_num(nrows, nchannels, nsamples);
     if (ncols < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        rms_norm_f32<WARP_SIZE, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        const dim3 block_dims(256, 1, 1);
+        rms_norm_f32<256, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
     } else {
         const dim3 block_dims(1024, 1, 1);
         rms_norm_f32<1024, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
     }
 }
-static void rms_norm_mul_f32_cuda(const float * x,
-                                  const float * mul,
-                                  const float * add,
-                                  float *       dst,
-                                  const int     ncols,
-                                  const int     nrows,
-                                  const int     nchannels,
-                                  const int     nsamples,
-                                  const int64_t stride_row,
-                                  const int64_t stride_channel,
-                                  const int64_t stride_sample,
-                                  const int64_t mul_stride_row,
-                                  const int64_t mul_stride_channel,
-                                  const int64_t mul_stride_sample,
-                                  const int     mul_ncols,
-                                  const int     mul_nrows,
-                                  const int     mul_nchannels,
-                                  const int     mul_nsamples,
-                                  const int64_t add_stride_row,
-                                  const int64_t add_stride_channel,
-                                  const int64_t add_stride_sample,
-                                  const int     add_ncols,
-                                  const int     add_nrows,
-                                  const int     add_nchannels,
-                                  const int     add_nsamples,
-                                  const float   eps,
-                                  cudaStream_t  stream) {
+static void rms_norm_mul_f32_cuda(const float *  x,
+                                  const float *  mul,
+                                  const float *  add,
+                                  float *        dst,
+                                  const int      ncols,
+                                  const int      nrows,
+                                  const int      nchannels,
+                                  const int      nsamples,
+                                  const int64_t  stride_row,
+                                  const int64_t  stride_channel,
+                                  const int64_t  stride_sample,
+                                  const int64_t  mul_stride_row,
+                                  const int64_t  mul_stride_channel,
+                                  const int64_t  mul_stride_sample,
+                                  const uint32_t mul_ncols,
+                                  const uint32_t mul_nrows,
+                                  const uint32_t mul_nchannels,
+                                  const uint32_t mul_nsamples,
+                                  const int64_t  add_stride_row,
+                                  const int64_t  add_stride_channel,
+                                  const int64_t  add_stride_sample,
+                                  const uint32_t add_ncols,
+                                  const uint32_t add_nrows,
+                                  const uint32_t add_nchannels,
+                                  const uint32_t add_nsamples,
+                                  const float    eps,
+                                  cudaStream_t   stream) {
     const dim3 blocks_num(nrows, nchannels, nsamples);
     if (mul == nullptr) {
         rms_norm_f32_cuda(x, dst, ncols, nrows, nchannels, nsamples, stride_row, stride_channel, stride_sample, eps, stream);
         return;
     }
     if (add == nullptr) {
+        const uint3 mul_ncols_packed     = init_fastdiv_values(mul_ncols);
+        const uint3 mul_nrows_packed     = init_fastdiv_values(mul_nrows);
+        const uint3 mul_nchannels_packed = init_fastdiv_values(mul_nchannels);
+        const uint3 mul_nsamples_packed  = init_fastdiv_values(mul_nsamples);
         if (ncols < 1024) {
-            const dim3 block_dims(WARP_SIZE, 1, 1);
-            rms_norm_f32<WARP_SIZE, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
-                ncols, stride_row, stride_channel, stride_sample, eps,
-                mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
-                mul_ncols, mul_nrows, mul_nchannels, mul_nsamples);
+            const dim3 block_dims(256, 1, 1);
+            rms_norm_f32<256, true><<<blocks_num, block_dims, 0, stream>>>(
+                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
+                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
         } else {
             const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
-                ncols, stride_row, stride_channel, stride_sample, eps,
-                mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
-                mul_ncols, mul_nrows, mul_nchannels, mul_nsamples);
+            rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(
+                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
+                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
         }
     } else {
+        const uint3 mul_ncols_packed     = init_fastdiv_values(mul_ncols);
+        const uint3 mul_nrows_packed     = init_fastdiv_values(mul_nrows);
+        const uint3 mul_nchannels_packed = init_fastdiv_values(mul_nchannels);
+        const uint3 mul_nsamples_packed  = init_fastdiv_values(mul_nsamples);
+        const uint3 add_ncols_packed     = init_fastdiv_values(add_ncols);
+        const uint3 add_nrows_packed     = init_fastdiv_values(add_nrows);
+        const uint3 add_nchannels_packed = init_fastdiv_values(add_nchannels);
+        const uint3 add_nsamples_packed  = init_fastdiv_values(add_nsamples);
         if (ncols < 1024) {
-            const dim3 block_dims(WARP_SIZE, 1, 1);
-            rms_norm_f32<WARP_SIZE, true, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
-                ncols, stride_row, stride_channel, stride_sample, eps,
-                mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
-                mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
-                add, add_stride_row, add_stride_channel, add_stride_sample,
-                add_ncols, add_nrows, add_nchannels, add_nsamples);
+            const dim3 block_dims(256, 1, 1);
+            rms_norm_f32<256, true, true><<<blocks_num, block_dims, 0, stream>>>(
+                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
+                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
+                add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
+                add_nchannels_packed, add_nsamples_packed);
         } else {
             const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
-                ncols, stride_row, stride_channel, stride_sample, eps,
-                mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
-                mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
-                add, add_stride_row, add_stride_channel, add_stride_sample,
-                add_ncols, add_nrows, add_nchannels, add_nsamples);
+            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(
+                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
+                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
+                add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
+                add_nchannels_packed, add_nsamples_packed);
         }
     }
 }

package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu CHANGED Viewed

@@ -1,36 +1,50 @@
 #include "pad.cuh"
-static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
-    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
-    // blockIdx.y: idx of ne1
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
+static __global__ void pad_f32(const float * src, float * dst,
+                               const int lp0, const int rp0, const int lp1, const int rp1,
+                               const int lp2, const int rp2, const int lp3, const int rp3,
+                               const int ne0, const int ne1, const int ne2, const int ne3) {
+    // blockIdx.z: i3*ne2+i2
+    // blockIdx.y: i1
+    // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
+    // gridDim.y:  ne1
+    int i0 = threadIdx.x + blockIdx.x * blockDim.x;
+    int i1 = blockIdx.y;
+    int i2 = blockIdx.z % ne2;
+    int i3 = blockIdx.z / ne2;
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
         return;
     }
     // operation
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    if (nidx < ne00 && blockIdx.y < (unsigned)ne01 && blockIdx.z < (unsigned)(ne02*ne03)) {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne00 +
-            blockIdx.z * ne00 * ne01;
-        dst[offset_dst] = x[offset_src];
+    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
+        (i1 >= lp1 && i1 < ne1 - rp1) &&
+        (i2 >= lp2 && i2 < ne2 - rp2) &&
+        (i3 >= lp3 && i3 < ne3 - rp3)) {
+        const int64_t i00 = i0 - lp0;
+        const int64_t i01 = i1 - lp1;
+        const int64_t i02 = i2 - lp2;
+        const int64_t i03 = i3 - lp3;
+        const int64_t ne02 = ne2 - lp2 - rp2;
+        const int64_t ne01 = ne1 - lp1 - rp1;
+        const int64_t ne00 = ne0 - lp0 - rp0;
+        const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
+        dst[dst_idx] = src[src_idx];
     } else {
-        dst[offset_dst] = 0.0f;
+        dst[dst_idx] = 0.0f;
     }
 }
-static void pad_f32_cuda(const float * x, float * dst,
-    const int ne00, const int ne01, const int ne02, const int ne03,
+static void pad_f32_cuda(const float * src, float * dst,
+    const int lp0, const int rp0, const int lp1, const int rp1,
+    const int lp2, const int rp2, const int lp3, const int rp3,
     const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
     int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
     dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3);
 }
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -41,9 +55,18 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    const int32_t lp0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t rp0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t lp1 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t rp1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t lp2 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
+    const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
+    const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
     pad_f32_cuda(src0_d, dst_d,
-        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+                 lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }

package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu CHANGED Viewed

@@ -1,82 +1,91 @@
 #include "pad_reflect_1d.cuh"
-static __global__ void pad_reflect_1d_kernel_f32(
-    const void * __restrict__ src0,
-    void * __restrict__ dst,
-    const int64_t ne0,
-    const int64_t ne00,
-    const int64_t ne01,
-    const int64_t ne02,
-    const int64_t ne03,
-    const int64_t nb00,
-    const int64_t nb01,
-    const int64_t nb02,
-    const int64_t nb03,
-    const int64_t nb0,
-    const int64_t nb1,
-    const int64_t nb2,
-    const int64_t nb3,
-    const int p0,
-    const int p1) {
+static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
+    pad_reflect_1d_kernel_f32(
+        const void * __restrict__ src0,
+        void * __restrict__       dst,
+        const int64_t             ne0,
+        const int64_t             ne00,
+        const uint3               ne01,
+        const int64_t             ne02,
+        const int64_t             ne03,
+        const int64_t             nb00,
+        const int64_t             nb01,
+        const int64_t             nb02,
+        const int64_t             nb03,
+        const int64_t             nb0,
+        const int64_t             nb1,
+        const int64_t             nb2,
+        const int64_t             nb3,
+        const int                 p0,
+        const int                 p1) {
     const int64_t i3 = blockIdx.z;
     const int64_t i2 = blockIdx.y;
-    const int64_t i1 = blockIdx.x;
-    if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
+    const uint2   div_mod_packed = fast_div_modulo(blockIdx.x, ne01);
+    const int64_t tile1          = div_mod_packed.y;  // i1
+    const int64_t tile0          = div_mod_packed.x;  // nth i0 tile
+    const int64_t i1             = tile1;
+    const int64_t i0             = threadIdx.x + tile0 * blockDim.x;
+    // ne01.z is original value of unpacked ne01 (see init_fastdiv_values in common.cuh)
+    if (i0 >= ne0 || i1 >= ne01.z || i2 >= ne02 || i3 >= ne03) {
         return;
     }
-    const char * src0_ptr = (const char *)src0 + i3*nb03 + i2*nb02 + i1*nb01;
-    char * dst_ptr = (char *)dst + i3*nb3 + i2*nb2 + i1*nb1;
+    const char * src0_ptr = (const char *) src0 + i3 * nb03 + i2 * nb02 + i1 * nb01;
+    char *       dst_ptr  = (char *) dst + i3 * nb3 + i2 * nb2 + i1 * nb1;
-    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
-        float value;
+    const int64_t rel_i0 = i0 - p0;  // relative i0 in src0
+    int64_t src_idx;
-        if (i0 < p0) {
-            // Left padding - reflect
-            value = *(const float *)(src0_ptr + (p0 - i0) * nb00);
-        } else if (i0 < ne0 - p1) {
-            // Middle - copy
-            value = *(const float *)(src0_ptr + (i0 - p0) * nb00);
-        } else {
-            // Right padding - reflect
-            int64_t src_idx = (ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1;
-            value = *(const float *)(src0_ptr + src_idx * nb00);
-        }
-        *(float *)(dst_ptr + i0 * nb0) = value;
+    if (rel_i0 < 0) {
+        // Left padding - reflect
+        src_idx = -rel_i0;
+    } else if (rel_i0 < ne00) {
+        // Middle - copy
+        src_idx = rel_i0;
+    } else {
+        // Right padding - reflect
+        src_idx = 2 * ne00 - 2 - rel_i0;
     }
+    const float value               = *(const float *) (src0_ptr + src_idx * nb00);
+    *(float *) (dst_ptr + i0 * nb0) = value;
+    GGML_UNUSED(p1);
 }
 void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    cudaStream_t stream = ctx.stream();
+    const ggml_tensor * src0   = dst->src[0];
+    cudaStream_t        stream = ctx.stream();
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
     const int32_t * opts = (const int32_t *) dst->op_params;
-    const int p0 = opts[0];
-    const int p1 = opts[1];
+    const int       p0   = opts[0];
+    const int       p1   = opts[1];
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
+    const int64_t ne00        = src0->ne[0];
+    const int64_t ne01        = src0->ne[1];
+    const uint3   ne01_packed = init_fastdiv_values(ne01);
+    const int64_t ne02        = src0->ne[2];
+    const int64_t ne03        = src0->ne[3];
     const int64_t ne0 = dst->ne[0];
+    // sanity: padded length matches
     GGML_ASSERT(ne0 == ne00 + p0 + p1);
-    const dim3 block_dims(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1, 1);
-    const dim3 grid_dims(ne01, ne02, ne03);
+    constexpr int64_t bx     = CUDA_PAD_REFLECT_1D_BLOCK_SIZE;  // threads per block (x)
+    const int64_t     tiles0 = (ne0 + bx - 1) / bx;             // number of tiles along i0
+    // grid.x covers i1 and all tiles of i0: [ne01 * tiles0]
+    // grid.y covers i2: [ne02]
+    // grid.z covers i3: [ne03]
+    const dim3        grid_dims((unsigned) (ne01 * tiles0), (unsigned) ne02, (unsigned) ne03);
+    const dim3        block_dims((unsigned) bx, 1, 1);
     pad_reflect_1d_kernel_f32<<<grid_dims, block_dims, 0, stream>>>(
-        src0->data, dst->data,
-        ne0, ne00, ne01, ne02, ne03,
-        src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
-        dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
-        p0, p1
-    );
+        src0->data, dst->data, ne0, ne00, ne01_packed, ne02, ne03, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+        dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], p0, p1);
 }

package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu CHANGED Viewed

@@ -1,26 +1,27 @@
 #include "quantize.cuh"
 #include <cstdint>
+__launch_bounds__(CUDA_QUANTIZE_BLOCK_SIZE, 1)
 static __global__ void quantize_q8_1(
         const float * __restrict__ x, void * __restrict__ vy,
         const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
-        const int64_t ne0, const int ne1, const int ne2) {
+        const int64_t ne0, const uint32_t ne1, const uint3 ne2) {
     const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
     if (i0 >= ne0) {
         return;
     }
+    const int64_t i3 = fastdiv(blockIdx.z, ne2);
+    const int64_t i2 = blockIdx.z - i3*ne2.z;
     const int64_t i1 = blockIdx.y;
-    const int64_t i2 = blockIdx.z % ne2;
-    const int64_t i3 = blockIdx.z / ne2;
     const int64_t & i00 = i0;
     const int64_t & i01 = i1;
     const int64_t & i02 = i2;
     const int64_t & i03 = i3;
-    const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0;
+    const int64_t i_cont = ((i3*ne2.z + i2) * ne1 + i1) * ne0 + i0;
     block_q8_1 * y = (block_q8_1 *) vy;
@@ -31,10 +32,10 @@ static __global__ void quantize_q8_1(
     float amax = fabsf(xi);
     float sum = xi;
-    amax = warp_reduce_max(amax);
-    sum  = warp_reduce_sum(sum);
+    amax = warp_reduce_max<QK8_1>(amax);
+    sum  = warp_reduce_sum<QK8_1>(sum);
-    const float  d = amax / 127;
+    const float  d = amax / 127.0f;
     const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
     y[ib].qs[iqs] = q;
@@ -43,8 +44,7 @@ static __global__ void quantize_q8_1(
         return;
     }
-    reinterpret_cast<half&>(y[ib].ds.x) = d;
-    reinterpret_cast<half&>(y[ib].ds.y) = sum;
+    y[ib].ds = make_half2(d, sum);
 }
 template <mmq_q8_1_ds_layout ds_layout>
@@ -152,10 +152,12 @@ void quantize_row_q8_1_cuda(
     GGML_ASSERT(!ids);
     GGML_ASSERT(ne0 % QK8_1 == 0);
+    const uint3 ne2_fastdiv = init_fastdiv_values(ne2);
     const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
     const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
     const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv);
     GGML_UNUSED(type_src0);
 }