RubyGems - llama_cpp - Versions diffs - 0.5.0 → 0.5.1 - Mend

llama_cpp 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/ext/llama_cpp/src/ggml-alloc.c +106 -24
data/ext/llama_cpp/src/ggml-cuda.cu +83 -23
data/ext/llama_cpp/src/ggml-metal.m +35 -11
data/ext/llama_cpp/src/ggml-metal.metal +145 -92
data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
data/ext/llama_cpp/src/ggml.c +25 -53
data/ext/llama_cpp/src/k_quants.c +45 -12
data/ext/llama_cpp/src/llama.cpp +146 -70
data/ext/llama_cpp/src/llama.h +3 -0
data/lib/llama_cpp/version.rb +2 -2
metadata +2 -2

data/ext/llama_cpp/src/ggml-metal.metal CHANGED Viewed

@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
-    // broadcast
-    if (tpitg[0] == 0) {
-        buf[0] = buf[0];
-    }
+    //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
+    //               the loop, and when that is done, buf[0] has the correct (synchronized) value
+    //if (tpitg[0] == 0) {
+    //    buf[0] = buf[0];
+    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
     const float max = buf[0];
     // parallel sum
     buf[tpitg[0]] = 0.0f;
     for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
-        buf[tpitg[0]] += exp(psrc0[i00] - max);
+        const float exp_psrc0 = exp(psrc0[i00] - max);
+        buf[tpitg[0]] += exp_psrc0;
+        // Remember the result of exp here. exp is expensive, so we really do not
+        // whish to compute it twice.
+        pdst[i00] = exp_psrc0;
     }
     // reduce
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
-    // broadcast
-    if (tpitg[0] == 0) {
-        buf[0] = buf[0];
-    }
+    // broadcast - not needed, see above
+    //// broadcast
+    //if (tpitg[0] == 0) {
+    //    buf[0] = buf[0];
+    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
     const float sum = buf[0];
     for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
-        pdst[i00] = exp(psrc0[i00] - max) / sum;
+        pdst[i00] /= sum;
     }
 }
@@ -214,25 +220,17 @@ kernel void kernel_norm(
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
-    // broadcast
-    if (tpitg == 0) {
-        sum[0] /= ne00;
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    const float mean  = sum[0];
+    const float mean  = sum[0] / ne00;
-    // recenter
+    // recenter and VARIANCE
+    threadgroup_barrier(mem_flags::mem_threadgroup);
     device float * y = dst + tgpig*ne00;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        y[i00] = x[i00] - mean;
-    }
-    // VARIANCE
-    // parallel sum
     sum[tpitg] = 0.0f;
     for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        y[i00] = x[i00] - mean;
         sum[tpitg] += y[i00] * y[i00];
     }
     // reduce
     threadgroup_barrier(mem_flags::mem_threadgroup);
     for (uint i = ntg/2; i > 0; i /= 2) {
@@ -241,12 +239,7 @@ kernel void kernel_norm(
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
-    // broadcast
-    if (tpitg == 0) {
-        sum[0] /= ne00;
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    const float variance = sum[0];
+    const float variance = sum[0] / ne00;
     const float scale = 1.0f/sqrt(variance + eps);
     for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -254,7 +247,6 @@ kernel void kernel_norm(
     }
 }
 kernel void kernel_rms_norm(
         device const  void * src0,
         device       float * dst,
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
      mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
+#define NB_Q8_0 8
 kernel void kernel_mul_mat_q8_0_f32(
         device const  void * src0,
         device const float * src1,
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
     device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
     device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-    float yl[16];
+    float yl[NB_Q8_0];
     float sumf[nr]={0.f};
-    const int ix = tiisg/2;
-    const int il = tiisg%2;
+    const int ix = tiisg/4;
+    const int il = tiisg%4;
-    device const float * yb = y + ix * QK8_0 + 16*il;
+    device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
-    // each thread in a SIMD group deals with half a block.
-    for (int ib = ix; ib < nb; ib += nw/2) {
-        for (int i = 0; i < 16; ++i) {
+    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
+    for (int ib = ix; ib < nb; ib += nw/4) {
+        for (int i = 0; i < NB_Q8_0; ++i) {
             yl[i] = yb[i];
         }
         for (int row = 0; row < nr; row++) {
-            device const int8_t * qs = x[ib+row*nb].qs + 16*il;
+            device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
             float sumq = 0.f;
-            for (int iq = 0; iq < 16; ++iq) {
+            for (int iq = 0; iq < NB_Q8_0; ++iq) {
                 sumq += qs[iq] * yl[iq];
             }
             sumf[row] += sumq*x[ib+row*nb].d;
         }
-        yb += QK8_0 * 16;
+        yb += NB_Q8_0 * nw;
     }
     for (int row = 0; row < nr; ++row) {
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
     }
 }
-kernel void kernel_mul_mat_f16_f32(
+kernel void kernel_mul_mat_f16_f32_1row(
         device const  char * src0,
         device const  char * src1,
         device       float * dst,
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
         constant  uint64_t & nb12,
         constant   int64_t & ne0,
         constant   int64_t & ne1,
-        threadgroup float  * sum [[threadgroup(0)]],
         uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3  tpig[[thread_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3  tptg[[threads_per_threadgroup]]) {
+        uint tiisg[[thread_index_in_simdgroup]]) {
     const int64_t r0 = tgpig.x;
     const int64_t r1 = tgpig.y;
@@ -528,42 +519,101 @@ kernel void kernel_mul_mat_f16_f32(
     device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
     device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-    uint ith = tpitg.x;
-    uint nth = tptg.x;
+    float sumf = 0;
+    if (ne00 < 128) {
+        for (int i = tiisg; i < ne00; i += 32) {
+            sumf += (float) x[i] * (float) y[i];
+        }
+        float all_sum = simd_sum(sumf);
+        if (tiisg == 0) {
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    } else {
+        device const half4  * x4 = (device const half4  *) x;
+        device const float4 * y4 = (device const float4 *) y;
+        for (int i = tiisg; i < ne00/4; i += 32) {
+            for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
+        }
+        float all_sum = simd_sum(sumf);
+        if (tiisg == 0) {
+            for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    }
-    sum[ith] = 0.0f;
+}
-    for (int i = ith; i < ne00; i += nth) {
-        sum[ith] += (float) x[i] * (float) y[i];
-    }
+#define N_F16_F32 4
-    // accumulate the sum from all threads in the threadgroup
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
-    }
+kernel void kernel_mul_mat_f16_f32(
+        device const  char * src0,
+        device const  char * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]]) {
-    // Original implementation. Left behind commented out for now
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
-    //for (uint i = tptg.x/2; i > 0; i /= 2) {
-    //    if (tpitg.x < i) {
-    //        sum[tpitg.x] += sum[tpitg.x + i];
-    //    }
-    //    threadgroup_barrier(mem_flags::mem_threadgroup);
-    //}
-    //
-    //if (tpitg.x == 0) {
-    //    dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
-    //}
+    const int64_t r0 = tgpig.x;
+    const int64_t rb = tgpig.y*N_F16_F32;
+    const int64_t im = tgpig.z;
+    device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F16_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
+            float sumf = 0;
+            for (int i = tiisg; i < ne00; i += 32) {
+                sumf += (float) x[i] * (float) y[i];
+            }
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        device const half4 * x4 = (device const half4 *)x;
+        for (int row = 0; row < N_F16_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
+            device const float4 * y4 = (device const float4 *) y;
+            float sumf = 0;
+            for (int i = tiisg; i < ne00/4; i += 32) {
+                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
+            }
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
 }
 kernel void kernel_alibi_f32(
@@ -632,25 +682,27 @@ kernel void kernel_rope(
         constant       int & mode,
         constant     float & freq_base,
         constant     float & freq_scale,
-        uint3 tpig[[thread_position_in_grid]]) {
-    const int64_t i3 = tpig[2];
-    const int64_t i2 = tpig[1];
-    const int64_t i1 = tpig[0];
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]) {
+    const int64_t i3 = tgpig[2];
+    const int64_t i2 = tgpig[1];
+    const int64_t i1 = tgpig[0];
     const bool is_neox = mode & 2;
-    const float theta_scale = pow(freq_base, -2.0f/n_dims);
     const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
-    float theta = freq_scale * (float)p;
+    const float theta_0 = freq_scale * (float)p;
+    const float inv_ndims = -1.f/n_dims;
     if (!is_neox) {
-        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
             const float cos_theta = cos(theta);
             const float sin_theta = sin(theta);
-            theta *= theta_scale;
             device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
             device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -662,12 +714,12 @@ kernel void kernel_rope(
         }
     } else {
         for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
                 const float cos_theta = cos(theta);
                 const float sin_theta = sin(theta);
-                theta *= theta_scale;
                 const int64_t i0 = ib*n_dims + ic/2;
                 device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -1262,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
     const int r0 = tgpig.x;
     const int r1 = tgpig.y;
     const int r2 = tgpig.z;
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int first_row = r0 * N_DST;
     const int ib_row = first_row * nb;
     const uint offset0 = r2/gqa*(nb*ne0);
     device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;

data/ext/llama_cpp/src/ggml-opencl.cpp CHANGED Viewed

@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
         return;
     }
-    cl_mem mem = (cl_mem)tensor->data;
+    cl_mem mem = (cl_mem)tensor->extra;
     clReleaseMemObject(mem);
 }
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
     size_t d_size;
     cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
+    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
     cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     size_t d_size;
     cl_mem d_X;
     if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->data;
+        d_X = (cl_mem) src0->extra;
     } else {
-        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
+        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
     }
     cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
     cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
     size_t d_size;
     cl_mem d_X;
     if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->data;
+        d_X = (cl_mem) src0->extra;
     } else {
         d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
     }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                 events.emplace_back();
                 CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
             } else if (src0->backend == GGML_BACKEND_GPU) {
-                d_Q = (cl_mem) src0->data;
+                d_Q = (cl_mem) src0->extra;
             } else {
                 GGML_ASSERT(false);
             }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
     CL_CHECK(clFinish(queue));
-    tensor->data = dst;
+    tensor->extra = dst;
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }

data/ext/llama_cpp/src/ggml.c CHANGED Viewed

@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
 #include <sys/stat.h>
 #include <unistd.h>
+#endif
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
 #endif
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -192,9 +195,15 @@ typedef void * thread_ret_t;
 #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
 #else
 inline static void * ggml_aligned_malloc(size_t size) {
+    if (size == 0) {
+        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+        return NULL;
+    }
     void * aligned_memory = NULL;
-#ifdef GGML_USE_METAL
-    int result = posix_memalign(&aligned_memory, getpagesize(), size);
+#ifdef GGML_USE_CPU_HBM
+    int result = hbw_posix_memalign(&aligned_memory, 16, size);
+#elif GGML_USE_METAL
+    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
 #else
     int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
 #endif
@@ -215,8 +224,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
     return aligned_memory;
 }
 #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+#ifdef GGML_USE_CPU_HBM
+#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
+#else
 #define GGML_ALIGNED_FREE(ptr)    free(ptr)
 #endif
+#endif
 #define UNUSED GGML_UNUSED
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 #if !defined(__aarch64__)
-inline static uint16_t vaddvq_u8(uint8x16_t v) {
-    return
-        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
-        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
-        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
-        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
-        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
-        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
-        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
-        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
-}
-inline static int16_t vaddvq_s8(int8x16_t v) {
-    return
-        (int16_t)vgetq_lane_s8(v, 0)  + (int16_t)vgetq_lane_s8(v, 1)  +
-        (int16_t)vgetq_lane_s8(v, 2)  + (int16_t)vgetq_lane_s8(v, 3)  +
-        (int16_t)vgetq_lane_s8(v, 4)  + (int16_t)vgetq_lane_s8(v, 5)  +
-        (int16_t)vgetq_lane_s8(v, 6)  + (int16_t)vgetq_lane_s8(v, 7)  +
-        (int16_t)vgetq_lane_s8(v, 8)  + (int16_t)vgetq_lane_s8(v, 9)  +
-        (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
-        (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
-        (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
-}
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-inline static uint32_t vaddvq_u16(uint16x8_t v) {
-    return
-        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
-        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
-        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
-        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
-}
 inline static int32_t vaddvq_s32(int32x4_t v) {
     return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 }
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
     return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 }
-inline static float vminvq_f32(float32x4_t v) {
-    return
-        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
 inline static float vmaxvq_f32(float32x4_t v) {
     return
         MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@@ -4612,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         return NULL;
     }
+    // allow to call ggml_init with 0 size
+    if (params.mem_size == 0) {
+        params.mem_size = GGML_MEM_ALIGN;
+    }
     const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
     *ctx = (struct ggml_context) {
@@ -4814,7 +4786,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
     size_t obj_alloc_size = 0;
-    if (view_src == NULL && ctx->no_alloc == false) {
+    if (view_src == NULL && !ctx->no_alloc) {
         if (ctx->scratch.data != NULL) {
             // allocate tensor data in the scratch buffer
             if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@@ -5515,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
     }
     if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
     }
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5558,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
     }
     if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
     }
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -20003,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         struct ggml_tensor * data = NULL;
-        if (params.no_alloc == false) {
+        if (!params.no_alloc) {
             data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
             ok = ok && data != NULL;
@@ -20044,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             }
             // point the data member to the appropriate location in the binary blob using the tensor infos
-            if (params.no_alloc == false) {
+            if (!params.no_alloc) {
               //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
                 cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
             }