npm - cui-llama.rn - Versions diffs - 1.6.1 → 1.7.1 - Mend

cui-llama.rn 1.6.1 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (196) hide show

package/cpp/ggml-cpu/ggml-cpu.cpp CHANGED Viewed

@@ -11,24 +11,26 @@
 #include <vector>
 #ifdef LM_GGML_USE_CPU_HBM
-#include "ggml-cpu-hbm.h"
+#    include "ggml-cpu-hbm.h"
 #endif
 #ifdef LM_GGML_USE_CPU_KLEIDIAI
-#include "kleidiai/kleidiai.h"
-#endif
-#if defined(__APPLE__)
-#include <sys/types.h>
-#include <sys/sysctl.h>
+#    include "kleidiai/kleidiai.h"
 #endif
 #if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#else
+#    include <unistd.h>
 #endif
-#include <windows.h>
+#if defined(__APPLE__)
+#    include <sys/sysctl.h>
+#    include <sys/types.h>
 #endif
 // ggml-backend interface
@@ -70,8 +72,10 @@ static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_device_get_extra_buff
 }
 static bool lm_ggml_backend_cpu_is_extra_buffer_type(lm_ggml_backend_buffer_type_t buft) {
-    for (auto extra : lm_ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra && extra == buft) return true;
+    for (auto * extra : lm_ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra == buft) {
+            return true;
+        }
     }
     return false;
 }
@@ -330,9 +334,18 @@ static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_d
 }
 static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
+#ifdef _WIN32
+    MEMORYSTATUSEX status;
+    status.dwLength = sizeof(status);
+    GlobalMemoryStatusEx(&status);
+    *total = status.ullTotalPhys;
+    *free = status.ullAvailPhys;
+#else
+    long pages = sysconf(_SC_PHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    *total = pages * page_size;
+    *free = *total;
+#endif
     LM_GGML_UNUSED(dev);
 }

package/cpp/ggml-cpu/ops.cpp CHANGED Viewed

@@ -8,19 +8,6 @@
 #include <float.h>
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-// disable POSIX deprecation warnings
-// these functions are never going away, anyway
-#pragma warning(disable: 4996)
-// unreachable code because of multiple instances of code after LM_GGML_ABORT
-#pragma warning(disable: 4702)
-#endif
 // lm_ggml_compute_forward_dup
 static void lm_ggml_compute_forward_dup_same_cont(
@@ -2704,6 +2691,109 @@ static void lm_ggml_compute_forward_gelu(
     }
 }
+// lm_ggml_compute_forward_gelu_erf
+static void lm_ggml_compute_forward_gelu_erf_f32(
+        const lm_ggml_compute_params * params,
+        lm_ggml_tensor * dst) {
+    const lm_ggml_tensor * src0 = dst->src[0];
+    assert(lm_ggml_is_contiguous_1(src0));
+    assert(lm_ggml_is_contiguous_1(dst));
+    assert(lm_ggml_are_same_shape(src0, dst));
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nc = src0->ne[0];
+    const int nr = lm_ggml_nrows(src0);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        lm_ggml_vec_gelu_erf_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            LM_GGML_UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+static void lm_ggml_compute_forward_gelu_erf_f16(
+    const lm_ggml_compute_params * params,
+    lm_ggml_tensor * dst) {
+    const lm_ggml_tensor * src0 = dst->src[0];
+    assert(lm_ggml_is_contiguous_1(src0));
+    assert(lm_ggml_is_contiguous_1(dst));
+    assert(lm_ggml_are_same_shape(src0, dst));
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nc = src0->ne[0];
+    const int nr = lm_ggml_nrows(src0);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        lm_ggml_vec_gelu_erf_f16(nc,
+                (lm_ggml_fp16_t *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (lm_ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const lm_ggml_fp16_t x = ((lm_ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            const float v = LM_GGML_FP16_TO_FP32(x);
+            LM_GGML_UNUSED(v);
+            assert(!isnan(v));
+            assert(!isinf(v));
+        }
+#endif
+    }
+}
+static void lm_ggml_compute_forward_gelu_erf(
+        const lm_ggml_compute_params * params,
+        lm_ggml_tensor * dst) {
+    const lm_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case LM_GGML_TYPE_F32:
+            {
+                lm_ggml_compute_forward_gelu_erf_f32(params, dst);
+            } break;
+        case LM_GGML_TYPE_F16:
+            {
+                lm_ggml_compute_forward_gelu_erf_f16(params, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ABORT("fatal error");
+            }
+    }
+}
 // lm_ggml_compute_forward_gelu_quick
 static void lm_ggml_compute_forward_gelu_quick_f32(
@@ -7762,6 +7852,10 @@ void lm_ggml_compute_forward_unary(
             {
                 lm_ggml_compute_forward_gelu(params, dst);
             } break;
+        case LM_GGML_UNARY_OP_GELU_ERF:
+            {
+                lm_ggml_compute_forward_gelu_erf(params, dst);
+            } break;
         case LM_GGML_UNARY_OP_GELU_QUICK:
             {
                 lm_ggml_compute_forward_gelu_quick(params, dst);

package/cpp/ggml-cpu/vec.cpp CHANGED Viewed

@@ -2,12 +2,6 @@
 #include <cassert>
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-#endif
 // precomputed gelu table for f16 (128 KB)
 lm_ggml_fp16_t lm_ggml_table_gelu_f16[1 << 16];

package/cpp/ggml-cpu/vec.h CHANGED Viewed

@@ -428,6 +428,7 @@ inline static void lm_ggml_vec_exp_f16 (const int n, lm_ggml_fp16_t * y, const l
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
 static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+static const float SQRT_2_INV      = 0.70710678118654752440084436210484f;
 inline static float lm_ggml_gelu_f32(float x) {
     return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -440,6 +441,14 @@ inline static void lm_ggml_vec_gelu_f16(const int n, lm_ggml_fp16_t * y, const l
     }
 }
+inline static void lm_ggml_vec_gelu_erf_f16(const int n, lm_ggml_fp16_t * y, const lm_ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        float xi = LM_GGML_FP16_TO_FP32(x[i]);
+        float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
+        y[i] = LM_GGML_FP32_TO_FP16(res);
+    }
+}
 #ifdef LM_GGML_GELU_FP16
 inline static void lm_ggml_vec_gelu_f32(const int n, float * y, const float * x) {
     uint16_t t;
@@ -463,6 +472,13 @@ inline static void lm_ggml_vec_gelu_f32(const int n, float * y, const float * x)
 }
 #endif
+inline static void lm_ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        float xi = x[i];
+        y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
+    }
+}
 inline static float lm_ggml_gelu_quick_f32(float x) {
     return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
 }

package/cpp/ggml-llama-sim.metallib CHANGED Viewed

Binary file

package/cpp/ggml-llama.metallib CHANGED Viewed

Binary file

package/cpp/ggml-metal-impl.h CHANGED Viewed

@@ -207,6 +207,10 @@ typedef struct {
     float    attn_factor;
     float    beta_fast;
     float    beta_slow;
+    int32_t  sect_0;
+    int32_t  sect_1;
+    int32_t  sect_2;
+    int32_t  sect_3;
 } lm_ggml_metal_kargs_rope;
 typedef struct {
@@ -299,21 +303,42 @@ typedef struct {
 } lm_ggml_metal_kargs_mul_mv_ext;
 typedef struct {
-    int32_t  nei0;
-    int32_t  nei1;
-    uint64_t nbi1;
+    int32_t  ne10;
+    int32_t  ne11;  // n_expert_used (bcast)
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  neh11; // n_tokens
+    uint64_t nbh11;
+    int32_t  ne20;  // n_expert_used
+    uint64_t nb21;
+} lm_ggml_metal_kargs_mul_mm_id_map0;
+typedef struct {
+    int32_t  ne20; // n_expert_used
+    int32_t  neh0;
+    int32_t  neh1;
+    uint64_t nbh1;
+    uint64_t nbh2;
+    int32_t  ne0;
+    uint64_t nb1;
+    uint64_t nb2;
+} lm_ggml_metal_kargs_mul_mm_id_map1;
+typedef struct {
     int32_t  ne00;
     int32_t  ne02;
     uint64_t nb01;
     uint64_t nb02;
-    int32_t  ne11;
-    int32_t  ne12;
-    int32_t  ne13;
-    uint64_t nb10;
-    uint64_t nb11;
-    uint64_t nb12;
-    int32_t  ne0;
-    int32_t  ne1;
+    uint64_t nb03;
+    int32_t  neh12;
+    uint64_t nbh10;
+    uint64_t nbh11;
+    uint64_t nbh12;
+    uint64_t nbh13;
+    int32_t  neh0;
+    int32_t  neh1;
+    int16_t  r2;
+    int16_t  r3;
 } lm_ggml_metal_kargs_mul_mm_id;
 typedef struct {