npm - cui-llama.rn - Versions diffs - 1.0.7 → 1.0.9 - Mend

cui-llama.rn 1.0.7 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/cpp/ggml-quants.h CHANGED Viewed

@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
+#if defined(__ARM_FEATURE_SVE)
+extern int lm_ggml_sve_cnt_b;
+#endif
 #ifdef __cplusplus
 }
 #endif

package/cpp/ggml.c CHANGED Viewed

@@ -37,6 +37,9 @@
 #include <unistd.h>
 #endif
+#if defined(__ARM_FEATURE_SVE)
+int lm_ggml_sve_cnt_b = 0;
+#endif
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef LM_GGML_USE_LLAMAFILE
 #endif
@@ -53,6 +56,9 @@
 // disable POSIX deprecation warnings
 // these functions are never going away, anyway
 #pragma warning(disable: 4996)
+// unreachable code because of multiple instances of code after LM_GGML_ABORT
+#pragma warning(disable: 4702)
 #endif
 #if defined(_WIN32)
@@ -185,7 +191,7 @@ static void lm_ggml_print_backtrace_symbols(void) {
         fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
     }
 }
-#elif defined(__linux__)
+#elif defined(__linux__) && defined(__GLIBC__)
 #include <execinfo.h>
 static void lm_ggml_print_backtrace_symbols(void) {
     // void * trace[100];
@@ -480,9 +486,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) {
     }
 }
+void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = lm_ggml_compute_fp32_to_bf16(x[i]);
+    }
+}
 void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) {
   int i = 0;
 #if defined(__AVX512BF16__)
+  // subnormals are flushed to zero on this platform
   for (; i + 32 <= n; i += 32) {
         _mm512_storeu_si512(
             (__m512i *)(y + i),
@@ -962,7 +975,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
         .is_quantized             = false,
         .to_float                 = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
         .from_float               = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
-        .from_float_ref           = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
+        .from_float_ref           = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
         .vec_dot                  = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
         .vec_dot_type             = LM_GGML_TYPE_BF16,
         .nrows                    = 1,
@@ -2302,7 +2315,7 @@ inline static void lm_ggml_vec_abs_f32  (const int n, float * y, const float * x
 inline static void lm_ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
 inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
-inline static void lm_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
+inline static void lm_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@@ -3551,6 +3564,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
     LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer);
+#if defined(__ARM_FEATURE_SVE)
+    if (!lm_ggml_sve_cnt_b) {
+        lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+    }
+#endif
     LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
     lm_ggml_critical_section_end();
@@ -3705,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
         struct lm_ggml_tensor  * view_src,
         size_t                view_offs) {
-    assert(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
+    LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT);
+    LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
     // find the base tensor and absolute offset
     if (view_src != NULL && view_src->view_src != NULL) {
@@ -5358,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
     struct lm_ggml_context * ctx,
     struct lm_ggml_tensor * a,
     int n_groups,
+    float eps,
     bool inplace) {
     bool is_node = false;
@@ -5368,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
     struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
-    result->op_params[0] = n_groups;
+    lm_ggml_set_op_params_i32(result, 0, n_groups);
+    lm_ggml_set_op_params_f32(result, 1, eps);
     result->op = LM_GGML_OP_GROUP_NORM;
     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
@@ -5380,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
 struct lm_ggml_tensor * lm_ggml_group_norm(
     struct lm_ggml_context * ctx,
     struct lm_ggml_tensor * a,
-    int n_groups) {
-    return lm_ggml_group_norm_impl(ctx, a, n_groups, false);
+    int n_groups,
+    float eps) {
+    return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false);
 }
 struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
     struct lm_ggml_context * ctx,
     struct lm_ggml_tensor * a,
-    int n_groups) {
-    return lm_ggml_group_norm_impl(ctx, a, n_groups, true);
+    int n_groups,
+    float eps) {
+    return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
 }
 // lm_ggml_mul_mat
@@ -12079,10 +12103,11 @@ static void lm_ggml_compute_forward_group_norm_f32(
     LM_GGML_TENSOR_UNARY_OP_LOCALS
-    const float eps = 1e-6f; // TODO: make this a parameter
     // TODO: optimize
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
     int n_channels = src0->ne[2];
     int n_groups = dst->op_params[0];
     int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@@ -20650,7 +20675,7 @@ size_t lm_ggml_quantize_chunk(
         case LM_GGML_TYPE_BF16:
             {
                 size_t elemsize = sizeof(lm_ggml_bf16_t);
-                lm_ggml_fp32_to_bf16_row(src + start, (lm_ggml_bf16_t *)dst + start, n);
+                lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n);
                 result = n * elemsize;
             } break;
         case LM_GGML_TYPE_F32:

package/cpp/ggml.h CHANGED Viewed

@@ -349,6 +349,7 @@ extern "C" {
     LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float);
     LM_GGML_API float       lm_ggml_bf16_to_fp32(lm_ggml_bf16_t);  // consider just doing << 16
     LM_GGML_API void        lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t);
+    LM_GGML_API void        lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t);
     LM_GGML_API void        lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t);
     struct lm_ggml_object;
@@ -1139,16 +1140,17 @@ extern "C" {
     // group normalize along ne0*ne1*n_groups
     // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
     // a - x
     // b - dy
@@ -1455,7 +1457,6 @@ extern "C" {
     // if mode & 2 == 1, GPT-NeoX style
     //
     // b is an int32 vector with size a->ne[2], it contains the positions
-    // c is freq factors (e.g. phi3-128k), (optional)
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1472,6 +1473,7 @@ extern "C" {
             int                   mode);
     // custom RoPE
+    // c is freq factors (e.g. phi3-128k), (optional)
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,

package/cpp/llama-impl.h CHANGED Viewed

@@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
 #define LLAMA_LOG_INFO(...)  llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+//
+// helpers
+//
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return; // Avoid infinite loop if 'search' is an empty string
+    }
+    size_t pos = 0;
+    while ((pos = s.find(search, pos)) != std::string::npos) {
+        s.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+}

package/cpp/llama-vocab.cpp CHANGED Viewed

@@ -16,20 +16,6 @@
 // helpers
 //
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
-    }
-    s = std::move(result);
-}
 LLAMA_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
     va_list ap;
@@ -816,6 +802,9 @@ struct llm_tokenizer_ugm {
      * the best tokenization.
     */
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        // get current size of output (for reversal later)
+        size_t output_size = output.size();
         // normalize the input first
         std::string normalized;
         normalize(text, &normalized);
@@ -895,7 +884,7 @@ struct llm_tokenizer_ugm {
         }
         // reverse the output since we added tokens starting from the end of the input
-        std::reverse(output.begin(), output.end());
+        std::reverse(output.begin() + output_size, output.end());
     }
 private:
@@ -1444,7 +1433,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
     return token != -1 && (
         token == llama_token_eos_impl(vocab) ||
-        token == llama_token_eot_impl(vocab)
+        token == llama_token_eot_impl(vocab) ||
+        token == llama_token_eom_impl(vocab)
     );
 }
@@ -1500,6 +1490,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
     return vocab.special_eot_id;
 }
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eom_id;
+}
 int32_t llama_tokenize_impl(
     const struct llama_vocab & vocab,
                   const char * text,

package/cpp/llama-vocab.h CHANGED Viewed

@@ -45,6 +45,7 @@ struct llama_vocab {
     id special_suffix_id = -1;
     id special_middle_id = -1;
     id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
+    id special_eom_id    = -1;
     // tokenizer flags
     bool tokenizer_add_space_prefix = false;
@@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
 int32_t llama_tokenize_impl(
         const struct llama_vocab & vocab,