npm - cui-llama.rn - Versions diffs - 1.4.6 → 1.6.0 - Mend

cui-llama.rn 1.4.6 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (366) hide show

package/cpp/unicode.h CHANGED Viewed

@@ -1,66 +1,66 @@
-#pragma once
-#include <cstdint>
-#include <string>
-#include <vector>
-struct unicode_cpt_flags {
-    enum {
-        UNDEFINED       = 0x0001,
-        NUMBER          = 0x0002,  // regex: \p{N}
-        LETTER          = 0x0004,  // regex: \p{L}
-        SEPARATOR       = 0x0008,  // regex: \p{Z}
-        ACCENT_MARK     = 0x0010,  // regex: \p{M}
-        PUNCTUATION     = 0x0020,  // regex: \p{P}
-        SYMBOL          = 0x0040,  // regex: \p{S}
-        CONTROL         = 0x0080,  // regex: \p{C}
-        MASK_CATEGORIES = 0x00FF,
-    };
-    // codepoint type
-    uint16_t is_undefined   : 1;
-    uint16_t is_number      : 1;  // regex: \p{N}
-    uint16_t is_letter      : 1;  // regex: \p{L}
-    uint16_t is_separator   : 1;  // regex: \p{Z}
-    uint16_t is_accent_mark : 1;  // regex: \p{M}
-    uint16_t is_punctuation : 1;  // regex: \p{P}
-    uint16_t is_symbol      : 1;  // regex: \p{S}
-    uint16_t is_control     : 1;  // regex: \p{C}
-    // helper flags
-    uint16_t is_whitespace  : 1;  // regex: \s
-    uint16_t is_lowercase   : 1;
-    uint16_t is_uppercase   : 1;
-    uint16_t is_nfd         : 1;
-    // decode from uint16
-    inline unicode_cpt_flags(const uint16_t flags = 0) {
-        *reinterpret_cast<uint16_t*>(this) = flags;
-    }
-    inline uint16_t as_uint() const {
-        return *reinterpret_cast<const uint16_t*>(this);
-    }
-    inline uint16_t category_flag() const {
-        return this->as_uint() & MASK_CATEGORIES;
-    }
-};
-size_t unicode_len_utf8(char src);
-std::string unicode_cpt_to_utf8  (uint32_t cpt);
-uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
-std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
-std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
-unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
-unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
-std::string unicode_byte_to_utf8(uint8_t byte);
-uint8_t     unicode_utf8_to_byte(const std::string & utf8);
-uint32_t unicode_tolower(uint32_t cpt);
-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+struct unicode_cpt_flags {
+    enum {
+        UNDEFINED       = 0x0001,
+        NUMBER          = 0x0002,  // regex: \p{N}
+        LETTER          = 0x0004,  // regex: \p{L}
+        SEPARATOR       = 0x0008,  // regex: \p{Z}
+        ACCENT_MARK     = 0x0010,  // regex: \p{M}
+        PUNCTUATION     = 0x0020,  // regex: \p{P}
+        SYMBOL          = 0x0040,  // regex: \p{S}
+        CONTROL         = 0x0080,  // regex: \p{C}
+        MASK_CATEGORIES = 0x00FF,
+    };
+    // codepoint type
+    uint16_t is_undefined   : 1;
+    uint16_t is_number      : 1;  // regex: \p{N}
+    uint16_t is_letter      : 1;  // regex: \p{L}
+    uint16_t is_separator   : 1;  // regex: \p{Z}
+    uint16_t is_accent_mark : 1;  // regex: \p{M}
+    uint16_t is_punctuation : 1;  // regex: \p{P}
+    uint16_t is_symbol      : 1;  // regex: \p{S}
+    uint16_t is_control     : 1;  // regex: \p{C}
+    // helper flags
+    uint16_t is_whitespace  : 1;  // regex: \s
+    uint16_t is_lowercase   : 1;
+    uint16_t is_uppercase   : 1;
+    uint16_t is_nfd         : 1;
+    // decode from uint16
+    inline unicode_cpt_flags(const uint16_t flags = 0) {
+        *reinterpret_cast<uint16_t*>(this) = flags;
+    }
+    inline uint16_t as_uint() const {
+        return *reinterpret_cast<const uint16_t*>(this);
+    }
+    inline uint16_t category_flag() const {
+        return this->as_uint() & MASK_CATEGORIES;
+    }
+};
+size_t unicode_len_utf8(char src);
+std::string unicode_cpt_to_utf8  (uint32_t cpt);
+uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
+unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
+std::string unicode_byte_to_utf8(uint8_t byte);
+uint8_t     unicode_utf8_to_byte(const std::string & utf8);
+uint32_t unicode_tolower(uint32_t cpt);
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);

package/cpp/vec.cpp ADDED Viewed

@@ -0,0 +1,258 @@
+#include "vec.h"
+#include <cassert>
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+#endif
+// precomputed gelu table for f16 (128 KB)
+lm_ggml_fp16_t lm_ggml_table_gelu_f16[1 << 16];
+// precomputed quick gelu table for f16 (128 KB)
+lm_ggml_fp16_t lm_ggml_table_gelu_quick_f16[1 << 16];
+void lm_ggml_vec_dot_f32(int n, float * LM_GGML_RESTRICT s, size_t bs, const float * LM_GGML_RESTRICT x, size_t bx, const float * LM_GGML_RESTRICT y, size_t by, int nrc) {
+   assert(nrc == 1);
+   LM_GGML_UNUSED(nrc);
+   LM_GGML_UNUSED(bx);
+   LM_GGML_UNUSED(by);
+   LM_GGML_UNUSED(bs);
+#if defined(LM_GGML_SIMD)
+    float sumf = 0.0f;
+    const int np = (n & ~(LM_GGML_F32_STEP - 1));
+    LM_GGML_F32_VEC sum[LM_GGML_F32_ARR] = { LM_GGML_F32_VEC_ZERO };
+    LM_GGML_F32_VEC ax[LM_GGML_F32_ARR];
+    LM_GGML_F32_VEC ay[LM_GGML_F32_ARR];
+    for (int i = 0; i < np; i += LM_GGML_F32_STEP) {
+        for (int j = 0; j < LM_GGML_F32_ARR; j++) {
+            ax[j] = LM_GGML_F32_VEC_LOAD(x + i + j*LM_GGML_F32_EPR);
+            ay[j] = LM_GGML_F32_VEC_LOAD(y + i + j*LM_GGML_F32_EPR);
+            sum[j] = LM_GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+    // reduce sum0..sum3 to sum0
+    LM_GGML_F32_VEC_REDUCE(sumf, sum);
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += x[i]*y[i];
+    }
+#else
+    // scalar
+    lm_ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (lm_ggml_float)(x[i]*y[i]);
+    }
+#endif
+    *s = sumf;
+}
+void lm_ggml_vec_dot_bf16(int n, float * LM_GGML_RESTRICT s, size_t bs, lm_ggml_bf16_t * LM_GGML_RESTRICT x, size_t bx, lm_ggml_bf16_t * LM_GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    LM_GGML_UNUSED(nrc);
+    LM_GGML_UNUSED(bx);
+    LM_GGML_UNUSED(by);
+    LM_GGML_UNUSED(bs);
+    int i = 0;
+    lm_ggml_float sumf = 0;
+#if defined(__AVX512BF16__)
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 64 <= n; i += 64) {
+        c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
+                             m512bh(_mm512_loadu_si512((y + i))));
+        c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
+                             m512bh(_mm512_loadu_si512((y + i + 32))));
+    }
+    sumf += (lm_ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (lm_ggml_float)_mm512_reduce_add_ps(c2);
+#elif defined(__AVX512F__)
+#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
+    }
+    sumf += (lm_ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (lm_ggml_float)_mm512_reduce_add_ps(c2);
+#undef LOAD
+#elif defined(__AVX2__) || defined(__AVX__)
+#if defined(__AVX2__)
+#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+#else
+#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
+#endif
+    __m256 c1 = _mm256_setzero_ps();
+    __m256 c2 = _mm256_setzero_ps();
+    __m256 c3 = _mm256_setzero_ps();
+    __m256 c4 = _mm256_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
+        c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
+        c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
+    }
+    __m128 g;
+    c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
+                       _mm256_add_ps(c2, c4));
+    g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
+                   _mm256_castps256_ps128(c1));
+    g = _mm_add_ps(g, _mm_movehl_ps(g, g));
+    g = _mm_add_ss(g, _mm_movehdup_ps(g));
+    sumf += (lm_ggml_float)_mm_cvtss_f32(g);
+#undef LOAD
+#endif
+    for (; i < n; ++i) {
+        sumf += (lm_ggml_float)(LM_GGML_BF16_TO_FP32(x[i]) *
+                             LM_GGML_BF16_TO_FP32(y[i]));
+    }
+    *s = sumf;
+}
+void lm_ggml_vec_dot_f16(int n, float * LM_GGML_RESTRICT s, size_t bs, lm_ggml_fp16_t * LM_GGML_RESTRICT x, size_t bx, lm_ggml_fp16_t * LM_GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    LM_GGML_UNUSED(nrc);
+    LM_GGML_UNUSED(bx);
+    LM_GGML_UNUSED(by);
+    LM_GGML_UNUSED(bs);
+    lm_ggml_float sumf = 0.0;
+#if defined(LM_GGML_SIMD)
+    const int np = (n & ~(LM_GGML_F16_STEP - 1));
+    LM_GGML_F16_VEC sum[LM_GGML_F16_ARR] = { LM_GGML_F16_VEC_ZERO };
+    LM_GGML_F16_VEC ax[LM_GGML_F16_ARR];
+    LM_GGML_F16_VEC ay[LM_GGML_F16_ARR];
+    for (int i = 0; i < np; i += LM_GGML_F16_STEP) {
+        for (int j = 0; j < LM_GGML_F16_ARR; j++) {
+            ax[j] = LM_GGML_F16_VEC_LOAD(x + i + j*LM_GGML_F16_EPR, j);
+            ay[j] = LM_GGML_F16_VEC_LOAD(y + i + j*LM_GGML_F16_EPR, j);
+            sum[j] = LM_GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+    // reduce sum0..sum3 to sum0
+    LM_GGML_F16_VEC_REDUCE(sumf, sum);
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[i])*LM_GGML_FP16_TO_FP32(y[i]));
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        sumf += (lm_ggml_float)(LM_GGML_FP16_TO_FP32(x[i])*LM_GGML_FP16_TO_FP32(y[i]));
+    }
+#endif
+    *s = sumf;
+}
+void lm_ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i, lm_ggml_v_silu(_mm512_loadu_ps(x + i)));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i, lm_ggml_v_silu(_mm256_loadu_ps(x + i)));
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        _mm_storeu_ps(y + i, lm_ggml_v_silu(_mm_loadu_ps(x + i)));
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, lm_ggml_v_silu(vld1q_f32(x + i)));
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = lm_ggml_silu_f32(x[i]);
+    }
+}
+lm_ggml_float lm_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
+    int i = 0;
+    lm_ggml_float sum = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        __m512 val = lm_ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
+                                               _mm512_set1_ps(max)));
+        _mm512_storeu_ps(y + i, val);
+        sum += (lm_ggml_float)_mm512_reduce_add_ps(val);
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        __m256 val = lm_ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
+                                               _mm256_set1_ps(max)));
+        _mm256_storeu_ps(y + i, val);
+        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
+                                 _mm256_castps256_ps128(val));
+        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
+        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
+        sum += (lm_ggml_float)_mm_cvtss_f32(val2);
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = lm_ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
+                                            _mm_set1_ps(max)));
+        _mm_storeu_ps(y + i, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
+#endif
+        sum += (lm_ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = lm_ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
+                                                vdupq_n_f32(max)));
+        vst1q_f32(y + i, val);
+        sum += (lm_ggml_float)vaddvq_f32(val);
+    }
+#endif
+    for (; i < n; ++i) {
+        float val = expf(x[i] - max);
+        sum += (lm_ggml_float)val;
+        y[i] = val;
+    }
+    return sum;
+}
+lm_ggml_float lm_ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
+    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
+    int i = 0;
+    lm_ggml_float sum = 0;
+    for (; i < n; ++i) {
+        float val = x[i] - max;
+        y[i] = val;
+        sum += (lm_ggml_float)expf(val);
+    }
+    return sum = (lm_ggml_float)logf(sum);
+}