npm - cui-llama.rn - Versions diffs - 1.0.6 → 1.0.9 - Mend

cui-llama.rn 1.0.6 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/cpp/ggml-metal.m CHANGED Viewed

@@ -210,7 +210,7 @@ enum lm_ggml_metal_kernel_type {
     LM_GGML_METAL_KERNEL_TYPE_COUNT
 };
-struct lm_ggml_metal_context {
+struct lm_ggml_backend_metal_context {
     int n_cb;
     id<MTLDevice>       device;
@@ -224,6 +224,10 @@ struct lm_ggml_metal_context {
     bool support_simdgroup_mm;
     bool should_capture_next_compute;
+    // abort lm_ggml_metal_graph_compute if callback returns true
+    lm_ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };
 // MSL code
@@ -289,7 +293,7 @@ static void * lm_ggml_metal_host_malloc(size_t n) {
     return data;
 }
-static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
+static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
     LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 #if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
@@ -306,7 +310,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
     LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
     // Configure context
-    struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context));
+    struct lm_ggml_backend_metal_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_context));
     ctx->device = device;
     ctx->n_cb   = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
     ctx->queue  = [ctx->device newCommandQueue];
@@ -668,7 +672,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
     return ctx;
 }
-static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) {
+static void lm_ggml_metal_free(struct lm_ggml_backend_metal_context * ctx) {
     LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
     for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
@@ -734,7 +738,7 @@ static id<MTLBuffer> lm_ggml_metal_get_buffer(struct lm_ggml_tensor * t, size_t
     return nil;
 }
-static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, const struct lm_ggml_tensor * op) {
+static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context * ctx, const struct lm_ggml_tensor * op) {
     for (size_t i = 0, n = 3; i < n; ++i) {
         if (op->src[i] != NULL && op->src[i]->type == LM_GGML_TYPE_BF16) {
             return false;
@@ -845,7 +849,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
 }
 static enum lm_ggml_status lm_ggml_metal_graph_compute(
-        struct lm_ggml_metal_context * ctx,
+        struct lm_ggml_backend_metal_context * ctx,
                struct lm_ggml_cgraph * gf) {
     @autoreleasepool {
@@ -878,8 +882,11 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
         id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
         command_buffer_builder[cb_idx] = command_buffer;
-        // enqueue the command buffers in order to specify their execution order
-        [command_buffer enqueue];
+        // always enqueue the first two command buffers
+        // enqueue all of the command buffers if we don't need to abort
+        if (cb_idx < 2 || ctx->abort_callback == NULL) {
+            [command_buffer enqueue];
+        }
     }
     const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
@@ -2229,10 +2236,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
                         LM_GGML_ASSERT(ne00 % 4 == 0);
                         LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
-                        //float eps;
-                        //memcpy(&eps, dst->op_params, sizeof(float));
-                        const float eps = 1e-6f; // TODO: temporarily hardcoded
+                        float eps;
+                        memcpy(&eps, dst->op_params + 1, sizeof(float));
                         const int32_t n_groups = ((int32_t *) dst->op_params)[0];
@@ -2829,7 +2834,9 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
         [encoder endEncoding];
-        [command_buffer commit];
+        if (cb_idx < 2 || ctx->abort_callback == NULL) {
+            [command_buffer commit];
+        }
     });
     // Wait for completion and check status of each command buffer
@@ -2849,6 +2856,23 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
             return LM_GGML_STATUS_FAILED;
         }
+        id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
+        if (!next_buffer) {
+            continue;
+        }
+        bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
+        if (next_queued) {
+            continue;
+        }
+        if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
+            LM_GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
+            return LM_GGML_STATUS_ABORTED;
+        }
+        [next_buffer commit];
     }
     if (should_capture) {
@@ -3152,7 +3176,7 @@ LM_GGML_CALL static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t ba
 }
 LM_GGML_CALL static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
-    struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
+    struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
     lm_ggml_metal_free(ctx);
     free(backend);
 }
@@ -3164,13 +3188,13 @@ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_defa
 }
 LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
-    struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context;
+    struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
     return lm_ggml_metal_graph_compute(metal_ctx, cgraph);
 }
 LM_GGML_CALL static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
-    struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context;
+    struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
     return lm_ggml_metal_supports_op(metal_ctx, op);
 }
@@ -3215,9 +3239,9 @@ static lm_ggml_guid_t lm_ggml_backend_metal_guid(void) {
 }
 lm_ggml_backend_t lm_ggml_backend_metal_init(void) {
-    struct lm_ggml_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
+    struct lm_ggml_backend_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
     if (ctx == NULL) {
+        LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return NULL;
     }
@@ -3239,15 +3263,24 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) {
 void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) {
     LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
-    struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
+    struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
     ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
 }
+void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data) {
+    LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
+    struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = user_data;
+}
 bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) {
     LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
-    struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
+    struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
     return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
@@ -3255,7 +3288,7 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family
 void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) {
     LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
-    struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
+    struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
     ctx->should_capture_next_compute = true;
 }

package/cpp/ggml-quants.c CHANGED Viewed

@@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
     float sumf = 0;
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (lm_ggml_sve_cnt_b == QK8_0) {
         const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
         const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
@@ -5303,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
     float sumf = 0;
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntb() == QK8_0) {
+    if (lm_ggml_sve_cnt_b == QK8_0) {
         svfloat32_t sumv0 = svdup_n_f32(0.0f);
         svfloat32_t sumv1 = svdup_n_f32(0.0f);
@@ -6449,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
             // compute mask for subtraction
             vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
+            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
             m <<= 1;
             vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
+            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
             m <<= 1;
             vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
+            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
             m <<= 1;
             vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
+            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
             m <<= 1;
             // load Q8 and take product with Q3
@@ -7720,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
             vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
             vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
+            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
             m <<= 1;
             vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
             vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
+            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
             m <<= 1;
             vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);

package/cpp/ggml-quants.h CHANGED Viewed

@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
+#if defined(__ARM_FEATURE_SVE)
+extern int lm_ggml_sve_cnt_b;
+#endif
 #ifdef __cplusplus
 }
 #endif

package/cpp/ggml.c CHANGED Viewed

@@ -37,6 +37,9 @@
 #include <unistd.h>
 #endif
+#if defined(__ARM_FEATURE_SVE)
+int lm_ggml_sve_cnt_b = 0;
+#endif
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef LM_GGML_USE_LLAMAFILE
 #endif
@@ -53,6 +56,9 @@
 // disable POSIX deprecation warnings
 // these functions are never going away, anyway
 #pragma warning(disable: 4996)
+// unreachable code because of multiple instances of code after LM_GGML_ABORT
+#pragma warning(disable: 4702)
 #endif
 #if defined(_WIN32)
@@ -141,7 +147,51 @@ typedef pthread_t lm_ggml_thread_t;
 #include <sys/wait.h>
-#if defined(__linux__)
+#if defined(__ANDROID__)
+#include <unwind.h>
+#include <dlfcn.h>
+#include <stdio.h>
+struct backtrace_state {
+    void ** current;
+    void ** end;
+};
+static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
+    struct backtrace_state * state = (struct backtrace_state *)arg;
+    uintptr_t pc = _Unwind_GetIP(context);
+    if (pc) {
+        if (state->current == state->end) {
+            return _URC_END_OF_STACK;
+        } else {
+            *state->current++ = (void*)pc;
+        }
+    }
+    return _URC_NO_REASON;
+}
+static void lm_ggml_print_backtrace_symbols(void) {
+    const int max = 100;
+    void* buffer[max];
+    struct backtrace_state state = {buffer, buffer + max};
+    _Unwind_Backtrace(unwind_callback, &state);
+    int count = state.current - buffer;
+    for (int idx = 0; idx < count; ++idx) {
+        const void * addr = buffer[idx];
+        const char * symbol = "";
+        Dl_info info;
+        if (dladdr(addr, &info) && info.dli_sname) {
+            symbol = info.dli_sname;
+        }
+        fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
+    }
+}
+#elif defined(__linux__) && defined(__GLIBC__)
 #include <execinfo.h>
 static void lm_ggml_print_backtrace_symbols(void) {
     // void * trace[100];
@@ -436,9 +486,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) {
     }
 }
+void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = lm_ggml_compute_fp32_to_bf16(x[i]);
+    }
+}
 void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) {
   int i = 0;
 #if defined(__AVX512BF16__)
+  // subnormals are flushed to zero on this platform
   for (; i + 32 <= n; i += 32) {
         _mm512_storeu_si512(
             (__m512i *)(y + i),
@@ -918,7 +975,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
         .is_quantized             = false,
         .to_float                 = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
         .from_float               = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
-        .from_float_ref           = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
+        .from_float_ref           = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
         .vec_dot                  = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
         .vec_dot_type             = LM_GGML_TYPE_BF16,
         .nrows                    = 1,
@@ -2258,7 +2315,7 @@ inline static void lm_ggml_vec_abs_f32  (const int n, float * y, const float * x
 inline static void lm_ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
 inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
-inline static void lm_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
+inline static void lm_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@@ -3507,6 +3564,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
     LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer);
+#if defined(__ARM_FEATURE_SVE)
+    if (!lm_ggml_sve_cnt_b) {
+        lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+    }
+#endif
     LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
     lm_ggml_critical_section_end();
@@ -3661,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
         struct lm_ggml_tensor  * view_src,
         size_t                view_offs) {
-    assert(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
+    LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT);
+    LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
     // find the base tensor and absolute offset
     if (view_src != NULL && view_src->view_src != NULL) {
@@ -5314,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
     struct lm_ggml_context * ctx,
     struct lm_ggml_tensor * a,
     int n_groups,
+    float eps,
     bool inplace) {
     bool is_node = false;
@@ -5324,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
     struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
-    result->op_params[0] = n_groups;
+    lm_ggml_set_op_params_i32(result, 0, n_groups);
+    lm_ggml_set_op_params_f32(result, 1, eps);
     result->op = LM_GGML_OP_GROUP_NORM;
     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
@@ -5336,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
 struct lm_ggml_tensor * lm_ggml_group_norm(
     struct lm_ggml_context * ctx,
     struct lm_ggml_tensor * a,
-    int n_groups) {
-    return lm_ggml_group_norm_impl(ctx, a, n_groups, false);
+    int n_groups,
+    float eps) {
+    return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false);
 }
 struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
     struct lm_ggml_context * ctx,
     struct lm_ggml_tensor * a,
-    int n_groups) {
-    return lm_ggml_group_norm_impl(ctx, a, n_groups, true);
+    int n_groups,
+    float eps) {
+    return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
 }
 // lm_ggml_mul_mat
@@ -12035,10 +12103,11 @@ static void lm_ggml_compute_forward_group_norm_f32(
     LM_GGML_TENSOR_UNARY_OP_LOCALS
-    const float eps = 1e-6f; // TODO: make this a parameter
     // TODO: optimize
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
     int n_channels = src0->ne[2];
     int n_groups = dst->op_params[0];
     int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@@ -20606,7 +20675,7 @@ size_t lm_ggml_quantize_chunk(
         case LM_GGML_TYPE_BF16:
             {
                 size_t elemsize = sizeof(lm_ggml_bf16_t);
-                lm_ggml_fp32_to_bf16_row(src + start, (lm_ggml_bf16_t *)dst + start, n);
+                lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n);
                 result = n * elemsize;
             } break;
         case LM_GGML_TYPE_F32:

package/cpp/ggml.h CHANGED Viewed

@@ -349,6 +349,7 @@ extern "C" {
     LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float);
     LM_GGML_API float       lm_ggml_bf16_to_fp32(lm_ggml_bf16_t);  // consider just doing << 16
     LM_GGML_API void        lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t);
+    LM_GGML_API void        lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t);
     LM_GGML_API void        lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t);
     struct lm_ggml_object;
@@ -1139,16 +1140,17 @@ extern "C" {
     // group normalize along ne0*ne1*n_groups
     // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
     // a - x
     // b - dy
@@ -1455,7 +1457,6 @@ extern "C" {
     // if mode & 2 == 1, GPT-NeoX style
     //
     // b is an int32 vector with size a->ne[2], it contains the positions
-    // c is freq factors (e.g. phi3-128k), (optional)
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1472,6 +1473,7 @@ extern "C" {
             int                   mode);
     // custom RoPE
+    // c is freq factors (e.g. phi3-128k), (optional)
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,

package/cpp/llama-impl.h CHANGED Viewed

@@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
 #define LLAMA_LOG_INFO(...)  llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+//
+// helpers
+//
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return; // Avoid infinite loop if 'search' is an empty string
+    }
+    size_t pos = 0;
+    while ((pos = s.find(search, pos)) != std::string::npos) {
+        s.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+}

package/cpp/llama-vocab.cpp CHANGED Viewed

@@ -16,20 +16,6 @@
 // helpers
 //
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
-    }
-    s = std::move(result);
-}
 LLAMA_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
     va_list ap;
@@ -816,6 +802,9 @@ struct llm_tokenizer_ugm {
      * the best tokenization.
     */
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        // get current size of output (for reversal later)
+        size_t output_size = output.size();
         // normalize the input first
         std::string normalized;
         normalize(text, &normalized);
@@ -895,7 +884,7 @@ struct llm_tokenizer_ugm {
         }
         // reverse the output since we added tokens starting from the end of the input
-        std::reverse(output.begin(), output.end());
+        std::reverse(output.begin() + output_size, output.end());
     }
 private:
@@ -1444,7 +1433,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
     return token != -1 && (
         token == llama_token_eos_impl(vocab) ||
-        token == llama_token_eot_impl(vocab)
+        token == llama_token_eot_impl(vocab) ||
+        token == llama_token_eom_impl(vocab)
     );
 }
@@ -1500,6 +1490,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
     return vocab.special_eot_id;
 }
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eom_id;
+}
 int32_t llama_tokenize_impl(
     const struct llama_vocab & vocab,
                   const char * text,

package/cpp/llama-vocab.h CHANGED Viewed

@@ -45,6 +45,7 @@ struct llama_vocab {
     id special_suffix_id = -1;
     id special_middle_id = -1;
     id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
+    id special_eom_id    = -1;
     // tokenizer flags
     bool tokenizer_add_space_prefix = false;
@@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
 int32_t llama_tokenize_impl(
         const struct llama_vocab & vocab,