npm - @novastera-oss/llamarn - Versions diffs - 0.2.1 → 0.2.2 - Mend

@novastera-oss/llamarn 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (266) hide show

package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal CHANGED Viewed

@@ -856,6 +856,7 @@ kernel void kernel_tanh(
 constant float GELU_COEF_A     = 0.044715f;
 constant float GELU_QUICK_COEF = -1.702f;
 constant float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+constant float SQRT_2_INV      = 0.70710678118654752440084436210484f;
 kernel void kernel_gelu(
     device const float * src0,
@@ -897,6 +898,42 @@ kernel void kernel_gelu_quick_4(
     dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
 }
+// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
+// ref: https://www.johndcook.com/blog/python_erf/
+constant float p_erf  = 0.3275911f;
+constant float a1_erf = 0.254829592f;
+constant float a2_erf = -0.284496736f;
+constant float a3_erf = 1.421413741f;
+constant float a4_erf = -1.453152027f;
+constant float a5_erf = 1.061405429f;
+template<typename T>
+T erf_approx(T x) {
+    T sign_x = sign(x);
+    x = fabs(x);
+    T t = 1.0f / (1.0f + p_erf * x);
+    T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
+    return sign_x * y;
+}
+kernel void kernel_gelu_erf(
+    device const float * src0,
+    device       float * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+    dst[tpig] = 0.5f*x*(1.0f+erf_approx<float>(x*SQRT_2_INV));
+}
+kernel void kernel_gelu_erf_4(
+    device const float4 * src0,
+    device       float4 * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+    dst[tpig] = 0.5f*x*(1.0f+erf_approx<float4>(x*SQRT_2_INV));
+}
 kernel void kernel_silu(
         device const float * src0,
         device       float * dst,
@@ -2713,8 +2750,148 @@ kernel void kernel_rope_neox(
     }
 }
+template<typename T>
+kernel void kernel_rope_multi(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+    device const int32_t * pos = (device const int32_t *) src1;
+    const float inv_ndims = -1.f/args.n_dims;
+    float cos_theta;
+    float sin_theta;
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
+            // mrope theta calculations
+            // note: the rest is the same as kernel_rope_neox
+            const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3;
+            const int sec_w01   = args.sect_0 + args.sect_1;               // end of section 1
+            const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
+            const int sector    = ic % sect_dims;
+            float theta_base;
+            if (sector < args.sect_0) {
+                theta_base = (float) pos[i2];
+            } else if (sector < sec_w01) {
+                theta_base = (float) pos[i2 + args.ne02];
+            } else if (sector < sec_w012) {
+                theta_base = (float) pos[i2 + args.ne02 * 2];
+            } else {
+                theta_base = (float) pos[i2 + args.ne02 * 3];
+            }
+            // end of mrope
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims/2];
+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+template<typename T>
+kernel void kernel_rope_vision(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+    device const int32_t * pos = (device const int32_t *) src1;
+    const float inv_ndims = -1.f/args.n_dims;
+    float cos_theta;
+    float sin_theta;
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < 2*args.n_dims) { // different from kernel_rope_multi
+            const int ic = i0/2;
+            // mrope theta calculations (only support 2 dimensions)
+            const int sect_dims = args.sect_0 + args.sect_1;
+            const int sector    = ic % sect_dims;
+            float p;
+            float theta_base;
+            if (sector < args.sect_1) {
+                p = (float) sector;
+                theta_base = (float) pos[i2];
+            } else {
+                p = (float) sector - args.sect_0;
+                theta_base = (float) pos[i2 + args.ne02];
+            }
+            const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
+            // end of mrope
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims]; // different from kernel_rope_multi
+            dst_data[0]           = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -2722,6 +2899,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
 typedef void (im2col_t)(
         device const float * x,
         device        char * dst,
@@ -3109,7 +3292,7 @@ template<
     typename kd4x4_t, // key type in device memory
     short nl_k,
     void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
-    typename vd4x4_t, // key type in device memory
+    typename vd4x4_t, // value type in device memory
     short nl_v,
     void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
     short DK,        // K head size
@@ -3630,7 +3813,7 @@ template<
     typename kd4_t, // key type in device memory
     short nl_k,
     void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
-    typename vd4_t, // key type in device memory
+    typename vd4_t, // value type in device memory
     short nl_v,
     void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
     short DK,       // K head size
@@ -3741,6 +3924,11 @@ kernel void kernel_flash_attn_ext_vec(
                 sm[tiisg] = pm[ic + tiisg];
             }
+            // skip -INF blocks
+            if (simd_max(sm[tiisg]) == -INFINITY) {
+                continue;
+            }
             // Q*K^T
             {
                 // each simdgroup processes 1 query and NE (NW/NL) head elements
@@ -3973,6 +4161,16 @@ kernel void kernel_flash_attn_ext_vec(
 typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
+template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  64, 64, 8>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 64, 64, 8>;
+#endif
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0,        8, dequantize_q4_0_t4, block_q4_0,  8, dequantize_q4_0_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1,        8, dequantize_q4_1_t4, block_q4_1,  8, dequantize_q4_1_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0,        8, dequantize_q5_0_t4, block_q5_0,  8, dequantize_q5_0_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1,        8, dequantize_q5_1_t4, block_q5_1,  8, dequantize_q5_1_t4, 64, 64, 8>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0,        8, dequantize_q8_0_t4, block_q8_0,  8, dequantize_q8_0_t4, 64, 64, 8>;
 template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  96, 96, 4>;
 #if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 96, 96, 4>;

package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt CHANGED Viewed

@@ -27,12 +27,15 @@ if (MUSAToolkit_FOUND)
     file(GLOB   GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
     list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
+    list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")
     file(GLOB   GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
     file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
     list(APPEND GGML_SOURCES_MUSA ${SRCS})
     file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
     list(APPEND GGML_SOURCES_MUSA ${SRCS})
+    file(GLOB   SRCS "../ggml-musa/*.cu")
+    list(APPEND GGML_SOURCES_MUSA ${SRCS})
     if (GGML_CUDA_FA_ALL_QUANTS)
         file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
@@ -62,7 +65,9 @@ if (MUSAToolkit_FOUND)
                             )
     # TODO: do not use CUDA definitions for MUSA
-    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+    if (NOT GGML_BACKEND_DL)
+        target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+    endif()
     add_compile_definitions(GGML_USE_MUSA)
     add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
@@ -92,9 +97,10 @@ if (MUSAToolkit_FOUND)
     endif()
     if (GGML_STATIC)
+        # TODO: mudnn has not provided static libraries yet
         target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
     else()
-        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
+        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
     endif()
     if (GGML_CUDA_NO_VMM)

package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu ADDED Viewed

@@ -0,0 +1,112 @@
+#include <mutex>
+#include <mudnn.h>
+#include "mudnn.cuh"
+namespace mudnn = musa::dnn;
+// Returns a human-readable error string for mudnn::Status
+const char* mudnnGetErrorString(mudnn::Status err) {
+    switch (err) {
+        case mudnn::Status::SUCCESS:
+            return "Success";
+        case mudnn::Status::INVALID_PARAMETER:
+            return "Invalid parameter";
+        case mudnn::Status::NOT_INITIALIZED:
+            return "Not initialized";
+        case mudnn::Status::ALLOC_FAILED:
+            return "Allocation failed";
+        case mudnn::Status::NOT_SUPPORTED:
+            return "Not supported";
+        case mudnn::Status::INTERNAL_ERROR:
+            return "Internal error";
+        case mudnn::Status::ARCH_MISMATCH:
+            return "Architecture mismatch";
+        case mudnn::Status::EXECUTION_FAILED:
+            return "Execution failed";
+        default:
+            return "Unknown mudnn status";
+    }
+}
+// Error checking macro for MUDNN calls
+#define MUDNN_CHECK(err) CUDA_CHECK_GEN(err, mudnn::Status::SUCCESS, mudnnGetErrorString)
+namespace {
+    // Thread-safe cache for mudnn::Handle objects per device
+    std::unordered_map<int, std::unique_ptr<mudnn::Handle>> handle_cache;
+    std::mutex handle_cache_mutex;
+    mudnn::Handle* get_cached_handle(int device_id) {
+        std::lock_guard<std::mutex> lock(handle_cache_mutex);
+        auto it = handle_cache.find(device_id);
+        if (it != handle_cache.end()) {
+            return it->second.get();
+        }
+        auto handle = std::make_unique<mudnn::Handle>(device_id);
+        mudnn::Handle* handle_ptr = handle.get();
+        handle_cache[device_id] = std::move(handle);
+        return handle_ptr;
+    }
+}
+// Extracts dimensions and strides from a ggml_tensor
+int get_ggml_dims_and_strides(const ggml_tensor* tensor,
+                              std::vector<int64_t>& dims,
+                              std::vector<int64_t>& strides) {
+    const int ndims = ggml_n_dims(tensor);
+    const size_t element_size = ggml_element_size(tensor);
+    dims.resize(ndims);
+    strides.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+        dims[i] = tensor->ne[i];
+        strides[i] = tensor->nb[i] / static_cast<int64_t>(element_size);
+    }
+    return ndims;
+}
+// Converts ggml_type to mudnn::Tensor::Type
+mudnn::Tensor::Type ggml_type_to_mudnn_type(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_F32:
+            return mudnn::Tensor::Type::FLOAT;
+        case GGML_TYPE_F16:
+            return mudnn::Tensor::Type::HALF;
+        // TODO: Add support for other types
+        default:
+            MUDNN_CHECK(mudnn::Status::NOT_SUPPORTED);
+    }
+    return mudnn::Tensor::Type::FLOAT; // Default fallback
+}
+// Asynchronous memory copy using mudnn::Unary::IDENTITY
+musaError_t mudnnMemcpyAsync(ggml_backend_cuda_context& ctx, const ggml_tensor* dst, const ggml_tensor* src) {
+    mudnn::Tensor tensor_dst, tensor_src;
+    MUDNN_CHECK(tensor_dst.SetType(ggml_type_to_mudnn_type(dst->type)));
+    MUDNN_CHECK(tensor_src.SetType(ggml_type_to_mudnn_type(src->type)));
+    std::vector<int64_t> dims, strides;
+    const int ndims = get_ggml_dims_and_strides(src, dims, strides);
+    MUDNN_CHECK(tensor_dst.SetNdInfo(ndims, dims.data(), strides.data()));
+    MUDNN_CHECK(tensor_src.SetNdInfo(ndims, dims.data(), strides.data()));
+    MUDNN_CHECK(tensor_dst.SetAddr(dst->data));
+    MUDNN_CHECK(tensor_src.SetAddr(src->data));
+    mudnn::Unary op;
+    MUDNN_CHECK(op.SetMode(mudnn::Unary::Mode::IDENTITY));
+    MUDNN_CHECK(op.SetAlpha(0.0f));
+    MUDNN_CHECK(op.SetBeta(0.0f));
+    mudnn::Handle* handle = get_cached_handle(ctx.device);
+    MUDNN_CHECK(handle->SetStream(ctx.stream()));
+    MUDNN_CHECK(op.Run(*handle, tensor_dst, tensor_src));
+    return musaSuccess;
+}

package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh ADDED Viewed

@@ -0,0 +1,12 @@
+#pragma once
+#include "../include/ggml.h"
+#include "../ggml-cuda/common.cuh"
+// Asynchronously copies data from src tensor to dst tensor using the provided context.
+// Returns a musaError_t indicating success or failure.
+musaError_t mudnnMemcpyAsync(
+    ggml_backend_cuda_context &ctx,
+    const ggml_tensor *dst,
+    const ggml_tensor *src
+);

package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt CHANGED Viewed

@@ -55,14 +55,17 @@ endfunction()
 set(GGML_OPENCL_KERNELS
     add
+    argsort
     clamp
     cpy
     cvt
     diag_mask_inf
+    div
     gelu
     gemv_noshuffle_general
     gemv_noshuffle
     get_rows
+    group_norm
     im2col_f32
     im2col_f16
     mul_mat_Ab_Bi_8x4
@@ -83,11 +86,14 @@ set(GGML_OPENCL_KERNELS
     rms_norm
     rope
     scale
+    sigmoid
     silu
     softmax_4_f32
     softmax_4_f16
     softmax_f32
     softmax_f16
+    sub
+    sum_rows
     transpose
 )