npm - @novastera-oss/llamarn - Versions diffs - 0.2.7 → 0.3.0 - Mend

@novastera-oss/llamarn 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (319) hide show

package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp CHANGED Viewed

@@ -65,8 +65,9 @@
 #include <aclnnop/aclnn_eq_tensor.h>
 #include <aclnnop/aclnn_gt_scalar.h>
 #include <aclnnop/aclnn_pow.h>
-#include <aclnnop/aclnn_grouped_matmul_v2.h>
+#include <aclnnop/aclnn_grouped_matmul_v3.h>
 #include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
+#include <aclnnop/aclnn_zero.h>
 #include <float.h>
 #include <cmath>
@@ -804,10 +805,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
         nb[i] = nb[i - 1] * ne[i - 1];
     }
-    ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
     aclTensor* zero =
         ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
+    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
     return zero;
+    GGML_UNUSED(n_bytes);
 }
 /**
@@ -2654,6 +2656,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
         memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
     }
+#ifdef ASCEND_310P
+    ggml_tensor src0_row = *src0;
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row = *dst;
+    if (src0->type == GGML_TYPE_F16) {
+        src0_row.type = GGML_TYPE_F32;
+    }
+    // src0_row [D, M, 1, 1] weight without permute
+    src0_row.ne[2] = 1;
+    src0_row.ne[3] = 1;
+    src0_row.nb[0] = ori_src0_nb[0];
+    src0_row.nb[1] = ori_src0_nb[1];
+    src0_row.nb[2] = ori_src0_nb[1];
+    src0_row.nb[3] = ori_src0_nb[1];
+    // src1_row [D, 1, 1, 1] -> input
+    src1_row.ne[1] = 1;
+    src1_row.ne[2] = 1;
+    src1_row.ne[3] = 1;
+    src1_row.nb[2] = nb11;
+    src1_row.nb[3] = nb11;
+    // dst_row [M, 1, 1, 1] -> out
+    dst_row.ne[1] = 1;
+    dst_row.ne[2] = 1;
+    dst_row.ne[3] = 1;
+    dst_row.nb[2] = nb1;
+    dst_row.nb[3] = nb1;
+    //create weight for one row
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            // expert index
+            int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(i02 >= 0 && i02 < n_as);
+            // If B = 1 (broadcast), always use 0; otherwise, use id.
+            int64_t i11 = (ne11 == 1 ? 0 : id);
+            int64_t i12 = iid1;
+            int64_t i1 = id;
+            int64_t i2 = i12;
+            void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
+            void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
+            void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2;
+            src0_row.data = src0_tmp_ptr;
+            src1_row.data = src1_tmp_ptr;
+            dst_row.data = dst_tmp_ptr;
+            dst_row.src[0] = &src0_row;
+            dst_row.src[1] = &src1_row;
+            ggml_cann_mul_mat(ctx, &dst_row);
+        }
+    }
+    return;
+#endif
     std::vector<aclTensor*> src0_tensor_vec;
     std::vector<aclTensor*> src1_tensor_vec;
     std::vector<aclTensor*> dst_tensor_vec;
@@ -2701,9 +2764,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
     }
     size_t GROUP_SIZE = 128;
-    // GroupedMatmulV2 required tensor_list.size < 128
+    // GroupedMatmulV3 required tensor_list.size < 128
     for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
-        // split and call GroupedMatmulV2
+        // split and call GroupedMatmulV3
         size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
         std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
         std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
@@ -2713,7 +2776,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
         aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
         aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
-        GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
+        GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
             nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
         ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);

package/cpp/llama.cpp/ggml/src/ggml-cann/common.h CHANGED Viewed

@@ -359,7 +359,7 @@ struct ggml_backend_cann_context {
         ggml_cann_set_device(device);
         description = aclrtGetSocName();
-        bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
+        async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
         GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
             device, async_mode ? "ON" : "OFF");
     }

package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp CHANGED Viewed

@@ -2086,6 +2086,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                     return false;
             }
         } break;
+        case GGML_OP_SET_ROWS:
+            {
+                // TODO: add support
+                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
+                return false;
+            } break;
         case GGML_OP_CPY: {
             ggml_tensor *src = op->src[0];
             if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
@@ -2182,12 +2188,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_MUL:
         case GGML_OP_DIV:
         case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
         case GGML_OP_CLAMP:
         case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
@@ -2205,6 +2209,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_PAD_REFLECT_1D:
         case GGML_OP_COUNT_EQUAL:
             return true;
+        case GGML_OP_SCALE:
+            float bias;
+            memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
+            return bias == 0.0f; // TODO: support bias != 0.0f
+        case GGML_OP_SOFT_MAX:
+            // TODO: support broadcast
+            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
+            return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
         case GGML_OP_FLASH_ATTN_EXT:{
             // derived from [ggml-cuda.cu]
             if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
@@ -2227,6 +2239,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                 // DeepSeek MLA
                 return false;
             }
+            // TODO: support broadcast
+            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
             if (op->src[0]->ne[3] != 1) {
                 return false;
             }

package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt CHANGED Viewed

@@ -5,7 +5,7 @@ function(ggml_add_cpu_backend_features cpu_name arch)
     # build, using set_source_files_properties() to set the arch flags is not possible
     set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
     add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
-    target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
+    target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
     target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
     target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
     set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -388,6 +388,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             else()
                 list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
             endif()
+        elseif(GGML_CPU_ALL_VARIANTS)
+            # Begin with the lowest baseline
+            set(ARCH_DEFINITIONS "")
+            # When a feature is selected, bump the MCPU to the first
+            # version that supported it
+            foreach(PVER RANGE 7 11)
+                if(DEFINED GGML_INTERNAL_POWER${PVER})
+                    set(POWERPC_MCPU "power${PVER}")
+                    list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
+                endif()
+            endforeach()
+            if (GGML_INTERNAL_VSX)
+                list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
+                list(APPEND ARCH_FLAGS -mvsx)
+            endif()
+            if (DEFINED POWERPC_MCPU)
+                list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
+            endif()
+            ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
         else()
             if (GGML_CPU_POWERPC_CPUTYPE)
                 list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
@@ -427,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         # TODO: Separation to determine activation of VX/VXE/VXE2
         if (${S390X_M} MATCHES "8561|8562")
+            set(GGML_NNPA OFF)
             message(STATUS "z15 target")
             list(APPEND ARCH_FLAGS -march=z15)
         elseif (${S390X_M} MATCHES "3931")
@@ -443,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
         if (GGML_VXE)
+            message(STATUS "VX/VXE/VXE2 enabled")
             list(APPEND ARCH_FLAGS -mvx -mzvector)
+            list(APPEND ARCH_DEFINITIONS GGML_VXE)
+        endif()
+        if (GGML_NNPA)
+            message(STATUS "NNPA enabled")
+            list(APPEND ARCH_DEFINITIONS GGML_NNPA)
         endif()
     elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
         message(STATUS "Wasm detected")
@@ -465,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         # Fetch KleidiAI sources:
         include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.6.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.9.0")
         set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "75b4ad68f25ab673dcc01065e5a0b05f")
+        set(KLEIDIAI_ARCHIVE_MD5  "2a8e1bb55d201557553545536489a017")
         if (POLICY CMP0135)
             cmake_policy(SET CMP0135 NEW)
@@ -560,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
     if (EMSCRIPTEN)
         set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
     endif()
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+        # The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
+        target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
+    endif()
 endfunction()

package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp CHANGED Viewed

@@ -8,6 +8,7 @@
 #include "mmq.h"
 #include "ggml-impl.h"
 #include "ggml-cpu-impl.h"
+#include "simd-mappings.h"
 #include "quants.h"
 #include "ggml-quants.h"
 #include <algorithm>
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
         // Quantize these floats
         const float iscale = 127.f / amax;
-        y[i].d = GGML_FP32_TO_FP16(1 / iscale);
+        y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
         const float id = ( amax != 0.0f ) ? iscale : 0.f;
         const __m512 vscale = _mm512_set1_ps(id);
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
         const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
             __m512 vsum;
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
         const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
-            const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
+            const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
             __m512 vsum;
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
         const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
             __m512 vsum;
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                     vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
                 }
-                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+                vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
             }
             // load b
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
                 for (int k = 0; k < 8; ++k) {
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                 }
-                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
-                vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
+                vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
+                vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
             }
             // load b
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                     va[k] = _mm512_add_epi8(va[k], off);
                 }
-                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+                vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
             }
             // load b