RubyGems - whispercpp - Versions diffs - 1.3.3 → 1.3.5 - Mend

whispercpp 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (963) hide show

data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp CHANGED Viewed

@@ -22,23 +22,24 @@
 #include "ggml-cann.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cann/aclnn_ops.h"
+#include "ggml-cann/common.h"
+#include "ggml-impl.h"
+#include "ggml.h"
 #include <acl/acl.h>
+#include <aclnnop/aclnn_trans_matmul_weight.h>
 #include <stdarg.h>
+#include <chrono>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <mutex>
+#include <optional>
 #include <queue>
-#include <chrono>
 #include <unordered_set>
-#include <optional>
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
-#include "ggml-cann/aclnn_ops.h"
-#include "ggml-cann/common.h"
-#include "ggml.h"
 #define GGML_COMMON_DECL_C
@@ -55,33 +56,41 @@
  * @param line The line number where the error occurred.
  * @param msg The error message.
  */
-[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
-                                  const char* file, int line, const char* msg) {
+[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
     int32_t id = -1;
     aclrtGetDevice(&id);
     GGML_LOG_ERROR("CANN error: %s\n", msg);
-    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func,
-            file, line);
+    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ASSERT to get a stack trace
     GGML_ABORT("CANN error");
 }
+// Thread-local variable to record the current device of this thread.
+thread_local int g_current_cann_device = -1;
 /**
- * @brief Sets the device to be used by CANN.
+ * @brief Set the CANN device to be used.
  *
- * @param device The device ID to set.
+ * @param device The target device ID to set.
  */
 void ggml_cann_set_device(const int32_t device) {
-    // TODO: uncomment these lines after empty context has fixed.
-    // int current_device;
-    // ACL_CHECK(aclrtGetDevice(&current_device));
+    // int current_device = -1;
+    // Note: In some CANN versions, if no device has been set yet,
+    //       aclrtGetDevice(&current_device) may return 0 by default.
+    // aclrtGetDevice(&current_device);
+    // If the current device is already the target one, no need to switch.
+    if (device == g_current_cann_device) {
+        return;
+    }
-    // if (device == current_device) {
-    //   return;
-    // }
+    // Switch to the new device.
     ACL_CHECK(aclrtSetDevice(device));
+    // Update the global device record.
+    g_current_cann_device = device;
 }
 /**
@@ -96,12 +105,14 @@ int32_t ggml_cann_get_device() {
 }
 /**
- * @brief Get the value of the specified environment variable (name).
+ * @brief Get the value of the specified environment variable (name) as lowercase.
  *        if not empty, return a std::string object
  */
-std::optional<std::string> get_env(const std::string& name) {
-    const char* val = std::getenv(name.c_str());
-    if (!val) return std::nullopt;
+std::optional<std::string> get_env_as_lowercase(const std::string & name) {
+    const char * val = std::getenv(name.c_str());
+    if (!val) {
+        return std::nullopt;
+    }
     std::string res = std::string(val);
     std::transform(res.begin(), res.end(), res.begin(), ::tolower);
     return res;
@@ -110,11 +121,29 @@ std::optional<std::string> get_env(const std::string& name) {
 /**
  * @brief Verify whether the environment variable is a valid value.
  */
-bool parse_bool(const std::string& value) {
-    std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
+bool parse_bool(const std::string & value) {
+    static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
     return valid_values.find(value) != valid_values.end();
 }
+/**
+ * @brief Parse a string as an integer, returning 0 if invalid.
+ *
+ * This function attempts to convert the input string `value` to an `int`.
+ * If the string is not a valid integer or is out of the `int` range,
+ * it returns 0.
+ *
+ * @param value The string to parse.
+ * @return The parsed integer, or 0 if conversion fails.
+ */
+int parse_integer(const std::string & value) {
+    try {
+        return std::stoi(value);
+    } catch (...) {
+        return 0;
+    }
+}
 /**
  * @brief Initialize the CANN device information.
  *
@@ -126,11 +155,10 @@ bool parse_bool(const std::string& value) {
 static ggml_cann_device_info ggml_cann_init() {
     ggml_cann_device_info info = {};
-    aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
+    aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count);
     if (err != ACL_SUCCESS) {
-        GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
-                __func__, aclGetRecentErrMsg());
+        GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg());
         return info;
     }
@@ -138,16 +166,15 @@ static ggml_cann_device_info ggml_cann_init() {
     for (int id = 0; id < info.device_count; ++id) {
         aclrtPhysicalMemProp prop = {};
-        prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
-        prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-        prop.memAttr = ACL_HBM_MEM_HUGE;
-        prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = id;
-        prop.reserve = 0;
-        err = aclrtMemGetAllocationGranularity(
-            &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
-            &info.devices[id].vmm_granularity);
-        info.devices[id].vmm = err == ACL_SUCCESS;
+        prop.handleType           = ACL_MEM_HANDLE_TYPE_NONE;
+        prop.allocationType       = ACL_MEM_ALLOCATION_TYPE_PINNED;
+        prop.memAttr              = ACL_HBM_MEM_HUGE;
+        prop.location.type        = ACL_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id          = id;
+        prop.reserve              = 0;
+        err                       = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+                                                                     &info.devices[id].vmm_granularity);
+        info.devices[id].vmm      = err == ACL_SUCCESS;
         size_t free, total;
         ggml_backend_cann_get_device_memory(id, &free, &total);
@@ -167,7 +194,7 @@ static ggml_cann_device_info ggml_cann_init() {
  *
  * @return A reference to the structure containing the device information.
  */
-const ggml_cann_device_info& ggml_cann_info() {
+const ggml_cann_device_info & ggml_cann_info() {
     static ggml_cann_device_info info = ggml_cann_init();
     return info;
 }
@@ -187,7 +214,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
     /**
      * @brief The minimum free margin for a buffer.
      */
-    static const size_t min_free_margin = 1ull << 20;   // 1MB
+    static const size_t min_free_margin = 1ull << 20;  // 1MB
     /**
      * @brief The alignment for buffer allocation.
@@ -208,22 +235,18 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * @brief Structure representing a CANN buffer.
      */
     struct ggml_cann_buffer {
-        void* ptr = nullptr;  ///< Pointer to the buffer.
-        size_t size = 0;      ///< Size of the buffer.
-        std::chrono::steady_clock::time_point last_used;  ///< Last used time.
+        void *                                ptr  = nullptr;  ///< Pointer to the buffer.
+        size_t                                size = 0;        ///< Size of the buffer.
+        std::chrono::steady_clock::time_point last_used;       ///< Last used time.
-        bool operator>(const ggml_cann_buffer& other) const {
-            return size > other.size;
-        }
+        bool operator>(const ggml_cann_buffer & other) const { return size > other.size; }
     };
     /**
      * @brief Array of CANN buffers in the pool.
      */
-    std::unordered_map<void*, size_t> buffer_pool;
-    std::priority_queue<ggml_cann_buffer,
-                        std::vector<ggml_cann_buffer>,
-                        std::greater<>> free_buffers ;
+    std::unordered_map<void *, size_t>                                                   buffer_pool;
+    std::priority_queue<ggml_cann_buffer, std::vector<ggml_cann_buffer>, std::greater<>> free_buffers;
     /**
      * @brief Total size of all buffers in the pool.
@@ -236,7 +259,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * @param device The device ID to associate with this buffer pool.
      */
     explicit ggml_cann_pool_buf_prio(int device) : device(device) {
-        disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
+        disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
     }
     /**
@@ -244,7 +267,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      */
     ~ggml_cann_pool_buf_prio() {
         ggml_cann_set_device(device);
-        for (auto& [b_ptr, b_size] : buffer_pool) {
+        for (auto & [b_ptr, b_size] : buffer_pool) {
             aclrtFree(b_ptr);
             pool_size -= b_size;
         }
@@ -260,14 +283,14 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * the allocated buffer.
      * @return A pointer to the allocated buffer.
      */
-    void* alloc(size_t size, size_t* actual_size) override {
+    void * alloc(size_t size, size_t * actual_size) override {
         size = GGML_PAD(size, alignment);
         if (size == 0) {
             size = alignment;
         }
-        void* ptr = nullptr;
-        auto now = std::chrono::steady_clock::now();
+        void * ptr = nullptr;
+        auto   now = std::chrono::steady_clock::now();
         std::vector<ggml_cann_buffer> free_buffers_rest;
         free_buffers_rest.reserve(free_buffers.size());
@@ -280,24 +303,22 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
                 const size_t margin = b.size - size;
                 if (margin <= max_reuse_margin) {
                     *actual_size = b.size;
-                    ptr = b.ptr;
+                    ptr          = b.ptr;
 #ifdef DEBUG_CANN_MALLOC
                     GGML_LOG_INFO(
                         "cann pool[%d]: reused   %p, "
                         "pool_size = %5u MB, "
                         "size = %5u MB, "
                         "margin = %5u MB\n",
-                        device, b.ptr,
-                        (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                        (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
-                        (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
+                        device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                        (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
+                        (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
 #endif
                     break;
                 }
             }
-            bool should_clean = !disable_clean &&
-                                b.size > min_free_margin &&
+            bool should_clean = !disable_clean && b.size > min_free_margin &&
                                 std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
             if (should_clean) {
                 // free the buffer if the size is needed to be freed
@@ -309,20 +330,20 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
                     "cann pool[%d]: clean    %p, "
                     "pool_size = %5u MB, "
                     "size = %5u MB\n",
-                    device, b.ptr,
-                    (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                    (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+                    device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                    (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
 #endif
                 continue;
             }
             free_buffers_rest.push_back(b);
         }
-        for (ggml_cann_buffer &b : free_buffers_rest) {
+        for (ggml_cann_buffer & b : free_buffers_rest) {
             free_buffers.push(std::move(b));
         }
 #ifdef DEBUG_CANN_MALLOC
-        GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+        GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
+                      (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
 #endif
         if (ptr != nullptr) {
             return ptr;
@@ -338,8 +359,8 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
             "cann pool[%d]: allocate %p, "
             "pool_size = %5u MB, "
             "size = %5u MB\n",
-            device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-            (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
+            device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+            (uint32_t) (GGML_PAD(size, 1048576) / 1048576));
 #endif
         buffer_pool.emplace(ptr, size);
         return ptr;
@@ -351,7 +372,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * @param ptr Pointer to the buffer to free.
      * @param size Size of the buffer to free.
      */
-    void free(void* ptr, size_t size) override {
+    void free(void * ptr, size_t size) override {
         GGML_UNUSED(size);
         auto it = buffer_pool.find(ptr);
         if (it == buffer_pool.end()) {
@@ -359,13 +380,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
         }
         auto now = std::chrono::steady_clock::now();
-        free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
+        free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now });
 #ifdef DEBUG_CANN_MALLOC
         GGML_LOG_INFO(
             "cann pool[%d]: return   %p, "
             "pool_size = %5u MB\n",
-            device, ptr,
-            (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+            device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
 #endif
     }
 };
@@ -384,7 +404,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
     /**
      * @brief The minimum free margin for a buffer.
      */
-    static const size_t min_free_margin = 1ull << 20;   // 1MB
+    static const size_t min_free_margin = 1ull << 20;  // 1MB
     /**
      * @brief The alignment for buffer allocation.
@@ -410,10 +430,10 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * @brief Structure representing a CANN buffer.
      */
     struct ggml_cann_buffer {
-        void* ptr = nullptr;  ///< Pointer to the buffer memory.
-        size_t size = 0;      ///< Size of the buffer.
-        bool used = false;    ///< Whether the buffer is currently in use.
-        std::chrono::steady_clock::time_point last_used;  ///< Last used time.
+        void *                                ptr  = nullptr;  ///< Pointer to the buffer memory.
+        size_t                                size = 0;        ///< Size of the buffer.
+        bool                                  used = false;    ///< Whether the buffer is currently in use.
+        std::chrono::steady_clock::time_point last_used;       ///< Last used time.
     };
     /**
@@ -432,7 +452,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * @param device The device ID to associate with this buffer pool.
      */
     explicit ggml_cann_pool_buf(int device) : device(device) {
-        disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
+        disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
     }
     /**
@@ -441,7 +461,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
     ~ggml_cann_pool_buf() {
         ggml_cann_set_device(device);
         for (int i = 0; i < MAX_BUFFERS; ++i) {
-            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_buffer & b = buffer_pool[i];
             if (b.ptr != nullptr) {
                 aclrtFree(b.ptr);
                 pool_size -= b.size;
@@ -458,18 +478,18 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * the allocated buffer.
      * @return A pointer to the allocated buffer.
      */
-    void* alloc(size_t size, size_t* actual_size) override {
+    void * alloc(size_t size, size_t * actual_size) override {
         size = GGML_PAD(size, alignment);
         if (size == 0) {
             size = alignment;
         }
-        void* ptr = nullptr;
-        auto now = std::chrono::steady_clock::now();
+        void * ptr = nullptr;
+        auto   now = std::chrono::steady_clock::now();
         int i = 0;
         for (; i < MAX_BUFFERS; ++i) {
-            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_buffer & b = buffer_pool[i];
             if (b.ptr == nullptr) {
                 break;
             }
@@ -481,25 +501,23 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
                 const size_t margin = b.size - size;
                 if (margin <= max_reuse_margin) {
                     *actual_size = b.size;
-                    b.used = true;
-                    ptr = b.ptr;
+                    b.used       = true;
+                    ptr          = b.ptr;
 #ifdef DEBUG_CANN_MALLOC
                     GGML_LOG_INFO(
                         "cann pool[%d]: reused   %p, "
                         "pool_size = %5u MB, "
                         "size = %5u MB, "
                         "margin = %5u MB\n",
-                        device, b.ptr,
-                        (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                        (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
-                        (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
+                        device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                        (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
+                        (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
 #endif
                     break;
                 }
             }
-            bool should_clean = !disable_clean &&
-                                b.size > min_free_margin &&
+            bool should_clean = !disable_clean && b.size > min_free_margin &&
                                 std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
             if (should_clean) {
                 // free the buffer if the size is needed to be freed
@@ -510,9 +528,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
                     "cann pool[%d]: clean    %p, "
                     "pool_size = %5u MB, "
                     "size = %5u MB\n",
-                    device, b.ptr,
-                    (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                    (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+                    device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                    (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
 #endif
                 b.ptr = nullptr;
             }
@@ -523,13 +540,13 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
         if (i < MAX_BUFFERS) {
             // allocate a new buffer if no buffer can be reused
-            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_buffer & b = buffer_pool[i];
             ggml_cann_set_device(device);
             ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
             pool_size += size;
             *actual_size = size;
-            b.size = size;
-            b.used = true;
+            b.size       = size;
+            b.used       = true;
             if (i >= MAX_BUFFERS - 8) {
                 GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
             }
@@ -538,9 +555,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
                 "cann pool[%d]: allocate %p, "
                 "pool_size = %5u MB, "
                 "size = %5u MB\n",
-                device, b.ptr,
-                (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+                device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
 #endif
             return b.ptr;
         }
@@ -554,21 +570,20 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * @param ptr Pointer to the buffer to free.
      * @param size Size of the buffer to free.
      */
-    void free(void* ptr, size_t size) override {
+    void free(void * ptr, size_t size) override {
         GGML_UNUSED(size);
         for (int i = 0; i < MAX_BUFFERS; ++i) {
-            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_buffer & b = buffer_pool[i];
             if (b.ptr != ptr) {
                 continue;
             }
-            b.used = false;
+            b.used      = false;
             b.last_used = std::chrono::steady_clock::now();
 #ifdef DEBUG_CANN_MALLOC
             GGML_LOG_INFO(
                 "cann pool[%d]: return   %p, "
                 "pool_size = %5u MB\n",
-                device, b.ptr,
-                (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+                device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
 #endif
             return;
         }
@@ -596,7 +611,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
      * @brief Pointer to the start of the virtual memory pool.
      */
-    void* pool_addr = 0;
+    void * pool_addr = 0;
     /**
      * @brief Amount of virtual memory used in the pool.
@@ -621,7 +636,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
      * @brief Offsets for the mapped memory regions.
      */
-    std::vector<void*> map_offsets;
+    std::vector<void *> map_offsets;
     /**
      * @brief Constructor to initialize the buffer pool with virtual memory for
@@ -629,11 +644,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      *
      * @param device The device ID to associate with this buffer pool.
      */
-    explicit ggml_cann_pool_vmm(int device)
-    : device(device) {
-        auto dev = ggml_cann_info().devices[device];
+    explicit ggml_cann_pool_vmm(int device) : device(device) {
+        auto dev    = ggml_cann_info().devices[device];
         granularity = dev.vmm_granularity;
-        max_size = dev.total_vram;
+        max_size    = dev.total_vram;
     }
     /**
@@ -641,10 +655,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      */
     ~ggml_cann_pool_vmm() {
         if (pool_addr != 0) {
-            for (auto& offset : map_offsets) {
+            for (auto & offset : map_offsets) {
                 ACL_CHECK(aclrtUnmapMem(offset));
             }
-            for (auto& handle : handles) {
+            for (auto & handle : handles) {
                 ACL_CHECK(aclrtFreePhysical(handle));
             }
             ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
@@ -659,11 +673,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      * the allocated buffer.
      * @return A pointer to the allocated buffer.
      */
-    void* alloc(size_t size, size_t* actual_size) override {
+    void * alloc(size_t size, size_t * actual_size) override {
         // round up the allocation size to the alignment to ensure that all
         // allocations are aligned for all data types
         const size_t alignment = 128;
-        size = GGML_PAD(size, alignment);
+        size                   = GGML_PAD(size, alignment);
         if (size == 0) {
             size = alignment;
         }
@@ -673,53 +687,51 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
         if (size > avail) {
             // round up to the next multiple of the granularity
             size_t reserve_size = size - avail;
-            reserve_size = GGML_PAD(reserve_size, granularity);
+            reserve_size        = GGML_PAD(reserve_size, granularity);
             GGML_ASSERT(pool_size + reserve_size <= max_size);
             // allocate more physical memory
             aclrtPhysicalMemProp prop = {};
-            prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
-            prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-            prop.memAttr = ACL_HBM_MEM_HUGE;
-            prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id = device;
-            prop.reserve = 0;
+            prop.handleType           = ACL_MEM_HANDLE_TYPE_NONE;
+            prop.allocationType       = ACL_MEM_ALLOCATION_TYPE_PINNED;
+            prop.memAttr              = ACL_HBM_MEM_HUGE;
+            prop.location.type        = ACL_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id          = device;
+            prop.reserve              = 0;
             aclrtDrvMemHandle handle;
             ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
             // reserve virtual address space (if not already reserved)
             if (pool_addr == 0) {
-                ACL_CHECK(aclrtReserveMemAddress(
-                    &pool_addr, max_size, 0, NULL, 1));
+                ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1));
             }
             // map at the end of the pool
-            ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
-                                  handle, 0));
+            ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0));
             handles.push_back(handle);
-            map_offsets.push_back((char*)pool_addr + pool_size);
+            map_offsets.push_back((char *) pool_addr + pool_size);
             // add to the pool
             pool_size += reserve_size;
 #ifdef DEBUG_CANN_MALLOC
-             GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
-                   device, (unsigned long long) (pool_size/1024/1024),
-                   (unsigned long long) (reserve_size/1024/1024));
+            GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device,
+                          (unsigned long long) (pool_size / 1024 / 1024),
+                          (unsigned long long) (reserve_size / 1024 / 1024));
 #endif
         }
         GGML_ASSERT(pool_addr != 0);
-        void* ptr = (void*)((char*)pool_addr + pool_used);
+        void * ptr   = (void *) ((char *) pool_addr + pool_used);
         *actual_size = size;
         pool_used += size;
 #ifdef DEBUG_CANN_MALLOC
-        GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
-               (unsigned long long)size, (unsigned long long)ptr);
+        GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size,
+                      (unsigned long long) ptr);
 #endif
         return ptr;
     }
@@ -730,16 +742,16 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      * @param ptr Pointer to the buffer to free.
      * @param size Size of the buffer to free.
      */
-    void free(void* ptr, size_t size) override {
+    void free(void * ptr, size_t size) override {
 #ifdef DEBUG_CANN_MALLOC
-        GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
-               (unsigned long long)size, (unsigned long long)ptr);
+        GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size,
+                      (unsigned long long) ptr);
 #endif
         pool_used -= size;
         // all deallocations must be in reverse order of the allocations
-        GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
+        GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used));
     }
 };
@@ -751,9 +763,8 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  * @param device The device ID for which to create the pool.
  * @return A unique pointer to the created CANN pool.
  */
-std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
-    int device) {
-    std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
+std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
+    std::string mem_pool_type = get_env_as_lowercase("GGML_CANN_MEM_POOL").value_or("");
     if (mem_pool_type == "prio") {
         GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
@@ -777,9 +788,8 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
  * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
  */
 struct ggml_backend_cann_buffer_context {
-    int32_t device;  ///< The device ID associated with this buffer context.
-    void* dev_ptr =
-        nullptr;  ///< Pointer to the device memory allocated for the buffer.
+    int32_t device;             ///< The device ID associated with this buffer context.
+    void *  dev_ptr = nullptr;  ///< Pointer to the device memory allocated for the buffer.
     /**
      * @brief Constructor to initialize the CANN buffer context.
@@ -787,9 +797,7 @@ struct ggml_backend_cann_buffer_context {
      * @param device The device ID associated with this buffer context.
      * @param dev_ptr Pointer to the device memory allocated for the buffer.
      */
-    ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
-        : device(device),
-          dev_ptr(dev_ptr) {}
+    ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
     /**
      * @brief Destructor to free the device memory allocated for the buffer.
@@ -807,8 +815,8 @@ struct ggml_backend_cann_buffer_context {
  * @return true if the buffer is a CANN buffer, false otherwise.
  */
 static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
-static bool ggml_backend_buffer_is_cann(
-    ggml_backend_buffer_t buffer) {
+static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
     return ggml_backend_buft_is_cann(buffer->buft);
 }
@@ -820,10 +828,8 @@ static bool ggml_backend_buffer_is_cann(
  *
  * @param buffer The CANN buffer to free.
  */
-static void ggml_backend_cann_buffer_free_buffer(
-    ggml_backend_buffer_t buffer) {
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     delete ctx;
 }
@@ -836,10 +842,8 @@ static void ggml_backend_cann_buffer_free_buffer(
  * @param buffer The CANN buffer whose base pointer is to be retrieved.
  * @return A pointer to the base of the device memory allocated for the buffer.
  */
-static void* ggml_backend_cann_buffer_get_base(
-    ggml_backend_buffer_t buffer) {
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     return ctx->dev_ptr;
 }
@@ -856,21 +860,17 @@ static void* ggml_backend_cann_buffer_get_base(
  * @param dst Pointer to the destination buffer where transformed data will be
  * stored.
  */
-static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
-                                             const void* src,
-                                             void* dst) {
-    int64_t n_elems = ggml_nelements(tensor);
-    int64_t groups = n_elems / QK4_0;
-    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) {
+    int64_t n_elems     = ggml_nelements(tensor);
+    int64_t groups      = n_elems / QK4_0;
+    size_t  quant_bytes = n_elems * sizeof(uint8_t) / 2;
-    uint8_t* quant_offset = (uint8_t*)dst;
-    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+    uint8_t *  quant_offset = (uint8_t *) dst;
+    uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
     for (int i = 0; i < groups; i++) {
-        const block_q4_0* group =
-            (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0));
-        *scale_offset = group->d;
+        const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0));
+        *scale_offset            = group->d;
         scale_offset++;
         // 0-15
@@ -889,8 +889,7 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
     }
     // put (uint4b_t -8) into int4b_t
-    for (quant_offset = (uint8_t*)dst;
-         quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
+    for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) {
         (*quant_offset) ^= 0x88;
     }
 }
@@ -908,29 +907,27 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
  * @param dst Pointer to the destination buffer where the Q4.0 formatted data
  * will be stored.
  */
-static void ggml_backend_cann_transform_back_q4_0(
-    const ggml_tensor* tensor, void* src, void* dst) {
+static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) {
+    int64_t n_elems     = ggml_nelements(tensor);
+    int64_t groups      = n_elems / QK4_0;
+    size_t  quant_bytes = n_elems * sizeof(uint8_t) / 2;
-    int64_t n_elems = ggml_nelements(tensor);
-    int64_t groups = n_elems / QK4_0;
-    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+    uint8_t *  quant_offset = (uint8_t *) src;
+    uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes);
-    uint8_t* quant_offset = (uint8_t*)src;
-    uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
-    for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
+    for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) {
         (*quant_offset) ^= 0x88;
     }
-    quant_offset = (uint8_t*)src;
+    quant_offset = (uint8_t *) src;
     for (int i = 0; i < groups; i++) {
-        block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
-        group->d = *scale_offset;
+        block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0));
+        group->d           = *scale_offset;
         scale_offset++;
         // 0-15
         for (int j = 0; j < QK4_0 / 2; j += 2) {
-            group->qs[j] = ((*quant_offset) & 0x0F);
+            group->qs[j]     = ((*quant_offset) & 0x0F);
             group->qs[j + 1] = ((*quant_offset) >> 4);
             quant_offset++;
         }
@@ -957,20 +954,17 @@ static void ggml_backend_cann_transform_back_q4_0(
  * @param dst Pointer to the destination buffer where transformed data will be
  * stored.
  */
-static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
-                                             const void* src,
-                                             void* dst) {
-    int64_t n_elems = ggml_nelements(tensor);
-    int64_t groups = n_elems / QK8_0;
-    size_t quant_bytes = n_elems * sizeof(uint8_t);
+static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) {
+    int64_t n_elems     = ggml_nelements(tensor);
+    int64_t groups      = n_elems / QK8_0;
+    size_t  quant_bytes = n_elems * sizeof(uint8_t);
-    uint8_t* quant_offset = (uint8_t*)dst;
-    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+    uint8_t *  quant_offset = (uint8_t *) dst;
+    uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
     for (int i = 0; i < groups; i++) {
-        const block_q8_0* group =
-            (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0));
-        *scale_offset = group->d;
+        const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0));
+        *scale_offset            = group->d;
         scale_offset++;
         size_t group_quant_size = QK8_0 * sizeof(uint8_t);
         memcpy(quant_offset, group->qs, group_quant_size);
@@ -991,19 +985,17 @@ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
  * @param dst Pointer to the destination buffer where the Q8.0 formatted data
  * will be stored.
  */
-static void ggml_backend_cann_transform_back_q8_0(
-    const ggml_tensor* tensor, const void* src, void* dst) {
-    int64_t n_elems = ggml_nelements(tensor);
-    int64_t groups = n_elems / QK8_0;
-    size_t quant_bytes = n_elems * sizeof(uint8_t);
+static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) {
+    int64_t n_elems     = ggml_nelements(tensor);
+    int64_t groups      = n_elems / QK8_0;
+    size_t  quant_bytes = n_elems * sizeof(uint8_t);
-    const uint8_t* quant_offset = (const uint8_t*)src;
-    const uint16_t* scale_offset =
-        (const uint16_t*)((const char*)src + quant_bytes);
+    const uint8_t *  quant_offset = (const uint8_t *) src;
+    const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes);
     for (int i = 0; i < groups; i++) {
-        block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
-        group->d = *scale_offset;
+        block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0));
+        group->d           = *scale_offset;
         scale_offset++;
         size_t group_quant_size = QK8_0 * sizeof(uint8_t);
         memcpy(group->qs, quant_offset, group_quant_size);
@@ -1023,8 +1015,7 @@ static void ggml_backend_cann_transform_back_q8_0(
  * @param dst Pointer to the destination buffer where transformed data will be
  * stored.
  */
-static void ggml_backend_cann_transform(ggml_tensor* tensor,
-                                        const void* src, void* dst) {
+static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) {
     switch (tensor->type) {
         case GGML_TYPE_Q4_0:
             ggml_backend_cann_transform_q4_0(tensor, src, dst);
@@ -1049,8 +1040,7 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor,
  * @param dst Pointer to the destination buffer where transformed tensor data
  * will be stored.
  */
-static void ggml_backend_cann_transform_back(
-    const ggml_tensor* tensor, void* src, void* dst) {
+static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) {
     switch (tensor->type) {
         case GGML_TYPE_Q4_0:
             ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
@@ -1091,8 +1081,7 @@ static bool need_transform(ggml_type type) {
  * @param buffer The CANN buffer from which to initialize the tensor.
  * @param tensor Pointer to the tensor to be initialized.
  */
-static enum ggml_status ggml_backend_cann_buffer_init_tensor(
-    ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
+static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
         return GGML_STATUS_SUCCESS;
@@ -1103,18 +1092,105 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
     if (ggml_is_quantized(tensor->type)) {
         // Initialize padding to 0 to avoid possible NaN values
         size_t original_size = ggml_nbytes(tensor);
-        size_t padded_size =
-            ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+        size_t padded_size   = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
         if (padded_size > original_size && tensor->view_src == nullptr) {
             size_t memset_size = padded_size - original_size;
-            ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
-                                  memset_size, 0, memset_size));
+            ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size));
         }
     }
     return GGML_STATUS_SUCCESS;
 }
+/**
+ * @brief Workspace for caching NZ buffers per device.
+ *
+ * This struct manages a device buffer used in NZ computations. It supports
+ * allocation, reallocation, and clearing of cached memory. The struct is
+ * designed to be used with a global array, one per device.
+ */
+struct ggml_cann_nz_workspace {
+    void * ptr;        // Pointer to allocated device buffer
+    size_t allocated;  // Size of currently allocated buffer in bytes
+    /**
+     * @brief Constructor. Initializes the workspace with no allocated memory.
+     */
+    ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
+    /**
+     * @brief Free cached memory and reset the workspace.
+     *
+     * If a buffer has been allocated, this function releases it using
+     * aclrtFree and resets internal state.
+     */
+    void clear() {
+        if (ptr) {
+            ACL_CHECK(aclrtFree(ptr));
+            ptr       = nullptr;
+            allocated = 0;
+        }
+    }
+    /**
+     * @brief Allocate or reallocate the workspace buffer.
+     *
+     * If the requested size is larger than the currently allocated size,
+     * the old buffer will be freed and a new buffer of the requested size
+     * will be allocated on the device.
+     *
+     * @param new_size Size in bytes to allocate for the workspace.
+     */
+    void realloc(size_t new_size) {
+        if (new_size > allocated) {
+            clear();
+            ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
+            allocated = new_size;
+        }
+    }
+    /**
+     * @brief Get the device buffer pointer.
+     *
+     * @return Pointer to the allocated buffer, or nullptr if not allocated.
+     */
+    void * get() const { return ptr; }
+};
+/**
+ * @brief Global array of NZ workspaces, one per device.
+ */
+static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
+/**
+ * @brief Convert tensor weights to NZ format using Ascend CANN API.
+ *
+ * This function creates a transposed tensor descriptor and performs the
+ * TransMatmulWeight operation. Converting tensor formats can significantly
+ * improve performance on certain hardware.
+ *
+ * @param tensor Pointer to the input ggml_tensor containing the weights.
+ * @param offset Byte offset within the tensor data buffer where weights start.
+ * @param device device id.
+ *
+ * @note The workspace buffer used in this function is managed globally and reused
+ *       across calls. This reduces overhead from repeated memory allocation and deallocation.
+ */
+static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
+    acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
+    uint64_t       workspaceSize    = 0;
+    aclOpExecutor * executor;
+    // TransMatmulWeight
+    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
+    // Avoid frequent malloc/free of the workspace.
+    g_nz_workspaces[device].realloc(workspaceSize);
+    void * g_nz_workspace = g_nz_workspaces[device].get();
+    ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
+}
 // TODO: need handle tensor which has paddings.
 /**
  * @brief Set tensor data in a CANN buffer.
@@ -1128,27 +1204,32 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
  * @param offset Offset in the source data from where to start copying.
  * @param size Size of the data to be copied, in bytes.
  */
-static void ggml_backend_cann_buffer_set_tensor(
-    ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
-    size_t offset, size_t size) {
-    ggml_backend_cann_buffer_context *ctx =
-        (ggml_backend_cann_buffer_context *)buffer->context;
+static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                ggml_tensor *         tensor,
+                                                const void *          data,
+                                                size_t                offset,
+                                                size_t                size) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     ggml_cann_set_device(ctx->device);
     // TODO: refer to cann(#6017), it use thread's default stream.
     // For acl, synchronous functions use this default stream.
     // Why aclrtSynchronizeDevice?
+    // Only check env once.
+    static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
     if (!need_transform(tensor->type)) {
-        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
-                              ACL_MEMCPY_HOST_TO_DEVICE));
+        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+        if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
+            GGML_ASSERT(tensor->ne[2] == 1);
+            GGML_ASSERT(tensor->ne[3] == 1);
+            weight_format_to_nz(tensor, offset, ctx->device);
+        }
     } else {
-        void *transform_buffer = malloc(size);
+        void * transform_buffer = malloc(size);
         ggml_backend_cann_transform(tensor, data, transform_buffer);
-        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
-                              transform_buffer, size,
-                              ACL_MEMCPY_HOST_TO_DEVICE));
+        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
         free(transform_buffer);
     }
 }
@@ -1166,22 +1247,20 @@ static void ggml_backend_cann_buffer_set_tensor(
  * @param offset Offset in the destination buffer where to start copying.
  * @param size Size of the data to be copied, in bytes.
  */
-static void ggml_backend_cann_buffer_get_tensor(
-    ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
-    size_t offset, size_t size) {
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *   tensor,
+                                                void *                data,
+                                                size_t                offset,
+                                                size_t                size) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     ggml_cann_set_device(ctx->device);
     if (!need_transform(tensor->type)) {
-        ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
-                              ACL_MEMCPY_DEVICE_TO_HOST));
+        ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
     } else {
-        void* transform_buffer = malloc(size);
-        ACL_CHECK(aclrtMemcpy(transform_buffer, size,
-                              (char*)tensor->data + offset, size,
-                              ACL_MEMCPY_DEVICE_TO_HOST));
+        void * transform_buffer = malloc(size);
+        ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
         ggml_backend_cann_transform_back(tensor, transform_buffer, data);
         free(transform_buffer);
     }
@@ -1200,31 +1279,31 @@ static void ggml_backend_cann_buffer_get_tensor(
  * @param dst Pointer to the destination tensor where the data will be copied.
  * @return true if the copy operation succeeded, false otherwise.
  */
-static bool ggml_backend_cann_buffer_cpy_tensor(
-    ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
+static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *   src,
+                                                ggml_tensor *         dst) {
     if (ggml_backend_buffer_is_cann(src->buffer)) {
-        ggml_backend_cann_buffer_context* src_ctx =
-            (ggml_backend_cann_buffer_context*)src->buffer->context;
-        ggml_backend_cann_buffer_context* dst_ctx =
-            (ggml_backend_cann_buffer_context*)buffer->context;
+        ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
+        ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
         size_t memcpy_size = ggml_nbytes(src);
         // Same device.
         if (src_ctx->device == dst_ctx->device) {
-            ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
-                                  (const char*)src->data, memcpy_size,
+            ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
                                   ACL_MEMCPY_DEVICE_TO_DEVICE));
             return true;
         } else {
+#ifdef ASCEND_310P
+            // TODO: Support 310p P2P copy
+            return false;
+#endif
             // Different device but can access by peer.
             int32_t canAccessPeer = 0;
-            ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
-                                               dst_ctx->device));
+            ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device));
             if (canAccessPeer) {
                 ggml_cann_set_device(src_ctx->device);
                 ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
-                ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
-                                      (const char*)src->data, memcpy_size,
+                ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
                                       ACL_MEMCPY_DEVICE_TO_DEVICE));
                 return true;
             }
@@ -1242,10 +1321,8 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
  * @param buffer The CANN buffer to be cleared.
  * @param value The value to which each byte in the buffer will be set.
  */
-static void ggml_backend_cann_buffer_clear(
-    ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     ggml_cann_set_device(ctx->device);
     ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
@@ -1275,9 +1352,8 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
  * buffer type.
  */
 struct ggml_backend_cann_buffer_type_context {
-    int32_t
-        device; /**< Device identifier associated with the buffer context. */
-    std::string name; /**< Name associated with the buffer context. */
+    int32_t     device; /**< Device identifier associated with the buffer context. */
+    std::string name;   /**< Name associated with the buffer context. */
 };
 /**
@@ -1289,10 +1365,8 @@ struct ggml_backend_cann_buffer_type_context {
  * @param buft Pointer to the buffer type context.
  * @return Const pointer to the C-style string containing the name.
  */
-static const char* ggml_backend_cann_buffer_type_name(
-    ggml_backend_buffer_type_t buft) {
-    ggml_backend_cann_buffer_type_context* buft_ctx =
-        (ggml_backend_cann_buffer_type_context*)buft->context;
+static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
     return buft_ctx->name.c_str();
 }
@@ -1307,34 +1381,27 @@ static const char* ggml_backend_cann_buffer_type_name(
  * @param size Size in bytes of the buffer to allocate.
  * @return Pointer to the allocated buffer, or nullptr if allocation fails.
  */
-static ggml_backend_buffer_t
-ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
-                                           size_t size) {
-    ggml_backend_cann_buffer_type_context* buft_ctx =
-        (ggml_backend_cann_buffer_type_context*)buft->context;
+static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
     ggml_cann_set_device(buft_ctx->device);
     const size_t alignment = 128;
-    size = GGML_PAD(size, alignment);
+    size                   = GGML_PAD(size, alignment);
     if (size == 0) {
         size = alignment;
     }
-    void* dev_ptr;
+    void *   dev_ptr;
     aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
     if (err != ACL_SUCCESS) {
-        GGML_LOG_ERROR(
-            "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
-            __func__, size / 1024.0 / 1024.0, buft_ctx->device,
-            aclGetRecentErrMsg());
+        GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__,
+                       size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg());
         return nullptr;
     }
-    ggml_backend_cann_buffer_context* ctx =
-        new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
+    ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
-    return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
-                                    ctx, size);
+    return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size);
 }
 /**
@@ -1349,8 +1416,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
  * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
  * buffers).
  */
-static size_t ggml_backend_cann_buffer_type_get_alignment(
-    ggml_backend_buffer_type_t buft) {
+static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 128;
     GGML_UNUSED(buft);
@@ -1370,10 +1436,13 @@ static size_t ggml_backend_cann_buffer_type_get_alignment(
  * @return The total allocation size in bytes required for the tensor in the
  * CANN buffer.
  */
-static size_t ggml_backend_cann_buffer_type_get_alloc_size(
-    ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
-    size_t size = ggml_nbytes(tensor);
-    int64_t ne0 = tensor->ne[0];
+static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+                                                           const ggml_tensor *        tensor) {
+    size_t  size = ggml_nbytes(tensor);
+    int64_t ne0  = tensor->ne[0];
+    // Only check env once.
+    static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
     // last line must bigger than 32, because every single op deal at
     // least 32 bytes.
@@ -1381,14 +1450,21 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
     // int64_t line_size = ne0 * ggml_element_size(tensor);
     // int64_t line_size_align_32 = (line_size + 31) & ~31;
     // size += (line_size_align_32 - line_size);
-    // TODO: not support quantized yet.
-    // TODO: consider un-continue tensor.
     if (ggml_is_quantized(tensor->type)) {
         if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(
-                tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
         }
+    } else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
+        // NZ format weight are not support quantized yet.
+        // If ND tensor transform to NZ, size may changed.
+        int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
+        GGML_ASSERT(tensor->ne[2] == 1);
+        GGML_ASSERT(tensor->ne[3] == 1);
+        const aclIntArray * acl_shape = aclCreateIntArray(shape, 2);
+        size_t              new_size;
+        ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size));
+        ACL_CHECK(aclDestroyIntArray(acl_shape));
+        size = std::max(size, new_size);
     }
     return size;
@@ -1427,17 +1503,15 @@ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface
  * @return A pointer to the buffer type interface for the specified device, or
  * nullptr if the device index is out of range.
  */
-ggml_backend_buffer_type_t
-ggml_backend_cann_buffer_type(int32_t device) {
-    static std::mutex mutex;
+ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) {
+    static std::mutex           mutex;
     std::lock_guard<std::mutex> lock(mutex);
     if (device >= ggml_backend_cann_get_device_count()) {
         return nullptr;
     }
-    static ggml_backend_buffer_type
-        ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
+    static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
     static bool ggml_backend_cann_buffer_type_initialized = false;
@@ -1447,8 +1521,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
                 /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
                 /* .context  = */
-                 new ggml_backend_cann_buffer_type_context{
-                    i, "CANN" + std::to_string(i)},
+                new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) },
             };
         }
         ggml_backend_cann_buffer_type_initialized = true;
@@ -1512,16 +1585,16 @@ static void * ggml_cann_host_malloc(size_t size) {
     }
     const size_t alignment = 128;
-    size = GGML_PAD(size, alignment);
+    size                   = GGML_PAD(size, alignment);
     if (size == 0) {
         size = alignment;
     }
-    void * hostPtr = nullptr;
-    aclError err = aclrtMallocHost((void **) &hostPtr, size);
+    void *   hostPtr = nullptr;
+    aclError err     = aclrtMallocHost((void **) &hostPtr, size);
     if (err != ACL_SUCCESS) {
-        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
-                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
+        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0,
+                      aclGetRecentErrMsg());
         return nullptr;
     }
     return hostPtr;
@@ -1534,7 +1607,8 @@ static void * ggml_cann_host_malloc(size_t size) {
  * @param size Size in bytes of the host buffer to allocate.
  * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
  */
-static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                                             size_t                     size) {
     void * hostPtr = ggml_cann_host_malloc(size);
     if (hostPtr == nullptr) {
@@ -1543,8 +1617,8 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
     }
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
+    buffer->buft                 = buft;
+    buffer->iface.free_buffer    = ggml_backend_cann_host_buffer_free;
     return buffer;
 }
@@ -1558,14 +1632,15 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
 ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
         /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+                           /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
+                           /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+                           /* .get_max_size     = */ NULL,  // defaults to SIZE_MAX
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
+                           /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+                           },
+        /* .device   = */
+        ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
         /* .context  = */ nullptr,
     };
@@ -1585,8 +1660,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
  * stored.
  * @return true if the computation was successful; false otherwise.
  */
-static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
-                                      struct ggml_tensor* dst) {
+static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) {
     switch (dst->op) {
         case GGML_OP_REPEAT:
             ggml_cann_repeat(ctx, dst);
@@ -1594,6 +1668,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_GET_ROWS:
             ggml_cann_get_rows(ctx, dst);
             break;
+        case GGML_OP_SET_ROWS:
+            ggml_cann_set_rows(ctx, dst);
+            break;
         case GGML_OP_DUP:
             ggml_cann_dup(ctx, dst);
             break;
@@ -1616,48 +1693,50 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(dst)) {
                 case GGML_UNARY_OP_ABS:
-                    GGML_CANN_CALL_UNARY_OP(Abs);
+                    GGML_CANN_CALL_OP_UNARY(Abs);
                     break;
                 case GGML_UNARY_OP_NEG:
-                    GGML_CANN_CALL_UNARY_OP(Neg);
+                    GGML_CANN_CALL_OP_UNARY(Neg);
                     break;
                 case GGML_UNARY_OP_GELU:
-                    GGML_CANN_CALL_UNARY_OP(Gelu);
+                case GGML_UNARY_OP_GELU_ERF:
+                    // aclnnGelu internally uses the erf-based approximation.
+                    GGML_CANN_CALL_OP_UNARY(Gelu);
                     break;
                 case GGML_UNARY_OP_SILU:
-                    GGML_CANN_CALL_UNARY_OP(Silu);
+                    GGML_CANN_CALL_OP_UNARY(Silu);
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    {
+                        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+                            GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+                        };
+                        ggml_cann_op_unary(lambda, ctx, dst);
+                    }
                     break;
-                case GGML_UNARY_OP_GELU_QUICK: {
-                    auto lambda = [](ggml_backend_cann_context& ctx,
-                        aclTensor* acl_src,
-                        aclTensor* acl_dst) {
-                        GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
-                    };
-                    ggml_cann_unary_op(lambda, ctx, dst);
-                } break;
                 case GGML_UNARY_OP_TANH:
-                    GGML_CANN_CALL_UNARY_OP(Tanh);
+                    GGML_CANN_CALL_OP_UNARY(Tanh);
                     break;
                 case GGML_UNARY_OP_RELU:
-                    GGML_CANN_CALL_UNARY_OP(Relu);
+                    GGML_CANN_CALL_OP_UNARY(Relu);
                     break;
                 case GGML_UNARY_OP_SIGMOID:
-                    GGML_CANN_CALL_UNARY_OP(Sigmoid);
+                    GGML_CANN_CALL_OP_UNARY(Sigmoid);
                     break;
                 case GGML_UNARY_OP_HARDSIGMOID:
-                    GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
+                    GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
                     break;
                 case GGML_UNARY_OP_HARDSWISH:
-                    GGML_CANN_CALL_UNARY_OP(Hardswish);
+                    GGML_CANN_CALL_OP_UNARY(Hardswish);
                     break;
                 case GGML_UNARY_OP_EXP:
-                    GGML_CANN_CALL_UNARY_OP(Exp);
+                    GGML_CANN_CALL_OP_UNARY(Exp);
                     break;
                 case GGML_UNARY_OP_ELU:
                     ggml_cann_elu(ctx, dst);
                     break;
                 case GGML_UNARY_OP_SGN:
-                    GGML_CANN_CALL_UNARY_OP(Sign);
+                    GGML_CANN_CALL_OP_UNARY(Sign);
                     break;
                 case GGML_UNARY_OP_STEP:
                     ggml_cann_step(ctx, dst);
@@ -1666,12 +1745,43 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                     return false;
             }
             break;
+        case GGML_OP_GLU:
+            switch (ggml_get_glu_op(dst)) {
+                case GGML_GLU_OP_REGLU:
+                    GGML_CANN_CALL_OP_UNARY_GATED(Relu);
+                    break;
+                case GGML_GLU_OP_GEGLU:
+                case GGML_GLU_OP_GEGLU_ERF:
+                    // aclnnGelu internally uses the erf-based approximation.
+                    GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
+                    break;
+                case GGML_GLU_OP_SWIGLU:
+                    GGML_CANN_CALL_OP_UNARY_GATED(Silu);
+                    break;
+                case GGML_GLU_OP_GEGLU_QUICK:
+                    {
+                        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+                            GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+                        };
+                        ggml_cann_op_unary_gated(lambda, ctx, dst);
+                    }
+                    break;
+                default:
+                    return false;
+            }
+            break;
         case GGML_OP_NORM:
             ggml_cann_norm(ctx, dst);
             break;
         case GGML_OP_GROUP_NORM:
             ggml_cann_group_norm(ctx, dst);
             break;
+        case GGML_OP_L2_NORM:
+            ggml_cann_l2_norm(ctx, dst);
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            ggml_cann_cross_entropy_loss(ctx, dst);
+            break;
         case GGML_OP_CONCAT:
             ggml_cann_concat(ctx, dst);
             break;
@@ -1708,7 +1818,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
             ggml_cann_binary_op<aclnn_mul>(ctx, dst);
             break;
         case GGML_OP_SQRT:
-            GGML_CANN_CALL_UNARY_OP(Sqrt);
+            GGML_CANN_CALL_OP_UNARY(Sqrt);
             break;
         case GGML_OP_CLAMP:
             ggml_cann_clamp(ctx, dst);
@@ -1753,16 +1863,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
             ggml_cann_argmax(ctx, dst);
             break;
         case GGML_OP_COS:
-            ggml_cann_unary_op<aclnn_cos>(ctx, dst);
+            ggml_cann_op_unary<aclnn_cos>(ctx, dst);
             break;
         case GGML_OP_SIN:
-            ggml_cann_unary_op<aclnn_sin>(ctx, dst);
+            ggml_cann_op_unary<aclnn_sin>(ctx, dst);
             break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             ggml_cann_conv_transpose_1d(ctx, dst);
             break;
         case GGML_OP_LOG:
-            GGML_CANN_CALL_UNARY_OP(Log);
+            GGML_CANN_CALL_OP_UNARY(Log);
             break;
         case GGML_OP_MEAN:
             ggml_cann_mean(ctx, dst);
@@ -1776,6 +1886,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_FLASH_ATTN_EXT:
             ggml_cann_flash_attn_ext(ctx, dst);
             break;
+        case GGML_OP_OUT_PROD:
+            ggml_cann_out_prod(ctx, dst);
+            break;
+        case GGML_OP_SSM_CONV:
+            ggml_cann_ssm_conv(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -1793,9 +1909,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
  * @param backend Pointer to the CANN backend structure.
  * @return A pointer to a constant string representing the backend name.
  */
-static const char* ggml_backend_cann_name(ggml_backend_t backend) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+static const char * ggml_backend_cann_name(ggml_backend_t backend) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     return cann_ctx->name.c_str();
 }
@@ -1809,8 +1924,7 @@ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
  * @param backend Pointer to the CANN backend structure to be freed.
  */
 static void ggml_backend_cann_free(ggml_backend_t backend) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     ACL_CHECK(aclrtSynchronizeDevice());
     ACL_CHECK(aclrtResetDevice(cann_ctx->device));
@@ -1818,7 +1932,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
     delete backend;
 }
 /**
  * @brief Sets tensor data asynchronously in the CANN backend.
  *
@@ -1831,21 +1944,18 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
  * @param size Size of the data to copy in bytes.
  */
 static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
-                                               ggml_tensor *tensor,
-                                               const void *data,
-                                               size_t offset,
-                                               size_t size) {
-    ggml_backend_cann_context *cann_ctx =
-        (ggml_backend_cann_context *)backend->context;
-    ggml_backend_buffer_t buf =
-        tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
-        "unsupported buffer type");
+                                               ggml_tensor *  tensor,
+                                               const void *   data,
+                                               size_t         offset,
+                                               size_t         size) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+    ggml_backend_buffer_t       buf      = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(!ggml_is_quantized(tensor->type));
-    ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
-        ACL_MEMCPY_HOST_TO_DEVICE);
+    ACL_CHECK(aclrtMemcpyAsync((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE,
+                               cann_ctx->stream()));
 }
 /**
@@ -1859,21 +1969,19 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
  * @param offset Offset in bytes within the host data.
  * @param size Size of the data to copy in bytes.
  */
-static void ggml_backend_cann_get_tensor_async(
-    ggml_backend_t backend, const ggml_tensor *tensor, void *data,
-    size_t offset, size_t size) {
-    ggml_backend_cann_context *cann_ctx =
-        (ggml_backend_cann_context *)backend->context;
-    ggml_backend_buffer_t buf =
-        tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
-                "unsupported buffer type");
+static void ggml_backend_cann_get_tensor_async(ggml_backend_t      backend,
+                                               const ggml_tensor * tensor,
+                                               void *              data,
+                                               size_t              offset,
+                                               size_t              size) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+    ggml_backend_buffer_t       buf      = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(!ggml_is_quantized(tensor->type));
-    ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
-        ACL_MEMCPY_DEVICE_TO_HOST);
+    ACL_CHECK(aclrtMemcpyAsync(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST,
+                               cann_ctx->stream()));
 }
 /**
@@ -1889,62 +1997,67 @@ static void ggml_backend_cann_get_tensor_async(
  * @param dst Pointer to the destination tensor to copy data to.
  * @return true if the copy operation succeeds, false otherwise.
  */
-static bool ggml_backend_cann_cpy_tensor_async(
-    ggml_backend_t backend_src, ggml_backend_t backend_dst,
-    const ggml_tensor* src, ggml_tensor* dst) {
-    GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
-                ggml_backend_is_cann(backend_dst));
+static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t      backend_src,
+                                               ggml_backend_t      backend_dst,
+                                               const ggml_tensor * src,
+                                               ggml_tensor *       dst) {
+    GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst));
+    GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
-    if (!ggml_backend_buffer_is_cann(src->buffer) ||
-        !ggml_backend_buffer_is_cann(dst->buffer)) {
+    if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
         return false;
     }
-    ggml_backend_buffer_t buf_src =
-        src->view_src ? src->view_src->buffer : src->buffer;
-    ggml_backend_buffer_t buf_dst =
-        dst->view_src ? dst->view_src->buffer : dst->buffer;
+    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
-    ggml_backend_cann_context* cann_ctx_src =
-        (ggml_backend_cann_context*)backend_src->context;
-    ggml_backend_cann_context* cann_ctx_dst =
-        (ggml_backend_cann_context*)backend_dst->context;
+    ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context;
+    ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context;
     size_t copy_size = ggml_nbytes(dst);
+    if (copy_size == 0) {
+        return true;
+    }
     if (backend_src != backend_dst) {
-        ggml_backend_cann_buffer_context* buf_ctx_src =
-            (ggml_backend_cann_buffer_context*)buf_src->context;
-        ggml_backend_cann_buffer_context* buf_ctx_dst =
-            (ggml_backend_cann_buffer_context*)buf_dst->context;
+#ifdef ASCEND_310P
+        // TODO: Support 310p P2P copy
+        return false;
+#endif
+        ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context;
+        ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context;
         GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
         GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
         int32_t canAccessPeer = 0;
-        ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
-                                           cann_ctx_dst->device));
+        ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device));
         if (!canAccessPeer) {
             return false;
         }
         // need open both directions for memcpyasync between devices.
-        ggml_cann_set_device(cann_ctx_dst->device);
         ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
         ggml_cann_set_device(cann_ctx_src->device);
         ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
         // wait for task_queue empty to keep task order.
-        cann_ctx_src->task_queue.wait();
-        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
-                                   ACL_MEMCPY_DEVICE_TO_DEVICE,
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                    cann_ctx_src->stream()));
-        //TODO: workaround for Event didn`t work here.
-        aclrtSynchronizeStream(cann_ctx_src->stream());
+        // record event on src stream after the copy
+        // TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
+        // if (!cann_ctx_src->copy_event) {
+        //     ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
+        // }
+        // ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
+        // // wait on dst stream for the copy to complete
+        // ggml_cann_set_device(cann_ctx_dst->device);
+        // ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
+        ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
     } else {
         // src and dst are on the same backend
-        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
-                                   ACL_MEMCPY_DEVICE_TO_DEVICE,
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                    cann_ctx_dst->stream()));
     }
@@ -1960,13 +2073,110 @@ static bool ggml_backend_cann_cpy_tensor_async(
  * @param backend Pointer to the CANN backend structure to synchronize.
  */
 static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
-    cann_ctx->task_queue.wait();
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     ggml_cann_set_device(cann_ctx->device);
     ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
 }
+/**
+ * @brief Check if CANN backend can fuse the specified operation sequence
+ *
+ * This function determines whether an operation sequence starting from the specified node
+ * can be fused into an optimized operation in the CANN backend. Operation fusion can reduce
+ * memory access overhead and improve computational efficiency.
+ *
+ * @param cgraph Pointer to the computation graph
+ * @param node_idx Index of the starting node in the computation graph
+ * @param ops Sequence of operation types to check for fusion
+ * @return true if the operations can be fused
+ * @return false if the operations cannot be fused
+ */
+static bool ggml_cann_can_fuse(const struct ggml_cgraph *          cgraph,
+                               int                                 node_idx,
+                               std::initializer_list<enum ggml_op> ops) {
+    if (!ggml_can_fuse(cgraph, node_idx, ops)) {
+        return false;
+    }
+    // CANN backend supports fusing ADD + RMS_NORM operations
+    if ((ops.size() == 2) && ops.begin()[0] == GGML_OP_ADD && ops.begin()[1] == GGML_OP_RMS_NORM) {
+        ggml_tensor * add_node = cgraph->nodes[node_idx];
+        // TODO: support broadcast for ADD + RMS_NORM
+        if (add_node->src[0]->ne[0] != add_node->src[1]->ne[0] || add_node->src[0]->ne[1] != add_node->src[1]->ne[1] ||
+            add_node->src[0]->ne[2] != add_node->src[1]->ne[2] || add_node->src[0]->ne[3] != add_node->src[1]->ne[3]) {
+            return false;
+        }
+        return true;
+    }
+    return false;
+}
+/**
+ * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
+ *
+ * If CANN graph execution is enabled and graph capture is required, this function begins
+ * graph capture, runs the graph, ends capture, and stores the captured graph.
+ *
+ * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
+ *
+ * @param cann_ctx                     The CANN backend context.
+ * @param cgraph                       The ggml computation graph.
+ * @param use_cann_graph               Whether to use CANN graph execution.
+ * @param cann_graph_capture_required  Whether graph capture is needed due to graph changes.
+ */
+static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
+                                            ggml_cgraph *               cgraph,
+                                            bool                        use_cann_graph,
+                                            bool                        cann_graph_capture_required) {
+#ifdef USE_ACL_GRAPH
+    if (use_cann_graph && cann_graph_capture_required) {  // Begin CANN graph capture
+        ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
+    }
+#endif  // USE_ACL_GRAPH
+    // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
+    // With the use of CANN graphs, the execution will be performed by the graph launch.
+    static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));
+    if (!use_cann_graph || cann_graph_capture_required) {
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            ggml_tensor * node = cgraph->nodes[i];
+            if (opt_fusion) {
+                if (ggml_cann_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM })) {
+                    ggml_cann_op_add_rms_norm_fused(*cann_ctx, node, cgraph->nodes[i + 1]);
+                    i++;
+                    continue;
+                }
+            }
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
+                node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                continue;
+            }
+            bool ok = ggml_cann_compute_forward(*cann_ctx, node);
+            if (!ok) {
+                GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+            }
+            GGML_ASSERT(ok);
+        }
+    }
+#ifdef USE_ACL_GRAPH
+    if (use_cann_graph) {
+        GGML_ASSERT(!cann_ctx->graph_lru_cache.cache_list.empty());
+        ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
+        if (cann_graph_capture_required) {  // End CANN graph capture
+            ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
+        }
+        // Execute CANN graph
+        ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
+    }
+#endif  // USE_ACL_GRAPH
+}
 /**
  * @brief Computes a computational graph using a CANN backend.
  *
@@ -1979,28 +2189,50 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
  *         completes successfully, otherwise an appropriate error status.
  */
-static enum ggml_status ggml_backend_cann_graph_compute(
-    ggml_backend_t backend, ggml_cgraph* cgraph) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     ggml_cann_set_device(cann_ctx->device);
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor* node = cgraph->nodes[i];
-        if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
-            continue;
+    g_nz_workspaces[cann_ctx->device].clear();
+    // calculate rope cache for fist layer in current device.
+    cann_ctx->rope_cache.cached = false;
+    bool graph_capture_required = false;
+#ifdef USE_ACL_GRAPH
+    bool use_cann_graph = true;
+    static bool prefill_use_graph = parse_bool(get_env_as_lowercase("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
+    if (!prefill_use_graph) {
+        // Do not use acl_graph for prefill.
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            ggml_tensor * node = cgraph->nodes[i];
+            // TODO: Optimize here. Currently, we can only
+            // get seq_len by FA's input.
+            if (node->op == GGML_OP_FLASH_ATTN_EXT) {
+                // Q -> src[0], shape: [B, S, N, D]
+                use_cann_graph = (node->src[0]->ne[1] == 1);
+                break;
+            }
         }
+    }
-        bool ok = ggml_cann_compute_forward(*cann_ctx, node);
+    if (!cann_ctx->acl_graph_mode) {
+        use_cann_graph = false;
+    }
-        if (!ok) {
-            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
-                    node->name, ggml_op_name(node->op));
+    if (use_cann_graph) {
+        // If no matching graph is found, the graph needs to be recaptured.
+        graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
+        if (graph_capture_required) {
+            // If no matching graph is found, add a new ACL graph.
+            ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
+            cann_ctx->graph_lru_cache.push(new_graph);
         }
-        GGML_ASSERT(ok);
     }
+#else
+    bool use_cann_graph = false;
+#endif  // USE_ACL_GRAPH
+    evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, graph_capture_required);
     return GGML_STATUS_SUCCESS;
 }
@@ -2017,8 +2249,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
  * @return bool Returns true if the operation is supported by the backend,
  *              otherwise false.
  */
-static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
-                                                    const ggml_tensor* op) {
+static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -2036,28 +2267,41 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                 case GGML_UNARY_OP_ELU:
                 case GGML_UNARY_OP_SGN:
                 case GGML_UNARY_OP_STEP:
+                case GGML_UNARY_OP_GELU_ERF:
                     return true;
                 default:
                     return false;
             }
-        case GGML_OP_MUL_MAT: {
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F16:
-                case GGML_TYPE_F32:
+        case GGML_OP_GLU:
+            switch (ggml_get_glu_op(op)) {
+                case GGML_GLU_OP_REGLU:
+                case GGML_GLU_OP_GEGLU:
+                case GGML_GLU_OP_SWIGLU:
+                case GGML_GLU_OP_GEGLU_ERF:
+                case GGML_GLU_OP_GEGLU_QUICK:
                     return true;
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q4_0:
-#ifdef ASCEND_310P
-                    // Q4 && Q8 per group is not suppor on 310p device
-                    return false;
-#endif
-                    // only support contiguous for quantized types.
-                    return ggml_is_contiguous(op->src[0]) &&
-                            ggml_is_contiguous(op->src[1]);
                 default:
                     return false;
             }
-        }
+            break;
+        case GGML_OP_MUL_MAT:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                        return true;
+                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_Q4_0:
+#ifdef ASCEND_310P
+                        // Q4 && Q8 per group is not support on 310p device
+                        return false;
+#endif
+                        // only support contiguous for quantized types.
+                        return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+                    default:
+                        return false;
+                }
+            }
         case GGML_OP_MUL_MAT_ID:
             switch (op->src[0]->type) {
                 case GGML_TYPE_F16:
@@ -2066,106 +2310,112 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_0:
 #ifdef ASCEND_310P
-                    // Q4 && Q8 per group is not suppor on 310p device
+                    // Q4 && Q8 per group is not support on 310p device
                     return false;
 #endif
                     // only support contiguous for quantized types.
-                    return ggml_is_contiguous(op->src[0]) &&
-                            ggml_is_contiguous(op->src[1]);
+                    return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
                 default:
                     return false;
             }
         // embedding
-        case GGML_OP_GET_ROWS: {
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q8_0:
-                    return true;
-                default:
-                    return false;
-            }
-        } break;
-        case GGML_OP_CPY: {
-            ggml_tensor *src = op->src[0];
-            if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
-                  (src->type != GGML_TYPE_F32 &&
-                    src->type != GGML_TYPE_F16)) {
-                // only support F32 and F16.
-                return false;
+        case GGML_OP_GET_ROWS:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_Q8_0:
+                        return true;
+                    default:
+                        return false;
+                }
             }
-            if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
-                // unsupport dst is not contiguous.
-                return false;
+            break;
+        case GGML_OP_SET_ROWS:
+            {
+                switch (op->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                        return true;
+                    default:
+                        return false;
+                }
             }
-            return true;
-        } break;
-        case GGML_OP_CONT: {
-            // TODO: support GGML_TYPE_BF16
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                    return true;
-                default:
+            break;
+        case GGML_OP_CPY:
+            {
+                ggml_tensor * src = op->src[0];
+                if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
+                    (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
+                    // only support F32 and F16.
                     return false;
+                }
+                return true;
             }
-        }
-        case GGML_OP_ROPE: {
-            // TODO: with ops-test v == 1
-            float ext_factor = 0.0f;
-            memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
-            // TODO: n_dims <= ne0
-            if (op->src[0]->ne[0] != op->op_params[1]) {
-                return false;
-            }
-            // TODO: ext_factor != 0
-            if (ext_factor != 0) {
-                return false;
-            }
-            const int mode = ((const int32_t *) op->op_params)[2];
-            if (mode & GGML_ROPE_TYPE_MROPE) {
-                return false;
-            }
-            if (mode & GGML_ROPE_TYPE_VISION) {
-                return false;
-            }
-            if(!ggml_is_contiguous(op->src[0])){
-                return false;
+            break;
+        case GGML_OP_CONT:
+            {
+                // TODO: support GGML_TYPE_BF16
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                        return true;
+                    default:
+                        return false;
+                }
             }
-            return true;
-        }
-        case GGML_OP_UPSCALE: {
-            // aclnnUpsampleNearest2dGetWorkspaceSize not support
-            // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
-            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
-                return false;
+        case GGML_OP_ROPE:
+            {
+                if (op->src[0]->ne[0] > 896) {
+                    return false;
+                }
+#ifdef ASCEND_310P
+                // TODO: Support rope_dim < ne00(dim)
+                if (op->src[0]->ne[0] != op->op_params[1]) {
+                    return false;
+                }
+                if (!ggml_is_contiguous(op->src[0])) {
+                    return false;
+                }
+#endif
+                return true;
             }
-            if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
-                return false;
+        case GGML_OP_UPSCALE:
+            {
+                // aclnnUpsampleNearest2dGetWorkspaceSize not support
+                // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+                if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+                    return false;
+                }
+                if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
+                    return false;
+                }
+                if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) {
+                    return false;
+                }
+                return true;
             }
-            return true;
-        }
-        case GGML_OP_POOL_2D: {
-            const int32_t * opts = (const int32_t *) op->op_params;
+        case GGML_OP_POOL_2D:
+            {
+                const int32_t * opts = (const int32_t *) op->op_params;
 #ifdef ASCEND_310P
-            enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
-            if(opt == GGML_OP_POOL_MAX){
-                return false;
-            }
+                enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
+                if (opt == GGML_OP_POOL_MAX) {
+                    return false;
+                }
 #endif
-            const int       k0   = opts[1];
-            const int       k1   = opts[2];
-            const int       p0   = opts[5];
-            const int       p1   = opts[6];
-            // value of paddingH should be at most half of kernelH
-            // value of paddingW should be at most half of kernelW
-            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
-        }
+                const int k0 = opts[1];
+                const int k1 = opts[2];
+                const int p0 = opts[5];
+                const int p1 = opts[6];
+                // value of paddingH should be at most half of kernelH
+                // value of paddingW should be at most half of kernelW
+                return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
+            }
         case GGML_OP_SUM:
+            return ggml_is_contiguous_rows(op->src[0]);
+        case GGML_OP_L2_NORM:
+        case GGML_OP_CROSS_ENTROPY_LOSS:
         case GGML_OP_DUP:
         case GGML_OP_IM2COL:
         case GGML_OP_CONCAT:
@@ -2182,61 +2432,93 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_MUL:
         case GGML_OP_DIV:
         case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
         case GGML_OP_CLAMP:
         case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
         case GGML_OP_GROUP_NORM:
+            return true;
         case GGML_OP_PAD:
+            // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
+            return ggml_get_op_params_i32(op, 8) == 0;
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_ARGMAX:
         case GGML_OP_COS:
         case GGML_OP_SIN:
-        case GGML_OP_CONV_TRANSPOSE_1D:
         case GGML_OP_LOG:
         case GGML_OP_MEAN:
         case GGML_OP_PAD_REFLECT_1D:
         case GGML_OP_COUNT_EQUAL:
             return true;
-        case GGML_OP_FLASH_ATTN_EXT:{
-            // derived from [ggml-cuda.cu]
-            if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
-                return false;
-            }
-            if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
-                return false;
-            }
-            if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
-                return false;
-            }
-            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
-                // different head sizes of K and V are not supported yet
-                return false;
-            }
-            if (op->src[0]->ne[0] == 192) {
-                return false;
-            }
-            if (op->src[0]->ne[0] == 576) {
-                // DeepSeek MLA
+        case GGML_OP_OUT_PROD:
+            {
+#ifdef ASCEND_310P
+                // Ger is not supported on 310p device
                 return false;
+#endif
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                        return true;
+                    default:
+                        return false;
+                }
             }
-            if (op->src[0]->ne[3] != 1) {
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            return true;
+        case GGML_OP_SCALE:
+            float bias;
+            memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
+            return bias == 0.0f;  // TODO: support bias != 0.0f
+        case GGML_OP_SOFT_MAX:
+            // TODO: support attention sinks [TAG_ATTN_SINKS]
+            if (op->src[2]) {
                 return false;
             }
-            float logitSoftcap = 0.0f;
-            memcpy(&logitSoftcap,  (float*)op->op_params + 2, sizeof(float));
-            if(logitSoftcap != 0.0f) {
+            return true;
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
+#ifdef ASCEND_310P
+                // FA not support on 310p device
                 return false;
+#endif
+                // derived from [ggml-cuda.cu]
+                if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) {
+                    return false;
+                }
+                if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 &&
+                    op->src[1]->type != GGML_TYPE_BF16) {
+                    return false;
+                }
+                if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) {
+                    return false;
+                }
+                // TODO: support attention sinks [TAG_ATTN_SINKS]
+                if (op->src[4]) {
+                    return false;
+                }
+                if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                    // different head sizes of K and V are not supported yet
+                    return false;
+                }
+                if (op->src[0]->ne[0] % 16 != 0) {
+                    // TODO: padding to support
+                    return false;
+                }
+                float logitSoftcap = 0.0f;
+                memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
+                if (logitSoftcap != 0.0f) {
+                    return false;
+                }
+                return true;
             }
+        case GGML_OP_SSM_CONV:
             return true;
-        }
         default:
             return false;
     }
@@ -2259,28 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
 }
-/**
- * @brief Determines if a tensor operation should be offloaded to the CANN
- * backend.
- *
- * This function checks if a given tensor operation should be offloaded to the
- * CANN backend based on the operation type and the size of the tensor. It
- * returns true if the second dimension (ne[1]) of the tensor is greater than or
- * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
- *
- * @param backend Pointer to the CANN backend.
- * @param op Pointer to the tensor operation to check.
- * @return bool Returns true if the operation should be offloaded, otherwise
- * false.
- */
-static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
-                                                   const ggml_tensor* op) {
-    const int min_batch_size = 32;
-    GGML_UNUSED(dev);
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
-}
 /**
  * @brief Records an event on the CANN backend stream.
  *
@@ -2290,9 +2550,8 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
  * @param event Pointer to the event structure to be recorded.
  */
 static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
-    ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+    ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream()));
 }
 /**
@@ -2305,13 +2564,10 @@ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_
  * @param event Pointer to the event structure that the backend needs to wait
  * for.
  */
-static void ggml_backend_cann_event_wait(ggml_backend_t backend,
-                                         ggml_backend_event_t event) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     if (ggml_backend_is_cann(backend)) {
-        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
-                                       (aclrtEvent)event->context));
+        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context));
     } else {
         GGML_ABORT("fatal error");
     }
@@ -2338,6 +2594,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
     /* .graph_compute           = */ ggml_backend_cann_graph_compute,
     /* .event_record            = */ ggml_backend_cann_event_record,
     /* .event_wait              = */ ggml_backend_cann_event_wait,
+    /* .graph_optimize          = */ NULL,
 };
 /**
@@ -2349,30 +2606,31 @@ static const ggml_backend_i ggml_backend_cann_interface = {
  * @return A pointer to the static GUID.
  */
 static ggml_guid_t ggml_backend_cann_guid() {
-    static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
-                             0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64};
+    static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
+                              0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 };
     return &guid;
 }
 // backend device
 struct ggml_backend_cann_device_context {
-    int device;
+    int         device;
     std::string name;
     std::string description;
+    int op_offload_min_batch_size;
 };
 static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     return ctx->name.c_str();
 }
-static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     return ctx->description.c_str();
 }
 static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     ggml_backend_cann_get_device_memory(ctx->device, free, total);
 }
@@ -2399,7 +2657,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
 static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     return ggml_backend_cann_init(ctx->device);
 }
@@ -2416,19 +2674,17 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
  * @return bool Returns true if the CANN backend supports the buffer type,
  *              otherwise false.
  */
-static bool ggml_backend_cann_supports_buft(
-    ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
     if (ggml_backend_buft_is_cann(buft)) {
-        ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
-        ggml_backend_cann_buffer_type_context * buft_ctx =
-                        (ggml_backend_cann_buffer_type_context *)buft->context;
+        ggml_backend_cann_device_context *      dev_ctx  = (ggml_backend_cann_device_context *) dev->context;
+        ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
         return buft_ctx->device == dev_ctx->device;
     }
     return false;
 }
 static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     return ggml_backend_cann_buffer_type(ctx->device);
 }
@@ -2437,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
     return ggml_backend_cann_host_buffer_type();
 }
+/**
+ * @brief Determines if a tensor operation should be offloaded to the CANN
+ * backend.
+ *
+ * This function checks if a given tensor operation should be offloaded to the
+ * CANN backend based on the operation type and the size of the tensor. It
+ * returns true if the second dimension (ne[1]) of the tensor is greater than or
+ * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param op Pointer to the tensor operation to check.
+ * @return bool Returns true if the operation should be offloaded, otherwise
+ * false.
+ */
+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+    return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
+}
 /**
  * @brief Creates a new event for the CANN backend device.
  *
@@ -2447,9 +2723,8 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
  * @param backend Pointer to the CANN backend.
  * @return ggml_backend_event_t Returns a pointer to the new event structure.
  */
-static ggml_backend_event_t ggml_backend_cann_device_event_new(
-    ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
     ggml_cann_set_device(dev_ctx->device);
@@ -2471,7 +2746,7 @@ static ggml_backend_event_t ggml_backend_cann_device_event_new(
  * @param event Pointer to the event structure to be freed.
  */
 static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
-    ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
+    ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context));
     delete event;
     GGML_UNUSED(dev);
@@ -2485,7 +2760,7 @@ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_bac
  * @param event Pointer to the event structure to be synchronized.
  */
 static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
-    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
+    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context));
     GGML_UNUSED(dev);
 }
@@ -2496,10 +2771,10 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
     /* .get_memory              = */ ggml_backend_cann_device_get_memory,
     /* .get_type                = */ ggml_backend_cann_device_get_type,
     /* .get_props               = */ ggml_backend_cann_device_get_props,
-    /* .init_backend            = */ ggml_backend_cann_device_init,    // called for every card
+    /* .init_backend            = */ ggml_backend_cann_device_init,  // called for every card
     /* .get_buffer_type         = */ ggml_backend_cann_device_get_buffer_type,
     /* .get_host_buffer_type    = */ ggml_backend_cann_device_get_host_buffer_type,
-    /* .buffer_from_host_ptr    = */ NULL, // not supported for CANN
+    /* .buffer_from_host_ptr    = */ NULL,  // not supported for CANN
     /* .supports_op             = */ ggml_backend_cann_supports_op,
     /* .supports_buft           = */ ggml_backend_cann_supports_buft,
     /* .offload_op              = */ ggml_backend_cann_offload_op,
@@ -2508,7 +2783,6 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
     /* .event_synchronize       = */ ggml_backend_cann_device_event_synchronize,
 };
 // backend reg
 struct ggml_backend_cann_reg_context {
     std::vector<ggml_backend_dev_t> devices;
@@ -2520,12 +2794,12 @@ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
 }
 static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
-    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
     return ctx->devices.size();
 }
 static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
     GGML_ASSERT(index < ctx->devices.size());
     return ctx->devices[index];
 }
@@ -2547,34 +2821,32 @@ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
 // backend registry, called only once for cann backend
 ggml_backend_reg_t ggml_backend_cann_reg() {
     static ggml_backend_reg reg;
-    static bool initialized = false;
+    static bool             initialized = false;
     {
-        static std::mutex mutex;
+        static std::mutex           mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
             aclInit(nullptr);
             ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
             for (int i = 0; i < ggml_cann_info().device_count; i++) {
-                ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
-                dev_ctx->description = aclrtGetSocName();
-                dev_ctx->device = i;
-                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
+                ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
+                dev_ctx->description                       = aclrtGetSocName();
+                dev_ctx->device                            = i;
+                dev_ctx->name                              = GGML_CANN_NAME + std::to_string(i);
+                dev_ctx->op_offload_min_batch_size         = min_batch_size;
                 ggml_cann_set_device(i);
-                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .iface   = */ ggml_backend_cann_device_interface,
-                    /* .reg     = */ &reg,
-                    /* .context = */ dev_ctx
-                };
+                ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface   = */ ggml_backend_cann_device_interface,
+                                                                  /* .reg     = */ &reg,
+                                                                  /* .context = */ dev_ctx };
                 ctx->devices.push_back(dev);
             }
-            reg = ggml_backend_reg {
-                /* .api_version = */ GGML_BACKEND_API_VERSION,
-                /* .iface       = */ ggml_backend_cann_reg_interface,
-                /* .context     = */ ctx
-            };
+            reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
+                                    /* .iface       = */ ggml_backend_cann_reg_interface,
+                                    /* .context     = */ ctx };
         }
         initialized = true;
@@ -2590,39 +2862,36 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
         return nullptr;
     }
-    ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
+    ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device);
     if (ctx == nullptr) {
         GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return nullptr;
     }
     ggml_cann_set_device(ctx->device);
     ggml_backend_t cann_backend =
-        new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
-                         /* .interface = */ ggml_backend_cann_interface,
-                         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
-                         /* .context   = */ ctx};
+        new ggml_backend{ /* .guid      = */ ggml_backend_cann_guid(),
+                          /* .interface = */ ggml_backend_cann_interface,
+                          /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                          /* .context   = */ ctx };
     return cann_backend;
 }
 bool ggml_backend_is_cann(ggml_backend_t backend) {
-    return backend != NULL &&
-           ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
 }
 int32_t ggml_backend_cann_get_device_count() {
     return ggml_cann_info().device_count;
 }
-void ggml_backend_cann_get_device_description(
-    int32_t device, char* description, size_t description_size) {
+void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) {
     ggml_cann_set_device(device);
-    const char* soc_name = aclrtGetSocName();
+    const char * soc_name = aclrtGetSocName();
     snprintf(description, description_size, "%s", soc_name);
 }
-void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
-                                         size_t* total) {
+void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) {
     ggml_cann_set_device(device);
     ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
 }