RubyGems - whispercpp - Versions diffs - 1.3.4 → 1.3.6 - Mend

whispercpp 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (891) hide show

data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp CHANGED Viewed

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2026 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -22,24 +22,24 @@
 #include "ggml-cann.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cann/aclnn_ops.h"
+#include "ggml-cann/common.h"
+#include "ggml-impl.h"
+#include "ggml.h"
 #include <acl/acl.h>
-#include <stdarg.h>
 #include <aclnnop/aclnn_trans_matmul_weight.h>
+#include <stdarg.h>
+#include <chrono>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <mutex>
+#include <optional>
 #include <queue>
-#include <chrono>
 #include <unordered_set>
-#include <optional>
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
-#include "ggml-cann/aclnn_ops.h"
-#include "ggml-cann/common.h"
-#include "ggml.h"
 #define GGML_COMMON_DECL_C
@@ -56,52 +56,52 @@
  * @param line The line number where the error occurred.
  * @param msg The error message.
  */
-[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
-                                  const char* file, int line, const char* msg) {
+[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
     int32_t id = -1;
     aclrtGetDevice(&id);
     GGML_LOG_ERROR("CANN error: %s\n", msg);
-    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func,
-            file, line);
+    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ASSERT to get a stack trace
     GGML_ABORT("CANN error");
 }
+// Thread-local variable to record the current device of this thread.
+thread_local int g_current_cann_device = -1;
 /**
- * @brief Sets the device to be used by CANN.
+ * @brief Set the CANN device to be used.
  *
- * @param device The device ID to set.
+ * @param device The target device ID to set.
  */
 void ggml_cann_set_device(const int32_t device) {
-    int current_device = -1;
-    aclrtGetDevice(&current_device);
+    // int current_device = -1;
+    // Note: In some CANN versions, if no device has been set yet,
+    //       aclrtGetDevice(&current_device) may return 0 by default.
+    // aclrtGetDevice(&current_device);
-    if (device == current_device) {
-      return;
+    // If the current device is already the target one, no need to switch.
+    if (device == g_current_cann_device) {
+        return;
     }
+    // Switch to the new device.
     ACL_CHECK(aclrtSetDevice(device));
-}
-/**
- * @brief Retrieves the current device ID.
- *
- * @return The current device ID.
- */
-int32_t ggml_cann_get_device() {
-    int32_t id;
-    ACL_CHECK(aclrtGetDevice(&id));
-    return id;
+    // Update the global device record.
+    g_current_cann_device = device;
 }
 /**
- * @brief Get the value of the specified environment variable (name).
+ * @brief Get the value of the specified environment variable (name) as lowercase.
  *        if not empty, return a std::string object
  */
-std::optional<std::string> get_env(const std::string& name) {
-    const char* val = std::getenv(name.c_str());
-    if (!val) return std::nullopt;
+std::optional<std::string> get_env_as_lowercase(const std::string & name) {
+    const char * val = std::getenv(name.c_str());
+    if (!val) {
+        return std::nullopt;
+    }
     std::string res = std::string(val);
     std::transform(res.begin(), res.end(), res.begin(), ::tolower);
     return res;
@@ -110,8 +110,8 @@ std::optional<std::string> get_env(const std::string& name) {
 /**
  * @brief Verify whether the environment variable is a valid value.
  */
-bool parse_bool(const std::string& value) {
-    std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
+bool parse_bool(const std::string & value) {
+    static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
     return valid_values.find(value) != valid_values.end();
 }
@@ -125,7 +125,7 @@ bool parse_bool(const std::string& value) {
  * @param value The string to parse.
  * @return The parsed integer, or 0 if conversion fails.
  */
-int parse_integer(const std::string& value) {
+int parse_integer(const std::string & value) {
     try {
         return std::stoi(value);
     } catch (...) {
@@ -144,11 +144,10 @@ int parse_integer(const std::string& value) {
 static ggml_cann_device_info ggml_cann_init() {
     ggml_cann_device_info info = {};
-    aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
+    aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count);
     if (err != ACL_SUCCESS) {
-        GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
-                __func__, aclGetRecentErrMsg());
+        GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg());
         return info;
     }
@@ -156,16 +155,15 @@ static ggml_cann_device_info ggml_cann_init() {
     for (int id = 0; id < info.device_count; ++id) {
         aclrtPhysicalMemProp prop = {};
-        prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
-        prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-        prop.memAttr = ACL_HBM_MEM_HUGE;
-        prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = id;
-        prop.reserve = 0;
-        err = aclrtMemGetAllocationGranularity(
-            &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
-            &info.devices[id].vmm_granularity);
-        info.devices[id].vmm = err == ACL_SUCCESS;
+        prop.handleType           = ACL_MEM_HANDLE_TYPE_NONE;
+        prop.allocationType       = ACL_MEM_ALLOCATION_TYPE_PINNED;
+        prop.memAttr              = ACL_HBM_MEM_HUGE;
+        prop.location.type        = ACL_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id          = id;
+        prop.reserve              = 0;
+        err                       = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+                                                                     &info.devices[id].vmm_granularity);
+        info.devices[id].vmm      = err == ACL_SUCCESS;
         size_t free, total;
         ggml_backend_cann_get_device_memory(id, &free, &total);
@@ -185,7 +183,7 @@ static ggml_cann_device_info ggml_cann_init() {
  *
  * @return A reference to the structure containing the device information.
  */
-const ggml_cann_device_info& ggml_cann_info() {
+const ggml_cann_device_info & ggml_cann_info() {
     static ggml_cann_device_info info = ggml_cann_init();
     return info;
 }
@@ -205,7 +203,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
     /**
      * @brief The minimum free margin for a buffer.
      */
-    static const size_t min_free_margin = 1ull << 20;   // 1MB
+    static const size_t min_free_margin = 1ull << 20;  // 1MB
     /**
      * @brief The alignment for buffer allocation.
@@ -226,22 +224,18 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * @brief Structure representing a CANN buffer.
      */
     struct ggml_cann_buffer {
-        void* ptr = nullptr;  ///< Pointer to the buffer.
-        size_t size = 0;      ///< Size of the buffer.
-        std::chrono::steady_clock::time_point last_used;  ///< Last used time.
+        void *                                ptr  = nullptr;  ///< Pointer to the buffer.
+        size_t                                size = 0;        ///< Size of the buffer.
+        std::chrono::steady_clock::time_point last_used;       ///< Last used time.
-        bool operator>(const ggml_cann_buffer& other) const {
-            return size > other.size;
-        }
+        bool operator>(const ggml_cann_buffer & other) const { return size > other.size; }
     };
     /**
      * @brief Array of CANN buffers in the pool.
      */
-    std::unordered_map<void*, size_t> buffer_pool;
-    std::priority_queue<ggml_cann_buffer,
-                        std::vector<ggml_cann_buffer>,
-                        std::greater<>> free_buffers ;
+    std::unordered_map<void *, size_t>                                                   buffer_pool;
+    std::priority_queue<ggml_cann_buffer, std::vector<ggml_cann_buffer>, std::greater<>> free_buffers;
     /**
      * @brief Total size of all buffers in the pool.
@@ -254,7 +248,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * @param device The device ID to associate with this buffer pool.
      */
     explicit ggml_cann_pool_buf_prio(int device) : device(device) {
-        disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
+        disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
     }
     /**
@@ -262,7 +256,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      */
     ~ggml_cann_pool_buf_prio() {
         ggml_cann_set_device(device);
-        for (auto& [b_ptr, b_size] : buffer_pool) {
+        for (auto & [b_ptr, b_size] : buffer_pool) {
             aclrtFree(b_ptr);
             pool_size -= b_size;
         }
@@ -278,14 +272,14 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * the allocated buffer.
      * @return A pointer to the allocated buffer.
      */
-    void* alloc(size_t size, size_t* actual_size) override {
+    void * alloc(size_t size, size_t * actual_size) override {
         size = GGML_PAD(size, alignment);
         if (size == 0) {
             size = alignment;
         }
-        void* ptr = nullptr;
-        auto now = std::chrono::steady_clock::now();
+        void * ptr = nullptr;
+        auto   now = std::chrono::steady_clock::now();
         std::vector<ggml_cann_buffer> free_buffers_rest;
         free_buffers_rest.reserve(free_buffers.size());
@@ -298,24 +292,22 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
                 const size_t margin = b.size - size;
                 if (margin <= max_reuse_margin) {
                     *actual_size = b.size;
-                    ptr = b.ptr;
+                    ptr          = b.ptr;
 #ifdef DEBUG_CANN_MALLOC
                     GGML_LOG_INFO(
                         "cann pool[%d]: reused   %p, "
                         "pool_size = %5u MB, "
                         "size = %5u MB, "
                         "margin = %5u MB\n",
-                        device, b.ptr,
-                        (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                        (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
-                        (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
+                        device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                        (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
+                        (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
 #endif
                     break;
                 }
             }
-            bool should_clean = !disable_clean &&
-                                b.size > min_free_margin &&
+            bool should_clean = !disable_clean && b.size > min_free_margin &&
                                 std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
             if (should_clean) {
                 // free the buffer if the size is needed to be freed
@@ -327,20 +319,20 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
                     "cann pool[%d]: clean    %p, "
                     "pool_size = %5u MB, "
                     "size = %5u MB\n",
-                    device, b.ptr,
-                    (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                    (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+                    device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                    (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
 #endif
                 continue;
             }
             free_buffers_rest.push_back(b);
         }
-        for (ggml_cann_buffer &b : free_buffers_rest) {
+        for (ggml_cann_buffer & b : free_buffers_rest) {
             free_buffers.push(std::move(b));
         }
 #ifdef DEBUG_CANN_MALLOC
-        GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+        GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
+                      (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
 #endif
         if (ptr != nullptr) {
             return ptr;
@@ -356,8 +348,8 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
             "cann pool[%d]: allocate %p, "
             "pool_size = %5u MB, "
             "size = %5u MB\n",
-            device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-            (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
+            device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+            (uint32_t) (GGML_PAD(size, 1048576) / 1048576));
 #endif
         buffer_pool.emplace(ptr, size);
         return ptr;
@@ -369,7 +361,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * @param ptr Pointer to the buffer to free.
      * @param size Size of the buffer to free.
      */
-    void free(void* ptr, size_t size) override {
+    void free(void * ptr, size_t size) override {
         GGML_UNUSED(size);
         auto it = buffer_pool.find(ptr);
         if (it == buffer_pool.end()) {
@@ -377,13 +369,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
         }
         auto now = std::chrono::steady_clock::now();
-        free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
+        free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now });
 #ifdef DEBUG_CANN_MALLOC
         GGML_LOG_INFO(
             "cann pool[%d]: return   %p, "
             "pool_size = %5u MB\n",
-            device, ptr,
-            (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+            device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
 #endif
     }
 };
@@ -402,7 +393,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
     /**
      * @brief The minimum free margin for a buffer.
      */
-    static const size_t min_free_margin = 1ull << 20;   // 1MB
+    static const size_t min_free_margin = 1ull << 20;  // 1MB
     /**
      * @brief The alignment for buffer allocation.
@@ -428,10 +419,10 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * @brief Structure representing a CANN buffer.
      */
     struct ggml_cann_buffer {
-        void* ptr = nullptr;  ///< Pointer to the buffer memory.
-        size_t size = 0;      ///< Size of the buffer.
-        bool used = false;    ///< Whether the buffer is currently in use.
-        std::chrono::steady_clock::time_point last_used;  ///< Last used time.
+        void *                                ptr  = nullptr;  ///< Pointer to the buffer memory.
+        size_t                                size = 0;        ///< Size of the buffer.
+        bool                                  used = false;    ///< Whether the buffer is currently in use.
+        std::chrono::steady_clock::time_point last_used;       ///< Last used time.
     };
     /**
@@ -450,7 +441,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * @param device The device ID to associate with this buffer pool.
      */
     explicit ggml_cann_pool_buf(int device) : device(device) {
-        disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
+        disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
     }
     /**
@@ -459,7 +450,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
     ~ggml_cann_pool_buf() {
         ggml_cann_set_device(device);
         for (int i = 0; i < MAX_BUFFERS; ++i) {
-            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_buffer & b = buffer_pool[i];
             if (b.ptr != nullptr) {
                 aclrtFree(b.ptr);
                 pool_size -= b.size;
@@ -476,18 +467,18 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * the allocated buffer.
      * @return A pointer to the allocated buffer.
      */
-    void* alloc(size_t size, size_t* actual_size) override {
+    void * alloc(size_t size, size_t * actual_size) override {
         size = GGML_PAD(size, alignment);
         if (size == 0) {
             size = alignment;
         }
-        void* ptr = nullptr;
-        auto now = std::chrono::steady_clock::now();
+        void * ptr = nullptr;
+        auto   now = std::chrono::steady_clock::now();
         int i = 0;
         for (; i < MAX_BUFFERS; ++i) {
-            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_buffer & b = buffer_pool[i];
             if (b.ptr == nullptr) {
                 break;
             }
@@ -499,25 +490,23 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
                 const size_t margin = b.size - size;
                 if (margin <= max_reuse_margin) {
                     *actual_size = b.size;
-                    b.used = true;
-                    ptr = b.ptr;
+                    b.used       = true;
+                    ptr          = b.ptr;
 #ifdef DEBUG_CANN_MALLOC
                     GGML_LOG_INFO(
                         "cann pool[%d]: reused   %p, "
                         "pool_size = %5u MB, "
                         "size = %5u MB, "
                         "margin = %5u MB\n",
-                        device, b.ptr,
-                        (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                        (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
-                        (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
+                        device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                        (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
+                        (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
 #endif
                     break;
                 }
             }
-            bool should_clean = !disable_clean &&
-                                b.size > min_free_margin &&
+            bool should_clean = !disable_clean && b.size > min_free_margin &&
                                 std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
             if (should_clean) {
                 // free the buffer if the size is needed to be freed
@@ -528,9 +517,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
                     "cann pool[%d]: clean    %p, "
                     "pool_size = %5u MB, "
                     "size = %5u MB\n",
-                    device, b.ptr,
-                    (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                    (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+                    device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                    (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
 #endif
                 b.ptr = nullptr;
             }
@@ -541,13 +529,13 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
         if (i < MAX_BUFFERS) {
             // allocate a new buffer if no buffer can be reused
-            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_buffer & b = buffer_pool[i];
             ggml_cann_set_device(device);
             ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
             pool_size += size;
             *actual_size = size;
-            b.size = size;
-            b.used = true;
+            b.size       = size;
+            b.used       = true;
             if (i >= MAX_BUFFERS - 8) {
                 GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
             }
@@ -556,9 +544,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
                 "cann pool[%d]: allocate %p, "
                 "pool_size = %5u MB, "
                 "size = %5u MB\n",
-                device, b.ptr,
-                (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
-                (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+                device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+                (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
 #endif
             return b.ptr;
         }
@@ -572,21 +559,20 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * @param ptr Pointer to the buffer to free.
      * @param size Size of the buffer to free.
      */
-    void free(void* ptr, size_t size) override {
+    void free(void * ptr, size_t size) override {
         GGML_UNUSED(size);
         for (int i = 0; i < MAX_BUFFERS; ++i) {
-            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_buffer & b = buffer_pool[i];
             if (b.ptr != ptr) {
                 continue;
             }
-            b.used = false;
+            b.used      = false;
             b.last_used = std::chrono::steady_clock::now();
 #ifdef DEBUG_CANN_MALLOC
             GGML_LOG_INFO(
                 "cann pool[%d]: return   %p, "
                 "pool_size = %5u MB\n",
-                device, b.ptr,
-                (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+                device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
 #endif
             return;
         }
@@ -614,7 +600,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
      * @brief Pointer to the start of the virtual memory pool.
      */
-    void* pool_addr = 0;
+    void * pool_addr = 0;
     /**
      * @brief Amount of virtual memory used in the pool.
@@ -639,7 +625,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
      * @brief Offsets for the mapped memory regions.
      */
-    std::vector<void*> map_offsets;
+    std::vector<void *> map_offsets;
     /**
      * @brief Constructor to initialize the buffer pool with virtual memory for
@@ -647,11 +633,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      *
      * @param device The device ID to associate with this buffer pool.
      */
-    explicit ggml_cann_pool_vmm(int device)
-    : device(device) {
-        auto dev = ggml_cann_info().devices[device];
+    explicit ggml_cann_pool_vmm(int device) : device(device) {
+        auto dev    = ggml_cann_info().devices[device];
         granularity = dev.vmm_granularity;
-        max_size = dev.total_vram;
+        max_size    = dev.total_vram;
     }
     /**
@@ -659,10 +644,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      */
     ~ggml_cann_pool_vmm() {
         if (pool_addr != 0) {
-            for (auto& offset : map_offsets) {
+            for (auto & offset : map_offsets) {
                 ACL_CHECK(aclrtUnmapMem(offset));
             }
-            for (auto& handle : handles) {
+            for (auto & handle : handles) {
                 ACL_CHECK(aclrtFreePhysical(handle));
             }
             ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
@@ -677,11 +662,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      * the allocated buffer.
      * @return A pointer to the allocated buffer.
      */
-    void* alloc(size_t size, size_t* actual_size) override {
+    void * alloc(size_t size, size_t * actual_size) override {
         // round up the allocation size to the alignment to ensure that all
         // allocations are aligned for all data types
         const size_t alignment = 128;
-        size = GGML_PAD(size, alignment);
+        size                   = GGML_PAD(size, alignment);
         if (size == 0) {
             size = alignment;
         }
@@ -691,53 +676,51 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
         if (size > avail) {
             // round up to the next multiple of the granularity
             size_t reserve_size = size - avail;
-            reserve_size = GGML_PAD(reserve_size, granularity);
+            reserve_size        = GGML_PAD(reserve_size, granularity);
             GGML_ASSERT(pool_size + reserve_size <= max_size);
             // allocate more physical memory
             aclrtPhysicalMemProp prop = {};
-            prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
-            prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-            prop.memAttr = ACL_HBM_MEM_HUGE;
-            prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id = device;
-            prop.reserve = 0;
+            prop.handleType           = ACL_MEM_HANDLE_TYPE_NONE;
+            prop.allocationType       = ACL_MEM_ALLOCATION_TYPE_PINNED;
+            prop.memAttr              = ACL_HBM_MEM_HUGE;
+            prop.location.type        = ACL_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id          = device;
+            prop.reserve              = 0;
             aclrtDrvMemHandle handle;
             ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
             // reserve virtual address space (if not already reserved)
             if (pool_addr == 0) {
-                ACL_CHECK(aclrtReserveMemAddress(
-                    &pool_addr, max_size, 0, NULL, 1));
+                ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1));
             }
             // map at the end of the pool
-            ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
-                                  handle, 0));
+            ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0));
             handles.push_back(handle);
-            map_offsets.push_back((char*)pool_addr + pool_size);
+            map_offsets.push_back((char *) pool_addr + pool_size);
             // add to the pool
             pool_size += reserve_size;
 #ifdef DEBUG_CANN_MALLOC
-             GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
-                   device, (unsigned long long) (pool_size/1024/1024),
-                   (unsigned long long) (reserve_size/1024/1024));
+            GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device,
+                          (unsigned long long) (pool_size / 1024 / 1024),
+                          (unsigned long long) (reserve_size / 1024 / 1024));
 #endif
         }
         GGML_ASSERT(pool_addr != 0);
-        void* ptr = (void*)((char*)pool_addr + pool_used);
+        void * ptr   = (void *) ((char *) pool_addr + pool_used);
         *actual_size = size;
         pool_used += size;
 #ifdef DEBUG_CANN_MALLOC
-        GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
-               (unsigned long long)size, (unsigned long long)ptr);
+        GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size,
+                      (unsigned long long) ptr);
 #endif
         return ptr;
     }
@@ -748,16 +731,16 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      * @param ptr Pointer to the buffer to free.
      * @param size Size of the buffer to free.
      */
-    void free(void* ptr, size_t size) override {
+    void free(void * ptr, size_t size) override {
 #ifdef DEBUG_CANN_MALLOC
-        GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
-               (unsigned long long)size, (unsigned long long)ptr);
+        GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size,
+                      (unsigned long long) ptr);
 #endif
         pool_used -= size;
         // all deallocations must be in reverse order of the allocations
-        GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
+        GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used));
     }
 };
@@ -769,9 +752,8 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  * @param device The device ID for which to create the pool.
  * @return A unique pointer to the created CANN pool.
  */
-std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
-    int device) {
-    std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
+std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
+    std::string mem_pool_type = get_env_as_lowercase("GGML_CANN_MEM_POOL").value_or("");
     if (mem_pool_type == "prio") {
         GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
@@ -795,9 +777,8 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
  * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
  */
 struct ggml_backend_cann_buffer_context {
-    int32_t device;  ///< The device ID associated with this buffer context.
-    void* dev_ptr =
-        nullptr;  ///< Pointer to the device memory allocated for the buffer.
+    int32_t device;             ///< The device ID associated with this buffer context.
+    void *  dev_ptr = nullptr;  ///< Pointer to the device memory allocated for the buffer.
     /**
      * @brief Constructor to initialize the CANN buffer context.
@@ -805,9 +786,7 @@ struct ggml_backend_cann_buffer_context {
      * @param device The device ID associated with this buffer context.
      * @param dev_ptr Pointer to the device memory allocated for the buffer.
      */
-    ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
-        : device(device),
-          dev_ptr(dev_ptr) {}
+    ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
     /**
      * @brief Destructor to free the device memory allocated for the buffer.
@@ -815,19 +794,44 @@ struct ggml_backend_cann_buffer_context {
     ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
 };
+// cann buffer type
+/**
+ * @brief Structure representing context information for a specific backend
+ * buffer type.
+ */
+struct ggml_backend_cann_buffer_type_context {
+    int32_t     device; /**< Device identifier associated with the buffer context. */
+    std::string name;   /**< Name associated with the buffer context. */
+};
+/**
+ * @brief Retrieves the name associated with a CANN buffer type.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN buffer type context.
+ *
+ * @param buft Pointer to the buffer type context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
+    return buft_ctx->name.c_str();
+}
 /**
- * @brief Check if a buffer is a CANN buffer.
+ * @brief Checks if the backend buffer type is associated with the CANN backend.
  *
- * This function checks if a given buffer is a CANN buffer by comparing its
- * `get_name` function pointer to `ggml_backend_cann_buffer_get_name`.
+ * This function checks whether the provided backend buffer type is associated
+ * with the CANN backend based on the comparison of its name retrieval function
+ * pointer.
  *
- * @param buffer The buffer to check.
- * @return true if the buffer is a CANN buffer, false otherwise.
+ * @param buft Pointer to the backend buffer type to check.
+ * @return bool Returns true if the buffer type is associated with the CANN
+ * backend, otherwise false.
  */
-static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
-static bool ggml_backend_buffer_is_cann(
-    ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_is_cann(buffer->buft);
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
 }
 /**
@@ -838,10 +842,8 @@ static bool ggml_backend_buffer_is_cann(
  *
  * @param buffer The CANN buffer to free.
  */
-static void ggml_backend_cann_buffer_free_buffer(
-    ggml_backend_buffer_t buffer) {
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     delete ctx;
 }
@@ -854,10 +856,8 @@ static void ggml_backend_cann_buffer_free_buffer(
  * @param buffer The CANN buffer whose base pointer is to be retrieved.
  * @return A pointer to the base of the device memory allocated for the buffer.
  */
-static void* ggml_backend_cann_buffer_get_base(
-    ggml_backend_buffer_t buffer) {
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     return ctx->dev_ptr;
 }
@@ -874,21 +874,17 @@ static void* ggml_backend_cann_buffer_get_base(
  * @param dst Pointer to the destination buffer where transformed data will be
  * stored.
  */
-static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
-                                             const void* src,
-                                             void* dst) {
-    int64_t n_elems = ggml_nelements(tensor);
-    int64_t groups = n_elems / QK4_0;
-    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) {
+    int64_t n_elems     = ggml_nelements(tensor);
+    int64_t groups      = n_elems / QK4_0;
+    size_t  quant_bytes = n_elems * sizeof(uint8_t) / 2;
-    uint8_t* quant_offset = (uint8_t*)dst;
-    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+    uint8_t *  quant_offset = (uint8_t *) dst;
+    uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
     for (int i = 0; i < groups; i++) {
-        const block_q4_0* group =
-            (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0));
-        *scale_offset = group->d;
+        const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0));
+        *scale_offset            = group->d;
         scale_offset++;
         // 0-15
@@ -907,8 +903,7 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
     }
     // put (uint4b_t -8) into int4b_t
-    for (quant_offset = (uint8_t*)dst;
-         quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
+    for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) {
         (*quant_offset) ^= 0x88;
     }
 }
@@ -926,29 +921,27 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
  * @param dst Pointer to the destination buffer where the Q4.0 formatted data
  * will be stored.
  */
-static void ggml_backend_cann_transform_back_q4_0(
-    const ggml_tensor* tensor, void* src, void* dst) {
-    int64_t n_elems = ggml_nelements(tensor);
-    int64_t groups = n_elems / QK4_0;
-    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) {
+    int64_t n_elems     = ggml_nelements(tensor);
+    int64_t groups      = n_elems / QK4_0;
+    size_t  quant_bytes = n_elems * sizeof(uint8_t) / 2;
-    uint8_t* quant_offset = (uint8_t*)src;
-    uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
+    uint8_t *  quant_offset = (uint8_t *) src;
+    uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes);
-    for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
+    for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) {
         (*quant_offset) ^= 0x88;
     }
-    quant_offset = (uint8_t*)src;
+    quant_offset = (uint8_t *) src;
     for (int i = 0; i < groups; i++) {
-        block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
-        group->d = *scale_offset;
+        block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0));
+        group->d           = *scale_offset;
         scale_offset++;
         // 0-15
         for (int j = 0; j < QK4_0 / 2; j += 2) {
-            group->qs[j] = ((*quant_offset) & 0x0F);
+            group->qs[j]     = ((*quant_offset) & 0x0F);
             group->qs[j + 1] = ((*quant_offset) >> 4);
             quant_offset++;
         }
@@ -975,20 +968,17 @@ static void ggml_backend_cann_transform_back_q4_0(
  * @param dst Pointer to the destination buffer where transformed data will be
  * stored.
  */
-static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
-                                             const void* src,
-                                             void* dst) {
-    int64_t n_elems = ggml_nelements(tensor);
-    int64_t groups = n_elems / QK8_0;
-    size_t quant_bytes = n_elems * sizeof(uint8_t);
+static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) {
+    int64_t n_elems     = ggml_nelements(tensor);
+    int64_t groups      = n_elems / QK8_0;
+    size_t  quant_bytes = n_elems * sizeof(uint8_t);
-    uint8_t* quant_offset = (uint8_t*)dst;
-    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+    uint8_t *  quant_offset = (uint8_t *) dst;
+    uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
     for (int i = 0; i < groups; i++) {
-        const block_q8_0* group =
-            (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0));
-        *scale_offset = group->d;
+        const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0));
+        *scale_offset            = group->d;
         scale_offset++;
         size_t group_quant_size = QK8_0 * sizeof(uint8_t);
         memcpy(quant_offset, group->qs, group_quant_size);
@@ -1009,19 +999,17 @@ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
  * @param dst Pointer to the destination buffer where the Q8.0 formatted data
  * will be stored.
  */
-static void ggml_backend_cann_transform_back_q8_0(
-    const ggml_tensor* tensor, const void* src, void* dst) {
-    int64_t n_elems = ggml_nelements(tensor);
-    int64_t groups = n_elems / QK8_0;
-    size_t quant_bytes = n_elems * sizeof(uint8_t);
+static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) {
+    int64_t n_elems     = ggml_nelements(tensor);
+    int64_t groups      = n_elems / QK8_0;
+    size_t  quant_bytes = n_elems * sizeof(uint8_t);
-    const uint8_t* quant_offset = (const uint8_t*)src;
-    const uint16_t* scale_offset =
-        (const uint16_t*)((const char*)src + quant_bytes);
+    const uint8_t *  quant_offset = (const uint8_t *) src;
+    const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes);
     for (int i = 0; i < groups; i++) {
-        block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
-        group->d = *scale_offset;
+        block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0));
+        group->d           = *scale_offset;
         scale_offset++;
         size_t group_quant_size = QK8_0 * sizeof(uint8_t);
         memcpy(group->qs, quant_offset, group_quant_size);
@@ -1041,8 +1029,7 @@ static void ggml_backend_cann_transform_back_q8_0(
  * @param dst Pointer to the destination buffer where transformed data will be
  * stored.
  */
-static void ggml_backend_cann_transform(ggml_tensor* tensor,
-                                        const void* src, void* dst) {
+static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) {
     switch (tensor->type) {
         case GGML_TYPE_Q4_0:
             ggml_backend_cann_transform_q4_0(tensor, src, dst);
@@ -1067,8 +1054,7 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor,
  * @param dst Pointer to the destination buffer where transformed tensor data
  * will be stored.
  */
-static void ggml_backend_cann_transform_back(
-    const ggml_tensor* tensor, void* src, void* dst) {
+static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) {
     switch (tensor->type) {
         case GGML_TYPE_Q4_0:
             ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
@@ -1109,8 +1095,7 @@ static bool need_transform(ggml_type type) {
  * @param buffer The CANN buffer from which to initialize the tensor.
  * @param tensor Pointer to the tensor to be initialized.
  */
-static enum ggml_status ggml_backend_cann_buffer_init_tensor(
-    ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
+static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
         return GGML_STATUS_SUCCESS;
@@ -1121,13 +1106,11 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
     if (ggml_is_quantized(tensor->type)) {
         // Initialize padding to 0 to avoid possible NaN values
         size_t original_size = ggml_nbytes(tensor);
-        size_t padded_size =
-            ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+        size_t padded_size   = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
         if (padded_size > original_size && tensor->view_src == nullptr) {
             size_t memset_size = padded_size - original_size;
-            ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
-                                  memset_size, 0, memset_size));
+            ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size));
         }
     }
     return GGML_STATUS_SUCCESS;
@@ -1141,8 +1124,8 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
  * designed to be used with a global array, one per device.
  */
 struct ggml_cann_nz_workspace {
-    void*  ptr;       // Pointer to allocated device buffer
-    size_t allocated; // Size of currently allocated buffer in bytes
+    void * ptr;        // Pointer to allocated device buffer
+    size_t allocated;  // Size of currently allocated buffer in bytes
     /**
      * @brief Constructor. Initializes the workspace with no allocated memory.
@@ -1158,7 +1141,7 @@ struct ggml_cann_nz_workspace {
     void clear() {
         if (ptr) {
             ACL_CHECK(aclrtFree(ptr));
-            ptr = nullptr;
+            ptr       = nullptr;
             allocated = 0;
         }
     }
@@ -1185,7 +1168,7 @@ struct ggml_cann_nz_workspace {
      *
      * @return Pointer to the allocated buffer, or nullptr if not allocated.
      */
-    void* get() const { return ptr; }
+    void * get() const { return ptr; }
 };
 /**
@@ -1207,22 +1190,19 @@ static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
  * @note The workspace buffer used in this function is managed globally and reused
  *       across calls. This reduces overhead from repeated memory allocation and deallocation.
  */
-static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) {
-    aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
-                                    tensor->nb, 2, ACL_FORMAT_ND, offset);
-    uint64_t workspaceSize = 0;
-    aclOpExecutor *executor;
+static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
+    acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
+    uint64_t       workspaceSize    = 0;
+    aclOpExecutor * executor;
     // TransMatmulWeight
-    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
-                                                    &workspaceSize, &executor));
+    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
     // Avoid frequent malloc/free of the workspace.
     g_nz_workspaces[device].realloc(workspaceSize);
-    void* g_nz_workspace = g_nz_workspaces[device].get();
+    void * g_nz_workspace = g_nz_workspaces[device].get();
     ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
-    ACL_CHECK(aclDestroyTensor(weightTransposed));
 }
 // TODO: need handle tensor which has paddings.
@@ -1238,11 +1218,12 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device)
  * @param offset Offset in the source data from where to start copying.
  * @param size Size of the data to be copied, in bytes.
  */
-static void ggml_backend_cann_buffer_set_tensor(
-    ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
-    size_t offset, size_t size) {
-    ggml_backend_cann_buffer_context *ctx =
-        (ggml_backend_cann_buffer_context *)buffer->context;
+static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                ggml_tensor *         tensor,
+                                                const void *          data,
+                                                size_t                offset,
+                                                size_t                size) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     ggml_cann_set_device(ctx->device);
     // TODO: refer to cann(#6017), it use thread's default stream.
@@ -1250,22 +1231,19 @@ static void ggml_backend_cann_buffer_set_tensor(
     // Why aclrtSynchronizeDevice?
     // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
+    static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
     if (!need_transform(tensor->type)) {
-        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
-                              ACL_MEMCPY_HOST_TO_DEVICE));
-        if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
+        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+        if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
             GGML_ASSERT(tensor->ne[2] == 1);
             GGML_ASSERT(tensor->ne[3] == 1);
             weight_format_to_nz(tensor, offset, ctx->device);
         }
     } else {
-        void *transform_buffer = malloc(size);
+        void * transform_buffer = malloc(size);
         ggml_backend_cann_transform(tensor, data, transform_buffer);
-        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
-                              transform_buffer, size,
-                              ACL_MEMCPY_HOST_TO_DEVICE));
+        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
         free(transform_buffer);
     }
 }
@@ -1283,22 +1261,20 @@ static void ggml_backend_cann_buffer_set_tensor(
  * @param offset Offset in the destination buffer where to start copying.
  * @param size Size of the data to be copied, in bytes.
  */
-static void ggml_backend_cann_buffer_get_tensor(
-    ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
-    size_t offset, size_t size) {
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *   tensor,
+                                                void *                data,
+                                                size_t                offset,
+                                                size_t                size) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     ggml_cann_set_device(ctx->device);
     if (!need_transform(tensor->type)) {
-        ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
-                              ACL_MEMCPY_DEVICE_TO_HOST));
+        ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
     } else {
-        void* transform_buffer = malloc(size);
-        ACL_CHECK(aclrtMemcpy(transform_buffer, size,
-                              (char*)tensor->data + offset, size,
-                              ACL_MEMCPY_DEVICE_TO_HOST));
+        void * transform_buffer = malloc(size);
+        ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
         ggml_backend_cann_transform_back(tensor, transform_buffer, data);
         free(transform_buffer);
     }
@@ -1317,19 +1293,17 @@ static void ggml_backend_cann_buffer_get_tensor(
  * @param dst Pointer to the destination tensor where the data will be copied.
  * @return true if the copy operation succeeded, false otherwise.
  */
-static bool ggml_backend_cann_buffer_cpy_tensor(
-    ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
-    if (ggml_backend_buffer_is_cann(src->buffer)) {
-        ggml_backend_cann_buffer_context* src_ctx =
-            (ggml_backend_cann_buffer_context*)src->buffer->context;
-        ggml_backend_cann_buffer_context* dst_ctx =
-            (ggml_backend_cann_buffer_context*)buffer->context;
+static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *   src,
+                                                ggml_tensor *         dst) {
+    if (ggml_backend_buft_is_cann(src->buffer->buft)) {
+        ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
+        ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
         size_t memcpy_size = ggml_nbytes(src);
         // Same device.
         if (src_ctx->device == dst_ctx->device) {
-            ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
-                                  (const char*)src->data, memcpy_size,
+            ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
                                   ACL_MEMCPY_DEVICE_TO_DEVICE));
             return true;
         } else {
@@ -1339,13 +1313,11 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
 #endif
             // Different device but can access by peer.
             int32_t canAccessPeer = 0;
-            ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
-                                               dst_ctx->device));
+            ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device));
             if (canAccessPeer) {
                 ggml_cann_set_device(src_ctx->device);
                 ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
-                ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
-                                      (const char*)src->data, memcpy_size,
+                ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
                                       ACL_MEMCPY_DEVICE_TO_DEVICE));
                 return true;
             }
@@ -1363,10 +1335,8 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
  * @param buffer The CANN buffer to be cleared.
  * @param value The value to which each byte in the buffer will be set.
  */
-static void ggml_backend_cann_buffer_clear(
-    ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_cann_buffer_context* ctx =
-        (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
     ggml_cann_set_device(ctx->device);
     ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
@@ -1390,34 +1360,6 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
     /* .reset           = */ NULL,
 };
-// cann buffer type
-/**
- * @brief Structure representing context information for a specific backend
- * buffer type.
- */
-struct ggml_backend_cann_buffer_type_context {
-    int32_t
-        device; /**< Device identifier associated with the buffer context. */
-    std::string name; /**< Name associated with the buffer context. */
-};
-/**
- * @brief Retrieves the name associated with a CANN buffer type.
- *
- * This function returns the descriptive name associated with the specified
- * CANN buffer type context.
- *
- * @param buft Pointer to the buffer type context.
- * @return Const pointer to the C-style string containing the name.
- */
-static const char* ggml_backend_cann_buffer_type_name(
-    ggml_backend_buffer_type_t buft) {
-    ggml_backend_cann_buffer_type_context* buft_ctx =
-        (ggml_backend_cann_buffer_type_context*)buft->context;
-    return buft_ctx->name.c_str();
-}
 /**
  * @brief Allocates a new CANN buffer of the specified type and size.
  *
@@ -1428,34 +1370,27 @@ static const char* ggml_backend_cann_buffer_type_name(
  * @param size Size in bytes of the buffer to allocate.
  * @return Pointer to the allocated buffer, or nullptr if allocation fails.
  */
-static ggml_backend_buffer_t
-ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
-                                           size_t size) {
-    ggml_backend_cann_buffer_type_context* buft_ctx =
-        (ggml_backend_cann_buffer_type_context*)buft->context;
+static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
     ggml_cann_set_device(buft_ctx->device);
     const size_t alignment = 128;
-    size = GGML_PAD(size, alignment);
+    size                   = GGML_PAD(size, alignment);
     if (size == 0) {
         size = alignment;
     }
-    void* dev_ptr;
+    void *   dev_ptr;
     aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
     if (err != ACL_SUCCESS) {
-        GGML_LOG_ERROR(
-            "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
-            __func__, size / 1024.0 / 1024.0, buft_ctx->device,
-            aclGetRecentErrMsg());
+        GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__,
+                       size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg());
         return nullptr;
     }
-    ggml_backend_cann_buffer_context* ctx =
-        new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
+    ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
-    return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
-                                    ctx, size);
+    return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size);
 }
 /**
@@ -1470,8 +1405,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
  * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
  * buffers).
  */
-static size_t ggml_backend_cann_buffer_type_get_alignment(
-    ggml_backend_buffer_type_t buft) {
+static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 128;
     GGML_UNUSED(buft);
@@ -1491,13 +1425,13 @@ static size_t ggml_backend_cann_buffer_type_get_alignment(
  * @return The total allocation size in bytes required for the tensor in the
  * CANN buffer.
  */
-static size_t ggml_backend_cann_buffer_type_get_alloc_size(
-    ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
-    size_t size = ggml_nbytes(tensor);
-    int64_t ne0 = tensor->ne[0];
+static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+                                                           const ggml_tensor *        tensor) {
+    size_t  size = ggml_nbytes(tensor);
+    int64_t ne0  = tensor->ne[0];
     // Only check env once.
-    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
+    static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
     // last line must bigger than 32, because every single op deal at
     // least 32 bytes.
@@ -1507,19 +1441,17 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
     // size += (line_size_align_32 - line_size);
     if (ggml_is_quantized(tensor->type)) {
         if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(
-                tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
         }
-    } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
+    } else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
         // NZ format weight are not support quantized yet.
         // If ND tensor transform to NZ, size may changed.
-        int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
+        int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
         GGML_ASSERT(tensor->ne[2] == 1);
         GGML_ASSERT(tensor->ne[3] == 1);
-        const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
-        size_t new_size;
-        ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
-                    ggml_cann_type_mapping(tensor->type), &new_size));
+        const aclIntArray * acl_shape = aclCreateIntArray(shape, 2);
+        size_t              new_size;
+        ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size));
         ACL_CHECK(aclDestroyIntArray(acl_shape));
         size = std::max(size, new_size);
     }
@@ -1560,17 +1492,15 @@ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface
  * @return A pointer to the buffer type interface for the specified device, or
  * nullptr if the device index is out of range.
  */
-ggml_backend_buffer_type_t
-ggml_backend_cann_buffer_type(int32_t device) {
-    static std::mutex mutex;
+ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) {
+    static std::mutex           mutex;
     std::lock_guard<std::mutex> lock(mutex);
     if (device >= ggml_backend_cann_get_device_count()) {
         return nullptr;
     }
-    static ggml_backend_buffer_type
-        ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
+    static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
     static bool ggml_backend_cann_buffer_type_initialized = false;
@@ -1580,8 +1510,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
                 /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
                 /* .context  = */
-                 new ggml_backend_cann_buffer_type_context{
-                    i, "CANN" + std::to_string(i)},
+                new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) },
             };
         }
         ggml_backend_cann_buffer_type_initialized = true;
@@ -1645,16 +1574,16 @@ static void * ggml_cann_host_malloc(size_t size) {
     }
     const size_t alignment = 128;
-    size = GGML_PAD(size, alignment);
+    size                   = GGML_PAD(size, alignment);
     if (size == 0) {
         size = alignment;
     }
-    void * hostPtr = nullptr;
-    aclError err = aclrtMallocHost((void **) &hostPtr, size);
+    void *   hostPtr = nullptr;
+    aclError err     = aclrtMallocHost((void **) &hostPtr, size);
     if (err != ACL_SUCCESS) {
-        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
-                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
+        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0,
+                      aclGetRecentErrMsg());
         return nullptr;
     }
     return hostPtr;
@@ -1667,7 +1596,8 @@ static void * ggml_cann_host_malloc(size_t size) {
  * @param size Size in bytes of the host buffer to allocate.
  * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
  */
-static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                                             size_t                     size) {
     void * hostPtr = ggml_cann_host_malloc(size);
     if (hostPtr == nullptr) {
@@ -1676,8 +1606,8 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
     }
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
+    buffer->buft                 = buft;
+    buffer->iface.free_buffer    = ggml_backend_cann_host_buffer_free;
     return buffer;
 }
@@ -1691,14 +1621,15 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
 ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
         /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+                           /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
+                           /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+                           /* .get_max_size     = */ NULL,  // defaults to SIZE_MAX
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
+                           /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+                           },
+        /* .device   = */
+        ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
         /* .context  = */ nullptr,
     };
@@ -1718,8 +1649,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
  * stored.
  * @return true if the computation was successful; false otherwise.
  */
-static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
-                                      struct ggml_tensor* dst) {
+static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) {
     switch (dst->op) {
         case GGML_OP_REPEAT:
             ggml_cann_repeat(ctx, dst);
@@ -1765,14 +1695,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                 case GGML_UNARY_OP_SILU:
                     GGML_CANN_CALL_OP_UNARY(Silu);
                     break;
-                case GGML_UNARY_OP_GELU_QUICK: {
-                    auto lambda = [](ggml_backend_cann_context& ctx,
-                        aclTensor* acl_src,
-                        aclTensor* acl_dst) {
-                        GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
-                    };
-                    ggml_cann_op_unary(lambda, ctx, dst);
-                } break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    {
+                        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+                            GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+                        };
+                        ggml_cann_op_unary(lambda, ctx, dst);
+                    }
+                    break;
                 case GGML_UNARY_OP_TANH:
                     GGML_CANN_CALL_OP_UNARY(Tanh);
                     break;
@@ -1817,14 +1747,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                 case GGML_GLU_OP_SWIGLU:
                     GGML_CANN_CALL_OP_UNARY_GATED(Silu);
                     break;
-                case GGML_GLU_OP_GEGLU_QUICK: {
-                    auto lambda = [](ggml_backend_cann_context& ctx,
-                        aclTensor* acl_src,
-                        aclTensor* acl_dst) {
-                        GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
-                    };
-                    ggml_cann_op_unary_gated(lambda, ctx, dst);
-                } break;
+                case GGML_GLU_OP_GEGLU_QUICK:
+                    {
+                        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+                            GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+                        };
+                        ggml_cann_op_unary_gated(lambda, ctx, dst);
+                    }
+                    break;
                 default:
                     return false;
             }
@@ -1835,6 +1765,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_GROUP_NORM:
             ggml_cann_group_norm(ctx, dst);
             break;
+        case GGML_OP_L2_NORM:
+            ggml_cann_l2_norm(ctx, dst);
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            ggml_cann_cross_entropy_loss(ctx, dst);
+            break;
         case GGML_OP_CONCAT:
             ggml_cann_concat(ctx, dst);
             break;
@@ -1939,6 +1875,15 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_FLASH_ATTN_EXT:
             ggml_cann_flash_attn_ext(ctx, dst);
             break;
+        case GGML_OP_OUT_PROD:
+            ggml_cann_out_prod(ctx, dst);
+            break;
+        case GGML_OP_GATED_LINEAR_ATTN:
+            ggml_cann_gated_linear_attn(ctx, dst);
+            break;
+        case GGML_OP_SSM_CONV:
+            ggml_cann_ssm_conv(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -1956,9 +1901,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
  * @param backend Pointer to the CANN backend structure.
  * @return A pointer to a constant string representing the backend name.
  */
-static const char* ggml_backend_cann_name(ggml_backend_t backend) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+static const char * ggml_backend_cann_name(ggml_backend_t backend) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     return cann_ctx->name.c_str();
 }
@@ -1972,8 +1916,7 @@ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
  * @param backend Pointer to the CANN backend structure to be freed.
  */
 static void ggml_backend_cann_free(ggml_backend_t backend) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     ACL_CHECK(aclrtSynchronizeDevice());
     ACL_CHECK(aclrtResetDevice(cann_ctx->device));
@@ -1981,7 +1924,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
     delete backend;
 }
 /**
  * @brief Sets tensor data asynchronously in the CANN backend.
  *
@@ -1994,21 +1936,18 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
  * @param size Size of the data to copy in bytes.
  */
 static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
-                                               ggml_tensor *tensor,
-                                               const void *data,
-                                               size_t offset,
-                                               size_t size) {
-    ggml_backend_cann_context *cann_ctx =
-        (ggml_backend_cann_context *)backend->context;
-    ggml_backend_buffer_t buf =
-        tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
-        "unsupported buffer type");
+                                               ggml_tensor *  tensor,
+                                               const void *   data,
+                                               size_t         offset,
+                                               size_t         size) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+    ggml_backend_buffer_t       buf      = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(!ggml_is_quantized(tensor->type));
-    ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
-        ACL_MEMCPY_HOST_TO_DEVICE);
+    ACL_CHECK(aclrtMemcpyAsync((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE,
+                               cann_ctx->stream()));
 }
 /**
@@ -2022,21 +1961,19 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
  * @param offset Offset in bytes within the host data.
  * @param size Size of the data to copy in bytes.
  */
-static void ggml_backend_cann_get_tensor_async(
-    ggml_backend_t backend, const ggml_tensor *tensor, void *data,
-    size_t offset, size_t size) {
-    ggml_backend_cann_context *cann_ctx =
-        (ggml_backend_cann_context *)backend->context;
-    ggml_backend_buffer_t buf =
-        tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+static void ggml_backend_cann_get_tensor_async(ggml_backend_t      backend,
+                                               const ggml_tensor * tensor,
+                                               void *              data,
+                                               size_t              offset,
+                                               size_t              size) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+    ggml_backend_buffer_t       buf      = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
-                "unsupported buffer type");
+    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(!ggml_is_quantized(tensor->type));
-    ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
-        ACL_MEMCPY_DEVICE_TO_HOST);
+    ACL_CHECK(aclrtMemcpyAsync(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST,
+                               cann_ctx->stream()));
 }
 /**
@@ -2052,28 +1989,23 @@ static void ggml_backend_cann_get_tensor_async(
  * @param dst Pointer to the destination tensor to copy data to.
  * @return true if the copy operation succeeds, false otherwise.
  */
-static bool ggml_backend_cann_cpy_tensor_async(
-    ggml_backend_t backend_src, ggml_backend_t backend_dst,
-    const ggml_tensor* src, ggml_tensor* dst) {
-    GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
-                ggml_backend_is_cann(backend_dst));
+static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t      backend_src,
+                                               ggml_backend_t      backend_dst,
+                                               const ggml_tensor * src,
+                                               ggml_tensor *       dst) {
+    GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst));
-    GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
+    GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
-    if (!ggml_backend_buffer_is_cann(src->buffer) ||
-        !ggml_backend_buffer_is_cann(dst->buffer)) {
+    if (!ggml_backend_buft_is_cann(src->buffer->buft) || !ggml_backend_buft_is_cann(dst->buffer->buft)) {
         return false;
     }
-    ggml_backend_buffer_t buf_src =
-        src->view_src ? src->view_src->buffer : src->buffer;
-    ggml_backend_buffer_t buf_dst =
-        dst->view_src ? dst->view_src->buffer : dst->buffer;
+    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
-    ggml_backend_cann_context* cann_ctx_src =
-        (ggml_backend_cann_context*)backend_src->context;
-    ggml_backend_cann_context* cann_ctx_dst =
-        (ggml_backend_cann_context*)backend_dst->context;
+    ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context;
+    ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context;
     size_t copy_size = ggml_nbytes(dst);
     if (copy_size == 0) {
@@ -2084,17 +2016,14 @@ static bool ggml_backend_cann_cpy_tensor_async(
         // TODO: Support 310p P2P copy
         return false;
 #endif
-        ggml_backend_cann_buffer_context* buf_ctx_src =
-            (ggml_backend_cann_buffer_context*)buf_src->context;
-        ggml_backend_cann_buffer_context* buf_ctx_dst =
-            (ggml_backend_cann_buffer_context*)buf_dst->context;
+        ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context;
+        ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context;
         GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
         GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
         int32_t canAccessPeer = 0;
-        ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
-                                           cann_ctx_dst->device));
+        ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device));
         if (!canAccessPeer) {
             return false;
         }
@@ -2105,9 +2034,7 @@ static bool ggml_backend_cann_cpy_tensor_async(
         ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
         // wait for task_queue empty to keep task order.
-        cann_ctx_src->task_queue.wait();
-        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
-                                   ACL_MEMCPY_DEVICE_TO_DEVICE,
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                    cann_ctx_src->stream()));
         // record event on src stream after the copy
         // TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
@@ -2122,8 +2049,7 @@ static bool ggml_backend_cann_cpy_tensor_async(
         ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
     } else {
         // src and dst are on the same backend
-        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
-                                   ACL_MEMCPY_DEVICE_TO_DEVICE,
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                    cann_ctx_dst->stream()));
     }
@@ -2139,147 +2065,44 @@ static bool ggml_backend_cann_cpy_tensor_async(
  * @param backend Pointer to the CANN backend structure to synchronize.
  */
 static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
-    cann_ctx->task_queue.wait();
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     ggml_cann_set_device(cann_ctx->device);
     ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
 }
-#ifdef USE_ACL_GRAPH
-/**
- * @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
- *
- * This function creates a new ggml_cann_graph object and fills its node properties
- * (operation type, dimensions, strides, input sources, and operation parameters)
- * based on the current ggml computation graph.
- *
- * Each node in the ggml graph is mapped to a property entry in the new CANN graph:
- * - node address
- * - operation type
- * - shape (ne) and strides (nb)
- * - source tensor addresses
- * - operation parameters
- *
- * After initialization, the new graph is pushed into the LRU cache owned by the
- * CANN backend context. The cache takes ownership of the graph and manages its
- * lifetime (including deletion upon eviction).
- *
- * @param cann_ctx  The CANN backend context containing the graph cache.
- * @param cgraph    The current ggml computation graph.
- */
-static void add_lru_matched_graph_node_properties(
-        ggml_backend_cann_context * cann_ctx,
-        ggml_cgraph * cgraph) {
-    // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
-    ggml_cann_graph * new_graph = new ggml_cann_graph();
-    new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
-    for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
-        ggml_tensor * node = cgraph->nodes[node_idx];
-        auto & prop = new_graph->ggml_graph_properties[node_idx];
-        prop.node_address = node->data;
-        prop.node_op      = node->op;
-        std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
-        std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
-        for (int src = 0; src < GGML_MAX_SRC; ++src) {
-            prop.src_address[src] = node->src[src] ? node->src[src]->data : nullptr;
-        }
-        memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
-    }
-    // Insert into the LRU cache (cache takes ownership and will delete it when evicted).
-    cann_ctx->graph_lru_cache.push(new_graph);
-}
 /**
- * @brief Check if a ggml tensor node matches a previously captured CANN graph node.
+ * @brief Check if CANN backend can fuse the specified operation sequence
  *
- * This function compares all relevant fields (address, op type, shape, source inputs, op params)
- * to determine whether the current node matches a previously recorded version.
+ * This function determines whether an operation sequence starting from the specified node
+ * can be fused into an optimized operation in the CANN backend. Operation fusion can reduce
+ * memory access overhead and improve computational efficiency.
  *
- * @param node                  The current ggml tensor node.
- * @param graph_node_properties The stored properties of a CANN graph node.
- * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
+ * @param cgraph Pointer to the computation graph
+ * @param node_idx Index of the starting node in the computation graph
+ * @param ops Sequence of operation types to check for fusion
+ * @return true if the operations can be fused
+ * @return false if the operations cannot be fused
  */
-static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
-    if (node->data != graph_node_properties->node_address &&
-           node->op != GGML_OP_VIEW) {
+static bool ggml_cann_can_fuse(const struct ggml_cgraph *          cgraph,
+                               int                                 node_idx,
+                               std::initializer_list<enum ggml_op> ops) {
+    if (!ggml_can_fuse(cgraph, node_idx, ops)) {
         return false;
     }
-    if (node->op != graph_node_properties->node_op) {
-        return false;
-    }
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != graph_node_properties->ne[i]) {
-            return false;
-        }
-        if (node->nb[i] != graph_node_properties->nb[i]) {
-            return false;
-        }
-    }
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (node->src[i] &&
-            node->src[i]->data != graph_node_properties->src_address[i] &&
-            node->op != GGML_OP_VIEW
-        ) {
-            return false;
-        }
-    }
-    if (node->op == GGML_OP_SCALE &&
-        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
-        return false;
-    }
-    return true;
-}
-/**
- * @brief Check whether there is a cached CANN graph that matches the current ggml graph.
- *
- * This function iterates through the cached CANN graphs stored in the LRU cache and
- * compares them against the given ggml computation graph. A match requires that the
- * number of nodes is the same and that each node’s properties (operation type,
- * dimensions, strides, inputs, and operation parameters) are identical.
- *
- * If a matching graph is found, it is promoted to the front of the LRU cache and the
- * function returns true. Otherwise, the function returns false, indicating that a new
- * CANN graph needs to be captured.
- *
- * @param cann_ctx  The CANN backend context containing the graph cache.
- * @param cgraph    The current ggml computation graph.
- * @return true if a matching cached graph exists; false otherwise.
- */
-static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
-    ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache;
-    for (auto &graph_ptr : lru_cache.cache_list) {
-        // Skip graphs with a different number of nodes.
-        if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
-            continue;
-        }
-        // Check if all nodes match.
-        bool all_match = true;
-        for (int i = 0; i < cgraph->n_nodes; ++i) {
-            if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
-                all_match = false;
-                break;
-            }
-        }
-        if (all_match) {
-            // update cache_list && renturn graph_ptr
-            lru_cache.move_to_front(graph_ptr);
-            return true;
+    // CANN backend supports fusing ADD + RMS_NORM operations
+    if ((ops.size() == 2) && ops.begin()[0] == GGML_OP_ADD && ops.begin()[1] == GGML_OP_RMS_NORM) {
+        ggml_tensor * add_node = cgraph->nodes[node_idx];
+        // TODO: support broadcast for ADD + RMS_NORM
+        if (add_node->src[0]->ne[0] != add_node->src[1]->ne[0] || add_node->src[0]->ne[1] != add_node->src[1]->ne[1] ||
+            add_node->src[0]->ne[2] != add_node->src[1]->ne[2] || add_node->src[0]->ne[3] != add_node->src[1]->ne[3]) {
+            return false;
         }
+        return true;
     }
     return false;
 }
-#endif  // USE_ACL_GRAPH
 /**
  * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
@@ -2289,26 +2112,41 @@ static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph *
  *
  * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
  *
- * @param cann_ctx                 The CANN backend context.
- * @param cgraph                   The ggml computation graph.
- * @param use_cann_graph           Whether to use CANN graph execution.
- * @param cann_graph_update_required Whether graph capture is needed due to graph changes.
+ * @param cann_ctx                     The CANN backend context.
+ * @param cgraph                       The ggml computation graph.
+ * @param use_cann_graph               Whether to use CANN graph execution.
+ * @param cann_graph_capture_required  Whether graph capture is needed due to graph changes.
  */
-static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
-    bool & use_cann_graph,  bool & cann_graph_update_required) {
+static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
+                                            ggml_cgraph *               cgraph,
+                                            bool                        use_cann_graph,
+                                            bool                        cann_graph_capture_required) {
 #ifdef USE_ACL_GRAPH
-    ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
-    if (use_cann_graph && cann_graph_update_required) {
+    if (use_cann_graph && cann_graph_capture_required) {  // Begin CANN graph capture
         ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
     }
-#endif // USE_ACL_GRAPH
+#endif  // USE_ACL_GRAPH
     // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
     // With the use of CANN graphs, the execution will be performed by the graph launch.
-    if (!use_cann_graph || cann_graph_update_required) {
+    static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));
+    if (!use_cann_graph || cann_graph_capture_required) {
         for (int i = 0; i < cgraph->n_nodes; i++) {
             ggml_tensor * node = cgraph->nodes[i];
+            if (opt_fusion) {
+                if (ggml_cann_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM })) {
+                    ggml_cann_op_add_rms_norm_fused(*cann_ctx, node, cgraph->nodes[i + 1]);
+                    i++;
+                    continue;
+                }
+            }
-            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
+                node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                continue;
+            }
+            if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
                 continue;
             }
@@ -2321,18 +2159,20 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
     }
 #ifdef USE_ACL_GRAPH
-    if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
-        ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
-    }
     if (use_cann_graph) {
-        // Execute graph
+        GGML_ASSERT(!cann_ctx->graph_lru_cache.cache_list.empty());
+        ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
+        if (cann_graph_capture_required) {  // End CANN graph capture
+            ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
+        }
+        // Execute CANN graph
         ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
     }
-#endif // USE_ACL_GRAPH
+#endif  // USE_ACL_GRAPH
 }
 /**
  * @brief Computes a computational graph using a CANN backend.
  *
@@ -2345,21 +2185,19 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
  *         completes successfully, otherwise an appropriate error status.
  */
-static enum ggml_status ggml_backend_cann_graph_compute(
-    ggml_backend_t backend, ggml_cgraph* cgraph) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     ggml_cann_set_device(cann_ctx->device);
     g_nz_workspaces[cann_ctx->device].clear();
     // calculate rope cache for fist layer in current device.
     cann_ctx->rope_cache.cached = false;
+    bool graph_capture_required = false;
 #ifdef USE_ACL_GRAPH
     bool use_cann_graph = true;
-    bool cann_graph_update_required = false;
-    static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
+    static bool prefill_use_graph = parse_bool(get_env_as_lowercase("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
     if (!prefill_use_graph) {
         // Do not use acl_graph for prefill.
         for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -2380,22 +2218,17 @@ static enum ggml_status ggml_backend_cann_graph_compute(
     if (use_cann_graph) {
         // If no matching graph is found, the graph needs to be recaptured.
-        cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
-        if (cann_graph_update_required) {
+        graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
+        if (graph_capture_required) {
             // If no matching graph is found, add a new ACL graph.
-            add_lru_matched_graph_node_properties(cann_ctx, cgraph);
+            ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
+            cann_ctx->graph_lru_cache.push(new_graph);
         }
     }
 #else
     bool use_cann_graph = false;
-    bool cann_graph_update_required = false;
 #endif  // USE_ACL_GRAPH
-    evaluate_and_capture_cann_graph(
-        cann_ctx,
-        cgraph,
-        use_cann_graph,
-        cann_graph_update_required
-    );
+    evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, graph_capture_required);
     return GGML_STATUS_SUCCESS;
 }
@@ -2412,8 +2245,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
  * @return bool Returns true if the operation is supported by the backend,
  *              otherwise false.
  */
-static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
-                                                    const ggml_tensor* op) {
+static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -2448,24 +2280,24 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                     return false;
             }
             break;
-        case GGML_OP_MUL_MAT: {
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F16:
-                case GGML_TYPE_F32:
-                    return true;
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q4_0:
+        case GGML_OP_MUL_MAT:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                        return true;
+                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_Q4_0:
 #ifdef ASCEND_310P
-                    // Q4 && Q8 per group is not support on 310p device
-                    return false;
+                        // Q4 && Q8 per group is not support on 310p device
+                        return false;
 #endif
-                    // only support contiguous for quantized types.
-                    return ggml_is_contiguous(op->src[0]) &&
-                            ggml_is_contiguous(op->src[1]);
-                default:
-                    return false;
+                        // only support contiguous for quantized types.
+                        return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+                    default:
+                        return false;
+                }
             }
-        }
         case GGML_OP_MUL_MAT_ID:
             switch (op->src[0]->type) {
                 case GGML_TYPE_F16:
@@ -2478,101 +2310,109 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                     return false;
 #endif
                     // only support contiguous for quantized types.
-                    return ggml_is_contiguous(op->src[0]) &&
-                            ggml_is_contiguous(op->src[1]);
+                    return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
                 default:
                     return false;
             }
         // embedding
-        case GGML_OP_GET_ROWS: {
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q8_0:
-                    return true;
-                default:
-                    return false;
-            }
-        } break;
-        case GGML_OP_SET_ROWS: {
-            switch (op->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                    return true;
-                default:
-                    return false;
+        case GGML_OP_GET_ROWS:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_Q8_0:
+                        return true;
+                    default:
+                        return false;
+                }
             }
-        } break;
-        case GGML_OP_CPY: {
-            ggml_tensor *src = op->src[0];
-            if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
-                  (src->type != GGML_TYPE_F32 &&
-                    src->type != GGML_TYPE_F16)) {
-                // only support F32 and F16.
-                return false;
+            break;
+        case GGML_OP_SET_ROWS:
+            {
+                switch (op->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                        return true;
+                    default:
+                        return false;
+                }
             }
-            return true;
-        } break;
-        case GGML_OP_CONT: {
-            // TODO: support GGML_TYPE_BF16
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                    return true;
-                default:
+            break;
+        case GGML_OP_CPY:
+            {
+                ggml_tensor * src = op->src[0];
+                if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
+                    (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
+                    // only support F32 and F16.
                     return false;
+                }
+                return true;
             }
-        }
-        case GGML_OP_ROPE: {
-            // TODO: with ops-test v == 1
-            // TODO: n_dims <= ne0
-            if (op->src[0]->ne[0] != op->op_params[1]) {
-                return false;
-            }
-            const int mode = ((const int32_t *) op->op_params)[2];
-            if (mode & GGML_ROPE_TYPE_MROPE) {
-                return false;
-            }
-            if (mode & GGML_ROPE_TYPE_VISION) {
-                return false;
+            break;
+        case GGML_OP_CONT:
+            {
+                // TODO: support GGML_TYPE_BF16
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                        return true;
+                    default:
+                        return false;
+                }
             }
+        case GGML_OP_ROPE:
+            {
+                if (op->src[0]->ne[0] > 896) {
+                    return false;
+                }
 #ifdef ASCEND_310P
-            if(!ggml_is_contiguous(op->src[0])){
-                return false;
-            }
+                // TODO: Support rope_dim < ne00(dim)
+                if (op->src[0]->ne[0] != op->op_params[1]) {
+                    return false;
+                }
+                if (!ggml_is_contiguous(op->src[0])) {
+                    return false;
+                }
 #endif
-            return true;
-        }
-        case GGML_OP_UPSCALE: {
-            // aclnnUpsampleNearest2dGetWorkspaceSize not support
-            // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
-            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
-                return false;
+                return true;
             }
-            if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
-                return false;
+        case GGML_OP_UPSCALE:
+            {
+                // aclnnUpsampleNearest2dGetWorkspaceSize not support
+                // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+                if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+                    return false;
+                }
+                if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
+                    return false;
+                }
+                if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) {
+                    return false;
+                }
+                return true;
             }
-            return true;
-        }
-        case GGML_OP_POOL_2D: {
-            const int32_t * opts = (const int32_t *) op->op_params;
+        case GGML_OP_POOL_2D:
+            {
+                const int32_t * opts = (const int32_t *) op->op_params;
 #ifdef ASCEND_310P
-            enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
-            if(opt == GGML_OP_POOL_MAX){
-                return false;
-            }
+                enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
+                if (opt == GGML_OP_POOL_MAX) {
+                    return false;
+                }
 #endif
-            const int       k0   = opts[1];
-            const int       k1   = opts[2];
-            const int       p0   = opts[5];
-            const int       p1   = opts[6];
-            // value of paddingH should be at most half of kernelH
-            // value of paddingW should be at most half of kernelW
-            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
-        }
-        case GGML_OP_DUP:
+                const int k0 = opts[1];
+                const int k1 = opts[2];
+                const int p0 = opts[5];
+                const int p1 = opts[6];
+                // value of paddingH should be at most half of kernelH
+                // value of paddingW should be at most half of kernelW
+                return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
+            }
         case GGML_OP_SUM:
+            return ggml_is_contiguous_rows(op->src[0]);
+        case GGML_OP_L2_NORM:
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+        case GGML_OP_DUP:
         case GGML_OP_IM2COL:
         case GGML_OP_CONCAT:
         case GGML_OP_REPEAT:
@@ -2596,7 +2436,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
         case GGML_OP_GROUP_NORM:
+            return true;
         case GGML_OP_PAD:
+            // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
+            return ggml_get_op_params_i32(op, 8) == 0;
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
@@ -2607,54 +2450,72 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_MEAN:
         case GGML_OP_PAD_REFLECT_1D:
         case GGML_OP_COUNT_EQUAL:
+        case GGML_OP_GATED_LINEAR_ATTN:
             return true;
+        case GGML_OP_OUT_PROD:
+            {
+#ifdef ASCEND_310P
+                // Ger is not supported on 310p device
+                return false;
+#endif
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                        return true;
+                    default:
+                        return false;
+                }
+            }
         case GGML_OP_CONV_TRANSPOSE_1D:
-            // TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
-            return (op->src[0]->ne[0] - 1) <= 255;
+            return true;
         case GGML_OP_SCALE:
             float bias;
-            memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
-            return bias == 0.0f; // TODO: support bias != 0.0f
+            memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
+            return bias == 0.0f;  // TODO: support bias != 0.0f
         case GGML_OP_SOFT_MAX:
             // TODO: support attention sinks [TAG_ATTN_SINKS]
             if (op->src[2]) {
                 return false;
             }
             return true;
-        case GGML_OP_FLASH_ATTN_EXT:{
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
 #ifdef ASCEND_310P
-            // FA not support on 310p device
-            return false;
-#endif
-            // derived from [ggml-cuda.cu]
-            if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
-                return false;
-            }
-            if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
-                return false;
-            }
-            if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
-                return false;
-            }
-            // TODO: support attention sinks [TAG_ATTN_SINKS]
-            if (op->src[4]) {
-                return false;
-            }
-            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
-                // different head sizes of K and V are not supported yet
-                return false;
-            }
-            if (op->src[0]->ne[0] % 16 != 0) {
-                // TODO: padding to support
-                return false;
-            }
-            float logitSoftcap = 0.0f;
-            memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float));
-            if(logitSoftcap != 0.0f) {
+                // FA not support on 310p device
                 return false;
+#endif
+                // derived from [ggml-cuda.cu]
+                if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) {
+                    return false;
+                }
+                if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 &&
+                    op->src[1]->type != GGML_TYPE_BF16) {
+                    return false;
+                }
+                if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) {
+                    return false;
+                }
+                // TODO: support attention sinks [TAG_ATTN_SINKS]
+                if (op->src[4]) {
+                    return false;
+                }
+                if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                    // different head sizes of K and V are not supported yet
+                    return false;
+                }
+                if (op->src[0]->ne[0] % 16 != 0) {
+                    // TODO: padding to support
+                    return false;
+                }
+                float logitSoftcap = 0.0f;
+                memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
+                if (logitSoftcap != 0.0f) {
+                    return false;
+                }
+                return true;
             }
+        case GGML_OP_SSM_CONV:
             return true;
-        }
         default:
             return false;
     }
@@ -2662,43 +2523,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
     GGML_UNUSED(dev);
 }
-/**
- * @brief Checks if the backend buffer type is associated with the CANN backend.
- *
- * This function checks whether the provided backend buffer type is associated
- * with the CANN backend based on the comparison of its name retrieval function
- * pointer.
- *
- * @param buft Pointer to the backend buffer type to check.
- * @return bool Returns true if the buffer type is associated with the CANN
- * backend, otherwise false.
- */
-static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
-}
-/**
- * @brief Determines if a tensor operation should be offloaded to the CANN
- * backend.
- *
- * This function checks if a given tensor operation should be offloaded to the
- * CANN backend based on the operation type and the size of the tensor. It
- * returns true if the second dimension (ne[1]) of the tensor is greater than or
- * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
- *
- * @param backend Pointer to the CANN backend.
- * @param op Pointer to the tensor operation to check.
- * @return bool Returns true if the operation should be offloaded, otherwise
- * false.
- */
-static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
-                                                   const ggml_tensor* op) {
-    const int min_batch_size = 32;
-    GGML_UNUSED(dev);
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
-}
 /**
  * @brief Records an event on the CANN backend stream.
  *
@@ -2708,9 +2532,8 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
  * @param event Pointer to the event structure to be recorded.
  */
 static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
-    ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+    ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream()));
 }
 /**
@@ -2723,13 +2546,10 @@ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_
  * @param event Pointer to the event structure that the backend needs to wait
  * for.
  */
-static void ggml_backend_cann_event_wait(ggml_backend_t backend,
-                                         ggml_backend_event_t event) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)backend->context;
+static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
     if (ggml_backend_is_cann(backend)) {
-        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
-                                       (aclrtEvent)event->context));
+        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context));
     } else {
         GGML_ABORT("fatal error");
     }
@@ -2768,30 +2588,31 @@ static const ggml_backend_i ggml_backend_cann_interface = {
  * @return A pointer to the static GUID.
  */
 static ggml_guid_t ggml_backend_cann_guid() {
-    static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
-                             0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64};
+    static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
+                              0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 };
     return &guid;
 }
 // backend device
 struct ggml_backend_cann_device_context {
-    int device;
+    int         device;
     std::string name;
     std::string description;
+    int op_offload_min_batch_size;
 };
 static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     return ctx->name.c_str();
 }
-static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     return ctx->description.c_str();
 }
 static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     ggml_backend_cann_get_device_memory(ctx->device, free, total);
 }
@@ -2818,7 +2639,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
 static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     return ggml_backend_cann_init(ctx->device);
 }
@@ -2835,19 +2656,17 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
  * @return bool Returns true if the CANN backend supports the buffer type,
  *              otherwise false.
  */
-static bool ggml_backend_cann_supports_buft(
-    ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
     if (ggml_backend_buft_is_cann(buft)) {
-        ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
-        ggml_backend_cann_buffer_type_context * buft_ctx =
-                        (ggml_backend_cann_buffer_type_context *)buft->context;
+        ggml_backend_cann_device_context *      dev_ctx  = (ggml_backend_cann_device_context *) dev->context;
+        ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
         return buft_ctx->device == dev_ctx->device;
     }
     return false;
 }
 static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
     return ggml_backend_cann_buffer_type(ctx->device);
 }
@@ -2856,6 +2675,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
     return ggml_backend_cann_host_buffer_type();
 }
+/**
+ * @brief Determines if a tensor operation should be offloaded to the CANN
+ * backend.
+ *
+ * This function checks if a given tensor operation should be offloaded to the
+ * CANN backend based on the operation type and the size of the tensor. It
+ * returns true if the second dimension (ne[1]) of the tensor is greater than or
+ * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param op Pointer to the tensor operation to check.
+ * @return bool Returns true if the operation should be offloaded, otherwise
+ * false.
+ */
+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+    return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
+}
 /**
  * @brief Creates a new event for the CANN backend device.
  *
@@ -2866,9 +2705,8 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
  * @param backend Pointer to the CANN backend.
  * @return ggml_backend_event_t Returns a pointer to the new event structure.
  */
-static ggml_backend_event_t ggml_backend_cann_device_event_new(
-    ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
     ggml_cann_set_device(dev_ctx->device);
@@ -2890,7 +2728,7 @@ static ggml_backend_event_t ggml_backend_cann_device_event_new(
  * @param event Pointer to the event structure to be freed.
  */
 static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
-    ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
+    ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context));
     delete event;
     GGML_UNUSED(dev);
@@ -2904,7 +2742,7 @@ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_bac
  * @param event Pointer to the event structure to be synchronized.
  */
 static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
-    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
+    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context));
     GGML_UNUSED(dev);
 }
@@ -2915,10 +2753,10 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
     /* .get_memory              = */ ggml_backend_cann_device_get_memory,
     /* .get_type                = */ ggml_backend_cann_device_get_type,
     /* .get_props               = */ ggml_backend_cann_device_get_props,
-    /* .init_backend            = */ ggml_backend_cann_device_init,    // called for every card
+    /* .init_backend            = */ ggml_backend_cann_device_init,  // called for every card
     /* .get_buffer_type         = */ ggml_backend_cann_device_get_buffer_type,
     /* .get_host_buffer_type    = */ ggml_backend_cann_device_get_host_buffer_type,
-    /* .buffer_from_host_ptr    = */ NULL, // not supported for CANN
+    /* .buffer_from_host_ptr    = */ NULL,  // not supported for CANN
     /* .supports_op             = */ ggml_backend_cann_supports_op,
     /* .supports_buft           = */ ggml_backend_cann_supports_buft,
     /* .offload_op              = */ ggml_backend_cann_offload_op,
@@ -2927,7 +2765,6 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
     /* .event_synchronize       = */ ggml_backend_cann_device_event_synchronize,
 };
 // backend reg
 struct ggml_backend_cann_reg_context {
     std::vector<ggml_backend_dev_t> devices;
@@ -2939,12 +2776,12 @@ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
 }
 static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
-    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
     return ctx->devices.size();
 }
 static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
     GGML_ASSERT(index < ctx->devices.size());
     return ctx->devices[index];
 }
@@ -2966,34 +2803,32 @@ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
 // backend registry, called only once for cann backend
 ggml_backend_reg_t ggml_backend_cann_reg() {
     static ggml_backend_reg reg;
-    static bool initialized = false;
+    static bool             initialized = false;
     {
-        static std::mutex mutex;
+        static std::mutex           mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
             aclInit(nullptr);
             ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
             for (int i = 0; i < ggml_cann_info().device_count; i++) {
-                ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
-                dev_ctx->description = aclrtGetSocName();
-                dev_ctx->device = i;
-                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
+                ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
+                dev_ctx->description                       = aclrtGetSocName();
+                dev_ctx->device                            = i;
+                dev_ctx->name                              = GGML_CANN_NAME + std::to_string(i);
+                dev_ctx->op_offload_min_batch_size         = min_batch_size;
                 ggml_cann_set_device(i);
-                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .iface   = */ ggml_backend_cann_device_interface,
-                    /* .reg     = */ &reg,
-                    /* .context = */ dev_ctx
-                };
+                ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface   = */ ggml_backend_cann_device_interface,
+                                                                  /* .reg     = */ &reg,
+                                                                  /* .context = */ dev_ctx };
                 ctx->devices.push_back(dev);
             }
-            reg = ggml_backend_reg {
-                /* .api_version = */ GGML_BACKEND_API_VERSION,
-                /* .iface       = */ ggml_backend_cann_reg_interface,
-                /* .context     = */ ctx
-            };
+            reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
+                                    /* .iface       = */ ggml_backend_cann_reg_interface,
+                                    /* .context     = */ ctx };
         }
         initialized = true;
@@ -3009,39 +2844,36 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
         return nullptr;
     }
-    ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
+    ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device);
     if (ctx == nullptr) {
         GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return nullptr;
     }
     ggml_cann_set_device(ctx->device);
     ggml_backend_t cann_backend =
-        new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
-                         /* .interface = */ ggml_backend_cann_interface,
-                         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
-                         /* .context   = */ ctx};
+        new ggml_backend{ /* .guid      = */ ggml_backend_cann_guid(),
+                          /* .interface = */ ggml_backend_cann_interface,
+                          /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                          /* .context   = */ ctx };
     return cann_backend;
 }
 bool ggml_backend_is_cann(ggml_backend_t backend) {
-    return backend != NULL &&
-           ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
 }
 int32_t ggml_backend_cann_get_device_count() {
     return ggml_cann_info().device_count;
 }
-void ggml_backend_cann_get_device_description(
-    int32_t device, char* description, size_t description_size) {
+void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) {
     ggml_cann_set_device(device);
-    const char* soc_name = aclrtGetSocName();
+    const char * soc_name = aclrtGetSocName();
     snprintf(description, description_size, "%s", soc_name);
 }
-void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
-                                         size_t* total) {
+void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) {
     ggml_cann_set_device(device);
     ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
 }