npm - @fugood/llama.node - Versions diffs - 0.3.3 → 0.3.4 - Mend

@fugood/llama.node 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

package/CMakeLists.txt +5 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +18 -1
package/package.json +1 -1
package/src/EmbeddingWorker.cpp +15 -5
package/src/EmbeddingWorker.h +2 -1
package/src/LlamaCompletionWorker.cpp +1 -1
package/src/LlamaContext.cpp +81 -18
package/src/LlamaContext.h +2 -0
package/src/llama.cpp/.github/workflows/build.yml +197 -159
package/src/llama.cpp/.github/workflows/docker.yml +5 -8
package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
package/src/llama.cpp/.github/workflows/server.yml +21 -14
package/src/llama.cpp/CMakeLists.txt +11 -6
package/src/llama.cpp/Sources/llama/llama.h +4 -0
package/src/llama.cpp/cmake/common.cmake +33 -0
package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
package/src/llama.cpp/common/CMakeLists.txt +6 -2
package/src/llama.cpp/common/arg.cpp +426 -245
package/src/llama.cpp/common/common.cpp +143 -80
package/src/llama.cpp/common/common.h +81 -24
package/src/llama.cpp/common/sampling.cpp +53 -19
package/src/llama.cpp/common/sampling.h +22 -1
package/src/llama.cpp/common/speculative.cpp +274 -0
package/src/llama.cpp/common/speculative.h +28 -0
package/src/llama.cpp/docs/build.md +101 -148
package/src/llama.cpp/examples/CMakeLists.txt +32 -13
package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/batched/batched.cpp +5 -4
package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/infill/infill.cpp +1 -1
package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
package/src/llama.cpp/examples/llava/clip.cpp +262 -66
package/src/llama.cpp/examples/llava/clip.h +8 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
package/src/llama.cpp/examples/llava/llava.cpp +46 -19
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/main/main.cpp +9 -5
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/run/run.cpp +911 -0
package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
package/src/llama.cpp/examples/server/server.cpp +1758 -886
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
package/src/llama.cpp/examples/server/utils.hpp +94 -304
package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple/simple.cpp +4 -0
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/tts/tts.cpp +932 -0
package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
package/src/llama.cpp/ggml/include/ggml.h +106 -24
package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
package/src/llama.cpp/ggml/src/ggml.c +367 -207
package/src/llama.cpp/include/llama-cpp.h +25 -0
package/src/llama.cpp/include/llama.h +26 -19
package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
package/src/llama.cpp/src/CMakeLists.txt +2 -7
package/src/llama.cpp/src/llama-grammar.cpp +15 -15
package/src/llama.cpp/src/llama-grammar.h +2 -5
package/src/llama.cpp/src/llama-sampling.cpp +35 -90
package/src/llama.cpp/src/llama-vocab.cpp +6 -1
package/src/llama.cpp/src/llama.cpp +1748 -640
package/src/llama.cpp/src/unicode.cpp +62 -51
package/src/llama.cpp/src/unicode.h +9 -10
package/src/llama.cpp/tests/CMakeLists.txt +48 -37
package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
package/src/llama.cpp/tests/test-rope.cpp +61 -20
package/src/llama.cpp/tests/test-sampling.cpp +2 -2
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446

package/src/llama.cpp/ggml/src/ggml-cann/common.h CHANGED Viewed

@@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
 struct ggml_backend_cann_context {
     int32_t device;                  /**< Device ID. */
     std::string name;                /**< Name of the device. */
+    std::string description;         /**< Description of the device. */
     aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
-        {nullptr}}; /**< Array of streams for the device. */
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
     /**
      * @brief Constructor for initializing the context with a given device.
      * @param device Device ID.
      */
     explicit ggml_backend_cann_context(int device)
-        : device(device), name("CANN" + std::to_string(device)) {}
+        : device(device), name("CANN" + std::to_string(device)) {
+        ggml_cann_set_device(device);
+        description = aclrtGetSocName();
+    }
     /**
      * @brief Destructor for cleaning up resources.

package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp CHANGED Viewed

@@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
         ACL_CHECK(aclrtMemGetAllocationGranularity(
             &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
             &info.devices[id].vmm_granularity));
+        size_t free, total;
+        ggml_backend_cann_get_device_memory(id, &free, &total);
+        info.devices[id].total_vram = free;
     }
     // TODO: add more device info later.
@@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
      * @return A pointer to the allocated buffer.
      */
     void* alloc(size_t size, size_t* actual_size) override {
+        const size_t alignment = 128;
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
 #ifdef DEBUG_CANN_MALLOC
         int nnz = 0;
         size_t max_size = 0;
@@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
             return ptr;
         }
         void* ptr;
-        size_t look_ahead_size = (size_t)(1.05 * size);
-        look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
         ggml_cann_set_device(device);
         ACL_CHECK(
-            aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
-        *actual_size = look_ahead_size;
-        pool_size += look_ahead_size;
+            aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        *actual_size = size;
+        pool_size += size;
 #ifdef DEBUG_CANN_MALLOC
         GGML_LOG_INFO(
             "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
      * @brief The maximum size of the virtual memory pool (32 GB).
      */
-    static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35;  // 32 GB
+    size_t max_size;
     /**
      * @brief The device ID associated with this buffer pool.
@@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      */
     explicit ggml_cann_pool_vmm(int device)
         : device(device),
-          granularity(ggml_cann_info().devices[device].vmm_granularity) {}
+          granularity(ggml_cann_info().devices[device].vmm_granularity) {
+        auto dev = ggml_cann_info().devices[device];
+        granularity = dev.vmm_granularity;
+        max_size = dev.total_vram;
+    }
     /**
      * @brief Destructor to free all buffers in the virtual memory pool.
@@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
         // round up the allocation size to the alignment to ensure that all
         // allocations are aligned for all data types
         const size_t alignment = 128;
-        size = alignment * ((size + alignment - 1) / alignment);
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
         size_t avail = pool_size - pool_used;
         if (size > avail) {
             // round up to the next multiple of the granularity
             size_t reserve_size = size - avail;
-            reserve_size =
-                granularity * ((reserve_size + granularity - 1) / granularity);
+            reserve_size = GGML_PAD(reserve_size, granularity);
-            GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
+            GGML_ASSERT(pool_size + reserve_size <= max_size);
             // allocate more physical memory
             aclrtPhysicalMemProp prop = {};
@@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // reserve virtual address space (if not already reserved)
             if (pool_addr == 0) {
                 ACL_CHECK(aclrtReserveMemAddress(
-                    &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
+                    &pool_addr, max_size, 0, NULL, 1));
             }
             // map at the end of the pool
@@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // add to the pool
             pool_size += reserve_size;
-            // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
-            // reserved %llu MB)\n",
-            //       device, (unsigned long long) (pool_size/1024/1024),
-            //       (unsigned long long) (reserve_size/1024/1024));
+#ifdef DEBUG_CANN_MALLOC
+             GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+                   device, (unsigned long long) (pool_size/1024/1024),
+                   (unsigned long long) (reserve_size/1024/1024));
+#endif
         }
         GGML_ASSERT(pool_addr != 0);
@@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  */
 std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
     int device) {
-    // return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
     return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
 }
@@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
     static bool ggml_backend_cann_buffer_type_initialized = false;
     if (!ggml_backend_cann_buffer_type_initialized) {
-        for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
+        for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
             ggml_backend_cann_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
-                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
                 /* .context  = */
                  new ggml_backend_cann_buffer_type_context{
                     i, "CANN" + std::to_string(i)},
@@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
         return nullptr;
     }
+    const size_t alignment = 128;
+    size = GGML_PAD(size, alignment);
+    if (size == 0) {
+        size = alignment;
+    }
     void * hostPtr = nullptr;
     aclError err = aclrtMallocHost((void **) &hostPtr, size);
     if (err != ACL_SUCCESS) {
         GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, aclGetRecentErrMsg());
         return nullptr;
@@ -1669,12 +1687,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             }
         case GGML_OP_MUL_MAT: {
             switch (op->src[0]->type) {
-                case GGML_TYPE_F16:
-                case GGML_TYPE_F32:
                 case GGML_TYPE_Q8_0:
-                    // TODO: fix me
                     // Current groupsize should not be greater than k-1 in
-                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
+                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
+                    if (op->src[0]->ne[0] <= QK8_0) {
+                        return false;
+                    }
+                case GGML_TYPE_F16:
+                case GGML_TYPE_F32:
                 case GGML_TYPE_Q4_0:
                     return true;
                 default:
@@ -1706,9 +1726,50 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                     return false;
             }
         }
+        case GGML_OP_CONT: {
+            // TODO: support GGML_TYPE_BF16
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_ROPE: {
+            // TODO: with ops-test v == 1
+            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
+            // TODO: n_dims <= ne0
+            if (op->src[0]->ne[0] != op->op_params[1]) {
+                return false;
+            }
+            // TODO: ext_factor != 0
+            if (*ext_factor != 0) {
+                return false;
+            }
+            const int mode = ((const int32_t *) op->op_params)[2];
+            if (mode & GGML_ROPE_TYPE_MROPE) {
+                return false;
+            }
+            if (mode & GGML_ROPE_TYPE_VISION) {
+                return false;
+            }
+            return true;
+        }
+        case GGML_OP_UPSCALE: {
+            // aclnnUpsampleNearest2dGetWorkspaceSize not support
+            // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+                return false;
+            }
+            return true;
+        }
+        case GGML_OP_IM2COL:
+        case GGML_OP_CONCAT:
         case GGML_OP_DUP:
         case GGML_OP_REPEAT:
-        case GGML_OP_CONCAT:
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
@@ -1722,17 +1783,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
         case GGML_OP_POOL_2D:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
         case GGML_OP_GROUP_NORM:
-        case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
@@ -2041,7 +2098,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con
 static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
     /* .get_name          = */ ggml_backend_cann_reg_get_name,
     /* .get_device_count  = */ ggml_backend_cann_reg_get_device_count,
-    /* .get_device_get    = */ ggml_backend_cann_reg_get_device,
+    /* .get_device        = */ ggml_backend_cann_reg_get_device,
     /* .get_proc_address  = */ ggml_backend_cann_reg_get_proc_address,
 };
@@ -2064,16 +2121,17 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
                 dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
                 ggml_cann_set_device(i);
                 ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_cann_device_interface,
-                    /* .reg       = */ &reg,
-                    /* .context   = */ dev_ctx
+                    /* .iface   = */ ggml_backend_cann_device_interface,
+                    /* .reg     = */ &reg,
+                    /* .context = */ dev_ctx
                 };
                 ctx->devices.push_back(dev);
             }
             reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_cann_reg_interface,
-                /* .context   = */ ctx
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_cann_reg_interface,
+                /* .context     = */ ctx
             };
         }
@@ -2126,3 +2184,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
     ggml_cann_set_device(device);
     ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
 }
+GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)

package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt CHANGED Viewed

@@ -1,7 +1,3 @@
-if (NOT SOC_TYPE)
-    set (SOC_TYPE "Ascend910B3")
-endif()
 file(GLOB SRC_FILES
     get_row_f32.cpp
     get_row_f16.cpp
@@ -13,7 +9,6 @@ file(GLOB SRC_FILES
     dup.cpp
 )
-string(TOLOWER ${SOC_TYPE} SOC_VERSION)
 set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
 set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
@@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
     ${SRC_FILES}
 )
+message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
+ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
 # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp CHANGED Viewed

@@ -5,6 +5,7 @@
 using namespace AscendC;
 #define BUFFER_NUM 2
+const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
 template <typename SRC_T, typename DST_T>
 class DupByRows {
@@ -51,24 +52,36 @@ class DupByRows {
     __aicore__ inline void copy_in() {
         LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
-        DataCopyPadExtParams<SRC_T> padParams;
-        DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
+        const size_t elem_per_block = 32 / sizeof(SRC_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
+        DataCopy(src_local, src_gm, cpy_elements_len);
         src_queue.EnQue(src_local);
     }
     __aicore__ inline void copy_out() {
         LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
+#ifdef ASCEND_310P
+        const size_t elem_per_block = 32 / sizeof(DST_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t len = num_elem & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(dst_gm, dst_local, len);
+        }
+        if(tail != 0) {
+            for (size_t i = tail; i < elem_per_block; i++) {
+                dst_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
+            SetAtomicNone();
+        }
+#else
         DataCopyExtParams dataCopyParams;
         dataCopyParams.blockCount = 1;
         dataCopyParams.blockLen = num_elem * sizeof(DST_T);
         DataCopyPad(dst_gm, dst_local, dataCopyParams);
+#endif
         dst_queue.FreeTensor(dst_local);
     }

package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp CHANGED Viewed

@@ -14,7 +14,7 @@ class GET_ROW_F16 {
                                 int64_t *output_ne_ub, size_t *output_nb_ub) {
         // TODO, use template for F16/f32
         int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
+        op_block_idx = GetBlockIdx();
         for (int i = 0; i < 4; i++) {
             input_ne[i] = input_ne_ub[i];
@@ -59,32 +59,42 @@ class GET_ROW_F16 {
     }
     __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        size_t origin_len = len;
         LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(input_local, input_gm[offset], len);
+        const size_t elem_per_block = 32 / sizeof(half);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
         if(tail != 0) {
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(half);
-            DataCopyPadExtParams<half> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len],
-                        dataCopyParams, padParams);
+            len += elem_per_block;
         }
+        DataCopy(input_local, input_gm[offset], len);
         input_queue.EnQue(input_local);
     }
     __aicore__ inline void copy_out(uint32_t offset, size_t len) {
         LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(output_gm[offset], output_local, len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
         if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
             DataCopyExtParams dataCopyParams;
             dataCopyParams.blockCount = 1;
             dataCopyParams.blockLen = tail * sizeof(float);
             DataCopyPad(output_gm[offset + len], output_local[len],
                         dataCopyParams);
+#endif
         }
         output_queue.FreeTensor(output_local);
     }
@@ -150,6 +160,7 @@ class GET_ROW_F16 {
     GlobalTensor<float> output_gm;
     TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
     TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
 };
 template <typename T>

package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp CHANGED Viewed

@@ -13,7 +13,7 @@ class GET_ROW_F32 {
                                 int64_t *indices_ne_ub, size_t *indices_nb_ub,
                                 int64_t *output_ne_ub, size_t *output_nb_ub) {
         int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
+        op_block_idx = GetBlockIdx();
         for (int i = 0; i < 4; i++) {
             input_ne[i] = input_ne_ub[i];
@@ -55,31 +55,40 @@ class GET_ROW_F32 {
     __aicore__ inline void copy_in(uint32_t offset, size_t len) {
         LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(input_local, input_gm[offset], len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
         if(tail != 0) {
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPadExtParams<float> padParams;
-            DataCopyPad(input_local[len], input_gm[offset + len],
-                        dataCopyParams, padParams);
+            len += elem_per_block;
         }
+        DataCopy(input_local, input_gm[offset], len);
         input_queue.EnQue(input_local);
     }
     __aicore__ inline void copy_out(uint32_t offset, size_t len) {
         LocalTensor<float> output_local = output_queue.DeQue<float>();
-        size_t tail = len % 32;
-        len = len & ~31;
-        DataCopy(output_gm[offset], output_local, len);
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
         if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
             DataCopyExtParams dataCopyParams;
             dataCopyParams.blockCount = 1;
             dataCopyParams.blockLen = tail * sizeof(float);
             DataCopyPad(output_gm[offset + len], output_local[len],
                         dataCopyParams);
+#endif
         }
         output_queue.FreeTensor(output_local);
     }
@@ -144,6 +153,7 @@ class GET_ROW_F32 {
     GlobalTensor<float> output_gm;
     TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
     TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
 };
 template <typename T>

package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp CHANGED Viewed

@@ -2,6 +2,15 @@
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support 4bit get row
+    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support 4bit get row.\n");
+    }
+#else
 #define BUFFER_NUM 2
@@ -191,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
             indices_nb_ub, output_ne_ub, output_nb_ub);
     op.calculate();
 }
+#endif // #ifdef ASCEND_310P

package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp CHANGED Viewed

@@ -1,6 +1,14 @@
 #include "kernel_operator.h"
 using namespace AscendC;
+#ifdef ASCEND_310P
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->8bit quantization.\n");
+    }
+#else
 #define BUFFER_NUM 2
 #define QK8_0 32
@@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+#endif // #ifdef ASCEND_310P

package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp CHANGED Viewed

@@ -1,6 +1,14 @@
 #include "kernel_operator.h"
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support f32->8bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->8bit quantization.\n");
+    }
+#else
 #define BUFFER_NUM 2
 #define QK8_0 32
@@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+#endif // #ifdef ASCEND_310P

package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp CHANGED Viewed

@@ -1,6 +1,21 @@
 #include "kernel_operator.h"
 using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support float->4bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->4bit quantization.\n");
+    }
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->4bit quantization.\n");
+    }
+#else
 #define BUFFER_NUM 2
 #define Group_Size 32
@@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
     op.calculate();
 }
+#endif // #ifdef ASCEND_310P