@fugood/llama.node 0.4.7 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +4 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.ts +66 -6
- package/lib/index.js +59 -17
- package/lib/index.ts +74 -23
- package/package.json +1 -1
- package/src/DecodeAudioTokenWorker.cpp +40 -0
- package/src/DecodeAudioTokenWorker.h +22 -0
- package/src/EmbeddingWorker.cpp +7 -5
- package/src/LlamaCompletionWorker.cpp +68 -54
- package/src/LlamaCompletionWorker.h +7 -8
- package/src/LlamaContext.cpp +551 -235
- package/src/LlamaContext.h +26 -4
- package/src/LoadSessionWorker.cpp +4 -2
- package/src/SaveSessionWorker.cpp +10 -6
- package/src/TokenizeWorker.cpp +23 -14
- package/src/TokenizeWorker.h +2 -2
- package/src/addons.cc +8 -11
- package/src/common.hpp +129 -126
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
- package/src/tts_utils.cpp +342 -0
- package/src/tts_utils.h +62 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
|
@@ -65,6 +65,7 @@
|
|
|
65
65
|
#include <aclnnop/aclnn_eq_tensor.h>
|
|
66
66
|
#include <aclnnop/aclnn_gt_scalar.h>
|
|
67
67
|
#include <aclnnop/aclnn_pow.h>
|
|
68
|
+
#include <aclnnop/aclnn_grouped_matmul_v2.h>
|
|
68
69
|
#include <float.h>
|
|
69
70
|
|
|
70
71
|
#include <cmath>
|
|
@@ -2587,3 +2588,276 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
|
2587
2588
|
|
|
2588
2589
|
ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
|
|
2589
2590
|
}
|
|
2591
|
+
|
|
2592
|
+
/**
|
|
2593
|
+
* @brief Performs expert-specific matrix multiplication (MoE) with
|
|
2594
|
+
* floating-point precision using the CANN backend.
|
|
2595
|
+
*
|
|
2596
|
+
* This function executes a matrix multiplication operation tailored for
|
|
2597
|
+
* Mixture of Experts (MoE) models, where the input tensor is multiplied
|
|
2598
|
+
* with expert-specific weight matrices. It uses the CANN backend for
|
|
2599
|
+
* efficient computation and stores the result in the destination tensor `dst`.
|
|
2600
|
+
* The operation may leverage identity-based optimizations or routing masks
|
|
2601
|
+
* as part of sparse expert selection.
|
|
2602
|
+
*
|
|
2603
|
+
* @param ctx The context for executing CANN backend operations.
|
|
2604
|
+
* @param dst The destination tensor where the MoE multiplication result
|
|
2605
|
+
* will be stored.
|
|
2606
|
+
*
|
|
2607
|
+
* @note This function assumes floating-point data types and is designed for
|
|
2608
|
+
* MoE architectures, possibly involving sparse expert routing.
|
|
2609
|
+
*/
|
|
2610
|
+
static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
2611
|
+
//dst [M, K, N, 1]
|
|
2612
|
+
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
|
|
2613
|
+
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
|
|
2614
|
+
ggml_tensor * ids = dst->src[2]; //ids [K, N]
|
|
2615
|
+
|
|
2616
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
2617
|
+
|
|
2618
|
+
// copy index from npu to cpu
|
|
2619
|
+
int64_t n_as = ne02; // A
|
|
2620
|
+
int64_t n_ids = ids->ne[0]; // K
|
|
2621
|
+
|
|
2622
|
+
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
2623
|
+
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
|
|
2624
|
+
ACL_MEMCPY_DEVICE_TO_HOST);
|
|
2625
|
+
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
|
|
2626
|
+
|
|
2627
|
+
char * src0_original = (char *) src0->data;
|
|
2628
|
+
char * src1_original = (char *) src1->data;
|
|
2629
|
+
char * dst_original = (char *) dst->data;
|
|
2630
|
+
size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
|
|
2631
|
+
|
|
2632
|
+
// src0 is F16, src1 is F32, dst is F32
|
|
2633
|
+
ggml_cann_pool_alloc src0_cast_allocator;
|
|
2634
|
+
if (src0->type == GGML_TYPE_F16) {
|
|
2635
|
+
src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
|
|
2636
|
+
void* src0_cast_buf = src0_cast_allocator.get();
|
|
2637
|
+
|
|
2638
|
+
size_t cast_nb[GGML_MAX_DIMS];
|
|
2639
|
+
cast_nb[0] = sizeof(float_t);
|
|
2640
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2641
|
+
cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
|
|
2642
|
+
}
|
|
2643
|
+
|
|
2644
|
+
aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
|
|
2645
|
+
aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
|
|
2646
|
+
ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
|
|
2647
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
|
|
2648
|
+
ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
|
|
2649
|
+
|
|
2650
|
+
src0_original = (char *) src0_cast_buf;
|
|
2651
|
+
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
|
|
2652
|
+
}
|
|
2653
|
+
|
|
2654
|
+
std::vector<aclTensor*> src0_tensor_vec;
|
|
2655
|
+
std::vector<aclTensor*> src1_tensor_vec;
|
|
2656
|
+
std::vector<aclTensor*> dst_tensor_vec;
|
|
2657
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
2658
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
|
2659
|
+
// src0_row [M, D] -> weight && permute
|
|
2660
|
+
int64_t src0_ne[2] = {ne01, ne00};
|
|
2661
|
+
size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
|
|
2662
|
+
// src1_row [D, 1] -> input
|
|
2663
|
+
int64_t src1_ne[2] = {ne10, 1};
|
|
2664
|
+
size_t src1_nb[2] = {nb10, nb11};
|
|
2665
|
+
// dst_row [M, 1] -> out
|
|
2666
|
+
int64_t dst_ne[2] = {ne0, 1};
|
|
2667
|
+
size_t dst_nb[2] = {nb0, nb1};
|
|
2668
|
+
|
|
2669
|
+
// expert index
|
|
2670
|
+
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
2671
|
+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
|
2672
|
+
|
|
2673
|
+
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
|
2674
|
+
int64_t i11 = (ne11 == 1 ? 0 : id);
|
|
2675
|
+
int64_t i12 = iid1;
|
|
2676
|
+
|
|
2677
|
+
int64_t i1 = id;
|
|
2678
|
+
int64_t i2 = i12;
|
|
2679
|
+
|
|
2680
|
+
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
|
2681
|
+
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
|
2682
|
+
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
|
2683
|
+
|
|
2684
|
+
aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
|
|
2685
|
+
ACL_FLOAT, sizeof(float),
|
|
2686
|
+
src0_ne, src0_nb, 2);
|
|
2687
|
+
aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
|
|
2688
|
+
ACL_FLOAT, sizeof(float),
|
|
2689
|
+
src1_ne, src1_nb, 2);
|
|
2690
|
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
|
|
2691
|
+
ACL_FLOAT, sizeof(float),
|
|
2692
|
+
dst_ne, dst_nb, 2);
|
|
2693
|
+
|
|
2694
|
+
src0_tensor_vec.push_back(acl_src0);
|
|
2695
|
+
src1_tensor_vec.push_back(acl_src1);
|
|
2696
|
+
dst_tensor_vec.push_back(acl_dst);
|
|
2697
|
+
}
|
|
2698
|
+
}
|
|
2699
|
+
|
|
2700
|
+
size_t GROUP_SIZE = 128;
|
|
2701
|
+
// GroupedMatmulV2 required tensor_list.size < 128
|
|
2702
|
+
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
|
2703
|
+
// split and call GroupedMatmulV2
|
|
2704
|
+
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
|
2705
|
+
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
|
2706
|
+
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
|
2707
|
+
std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
|
|
2708
|
+
|
|
2709
|
+
aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
|
|
2710
|
+
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
|
|
2711
|
+
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
|
|
2712
|
+
|
|
2713
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
|
|
2714
|
+
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
|
|
2715
|
+
|
|
2716
|
+
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
|
|
2717
|
+
}
|
|
2718
|
+
return;
|
|
2719
|
+
}
|
|
2720
|
+
|
|
2721
|
+
/**
|
|
2722
|
+
* @brief Performs expert-specific matrix multiplication (MoE) with
|
|
2723
|
+
* quantized precision using the CANN backend.
|
|
2724
|
+
*
|
|
2725
|
+
* This function executes a matrix multiplication operation tailored for
|
|
2726
|
+
* Mixture of Experts (MoE) models, where the input tensor is multiplied
|
|
2727
|
+
* with expert-specific quantized weight matrices. It leverages the CANN
|
|
2728
|
+
* backend to perform efficient low-precision computations and stores the
|
|
2729
|
+
* quantized result in the destination tensor `dst`.
|
|
2730
|
+
*
|
|
2731
|
+
* Quantization techniques reduce memory footprint and improve performance
|
|
2732
|
+
* by using lower-bit representations (e.g., int8) instead of floating-point.
|
|
2733
|
+
* This function is designed to work with such formats and may incorporate
|
|
2734
|
+
* optimizations like identity-based fast paths or routing masks for sparse
|
|
2735
|
+
* expert selection.
|
|
2736
|
+
*
|
|
2737
|
+
* @param ctx The context for executing CANN backend operations.
|
|
2738
|
+
* @param dst The destination tensor where the quantized MoE multiplication result
|
|
2739
|
+
* will be stored.
|
|
2740
|
+
*
|
|
2741
|
+
* @note This function assumes quantized data types and is designed for
|
|
2742
|
+
* MoE architectures with potential sparse expert routing.
|
|
2743
|
+
*/
|
|
2744
|
+
static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
2745
|
+
// TODO: Use aclnnGroupedMatMul
|
|
2746
|
+
//dst [M, K, N, 1]
|
|
2747
|
+
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
|
|
2748
|
+
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
|
|
2749
|
+
ggml_tensor * ids = dst->src[2]; //ids [K, N]
|
|
2750
|
+
|
|
2751
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
2752
|
+
|
|
2753
|
+
// copy index from npu to cpu
|
|
2754
|
+
int64_t n_as = ne02; // A
|
|
2755
|
+
int64_t n_ids = ids->ne[0]; // K
|
|
2756
|
+
|
|
2757
|
+
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
2758
|
+
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
|
|
2759
|
+
ACL_MEMCPY_DEVICE_TO_HOST);
|
|
2760
|
+
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
|
|
2761
|
+
|
|
2762
|
+
char * src0_original = (char *) src0->data;
|
|
2763
|
+
char * src1_original = (char *) src1->data;
|
|
2764
|
+
char * dst_original = (char *) dst->data;
|
|
2765
|
+
|
|
2766
|
+
ggml_tensor src0_row = *src0;
|
|
2767
|
+
ggml_tensor src1_row = *src1;
|
|
2768
|
+
ggml_tensor dst_row = *dst;
|
|
2769
|
+
|
|
2770
|
+
const enum ggml_type type = dst->src[0]->type;
|
|
2771
|
+
float weight_elem_size;
|
|
2772
|
+
if (type == GGML_TYPE_Q4_0) {
|
|
2773
|
+
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
|
2774
|
+
} else if (type == GGML_TYPE_Q8_0) {
|
|
2775
|
+
weight_elem_size = float(sizeof(uint8_t));
|
|
2776
|
+
} else {
|
|
2777
|
+
GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
|
|
2778
|
+
}
|
|
2779
|
+
|
|
2780
|
+
// src0_row [D, M, 1, 1] weight without permute
|
|
2781
|
+
src0_row.ne[2] = 1;
|
|
2782
|
+
src0_row.ne[3] = 1;
|
|
2783
|
+
src0_row.nb[0] = weight_elem_size;
|
|
2784
|
+
src0_row.nb[1] = weight_elem_size * ne00;
|
|
2785
|
+
src0_row.nb[2] = weight_elem_size * ne00;
|
|
2786
|
+
src0_row.nb[3] = weight_elem_size * ne00;
|
|
2787
|
+
size_t weight_stride = ne00 * ne01 * weight_elem_size;
|
|
2788
|
+
size_t weight_size = weight_stride * ne02 * ne03;
|
|
2789
|
+
|
|
2790
|
+
// scale [D, M, 1, 1] -> scale && permute
|
|
2791
|
+
size_t scale_elem_size = sizeof(uint16_t);
|
|
2792
|
+
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
|
2793
|
+
|
|
2794
|
+
// src1_row [D, 1, 1, 1] -> input
|
|
2795
|
+
src1_row.ne[1] = 1;
|
|
2796
|
+
src1_row.ne[2] = 1;
|
|
2797
|
+
src1_row.ne[3] = 1;
|
|
2798
|
+
src1_row.nb[2] = nb11;
|
|
2799
|
+
src1_row.nb[3] = nb11;
|
|
2800
|
+
|
|
2801
|
+
// dst_row [M, 1, 1, 1] -> out
|
|
2802
|
+
dst_row.ne[1] = 1;
|
|
2803
|
+
dst_row.ne[2] = 1;
|
|
2804
|
+
dst_row.ne[3] = 1;
|
|
2805
|
+
dst_row.nb[2] = nb1;
|
|
2806
|
+
dst_row.nb[3] = nb1;
|
|
2807
|
+
|
|
2808
|
+
//create weight for one row
|
|
2809
|
+
ggml_cann_pool_alloc weight_allocator(ctx.pool());
|
|
2810
|
+
void* weight_buffer = weight_allocator.alloc(nb02);
|
|
2811
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
2812
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
|
2813
|
+
// expert index
|
|
2814
|
+
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
2815
|
+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
|
2816
|
+
|
|
2817
|
+
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
|
2818
|
+
int64_t i11 = (ne11 == 1 ? 0 : id);
|
|
2819
|
+
int64_t i12 = iid1;
|
|
2820
|
+
|
|
2821
|
+
int64_t i1 = id;
|
|
2822
|
+
int64_t i2 = i12;
|
|
2823
|
+
|
|
2824
|
+
void* src0_tmp_ptr = src0_original + i02*weight_stride;
|
|
2825
|
+
void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
|
|
2826
|
+
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
|
2827
|
+
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
|
2828
|
+
|
|
2829
|
+
// mem cpy
|
|
2830
|
+
ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
|
|
2831
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
|
2832
|
+
void* scale_buffer = (char*)weight_buffer + weight_stride;
|
|
2833
|
+
ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
|
|
2834
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
|
2835
|
+
|
|
2836
|
+
src0_row.data = weight_buffer;
|
|
2837
|
+
src1_row.data = src1_tmp_ptr;
|
|
2838
|
+
dst_row.data = dst_tmp_ptr;
|
|
2839
|
+
dst_row.src[0] = &src0_row;
|
|
2840
|
+
dst_row.src[1] = &src1_row;
|
|
2841
|
+
|
|
2842
|
+
ggml_cann_mul_mat(ctx, &dst_row);
|
|
2843
|
+
}
|
|
2844
|
+
}
|
|
2845
|
+
return;
|
|
2846
|
+
}
|
|
2847
|
+
|
|
2848
|
+
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
2849
|
+
const enum ggml_type type = dst->src[0]->type;
|
|
2850
|
+
switch (type) {
|
|
2851
|
+
case GGML_TYPE_F32:
|
|
2852
|
+
case GGML_TYPE_F16:
|
|
2853
|
+
ggml_cann_mul_mat_id_fp(ctx, dst);
|
|
2854
|
+
break;
|
|
2855
|
+
case GGML_TYPE_Q4_0:
|
|
2856
|
+
case GGML_TYPE_Q8_0:
|
|
2857
|
+
ggml_cann_mul_mat_id_quant(ctx, dst);
|
|
2858
|
+
break;
|
|
2859
|
+
default:
|
|
2860
|
+
GGML_ABORT("Unsupported type for mul_mat_id");
|
|
2861
|
+
break;
|
|
2862
|
+
}
|
|
2863
|
+
}
|
|
@@ -978,6 +978,33 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
|
|
|
978
978
|
}
|
|
979
979
|
}
|
|
980
980
|
|
|
981
|
+
/**
|
|
982
|
+
* @brief Performs sparse expert-based matrix multiplication using the CANN backend.
|
|
983
|
+
*
|
|
984
|
+
* @details This function implements a MoE-style batched matrix multiplication, where each input token
|
|
985
|
+
* is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
|
|
986
|
+
* in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
|
|
987
|
+
*
|
|
988
|
+
* For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
|
|
989
|
+
* performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
|
|
990
|
+
* and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
|
|
991
|
+
*
|
|
992
|
+
* Dimensions:
|
|
993
|
+
* - src0: [D, M, A, 1], where A is the number of experts
|
|
994
|
+
* - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
|
|
995
|
+
* - ids : [K, N], where K is the number of experts each token is routed to
|
|
996
|
+
* - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication
|
|
997
|
+
*
|
|
998
|
+
* The function handles two main modes:
|
|
999
|
+
* - If `ne12 == 1`, a simpler per-token loop is used.
|
|
1000
|
+
* - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
|
|
1001
|
+
*
|
|
1002
|
+
* @param ctx The CANN context used for operations.
|
|
1003
|
+
* @param dst The destination tensor where the expert-weighted token outputs are stored.
|
|
1004
|
+
* Expected to be of shape [M, K, N, 1].
|
|
1005
|
+
*/
|
|
1006
|
+
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
|
1007
|
+
|
|
981
1008
|
/**
|
|
982
1009
|
* @brief Applies a element-wise operation to two input tensors using the CANN
|
|
983
1010
|
* backend.
|
|
@@ -1672,7 +1672,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1672
1672
|
ggml_cann_mul_mat(ctx, dst);
|
|
1673
1673
|
break;
|
|
1674
1674
|
case GGML_OP_MUL_MAT_ID:
|
|
1675
|
-
|
|
1675
|
+
ggml_cann_mul_mat_id(ctx, dst);
|
|
1676
|
+
break;
|
|
1676
1677
|
case GGML_OP_SCALE:
|
|
1677
1678
|
ggml_cann_scale(ctx, dst);
|
|
1678
1679
|
break;
|
|
@@ -2030,7 +2031,22 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
|
2030
2031
|
}
|
|
2031
2032
|
}
|
|
2032
2033
|
case GGML_OP_MUL_MAT_ID:
|
|
2033
|
-
|
|
2034
|
+
switch (op->src[0]->type) {
|
|
2035
|
+
case GGML_TYPE_F16:
|
|
2036
|
+
case GGML_TYPE_F32:
|
|
2037
|
+
return true;
|
|
2038
|
+
case GGML_TYPE_Q8_0:
|
|
2039
|
+
case GGML_TYPE_Q4_0:
|
|
2040
|
+
#ifdef ASCEND_310P
|
|
2041
|
+
// Q4 && Q8 per group is not suppor on 310p device
|
|
2042
|
+
return false;
|
|
2043
|
+
#endif
|
|
2044
|
+
// only support contiguous for quantized types.
|
|
2045
|
+
return ggml_is_contiguous(op->src[0]) &&
|
|
2046
|
+
ggml_is_contiguous(op->src[1]);
|
|
2047
|
+
default:
|
|
2048
|
+
return false;
|
|
2049
|
+
}
|
|
2034
2050
|
// embedding
|
|
2035
2051
|
case GGML_OP_GET_ROWS: {
|
|
2036
2052
|
switch (op->src[0]->type) {
|
|
@@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu(
|
|
|
2691
2691
|
}
|
|
2692
2692
|
}
|
|
2693
2693
|
|
|
2694
|
+
// ggml_compute_forward_gelu_erf
|
|
2695
|
+
|
|
2696
|
+
static void ggml_compute_forward_gelu_erf_f32(
|
|
2697
|
+
const ggml_compute_params * params,
|
|
2698
|
+
ggml_tensor * dst) {
|
|
2699
|
+
|
|
2700
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
2701
|
+
|
|
2702
|
+
assert(ggml_is_contiguous_1(src0));
|
|
2703
|
+
assert(ggml_is_contiguous_1(dst));
|
|
2704
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
2705
|
+
|
|
2706
|
+
const int ith = params->ith;
|
|
2707
|
+
const int nth = params->nth;
|
|
2708
|
+
|
|
2709
|
+
const int nc = src0->ne[0];
|
|
2710
|
+
const int nr = ggml_nrows(src0);
|
|
2711
|
+
|
|
2712
|
+
// rows per thread
|
|
2713
|
+
const int dr = (nr + nth - 1)/nth;
|
|
2714
|
+
|
|
2715
|
+
// row range for this thread
|
|
2716
|
+
const int ir0 = dr*ith;
|
|
2717
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
2718
|
+
|
|
2719
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
2720
|
+
ggml_vec_gelu_erf_f32(nc,
|
|
2721
|
+
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
|
2722
|
+
(float *) ((char *) src0->data + i1*(src0->nb[1])));
|
|
2723
|
+
|
|
2724
|
+
#ifndef NDEBUG
|
|
2725
|
+
for (int k = 0; k < nc; k++) {
|
|
2726
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
2727
|
+
GGML_UNUSED(x);
|
|
2728
|
+
assert(!isnan(x));
|
|
2729
|
+
assert(!isinf(x));
|
|
2730
|
+
}
|
|
2731
|
+
#endif
|
|
2732
|
+
}
|
|
2733
|
+
}
|
|
2734
|
+
|
|
2735
|
+
static void ggml_compute_forward_gelu_erf_f16(
|
|
2736
|
+
const ggml_compute_params * params,
|
|
2737
|
+
ggml_tensor * dst) {
|
|
2738
|
+
|
|
2739
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
2740
|
+
|
|
2741
|
+
assert(ggml_is_contiguous_1(src0));
|
|
2742
|
+
assert(ggml_is_contiguous_1(dst));
|
|
2743
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
2744
|
+
|
|
2745
|
+
const int ith = params->ith;
|
|
2746
|
+
const int nth = params->nth;
|
|
2747
|
+
|
|
2748
|
+
const int nc = src0->ne[0];
|
|
2749
|
+
const int nr = ggml_nrows(src0);
|
|
2750
|
+
|
|
2751
|
+
// rows per thread
|
|
2752
|
+
const int dr = (nr + nth - 1)/nth;
|
|
2753
|
+
|
|
2754
|
+
// row range for this thread
|
|
2755
|
+
const int ir0 = dr*ith;
|
|
2756
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
2757
|
+
|
|
2758
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
2759
|
+
ggml_vec_gelu_erf_f16(nc,
|
|
2760
|
+
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
|
|
2761
|
+
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
|
|
2762
|
+
|
|
2763
|
+
#ifndef NDEBUG
|
|
2764
|
+
for (int k = 0; k < nc; k++) {
|
|
2765
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
2766
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
2767
|
+
GGML_UNUSED(v);
|
|
2768
|
+
assert(!isnan(v));
|
|
2769
|
+
assert(!isinf(v));
|
|
2770
|
+
}
|
|
2771
|
+
#endif
|
|
2772
|
+
}
|
|
2773
|
+
}
|
|
2774
|
+
|
|
2775
|
+
static void ggml_compute_forward_gelu_erf(
|
|
2776
|
+
const ggml_compute_params * params,
|
|
2777
|
+
ggml_tensor * dst) {
|
|
2778
|
+
|
|
2779
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
2780
|
+
|
|
2781
|
+
switch (src0->type) {
|
|
2782
|
+
case GGML_TYPE_F32:
|
|
2783
|
+
{
|
|
2784
|
+
ggml_compute_forward_gelu_erf_f32(params, dst);
|
|
2785
|
+
} break;
|
|
2786
|
+
case GGML_TYPE_F16:
|
|
2787
|
+
{
|
|
2788
|
+
ggml_compute_forward_gelu_erf_f16(params, dst);
|
|
2789
|
+
} break;
|
|
2790
|
+
default:
|
|
2791
|
+
{
|
|
2792
|
+
GGML_ABORT("fatal error");
|
|
2793
|
+
}
|
|
2794
|
+
}
|
|
2795
|
+
}
|
|
2796
|
+
|
|
2694
2797
|
// ggml_compute_forward_gelu_quick
|
|
2695
2798
|
|
|
2696
2799
|
static void ggml_compute_forward_gelu_quick_f32(
|
|
@@ -7749,6 +7852,10 @@ void ggml_compute_forward_unary(
|
|
|
7749
7852
|
{
|
|
7750
7853
|
ggml_compute_forward_gelu(params, dst);
|
|
7751
7854
|
} break;
|
|
7855
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
7856
|
+
{
|
|
7857
|
+
ggml_compute_forward_gelu_erf(params, dst);
|
|
7858
|
+
} break;
|
|
7752
7859
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
7753
7860
|
{
|
|
7754
7861
|
ggml_compute_forward_gelu_quick(params, dst);
|
|
@@ -428,6 +428,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
|
|
|
428
428
|
static const float GELU_COEF_A = 0.044715f;
|
|
429
429
|
static const float GELU_QUICK_COEF = -1.702f;
|
|
430
430
|
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
|
431
|
+
static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
|
431
432
|
|
|
432
433
|
inline static float ggml_gelu_f32(float x) {
|
|
433
434
|
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
|
@@ -440,6 +441,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
|
|
440
441
|
}
|
|
441
442
|
}
|
|
442
443
|
|
|
444
|
+
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
445
|
+
for (int i = 0; i < n; ++i) {
|
|
446
|
+
float xi = GGML_FP16_TO_FP32(x[i]);
|
|
447
|
+
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
|
448
|
+
y[i] = GGML_FP32_TO_FP16(res);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
443
452
|
#ifdef GGML_GELU_FP16
|
|
444
453
|
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
445
454
|
uint16_t t;
|
|
@@ -463,6 +472,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
|
463
472
|
}
|
|
464
473
|
#endif
|
|
465
474
|
|
|
475
|
+
inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
|
|
476
|
+
for (int i = 0; i < n; ++i) {
|
|
477
|
+
float xi = x[i];
|
|
478
|
+
y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
466
482
|
inline static float ggml_gelu_quick_f32(float x) {
|
|
467
483
|
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
|
468
484
|
}
|
|
@@ -27,12 +27,15 @@ if (MUSAToolkit_FOUND)
|
|
|
27
27
|
|
|
28
28
|
file(GLOB GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
|
|
29
29
|
list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
|
|
30
|
+
list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")
|
|
30
31
|
|
|
31
32
|
file(GLOB GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
|
|
32
33
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
|
|
33
34
|
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
|
34
35
|
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
|
35
36
|
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
|
37
|
+
file(GLOB SRCS "../ggml-musa/*.cu")
|
|
38
|
+
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
|
36
39
|
|
|
37
40
|
if (GGML_CUDA_FA_ALL_QUANTS)
|
|
38
41
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
|
@@ -62,7 +65,9 @@ if (MUSAToolkit_FOUND)
|
|
|
62
65
|
)
|
|
63
66
|
|
|
64
67
|
# TODO: do not use CUDA definitions for MUSA
|
|
65
|
-
|
|
68
|
+
if (NOT GGML_BACKEND_DL)
|
|
69
|
+
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
|
70
|
+
endif()
|
|
66
71
|
|
|
67
72
|
add_compile_definitions(GGML_USE_MUSA)
|
|
68
73
|
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
|
@@ -92,9 +97,10 @@ if (MUSAToolkit_FOUND)
|
|
|
92
97
|
endif()
|
|
93
98
|
|
|
94
99
|
if (GGML_STATIC)
|
|
100
|
+
# TODO: mudnn has not provided static libraries yet
|
|
95
101
|
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
|
96
102
|
else()
|
|
97
|
-
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
|
|
103
|
+
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
|
|
98
104
|
endif()
|
|
99
105
|
|
|
100
106
|
if (GGML_CUDA_NO_VMM)
|