@fugood/llama.node 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +20 -6
  18. package/lib/index.js +41 -17
  19. package/lib/index.ts +50 -23
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +9 -9
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +37 -18
  24. package/src/LlamaContext.h +1 -0
  25. package/src/TokenizeWorker.cpp +16 -12
  26. package/src/TokenizeWorker.h +2 -2
  27. package/src/common.hpp +54 -50
  28. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  29. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  30. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  31. package/src/llama.cpp/common/arg.cpp +14 -13
  32. package/src/llama.cpp/common/common.cpp +4 -75
  33. package/src/llama.cpp/common/common.h +7 -12
  34. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  35. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  37. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  38. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  39. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  40. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  41. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  42. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  43. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  44. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  45. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  48. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  52. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  53. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  54. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  55. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  56. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  57. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  58. package/src/llama.cpp/include/llama.h +24 -124
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  61. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  62. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  63. package/src/llama.cpp/src/llama-context.cpp +60 -110
  64. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  65. package/src/llama.cpp/src/llama-graph.h +49 -7
  66. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  67. package/src/llama.cpp/src/llama-hparams.h +34 -5
  68. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  69. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  70. package/src/llama.cpp/src/llama-memory.h +3 -2
  71. package/src/llama.cpp/src/llama-model.cpp +273 -94
  72. package/src/llama.cpp/src/llama-model.h +4 -1
  73. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  74. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  75. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  76. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  77. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  78. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  79. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  82. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  83. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  84. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  85. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  86. package/src/llama.cpp/tools/run/run.cpp +2 -2
  87. package/src/llama.cpp/tools/server/server.cpp +158 -47
  88. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  89. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
@@ -65,6 +65,7 @@
65
65
  #include <aclnnop/aclnn_eq_tensor.h>
66
66
  #include <aclnnop/aclnn_gt_scalar.h>
67
67
  #include <aclnnop/aclnn_pow.h>
68
+ #include <aclnnop/aclnn_grouped_matmul_v2.h>
68
69
  #include <float.h>
69
70
 
70
71
  #include <cmath>
@@ -2587,3 +2588,276 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2587
2588
 
2588
2589
  ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
2589
2590
  }
2591
+
2592
+ /**
2593
+ * @brief Performs expert-specific matrix multiplication (MoE) with
2594
+ * floating-point precision using the CANN backend.
2595
+ *
2596
+ * This function executes a matrix multiplication operation tailored for
2597
+ * Mixture of Experts (MoE) models, where the input tensor is multiplied
2598
+ * with expert-specific weight matrices. It uses the CANN backend for
2599
+ * efficient computation and stores the result in the destination tensor `dst`.
2600
+ * The operation may leverage identity-based optimizations or routing masks
2601
+ * as part of sparse expert selection.
2602
+ *
2603
+ * @param ctx The context for executing CANN backend operations.
2604
+ * @param dst The destination tensor where the MoE multiplication result
2605
+ * will be stored.
2606
+ *
2607
+ * @note This function assumes floating-point data types and is designed for
2608
+ * MoE architectures, possibly involving sparse expert routing.
2609
+ */
2610
+ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2611
+ //dst [M, K, N, 1]
2612
+ ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
2613
+ ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
2614
+ ggml_tensor * ids = dst->src[2]; //ids [K, N]
2615
+
2616
+ GGML_TENSOR_BINARY_OP_LOCALS
2617
+
2618
+ // copy index from npu to cpu
2619
+ int64_t n_as = ne02; // A
2620
+ int64_t n_ids = ids->ne[0]; // K
2621
+
2622
+ std::vector<char> ids_host(ggml_nbytes(ids));
2623
+ ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
2624
+ ACL_MEMCPY_DEVICE_TO_HOST);
2625
+ ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
2626
+
2627
+ char * src0_original = (char *) src0->data;
2628
+ char * src1_original = (char *) src1->data;
2629
+ char * dst_original = (char *) dst->data;
2630
+ size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
2631
+
2632
+ // src0 is F16, src1 is F32, dst is F32
2633
+ ggml_cann_pool_alloc src0_cast_allocator;
2634
+ if (src0->type == GGML_TYPE_F16) {
2635
+ src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
2636
+ void* src0_cast_buf = src0_cast_allocator.get();
2637
+
2638
+ size_t cast_nb[GGML_MAX_DIMS];
2639
+ cast_nb[0] = sizeof(float_t);
2640
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2641
+ cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
2642
+ }
2643
+
2644
+ aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
2645
+ aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
2646
+ ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
2647
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
2648
+ ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
2649
+
2650
+ src0_original = (char *) src0_cast_buf;
2651
+ memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
2652
+ }
2653
+
2654
+ std::vector<aclTensor*> src0_tensor_vec;
2655
+ std::vector<aclTensor*> src1_tensor_vec;
2656
+ std::vector<aclTensor*> dst_tensor_vec;
2657
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2658
+ for (int64_t id = 0; id < n_ids; id++) {
2659
+ // src0_row [M, D] -> weight && permute
2660
+ int64_t src0_ne[2] = {ne01, ne00};
2661
+ size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
2662
+ // src1_row [D, 1] -> input
2663
+ int64_t src1_ne[2] = {ne10, 1};
2664
+ size_t src1_nb[2] = {nb10, nb11};
2665
+ // dst_row [M, 1] -> out
2666
+ int64_t dst_ne[2] = {ne0, 1};
2667
+ size_t dst_nb[2] = {nb0, nb1};
2668
+
2669
+ // expert index
2670
+ int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2671
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
2672
+
2673
+ // If B = 1 (broadcast), always use 0; otherwise, use id.
2674
+ int64_t i11 = (ne11 == 1 ? 0 : id);
2675
+ int64_t i12 = iid1;
2676
+
2677
+ int64_t i1 = id;
2678
+ int64_t i2 = i12;
2679
+
2680
+ void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
2681
+ void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2682
+ void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2683
+
2684
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
2685
+ ACL_FLOAT, sizeof(float),
2686
+ src0_ne, src0_nb, 2);
2687
+ aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
2688
+ ACL_FLOAT, sizeof(float),
2689
+ src1_ne, src1_nb, 2);
2690
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
2691
+ ACL_FLOAT, sizeof(float),
2692
+ dst_ne, dst_nb, 2);
2693
+
2694
+ src0_tensor_vec.push_back(acl_src0);
2695
+ src1_tensor_vec.push_back(acl_src1);
2696
+ dst_tensor_vec.push_back(acl_dst);
2697
+ }
2698
+ }
2699
+
2700
+ size_t GROUP_SIZE = 128;
2701
+ // GroupedMatmulV2 required tensor_list.size < 128
2702
+ for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
2703
+ // split and call GroupedMatmulV2
2704
+ size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
2705
+ std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
2706
+ std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
2707
+ std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
2708
+
2709
+ aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
2710
+ aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
2711
+ aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
2712
+
2713
+ GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
2714
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
2715
+
2716
+ ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
2717
+ }
2718
+ return;
2719
+ }
2720
+
2721
+ /**
2722
+ * @brief Performs expert-specific matrix multiplication (MoE) with
2723
+ * quantized precision using the CANN backend.
2724
+ *
2725
+ * This function executes a matrix multiplication operation tailored for
2726
+ * Mixture of Experts (MoE) models, where the input tensor is multiplied
2727
+ * with expert-specific quantized weight matrices. It leverages the CANN
2728
+ * backend to perform efficient low-precision computations and stores the
2729
+ * quantized result in the destination tensor `dst`.
2730
+ *
2731
+ * Quantization techniques reduce memory footprint and improve performance
2732
+ * by using lower-bit representations (e.g., int8) instead of floating-point.
2733
+ * This function is designed to work with such formats and may incorporate
2734
+ * optimizations like identity-based fast paths or routing masks for sparse
2735
+ * expert selection.
2736
+ *
2737
+ * @param ctx The context for executing CANN backend operations.
2738
+ * @param dst The destination tensor where the quantized MoE multiplication result
2739
+ * will be stored.
2740
+ *
2741
+ * @note This function assumes quantized data types and is designed for
2742
+ * MoE architectures with potential sparse expert routing.
2743
+ */
2744
+ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2745
+ // TODO: Use aclnnGroupedMatMul
2746
+ //dst [M, K, N, 1]
2747
+ ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
2748
+ ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
2749
+ ggml_tensor * ids = dst->src[2]; //ids [K, N]
2750
+
2751
+ GGML_TENSOR_BINARY_OP_LOCALS
2752
+
2753
+ // copy index from npu to cpu
2754
+ int64_t n_as = ne02; // A
2755
+ int64_t n_ids = ids->ne[0]; // K
2756
+
2757
+ std::vector<char> ids_host(ggml_nbytes(ids));
2758
+ ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
2759
+ ACL_MEMCPY_DEVICE_TO_HOST);
2760
+ ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
2761
+
2762
+ char * src0_original = (char *) src0->data;
2763
+ char * src1_original = (char *) src1->data;
2764
+ char * dst_original = (char *) dst->data;
2765
+
2766
+ ggml_tensor src0_row = *src0;
2767
+ ggml_tensor src1_row = *src1;
2768
+ ggml_tensor dst_row = *dst;
2769
+
2770
+ const enum ggml_type type = dst->src[0]->type;
2771
+ float weight_elem_size;
2772
+ if (type == GGML_TYPE_Q4_0) {
2773
+ weight_elem_size = float(sizeof(uint8_t)) / 2;
2774
+ } else if (type == GGML_TYPE_Q8_0) {
2775
+ weight_elem_size = float(sizeof(uint8_t));
2776
+ } else {
2777
+ GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
2778
+ }
2779
+
2780
+ // src0_row [D, M, 1, 1] weight without permute
2781
+ src0_row.ne[2] = 1;
2782
+ src0_row.ne[3] = 1;
2783
+ src0_row.nb[0] = weight_elem_size;
2784
+ src0_row.nb[1] = weight_elem_size * ne00;
2785
+ src0_row.nb[2] = weight_elem_size * ne00;
2786
+ src0_row.nb[3] = weight_elem_size * ne00;
2787
+ size_t weight_stride = ne00 * ne01 * weight_elem_size;
2788
+ size_t weight_size = weight_stride * ne02 * ne03;
2789
+
2790
+ // scale [D, M, 1, 1] -> scale && permute
2791
+ size_t scale_elem_size = sizeof(uint16_t);
2792
+ size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
2793
+
2794
+ // src1_row [D, 1, 1, 1] -> input
2795
+ src1_row.ne[1] = 1;
2796
+ src1_row.ne[2] = 1;
2797
+ src1_row.ne[3] = 1;
2798
+ src1_row.nb[2] = nb11;
2799
+ src1_row.nb[3] = nb11;
2800
+
2801
+ // dst_row [M, 1, 1, 1] -> out
2802
+ dst_row.ne[1] = 1;
2803
+ dst_row.ne[2] = 1;
2804
+ dst_row.ne[3] = 1;
2805
+ dst_row.nb[2] = nb1;
2806
+ dst_row.nb[3] = nb1;
2807
+
2808
+ //create weight for one row
2809
+ ggml_cann_pool_alloc weight_allocator(ctx.pool());
2810
+ void* weight_buffer = weight_allocator.alloc(nb02);
2811
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2812
+ for (int64_t id = 0; id < n_ids; id++) {
2813
+ // expert index
2814
+ int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2815
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
2816
+
2817
+ // If B = 1 (broadcast), always use 0; otherwise, use id.
2818
+ int64_t i11 = (ne11 == 1 ? 0 : id);
2819
+ int64_t i12 = iid1;
2820
+
2821
+ int64_t i1 = id;
2822
+ int64_t i2 = i12;
2823
+
2824
+ void* src0_tmp_ptr = src0_original + i02*weight_stride;
2825
+ void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
2826
+ void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2827
+ void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2828
+
2829
+ // mem cpy
2830
+ ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
2831
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
2832
+ void* scale_buffer = (char*)weight_buffer + weight_stride;
2833
+ ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
2834
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
2835
+
2836
+ src0_row.data = weight_buffer;
2837
+ src1_row.data = src1_tmp_ptr;
2838
+ dst_row.data = dst_tmp_ptr;
2839
+ dst_row.src[0] = &src0_row;
2840
+ dst_row.src[1] = &src1_row;
2841
+
2842
+ ggml_cann_mul_mat(ctx, &dst_row);
2843
+ }
2844
+ }
2845
+ return;
2846
+ }
2847
+
2848
+ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2849
+ const enum ggml_type type = dst->src[0]->type;
2850
+ switch (type) {
2851
+ case GGML_TYPE_F32:
2852
+ case GGML_TYPE_F16:
2853
+ ggml_cann_mul_mat_id_fp(ctx, dst);
2854
+ break;
2855
+ case GGML_TYPE_Q4_0:
2856
+ case GGML_TYPE_Q8_0:
2857
+ ggml_cann_mul_mat_id_quant(ctx, dst);
2858
+ break;
2859
+ default:
2860
+ GGML_ABORT("Unsupported type for mul_mat_id");
2861
+ break;
2862
+ }
2863
+ }
@@ -978,6 +978,33 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
978
978
  }
979
979
  }
980
980
 
981
+ /**
982
+ * @brief Performs sparse expert-based matrix multiplication using the CANN backend.
983
+ *
984
+ * @details This function implements a MoE-style batched matrix multiplication, where each input token
985
+ * is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
986
+ * in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
987
+ *
988
+ * For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
989
+ * performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
990
+ * and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
991
+ *
992
+ * Dimensions:
993
+ * - src0: [D, M, A, 1], where A is the number of experts
994
+ * - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
995
+ * - ids : [K, N], where K is the number of experts each token is routed to
996
+ * - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication
997
+ *
998
+ * The function handles two main modes:
999
+ * - If `ne12 == 1`, a simpler per-token loop is used.
1000
+ * - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
1001
+ *
1002
+ * @param ctx The CANN context used for operations.
1003
+ * @param dst The destination tensor where the expert-weighted token outputs are stored.
1004
+ * Expected to be of shape [M, K, N, 1].
1005
+ */
1006
+ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
1007
+
981
1008
  /**
982
1009
  * @brief Applies a element-wise operation to two input tensors using the CANN
983
1010
  * backend.
@@ -1672,7 +1672,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1672
1672
  ggml_cann_mul_mat(ctx, dst);
1673
1673
  break;
1674
1674
  case GGML_OP_MUL_MAT_ID:
1675
- return false;
1675
+ ggml_cann_mul_mat_id(ctx, dst);
1676
+ break;
1676
1677
  case GGML_OP_SCALE:
1677
1678
  ggml_cann_scale(ctx, dst);
1678
1679
  break;
@@ -2030,7 +2031,22 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2030
2031
  }
2031
2032
  }
2032
2033
  case GGML_OP_MUL_MAT_ID:
2033
- return false;
2034
+ switch (op->src[0]->type) {
2035
+ case GGML_TYPE_F16:
2036
+ case GGML_TYPE_F32:
2037
+ return true;
2038
+ case GGML_TYPE_Q8_0:
2039
+ case GGML_TYPE_Q4_0:
2040
+ #ifdef ASCEND_310P
2041
+ // Q4 && Q8 per group is not suppor on 310p device
2042
+ return false;
2043
+ #endif
2044
+ // only support contiguous for quantized types.
2045
+ return ggml_is_contiguous(op->src[0]) &&
2046
+ ggml_is_contiguous(op->src[1]);
2047
+ default:
2048
+ return false;
2049
+ }
2034
2050
  // embedding
2035
2051
  case GGML_OP_GET_ROWS: {
2036
2052
  switch (op->src[0]->type) {
@@ -2202,6 +2202,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2202
2202
  } break;
2203
2203
 
2204
2204
  case GGML_UNARY_OP_GELU:
2205
+ case GGML_UNARY_OP_GELU_ERF:
2205
2206
  case GGML_UNARY_OP_GELU_QUICK:
2206
2207
  case GGML_UNARY_OP_SILU:
2207
2208
  {
@@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu(
2691
2691
  }
2692
2692
  }
2693
2693
 
2694
+ // ggml_compute_forward_gelu_erf
2695
+
2696
+ static void ggml_compute_forward_gelu_erf_f32(
2697
+ const ggml_compute_params * params,
2698
+ ggml_tensor * dst) {
2699
+
2700
+ const ggml_tensor * src0 = dst->src[0];
2701
+
2702
+ assert(ggml_is_contiguous_1(src0));
2703
+ assert(ggml_is_contiguous_1(dst));
2704
+ assert(ggml_are_same_shape(src0, dst));
2705
+
2706
+ const int ith = params->ith;
2707
+ const int nth = params->nth;
2708
+
2709
+ const int nc = src0->ne[0];
2710
+ const int nr = ggml_nrows(src0);
2711
+
2712
+ // rows per thread
2713
+ const int dr = (nr + nth - 1)/nth;
2714
+
2715
+ // row range for this thread
2716
+ const int ir0 = dr*ith;
2717
+ const int ir1 = MIN(ir0 + dr, nr);
2718
+
2719
+ for (int i1 = ir0; i1 < ir1; i1++) {
2720
+ ggml_vec_gelu_erf_f32(nc,
2721
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
2722
+ (float *) ((char *) src0->data + i1*(src0->nb[1])));
2723
+
2724
+ #ifndef NDEBUG
2725
+ for (int k = 0; k < nc; k++) {
2726
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2727
+ GGML_UNUSED(x);
2728
+ assert(!isnan(x));
2729
+ assert(!isinf(x));
2730
+ }
2731
+ #endif
2732
+ }
2733
+ }
2734
+
2735
+ static void ggml_compute_forward_gelu_erf_f16(
2736
+ const ggml_compute_params * params,
2737
+ ggml_tensor * dst) {
2738
+
2739
+ const ggml_tensor * src0 = dst->src[0];
2740
+
2741
+ assert(ggml_is_contiguous_1(src0));
2742
+ assert(ggml_is_contiguous_1(dst));
2743
+ assert(ggml_are_same_shape(src0, dst));
2744
+
2745
+ const int ith = params->ith;
2746
+ const int nth = params->nth;
2747
+
2748
+ const int nc = src0->ne[0];
2749
+ const int nr = ggml_nrows(src0);
2750
+
2751
+ // rows per thread
2752
+ const int dr = (nr + nth - 1)/nth;
2753
+
2754
+ // row range for this thread
2755
+ const int ir0 = dr*ith;
2756
+ const int ir1 = MIN(ir0 + dr, nr);
2757
+
2758
+ for (int i1 = ir0; i1 < ir1; i1++) {
2759
+ ggml_vec_gelu_erf_f16(nc,
2760
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
2761
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
2762
+
2763
+ #ifndef NDEBUG
2764
+ for (int k = 0; k < nc; k++) {
2765
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2766
+ const float v = GGML_FP16_TO_FP32(x);
2767
+ GGML_UNUSED(v);
2768
+ assert(!isnan(v));
2769
+ assert(!isinf(v));
2770
+ }
2771
+ #endif
2772
+ }
2773
+ }
2774
+
2775
+ static void ggml_compute_forward_gelu_erf(
2776
+ const ggml_compute_params * params,
2777
+ ggml_tensor * dst) {
2778
+
2779
+ const ggml_tensor * src0 = dst->src[0];
2780
+
2781
+ switch (src0->type) {
2782
+ case GGML_TYPE_F32:
2783
+ {
2784
+ ggml_compute_forward_gelu_erf_f32(params, dst);
2785
+ } break;
2786
+ case GGML_TYPE_F16:
2787
+ {
2788
+ ggml_compute_forward_gelu_erf_f16(params, dst);
2789
+ } break;
2790
+ default:
2791
+ {
2792
+ GGML_ABORT("fatal error");
2793
+ }
2794
+ }
2795
+ }
2796
+
2694
2797
  // ggml_compute_forward_gelu_quick
2695
2798
 
2696
2799
  static void ggml_compute_forward_gelu_quick_f32(
@@ -7749,6 +7852,10 @@ void ggml_compute_forward_unary(
7749
7852
  {
7750
7853
  ggml_compute_forward_gelu(params, dst);
7751
7854
  } break;
7855
+ case GGML_UNARY_OP_GELU_ERF:
7856
+ {
7857
+ ggml_compute_forward_gelu_erf(params, dst);
7858
+ } break;
7752
7859
  case GGML_UNARY_OP_GELU_QUICK:
7753
7860
  {
7754
7861
  ggml_compute_forward_gelu_quick(params, dst);
@@ -428,6 +428,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
428
428
  static const float GELU_COEF_A = 0.044715f;
429
429
  static const float GELU_QUICK_COEF = -1.702f;
430
430
  static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
431
+ static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
431
432
 
432
433
  inline static float ggml_gelu_f32(float x) {
433
434
  return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -440,6 +441,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
440
441
  }
441
442
  }
442
443
 
444
+ inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
445
+ for (int i = 0; i < n; ++i) {
446
+ float xi = GGML_FP16_TO_FP32(x[i]);
447
+ float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
448
+ y[i] = GGML_FP32_TO_FP16(res);
449
+ }
450
+ }
451
+
443
452
  #ifdef GGML_GELU_FP16
444
453
  inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
445
454
  uint16_t t;
@@ -463,6 +472,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
463
472
  }
464
473
  #endif
465
474
 
475
+ inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
476
+ for (int i = 0; i < n; ++i) {
477
+ float xi = x[i];
478
+ y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
479
+ }
480
+ }
481
+
466
482
  inline static float ggml_gelu_quick_f32(float x) {
467
483
  return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
468
484
  }
@@ -27,12 +27,15 @@ if (MUSAToolkit_FOUND)
27
27
 
28
28
  file(GLOB GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
29
29
  list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
30
+ list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")
30
31
 
31
32
  file(GLOB GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
32
33
  file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
33
34
  list(APPEND GGML_SOURCES_MUSA ${SRCS})
34
35
  file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
35
36
  list(APPEND GGML_SOURCES_MUSA ${SRCS})
37
+ file(GLOB SRCS "../ggml-musa/*.cu")
38
+ list(APPEND GGML_SOURCES_MUSA ${SRCS})
36
39
 
37
40
  if (GGML_CUDA_FA_ALL_QUANTS)
38
41
  file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
@@ -62,7 +65,9 @@ if (MUSAToolkit_FOUND)
62
65
  )
63
66
 
64
67
  # TODO: do not use CUDA definitions for MUSA
65
- target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
68
+ if (NOT GGML_BACKEND_DL)
69
+ target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
70
+ endif()
66
71
 
67
72
  add_compile_definitions(GGML_USE_MUSA)
68
73
  add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
@@ -92,9 +97,10 @@ if (MUSAToolkit_FOUND)
92
97
  endif()
93
98
 
94
99
  if (GGML_STATIC)
100
+ # TODO: mudnn has not provided static libraries yet
95
101
  target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
96
102
  else()
97
- target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
103
+ target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
98
104
  endif()
99
105
 
100
106
  if (GGML_CUDA_NO_VMM)