@fugood/llama.node 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +8 -8
  3. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  4. package/src/llama.cpp/common/arg.cpp +44 -999
  5. package/src/llama.cpp/common/arg.h +2 -2
  6. package/src/llama.cpp/common/chat.cpp +17 -2
  7. package/src/llama.cpp/common/common.cpp +33 -0
  8. package/src/llama.cpp/common/common.h +15 -1
  9. package/src/llama.cpp/common/download.cpp +1054 -0
  10. package/src/llama.cpp/common/download.h +55 -0
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  12. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
  23. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  24. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  25. package/src/llama.cpp/include/llama.h +7 -3
  26. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  27. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  28. package/src/llama.cpp/src/llama-arch.h +11 -0
  29. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  30. package/src/llama.cpp/src/llama-batch.h +12 -1
  31. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  32. package/src/llama.cpp/src/llama-chat.h +1 -0
  33. package/src/llama.cpp/src/llama-context.cpp +36 -13
  34. package/src/llama.cpp/src/llama-context.h +5 -5
  35. package/src/llama.cpp/src/llama-cparams.h +1 -0
  36. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  37. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  38. package/src/llama.cpp/src/llama-hparams.h +6 -0
  39. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  40. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
  41. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  42. package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
  43. package/src/llama.cpp/src/llama-model.cpp +320 -13171
  44. package/src/llama.cpp/src/llama-model.h +8 -0
  45. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  46. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  47. package/src/llama.cpp/src/llama-vocab.h +1 -0
  48. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  49. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  50. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  51. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  52. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  53. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  54. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  55. package/src/llama.cpp/src/models/bert.cpp +176 -0
  56. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  57. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  58. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  59. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  60. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  61. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  62. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  63. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  64. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  65. package/src/llama.cpp/src/models/deci.cpp +135 -0
  66. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  67. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  68. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  69. package/src/llama.cpp/src/models/dream.cpp +105 -0
  70. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  71. package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
  72. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  73. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  74. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  75. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  76. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  77. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  78. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  79. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  80. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  81. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  82. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  83. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  84. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  85. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  86. package/src/llama.cpp/src/models/granite.cpp +211 -0
  87. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  88. package/src/llama.cpp/src/models/grok.cpp +159 -0
  89. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  90. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  91. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  92. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  93. package/src/llama.cpp/src/models/jais.cpp +86 -0
  94. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  95. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  96. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  97. package/src/llama.cpp/src/models/llada.cpp +99 -0
  98. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  99. package/src/llama.cpp/src/models/llama.cpp +155 -0
  100. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  101. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  102. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  103. package/src/llama.cpp/src/models/models.h +481 -0
  104. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  105. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  106. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  107. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  108. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  109. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  110. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  111. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  112. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  113. package/src/llama.cpp/src/models/orion.cpp +123 -0
  114. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  115. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  116. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  117. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  118. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  119. package/src/llama.cpp/src/models/plm.cpp +168 -0
  120. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  121. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  122. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  123. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  124. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  125. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  126. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  127. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  128. package/src/llama.cpp/src/models/refact.cpp +94 -0
  129. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  130. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  131. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  132. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  133. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  134. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  135. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  136. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  137. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  138. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  139. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  140. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  141. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  142. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  143. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
580
580
  const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
581
581
  uint8_t *patmp = atmp;
582
582
  int vsums;
583
- int tmp;
583
+ int tmp, t1, t2, t3, t4, t5, t6, t7;
584
584
  __asm__ __volatile__(
585
585
  "vsetivli zero, 16, e8, m1\n\t"
586
586
  "vmv.v.x v8, zero\n\t"
587
+ "lb zero, 15(%[sc])\n\t"
587
588
  "vle8.v v1, (%[sc])\n\t"
589
+ "vle8.v v2, (%[bsums])\n\t"
590
+ "addi %[tmp], %[bsums], 16\n\t"
588
591
  "vand.vi v0, v1, 0xF\n\t"
589
592
  "vsrl.vi v1, v1, 4\n\t"
593
+ "vle8.v v3, (%[tmp])\n\t"
590
594
  "vse8.v v0, (%[scale])\n\t"
591
595
  "vsetivli zero, 16, e16, m2\n\t"
592
- "vle16.v v2, (%[bsums])\n\t"
593
596
  "vzext.vf2 v0, v1\n\t"
594
597
  "vwmul.vv v4, v0, v2\n\t"
595
598
  "vsetivli zero, 16, e32, m4\n\t"
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
608
611
 
609
612
  for (int j = 0; j < QK_K/128; ++j) {
610
613
  __asm__ __volatile__(
611
- "vsetvli zero, %[vl32], e8, m2\n\t"
614
+ "lb zero, 31(%[q2])\n\t"
615
+ "addi %[tmp], %[q2], 16\n\t"
616
+ "addi %[t1], %[q8], 16\n\t"
617
+ "vsetivli zero, 16, e8, m1\n\t"
612
618
  "vle8.v v0, (%[q2])\n\t"
619
+ "vle8.v v1, (%[tmp])\n\t"
613
620
  "vsrl.vi v2, v0, 2\n\t"
621
+ "vsrl.vi v3, v1, 2\n\t"
614
622
  "vsrl.vi v4, v0, 4\n\t"
623
+ "addi %[tmp], %[q8], 32\n\t"
624
+ "vle8.v v8, (%[q8])\n\t"
625
+ "vle8.v v9, (%[t1])\n\t"
626
+ "addi %[t1], %[t1], 32\n\t"
627
+ "vsrl.vi v5, v1, 4\n\t"
615
628
  "vsrl.vi v6, v0, 6\n\t"
629
+ "vsrl.vi v7, v1, 6\n\t"
630
+ "vle8.v v10, (%[tmp])\n\t"
631
+ "vle8.v v11, (%[t1])\n\t"
632
+ "addi %[tmp], %[tmp], 32\n\t"
633
+ "addi %[t1], %[t1], 32\n\t"
616
634
  "vand.vi v0, v0, 0x3\n\t"
635
+ "vand.vi v1, v1, 0x3\n\t"
617
636
  "vand.vi v2, v2, 0x3\n\t"
637
+ "vle8.v v12, (%[tmp])\n\t"
638
+ "vle8.v v13, (%[t1])\n\t"
639
+ "addi %[tmp], %[tmp], 32\n\t"
640
+ "addi %[t1], %[t1], 32\n\t"
641
+ "vand.vi v3, v3, 0x3\n\t"
618
642
  "vand.vi v4, v4, 0x3\n\t"
619
- "vsetvli zero, %[vl128], e8, m8\n\t"
620
- "vle8.v v8, (%[q8])\n\t"
621
- "vsetvli zero, %[vl64], e8, m4\n\t"
643
+ "vand.vi v5, v5, 0x3\n\t"
644
+ "vle8.v v14, (%[tmp])\n\t"
645
+ "vle8.v v15, (%[t1])\n\t"
622
646
  "vwmul.vv v16, v0, v8\n\t"
647
+ "vwmul.vv v18, v1, v9\n\t"
648
+ "vwmul.vv v20, v2, v10\n\t"
649
+ "vwmul.vv v22, v3, v11\n\t"
623
650
  "vwmul.vv v24, v4, v12\n\t"
624
- "vsetivli zero, 16, e16, m2\n\t"
651
+ "vwmul.vv v26, v5, v13\n\t"
652
+ "vwmul.vv v28, v6, v14\n\t"
653
+ "vwmul.vv v30, v7, v15\n\t"
654
+ "vsetivli zero, 8, e16, m1\n\t"
625
655
  "vmv.v.x v0, zero\n\t"
626
- "vwredsum.vs v10, v16, v0\n\t"
656
+ "lbu %[tmp], 0(%[scale])\n\t"
657
+ "vwredsum.vs v8, v16, v0\n\t"
627
658
  "vwredsum.vs v9, v18, v0\n\t"
628
- "vwredsum.vs v8, v20, v0\n\t"
629
- "vwredsum.vs v7, v22, v0\n\t"
630
- "vwredsum.vs v11, v24, v0\n\t"
631
- "vwredsum.vs v12, v26, v0\n\t"
632
- "vwredsum.vs v13, v28, v0\n\t"
633
- "vwredsum.vs v14, v30, v0\n\t"
659
+ "lbu %[t1], 1(%[scale])\n\t"
660
+ "vwredsum.vs v10, v20, v0\n\t"
661
+ "vwredsum.vs v11, v22, v0\n\t"
662
+ "lbu %[t2], 2(%[scale])\n\t"
663
+ "vwredsum.vs v12, v24, v0\n\t"
664
+ "vwredsum.vs v13, v26, v0\n\t"
665
+ "lbu %[t3], 3(%[scale])\n\t"
666
+ "vwredsum.vs v14, v28, v0\n\t"
667
+ "vwredsum.vs v15, v30, v0\n\t"
668
+ "lbu %[t4], 4(%[scale])\n\t"
669
+ "vwredsum.vs v8, v17, v8\n\t"
670
+ "vwredsum.vs v9, v19, v9\n\t"
671
+ "lbu %[t5], 5(%[scale])\n\t"
672
+ "vwredsum.vs v10, v21, v10\n\t"
673
+ "vwredsum.vs v11, v23, v11\n\t"
674
+ "lbu %[t6], 6(%[scale])\n\t"
675
+ "vwredsum.vs v12, v25, v12\n\t"
676
+ "vwredsum.vs v13, v27, v13\n\t"
677
+ "lbu %[t7], 7(%[scale])\n\t"
678
+ "vwredsum.vs v14, v29, v14\n\t"
679
+ "vwredsum.vs v15, v31, v15\n\t"
634
680
  "vsetivli zero, 4, e32, m1\n\t"
635
- "vslideup.vi v10, v9, 1\n\t"
636
- "vslideup.vi v8, v7, 1\n\t"
637
- "vslideup.vi v11, v12, 1\n\t"
638
- "vslideup.vi v13, v14, 1\n\t"
639
- "vslideup.vi v10, v8, 2\n\t"
640
- "vslideup.vi v11, v13, 2\n\t"
641
- "vsetivli zero, 8, e32, m2\n\t"
642
- "vle8.v v15, (%[scale])\n\t"
643
- "vzext.vf4 v12, v15\n\t"
644
- "vmul.vv v10, v10, v12\n\t"
645
- "vredsum.vs v0, v10, v0\n\t"
681
+ "vmul.vx v0, v8, %[tmp]\n\t"
682
+ "vmul.vx v1, v9, %[t1]\n\t"
683
+ "vmacc.vx v0, %[t2], v10\n\t"
684
+ "vmacc.vx v1, %[t3], v11\n\t"
685
+ "vmacc.vx v0, %[t4], v12\n\t"
686
+ "vmacc.vx v1, %[t5], v13\n\t"
687
+ "vmacc.vx v0, %[t6], v14\n\t"
688
+ "vmacc.vx v1, %[t7], v15\n\t"
646
689
  "vmv.x.s %[tmp], v0\n\t"
647
- "add %[isum], %[isum], %[tmp]"
648
- : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
690
+ "vmv.x.s %[t1], v1\n\t"
691
+ "add %[isum], %[isum], %[tmp]\n\t"
692
+ "add %[isum], %[isum], %[t1]"
693
+ : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
694
+ , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
695
+ , [isum] "+&r" (isum)
649
696
  : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
650
- , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
651
697
  : "memory"
652
698
  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
653
699
  , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
929
975
  const int8_t * restrict q8 = y[i].qs;
930
976
 
931
977
  int8_t * scale = (int8_t *)utmp;
932
- int tmp;
978
+ int tmp, t1, t2, t3, t4, t5, t6, t7;
933
979
  __asm__ __volatile__(
934
980
  "vsetivli zero, 12, e8, m1\n\t"
935
981
  "vle8.v v0, (%[s6b])\n\t"
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
967
1013
  int isum = 0;
968
1014
  for (int j = 0; j < QK_K; j += 128) {
969
1015
  __asm__ __volatile__(
1016
+ "lb zero, 31(%[q3])\n\t"
970
1017
  "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
971
1018
  "vle8.v v8, (%[q3])\n\t"
972
1019
  "vsrl.vi v10, v8, 2\n\t"
973
1020
  "vsrl.vi v12, v8, 4\n\t"
974
1021
  "vsrl.vi v14, v8, 6\n\t"
1022
+ "lb zero, 64(%[q8])\n\t"
975
1023
  "vand.vi v8, v8, 3\n\t"
976
1024
  "vand.vi v10, v10, 3\n\t"
977
1025
  "vand.vi v12, v12, 3\n\t"
978
1026
  "vle8.v v2, (%[qh])\n\t"
1027
+ "lb zero, 127(%[q8])\n\t"
979
1028
  "vand.vx v4, v2, %[m]\n\t"
980
1029
  "slli %[m], %[m], 1\n\t"
981
1030
  "vmseq.vx v0, v4, zero\n\t"
982
1031
  "vadd.vi v8, v8, -4, v0.t\n\t"
1032
+ "lb zero, 0(%[q8])\n\t"
983
1033
  "vand.vx v4, v2, %[m]\n\t"
984
1034
  "slli %[m], %[m], 1\n\t"
985
1035
  "vmseq.vx v0, v4, zero\n\t"
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
994
1044
  "vadd.vi v14, v14, -4, v0.t\n\t"
995
1045
  "vsetvli zero, %[vl128], e8, m8\n\t"
996
1046
  "vle8.v v0, (%[q8])\n\t"
1047
+ "lb %[tmp], 0(%[scale])\n\t"
1048
+ "lb %[t1], 1(%[scale])\n\t"
1049
+ "lb %[t2], 2(%[scale])\n\t"
1050
+ "lb %[t3], 3(%[scale])\n\t"
997
1051
  "vsetvli zero, %[vl64], e8, m4\n\t"
998
1052
  "vwmul.vv v16, v0, v8\n\t"
999
1053
  "vwmul.vv v24, v4, v12\n\t"
1000
1054
  "vsetivli zero, 16, e16, m2\n\t"
1001
1055
  "vmv.v.x v0, zero\n\t"
1002
- "vwredsum.vs v10, v16, v0\n\t"
1056
+ "vwredsum.vs v8, v16, v0\n\t"
1057
+ "lb %[t4], 4(%[scale])\n\t"
1058
+ "lb %[t5], 5(%[scale])\n\t"
1003
1059
  "vwredsum.vs v9, v18, v0\n\t"
1004
- "vwredsum.vs v8, v20, v0\n\t"
1005
- "vwredsum.vs v7, v22, v0\n\t"
1006
- "vwredsum.vs v11, v24, v0\n\t"
1007
- "vwredsum.vs v12, v26, v0\n\t"
1008
- "vwredsum.vs v13, v28, v0\n\t"
1009
- "vwredsum.vs v14, v30, v0\n\t"
1060
+ "vwredsum.vs v10, v20, v0\n\t"
1061
+ "vwredsum.vs v11, v22, v0\n\t"
1062
+ "vwredsum.vs v12, v24, v0\n\t"
1063
+ "lb %[t6], 6(%[scale])\n\t"
1064
+ "lb %[t7], 7(%[scale])\n\t"
1065
+ "vwredsum.vs v13, v26, v0\n\t"
1066
+ "vwredsum.vs v14, v28, v0\n\t"
1067
+ "vwredsum.vs v15, v30, v0\n\t"
1010
1068
  "vsetivli zero, 4, e32, m1\n\t"
1011
- "vslideup.vi v10, v9, 1\n\t"
1012
- "vslideup.vi v8, v7, 1\n\t"
1013
- "vslideup.vi v11, v12, 1\n\t"
1014
- "vslideup.vi v13, v14, 1\n\t"
1015
- "vslideup.vi v10, v8, 2\n\t"
1016
- "vslideup.vi v11, v13, 2\n\t"
1017
- "vsetivli zero, 8, e32, m2\n\t"
1018
- "vle8.v v15, (%[scale])\n\t"
1019
- "vsext.vf4 v12, v15\n\t"
1020
- "vmul.vv v10, v10, v12\n\t"
1021
- "vredsum.vs v0, v10, v0\n\t"
1069
+ "vmul.vx v0, v8, %[tmp]\n\t"
1070
+ "vmul.vx v1, v9, %[t1]\n\t"
1071
+ "vmacc.vx v0, %[t2], v10\n\t"
1072
+ "vmacc.vx v1, %[t3], v11\n\t"
1073
+ "vmacc.vx v0, %[t4], v12\n\t"
1074
+ "vmacc.vx v1, %[t5], v13\n\t"
1075
+ "vmacc.vx v0, %[t6], v14\n\t"
1076
+ "vmacc.vx v1, %[t7], v15\n\t"
1022
1077
  "vmv.x.s %[tmp], v0\n\t"
1023
- "add %[isum], %[isum], %[tmp]"
1024
- : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
1078
+ "vmv.x.s %[t1], v1\n\t"
1079
+ "add %[isum], %[isum], %[tmp]\n\t"
1080
+ "add %[isum], %[isum], %[t1]"
1081
+ : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
1082
+ , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
1083
+ , [m] "+&r" (m), [isum] "+&r" (isum)
1025
1084
  : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
1026
1085
  , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
1027
1086
  : "memory"
@@ -0,0 +1,50 @@
1
+ #include "ggml-backend-impl.h"
2
+
3
+ #if defined(__s390x__)
4
+ #include <sys/auxv.h>
5
+
6
+ // find hwcap bits in asm/elf.h
7
+ #ifndef HWCAP_VXRS_EXT2
8
+ #define HWCAP_VXRS_EXT2 (1 << 15)
9
+ #endif
10
+
11
+ #ifndef HWCAP_NNPA
12
+ #define HWCAP_NNPA (1 << 20)
13
+ #endif
14
+
15
+ struct s390x_features {
16
+ bool has_vxe2 = false;
17
+ bool has_nnpa = false;
18
+
19
+ s390x_features() {
20
+ uint32_t hwcap = getauxval(AT_HWCAP);
21
+ // NOTE: use hwcap2 with DFLT for z17 and later
22
+ // uint32_t hwcap2 = getauxval(AT_HWCAP2);
23
+
24
+ has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
25
+ has_nnpa = !!(hwcap & HWCAP_NNPA);
26
+ }
27
+ };
28
+
29
+ static int ggml_backend_cpu_s390x_score() {
30
+ int score = 1;
31
+ s390x_features sf;
32
+
33
+ // IBM z15 / LinuxONE 3
34
+ #ifdef GGML_USE_VXE2
35
+ if (!sf.has_vxe2) { return 0; }
36
+ score += 1 << 1;
37
+ #endif
38
+
39
+ // IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
40
+ #ifdef GGML_USE_NNPA
41
+ if (!sf.has_nnpa) { return 0; }
42
+ score += 1 << 2;
43
+ #endif
44
+
45
+ return score;
46
+ }
47
+
48
+ GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
49
+
50
+ #endif // __s390x__
@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
500
500
 
501
501
  #endif
502
502
 
503
- #if defined(__loongarch_asx)
503
+ #if defined(__loongarch_sx)
504
504
  /* float type data load instructions */
505
505
  static __m128 __lsx_vreplfr2vr_s(const float val) {
506
506
  v4f32 res = {val, val, val, val};
507
507
  return (__m128)res;
508
508
  }
509
+ #endif
509
510
 
511
+ #if defined(__loongarch_asx)
510
512
  static __m256 __lasx_xvreplfr2vr_s(const float val) {
511
513
  v8f32 res = {val, val, val, val, val, val, val, val};
512
514
  return (__m256)res;
@@ -1613,13 +1613,8 @@ static void ggml_compute_forward_mul_mat_id(
1613
1613
  chunk_size = 64;
1614
1614
  }
1615
1615
 
1616
- #if defined(__aarch64__)
1617
- // disable for ARM
1618
- const bool disable_chunking = true;
1619
- #else
1620
1616
  // disable for NUMA
1621
1617
  const bool disable_chunking = ggml_is_numa();
1622
- #endif // defined(__aarch64__)
1623
1618
 
1624
1619
  int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
1625
1620
  int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
@@ -1812,22 +1807,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1812
1807
  {
1813
1808
  ggml_compute_forward_cont(params, tensor);
1814
1809
  } break;
1815
- case GGML_OP_RESHAPE:
1816
- {
1817
- ggml_compute_forward_reshape(params, tensor);
1818
- } break;
1819
- case GGML_OP_VIEW:
1820
- {
1821
- ggml_compute_forward_view(params, tensor);
1822
- } break;
1823
- case GGML_OP_PERMUTE:
1824
- {
1825
- ggml_compute_forward_permute(params, tensor);
1826
- } break;
1827
- case GGML_OP_TRANSPOSE:
1828
- {
1829
- ggml_compute_forward_transpose(params, tensor);
1830
- } break;
1831
1810
  case GGML_OP_GET_ROWS:
1832
1811
  {
1833
1812
  ggml_compute_forward_get_rows(params, tensor);
@@ -2047,6 +2026,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
2047
2026
  {
2048
2027
  // nop
2049
2028
  } break;
2029
+ case GGML_OP_RESHAPE:
2030
+ {
2031
+ // nop
2032
+ } break;
2033
+ case GGML_OP_PERMUTE:
2034
+ {
2035
+ // nop
2036
+ } break;
2037
+ case GGML_OP_VIEW:
2038
+ {
2039
+ // nop
2040
+ } break;
2041
+ case GGML_OP_TRANSPOSE:
2042
+ {
2043
+ // nop
2044
+ } break;
2050
2045
  case GGML_OP_COUNT:
2051
2046
  {
2052
2047
  GGML_ABORT("fatal error");
@@ -2889,6 +2884,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2889
2884
  for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
2890
2885
  struct ggml_tensor * node = cgraph->nodes[node_n];
2891
2886
 
2887
+ if (ggml_op_is_empty(node->op)) {
2888
+ // skip NOPs
2889
+ continue;
2890
+ }
2891
+
2892
2892
  ggml_compute_forward(&params, node);
2893
2893
 
2894
2894
  if (state->ith == 0 && cplan->abort_callback &&