@fugood/llama.node 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +8 -8
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +44 -999
- package/src/llama.cpp/common/arg.h +2 -2
- package/src/llama.cpp/common/chat.cpp +17 -2
- package/src/llama.cpp/common/common.cpp +33 -0
- package/src/llama.cpp/common/common.h +15 -1
- package/src/llama.cpp/common/download.cpp +1054 -0
- package/src/llama.cpp/common/download.h +55 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/include/ggml.h +2 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
- package/src/llama.cpp/include/llama.h +7 -3
- package/src/llama.cpp/src/CMakeLists.txt +95 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -0
- package/src/llama.cpp/src/llama-arch.h +11 -0
- package/src/llama.cpp/src/llama-batch.cpp +63 -31
- package/src/llama.cpp/src/llama-batch.h +12 -1
- package/src/llama.cpp/src/llama-chat.cpp +32 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +36 -13
- package/src/llama.cpp/src/llama-context.h +5 -5
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-hparams.cpp +11 -1
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
- package/src/llama.cpp/src/llama-kv-cells.h +44 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
- package/src/llama.cpp/src/llama-model.cpp +320 -13171
- package/src/llama.cpp/src/llama-model.h +8 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/apertus.cpp +125 -0
- package/src/llama.cpp/src/models/arcee.cpp +135 -0
- package/src/llama.cpp/src/models/arctic.cpp +138 -0
- package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/src/llama.cpp/src/models/baichuan.cpp +122 -0
- package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/src/llama.cpp/src/models/bert.cpp +176 -0
- package/src/llama.cpp/src/models/bitnet.cpp +160 -0
- package/src/llama.cpp/src/models/bloom.cpp +101 -0
- package/src/llama.cpp/src/models/chameleon.cpp +178 -0
- package/src/llama.cpp/src/models/chatglm.cpp +132 -0
- package/src/llama.cpp/src/models/codeshell.cpp +111 -0
- package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/command-r.cpp +122 -0
- package/src/llama.cpp/src/models/dbrx.cpp +123 -0
- package/src/llama.cpp/src/models/deci.cpp +135 -0
- package/src/llama.cpp/src/models/deepseek.cpp +144 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
- package/src/llama.cpp/src/models/dots1.cpp +134 -0
- package/src/llama.cpp/src/models/dream.cpp +105 -0
- package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/src/llama.cpp/src/models/exaone.cpp +114 -0
- package/src/llama.cpp/src/models/exaone4.cpp +123 -0
- package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/src/llama.cpp/src/models/falcon.cpp +120 -0
- package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/src/llama.cpp/src/models/gemma.cpp +112 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/src/llama.cpp/src/models/glm4.cpp +127 -0
- package/src/llama.cpp/src/models/gpt2.cpp +105 -0
- package/src/llama.cpp/src/models/gptneox.cpp +144 -0
- package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/src/llama.cpp/src/models/granite.cpp +211 -0
- package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/src/llama.cpp/src/models/grok.cpp +159 -0
- package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/src/llama.cpp/src/models/internlm2.cpp +120 -0
- package/src/llama.cpp/src/models/jais.cpp +86 -0
- package/src/llama.cpp/src/models/jamba.cpp +106 -0
- package/src/llama.cpp/src/models/lfm2.cpp +173 -0
- package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/src/llama.cpp/src/models/llada.cpp +99 -0
- package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/src/llama.cpp/src/models/llama.cpp +155 -0
- package/src/llama.cpp/src/models/mamba.cpp +55 -0
- package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/src/llama.cpp/src/models/models.h +481 -0
- package/src/llama.cpp/src/models/mpt.cpp +126 -0
- package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/src/llama.cpp/src/models/nemotron.cpp +122 -0
- package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/src/llama.cpp/src/models/olmo.cpp +121 -0
- package/src/llama.cpp/src/models/olmo2.cpp +150 -0
- package/src/llama.cpp/src/models/olmoe.cpp +124 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/src/llama.cpp/src/models/openelm.cpp +124 -0
- package/src/llama.cpp/src/models/orion.cpp +123 -0
- package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/src/llama.cpp/src/models/phi2.cpp +121 -0
- package/src/llama.cpp/src/models/phi3.cpp +152 -0
- package/src/llama.cpp/src/models/plamo.cpp +110 -0
- package/src/llama.cpp/src/models/plamo2.cpp +316 -0
- package/src/llama.cpp/src/models/plm.cpp +168 -0
- package/src/llama.cpp/src/models/qwen.cpp +108 -0
- package/src/llama.cpp/src/models/qwen2.cpp +117 -0
- package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/src/llama.cpp/src/models/refact.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/src/llama.cpp/src/models/smollm3.cpp +128 -0
- package/src/llama.cpp/src/models/stablelm.cpp +146 -0
- package/src/llama.cpp/src/models/starcoder.cpp +100 -0
- package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/src/llama.cpp/src/models/xverse.cpp +108 -0
|
@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
580
580
|
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
581
581
|
uint8_t *patmp = atmp;
|
|
582
582
|
int vsums;
|
|
583
|
-
int tmp;
|
|
583
|
+
int tmp, t1, t2, t3, t4, t5, t6, t7;
|
|
584
584
|
__asm__ __volatile__(
|
|
585
585
|
"vsetivli zero, 16, e8, m1\n\t"
|
|
586
586
|
"vmv.v.x v8, zero\n\t"
|
|
587
|
+
"lb zero, 15(%[sc])\n\t"
|
|
587
588
|
"vle8.v v1, (%[sc])\n\t"
|
|
589
|
+
"vle8.v v2, (%[bsums])\n\t"
|
|
590
|
+
"addi %[tmp], %[bsums], 16\n\t"
|
|
588
591
|
"vand.vi v0, v1, 0xF\n\t"
|
|
589
592
|
"vsrl.vi v1, v1, 4\n\t"
|
|
593
|
+
"vle8.v v3, (%[tmp])\n\t"
|
|
590
594
|
"vse8.v v0, (%[scale])\n\t"
|
|
591
595
|
"vsetivli zero, 16, e16, m2\n\t"
|
|
592
|
-
"vle16.v v2, (%[bsums])\n\t"
|
|
593
596
|
"vzext.vf2 v0, v1\n\t"
|
|
594
597
|
"vwmul.vv v4, v0, v2\n\t"
|
|
595
598
|
"vsetivli zero, 16, e32, m4\n\t"
|
|
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
608
611
|
|
|
609
612
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
610
613
|
__asm__ __volatile__(
|
|
611
|
-
"
|
|
614
|
+
"lb zero, 31(%[q2])\n\t"
|
|
615
|
+
"addi %[tmp], %[q2], 16\n\t"
|
|
616
|
+
"addi %[t1], %[q8], 16\n\t"
|
|
617
|
+
"vsetivli zero, 16, e8, m1\n\t"
|
|
612
618
|
"vle8.v v0, (%[q2])\n\t"
|
|
619
|
+
"vle8.v v1, (%[tmp])\n\t"
|
|
613
620
|
"vsrl.vi v2, v0, 2\n\t"
|
|
621
|
+
"vsrl.vi v3, v1, 2\n\t"
|
|
614
622
|
"vsrl.vi v4, v0, 4\n\t"
|
|
623
|
+
"addi %[tmp], %[q8], 32\n\t"
|
|
624
|
+
"vle8.v v8, (%[q8])\n\t"
|
|
625
|
+
"vle8.v v9, (%[t1])\n\t"
|
|
626
|
+
"addi %[t1], %[t1], 32\n\t"
|
|
627
|
+
"vsrl.vi v5, v1, 4\n\t"
|
|
615
628
|
"vsrl.vi v6, v0, 6\n\t"
|
|
629
|
+
"vsrl.vi v7, v1, 6\n\t"
|
|
630
|
+
"vle8.v v10, (%[tmp])\n\t"
|
|
631
|
+
"vle8.v v11, (%[t1])\n\t"
|
|
632
|
+
"addi %[tmp], %[tmp], 32\n\t"
|
|
633
|
+
"addi %[t1], %[t1], 32\n\t"
|
|
616
634
|
"vand.vi v0, v0, 0x3\n\t"
|
|
635
|
+
"vand.vi v1, v1, 0x3\n\t"
|
|
617
636
|
"vand.vi v2, v2, 0x3\n\t"
|
|
637
|
+
"vle8.v v12, (%[tmp])\n\t"
|
|
638
|
+
"vle8.v v13, (%[t1])\n\t"
|
|
639
|
+
"addi %[tmp], %[tmp], 32\n\t"
|
|
640
|
+
"addi %[t1], %[t1], 32\n\t"
|
|
641
|
+
"vand.vi v3, v3, 0x3\n\t"
|
|
618
642
|
"vand.vi v4, v4, 0x3\n\t"
|
|
619
|
-
"
|
|
620
|
-
"vle8.v
|
|
621
|
-
"
|
|
643
|
+
"vand.vi v5, v5, 0x3\n\t"
|
|
644
|
+
"vle8.v v14, (%[tmp])\n\t"
|
|
645
|
+
"vle8.v v15, (%[t1])\n\t"
|
|
622
646
|
"vwmul.vv v16, v0, v8\n\t"
|
|
647
|
+
"vwmul.vv v18, v1, v9\n\t"
|
|
648
|
+
"vwmul.vv v20, v2, v10\n\t"
|
|
649
|
+
"vwmul.vv v22, v3, v11\n\t"
|
|
623
650
|
"vwmul.vv v24, v4, v12\n\t"
|
|
624
|
-
"
|
|
651
|
+
"vwmul.vv v26, v5, v13\n\t"
|
|
652
|
+
"vwmul.vv v28, v6, v14\n\t"
|
|
653
|
+
"vwmul.vv v30, v7, v15\n\t"
|
|
654
|
+
"vsetivli zero, 8, e16, m1\n\t"
|
|
625
655
|
"vmv.v.x v0, zero\n\t"
|
|
626
|
-
"
|
|
656
|
+
"lbu %[tmp], 0(%[scale])\n\t"
|
|
657
|
+
"vwredsum.vs v8, v16, v0\n\t"
|
|
627
658
|
"vwredsum.vs v9, v18, v0\n\t"
|
|
628
|
-
"
|
|
629
|
-
"vwredsum.vs
|
|
630
|
-
"vwredsum.vs v11,
|
|
631
|
-
"
|
|
632
|
-
"vwredsum.vs
|
|
633
|
-
"vwredsum.vs
|
|
659
|
+
"lbu %[t1], 1(%[scale])\n\t"
|
|
660
|
+
"vwredsum.vs v10, v20, v0\n\t"
|
|
661
|
+
"vwredsum.vs v11, v22, v0\n\t"
|
|
662
|
+
"lbu %[t2], 2(%[scale])\n\t"
|
|
663
|
+
"vwredsum.vs v12, v24, v0\n\t"
|
|
664
|
+
"vwredsum.vs v13, v26, v0\n\t"
|
|
665
|
+
"lbu %[t3], 3(%[scale])\n\t"
|
|
666
|
+
"vwredsum.vs v14, v28, v0\n\t"
|
|
667
|
+
"vwredsum.vs v15, v30, v0\n\t"
|
|
668
|
+
"lbu %[t4], 4(%[scale])\n\t"
|
|
669
|
+
"vwredsum.vs v8, v17, v8\n\t"
|
|
670
|
+
"vwredsum.vs v9, v19, v9\n\t"
|
|
671
|
+
"lbu %[t5], 5(%[scale])\n\t"
|
|
672
|
+
"vwredsum.vs v10, v21, v10\n\t"
|
|
673
|
+
"vwredsum.vs v11, v23, v11\n\t"
|
|
674
|
+
"lbu %[t6], 6(%[scale])\n\t"
|
|
675
|
+
"vwredsum.vs v12, v25, v12\n\t"
|
|
676
|
+
"vwredsum.vs v13, v27, v13\n\t"
|
|
677
|
+
"lbu %[t7], 7(%[scale])\n\t"
|
|
678
|
+
"vwredsum.vs v14, v29, v14\n\t"
|
|
679
|
+
"vwredsum.vs v15, v31, v15\n\t"
|
|
634
680
|
"vsetivli zero, 4, e32, m1\n\t"
|
|
635
|
-
"
|
|
636
|
-
"
|
|
637
|
-
"
|
|
638
|
-
"
|
|
639
|
-
"
|
|
640
|
-
"
|
|
641
|
-
"
|
|
642
|
-
"
|
|
643
|
-
"vzext.vf4 v12, v15\n\t"
|
|
644
|
-
"vmul.vv v10, v10, v12\n\t"
|
|
645
|
-
"vredsum.vs v0, v10, v0\n\t"
|
|
681
|
+
"vmul.vx v0, v8, %[tmp]\n\t"
|
|
682
|
+
"vmul.vx v1, v9, %[t1]\n\t"
|
|
683
|
+
"vmacc.vx v0, %[t2], v10\n\t"
|
|
684
|
+
"vmacc.vx v1, %[t3], v11\n\t"
|
|
685
|
+
"vmacc.vx v0, %[t4], v12\n\t"
|
|
686
|
+
"vmacc.vx v1, %[t5], v13\n\t"
|
|
687
|
+
"vmacc.vx v0, %[t6], v14\n\t"
|
|
688
|
+
"vmacc.vx v1, %[t7], v15\n\t"
|
|
646
689
|
"vmv.x.s %[tmp], v0\n\t"
|
|
647
|
-
"
|
|
648
|
-
|
|
690
|
+
"vmv.x.s %[t1], v1\n\t"
|
|
691
|
+
"add %[isum], %[isum], %[tmp]\n\t"
|
|
692
|
+
"add %[isum], %[isum], %[t1]"
|
|
693
|
+
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
|
|
694
|
+
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
|
|
695
|
+
, [isum] "+&r" (isum)
|
|
649
696
|
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
|
|
650
|
-
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
|
651
697
|
: "memory"
|
|
652
698
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
|
653
699
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
|
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
929
975
|
const int8_t * restrict q8 = y[i].qs;
|
|
930
976
|
|
|
931
977
|
int8_t * scale = (int8_t *)utmp;
|
|
932
|
-
int tmp;
|
|
978
|
+
int tmp, t1, t2, t3, t4, t5, t6, t7;
|
|
933
979
|
__asm__ __volatile__(
|
|
934
980
|
"vsetivli zero, 12, e8, m1\n\t"
|
|
935
981
|
"vle8.v v0, (%[s6b])\n\t"
|
|
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
967
1013
|
int isum = 0;
|
|
968
1014
|
for (int j = 0; j < QK_K; j += 128) {
|
|
969
1015
|
__asm__ __volatile__(
|
|
1016
|
+
"lb zero, 31(%[q3])\n\t"
|
|
970
1017
|
"vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
|
|
971
1018
|
"vle8.v v8, (%[q3])\n\t"
|
|
972
1019
|
"vsrl.vi v10, v8, 2\n\t"
|
|
973
1020
|
"vsrl.vi v12, v8, 4\n\t"
|
|
974
1021
|
"vsrl.vi v14, v8, 6\n\t"
|
|
1022
|
+
"lb zero, 64(%[q8])\n\t"
|
|
975
1023
|
"vand.vi v8, v8, 3\n\t"
|
|
976
1024
|
"vand.vi v10, v10, 3\n\t"
|
|
977
1025
|
"vand.vi v12, v12, 3\n\t"
|
|
978
1026
|
"vle8.v v2, (%[qh])\n\t"
|
|
1027
|
+
"lb zero, 127(%[q8])\n\t"
|
|
979
1028
|
"vand.vx v4, v2, %[m]\n\t"
|
|
980
1029
|
"slli %[m], %[m], 1\n\t"
|
|
981
1030
|
"vmseq.vx v0, v4, zero\n\t"
|
|
982
1031
|
"vadd.vi v8, v8, -4, v0.t\n\t"
|
|
1032
|
+
"lb zero, 0(%[q8])\n\t"
|
|
983
1033
|
"vand.vx v4, v2, %[m]\n\t"
|
|
984
1034
|
"slli %[m], %[m], 1\n\t"
|
|
985
1035
|
"vmseq.vx v0, v4, zero\n\t"
|
|
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
994
1044
|
"vadd.vi v14, v14, -4, v0.t\n\t"
|
|
995
1045
|
"vsetvli zero, %[vl128], e8, m8\n\t"
|
|
996
1046
|
"vle8.v v0, (%[q8])\n\t"
|
|
1047
|
+
"lb %[tmp], 0(%[scale])\n\t"
|
|
1048
|
+
"lb %[t1], 1(%[scale])\n\t"
|
|
1049
|
+
"lb %[t2], 2(%[scale])\n\t"
|
|
1050
|
+
"lb %[t3], 3(%[scale])\n\t"
|
|
997
1051
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
|
998
1052
|
"vwmul.vv v16, v0, v8\n\t"
|
|
999
1053
|
"vwmul.vv v24, v4, v12\n\t"
|
|
1000
1054
|
"vsetivli zero, 16, e16, m2\n\t"
|
|
1001
1055
|
"vmv.v.x v0, zero\n\t"
|
|
1002
|
-
"vwredsum.vs
|
|
1056
|
+
"vwredsum.vs v8, v16, v0\n\t"
|
|
1057
|
+
"lb %[t4], 4(%[scale])\n\t"
|
|
1058
|
+
"lb %[t5], 5(%[scale])\n\t"
|
|
1003
1059
|
"vwredsum.vs v9, v18, v0\n\t"
|
|
1004
|
-
"vwredsum.vs
|
|
1005
|
-
"vwredsum.vs
|
|
1006
|
-
"vwredsum.vs
|
|
1007
|
-
"
|
|
1008
|
-
"
|
|
1009
|
-
"vwredsum.vs
|
|
1060
|
+
"vwredsum.vs v10, v20, v0\n\t"
|
|
1061
|
+
"vwredsum.vs v11, v22, v0\n\t"
|
|
1062
|
+
"vwredsum.vs v12, v24, v0\n\t"
|
|
1063
|
+
"lb %[t6], 6(%[scale])\n\t"
|
|
1064
|
+
"lb %[t7], 7(%[scale])\n\t"
|
|
1065
|
+
"vwredsum.vs v13, v26, v0\n\t"
|
|
1066
|
+
"vwredsum.vs v14, v28, v0\n\t"
|
|
1067
|
+
"vwredsum.vs v15, v30, v0\n\t"
|
|
1010
1068
|
"vsetivli zero, 4, e32, m1\n\t"
|
|
1011
|
-
"
|
|
1012
|
-
"
|
|
1013
|
-
"
|
|
1014
|
-
"
|
|
1015
|
-
"
|
|
1016
|
-
"
|
|
1017
|
-
"
|
|
1018
|
-
"
|
|
1019
|
-
"vsext.vf4 v12, v15\n\t"
|
|
1020
|
-
"vmul.vv v10, v10, v12\n\t"
|
|
1021
|
-
"vredsum.vs v0, v10, v0\n\t"
|
|
1069
|
+
"vmul.vx v0, v8, %[tmp]\n\t"
|
|
1070
|
+
"vmul.vx v1, v9, %[t1]\n\t"
|
|
1071
|
+
"vmacc.vx v0, %[t2], v10\n\t"
|
|
1072
|
+
"vmacc.vx v1, %[t3], v11\n\t"
|
|
1073
|
+
"vmacc.vx v0, %[t4], v12\n\t"
|
|
1074
|
+
"vmacc.vx v1, %[t5], v13\n\t"
|
|
1075
|
+
"vmacc.vx v0, %[t6], v14\n\t"
|
|
1076
|
+
"vmacc.vx v1, %[t7], v15\n\t"
|
|
1022
1077
|
"vmv.x.s %[tmp], v0\n\t"
|
|
1023
|
-
"
|
|
1024
|
-
|
|
1078
|
+
"vmv.x.s %[t1], v1\n\t"
|
|
1079
|
+
"add %[isum], %[isum], %[tmp]\n\t"
|
|
1080
|
+
"add %[isum], %[isum], %[t1]"
|
|
1081
|
+
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
|
|
1082
|
+
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
|
|
1083
|
+
, [m] "+&r" (m), [isum] "+&r" (isum)
|
|
1025
1084
|
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
|
|
1026
1085
|
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
|
|
1027
1086
|
: "memory"
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#include "ggml-backend-impl.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__s390x__)
|
|
4
|
+
#include <sys/auxv.h>
|
|
5
|
+
|
|
6
|
+
// find hwcap bits in asm/elf.h
|
|
7
|
+
#ifndef HWCAP_VXRS_EXT2
|
|
8
|
+
#define HWCAP_VXRS_EXT2 (1 << 15)
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
#ifndef HWCAP_NNPA
|
|
12
|
+
#define HWCAP_NNPA (1 << 20)
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
struct s390x_features {
|
|
16
|
+
bool has_vxe2 = false;
|
|
17
|
+
bool has_nnpa = false;
|
|
18
|
+
|
|
19
|
+
s390x_features() {
|
|
20
|
+
uint32_t hwcap = getauxval(AT_HWCAP);
|
|
21
|
+
// NOTE: use hwcap2 with DFLT for z17 and later
|
|
22
|
+
// uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
|
23
|
+
|
|
24
|
+
has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
|
|
25
|
+
has_nnpa = !!(hwcap & HWCAP_NNPA);
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
static int ggml_backend_cpu_s390x_score() {
|
|
30
|
+
int score = 1;
|
|
31
|
+
s390x_features sf;
|
|
32
|
+
|
|
33
|
+
// IBM z15 / LinuxONE 3
|
|
34
|
+
#ifdef GGML_USE_VXE2
|
|
35
|
+
if (!sf.has_vxe2) { return 0; }
|
|
36
|
+
score += 1 << 1;
|
|
37
|
+
#endif
|
|
38
|
+
|
|
39
|
+
// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
|
|
40
|
+
#ifdef GGML_USE_NNPA
|
|
41
|
+
if (!sf.has_nnpa) { return 0; }
|
|
42
|
+
score += 1 << 2;
|
|
43
|
+
#endif
|
|
44
|
+
|
|
45
|
+
return score;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
|
|
49
|
+
|
|
50
|
+
#endif // __s390x__
|
|
@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
|
500
500
|
|
|
501
501
|
#endif
|
|
502
502
|
|
|
503
|
-
#if defined(
|
|
503
|
+
#if defined(__loongarch_sx)
|
|
504
504
|
/* float type data load instructions */
|
|
505
505
|
static __m128 __lsx_vreplfr2vr_s(const float val) {
|
|
506
506
|
v4f32 res = {val, val, val, val};
|
|
507
507
|
return (__m128)res;
|
|
508
508
|
}
|
|
509
|
+
#endif
|
|
509
510
|
|
|
511
|
+
#if defined(__loongarch_asx)
|
|
510
512
|
static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
|
511
513
|
v8f32 res = {val, val, val, val, val, val, val, val};
|
|
512
514
|
return (__m256)res;
|
|
@@ -1613,13 +1613,8 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
1613
1613
|
chunk_size = 64;
|
|
1614
1614
|
}
|
|
1615
1615
|
|
|
1616
|
-
#if defined(__aarch64__)
|
|
1617
|
-
// disable for ARM
|
|
1618
|
-
const bool disable_chunking = true;
|
|
1619
|
-
#else
|
|
1620
1616
|
// disable for NUMA
|
|
1621
1617
|
const bool disable_chunking = ggml_is_numa();
|
|
1622
|
-
#endif // defined(__aarch64__)
|
|
1623
1618
|
|
|
1624
1619
|
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
|
1625
1620
|
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
|
@@ -1812,22 +1807,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
1812
1807
|
{
|
|
1813
1808
|
ggml_compute_forward_cont(params, tensor);
|
|
1814
1809
|
} break;
|
|
1815
|
-
case GGML_OP_RESHAPE:
|
|
1816
|
-
{
|
|
1817
|
-
ggml_compute_forward_reshape(params, tensor);
|
|
1818
|
-
} break;
|
|
1819
|
-
case GGML_OP_VIEW:
|
|
1820
|
-
{
|
|
1821
|
-
ggml_compute_forward_view(params, tensor);
|
|
1822
|
-
} break;
|
|
1823
|
-
case GGML_OP_PERMUTE:
|
|
1824
|
-
{
|
|
1825
|
-
ggml_compute_forward_permute(params, tensor);
|
|
1826
|
-
} break;
|
|
1827
|
-
case GGML_OP_TRANSPOSE:
|
|
1828
|
-
{
|
|
1829
|
-
ggml_compute_forward_transpose(params, tensor);
|
|
1830
|
-
} break;
|
|
1831
1810
|
case GGML_OP_GET_ROWS:
|
|
1832
1811
|
{
|
|
1833
1812
|
ggml_compute_forward_get_rows(params, tensor);
|
|
@@ -2047,6 +2026,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
2047
2026
|
{
|
|
2048
2027
|
// nop
|
|
2049
2028
|
} break;
|
|
2029
|
+
case GGML_OP_RESHAPE:
|
|
2030
|
+
{
|
|
2031
|
+
// nop
|
|
2032
|
+
} break;
|
|
2033
|
+
case GGML_OP_PERMUTE:
|
|
2034
|
+
{
|
|
2035
|
+
// nop
|
|
2036
|
+
} break;
|
|
2037
|
+
case GGML_OP_VIEW:
|
|
2038
|
+
{
|
|
2039
|
+
// nop
|
|
2040
|
+
} break;
|
|
2041
|
+
case GGML_OP_TRANSPOSE:
|
|
2042
|
+
{
|
|
2043
|
+
// nop
|
|
2044
|
+
} break;
|
|
2050
2045
|
case GGML_OP_COUNT:
|
|
2051
2046
|
{
|
|
2052
2047
|
GGML_ABORT("fatal error");
|
|
@@ -2889,6 +2884,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2889
2884
|
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
|
2890
2885
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
|
2891
2886
|
|
|
2887
|
+
if (ggml_op_is_empty(node->op)) {
|
|
2888
|
+
// skip NOPs
|
|
2889
|
+
continue;
|
|
2890
|
+
}
|
|
2891
|
+
|
|
2892
2892
|
ggml_compute_forward(¶ms, node);
|
|
2893
2893
|
|
|
2894
2894
|
if (state->ith == 0 && cplan->abort_callback &&
|