@fugood/llama.node 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +80 -10
- package/src/llama.cpp/common/chat.cpp +52 -8
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +16 -6
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +9 -4
- package/src/llama.cpp/src/llama-arch.cpp +105 -0
- package/src/llama.cpp/src/llama-arch.h +12 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +33 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +19 -10
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +175 -148
- package/src/llama.cpp/src/llama-graph.h +60 -23
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +949 -75
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +40 -4
- package/src/llama.cpp/src/llama-vocab.cpp +49 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
412
412
|
}
|
|
413
413
|
}
|
|
414
414
|
|
|
415
|
+
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
416
|
+
const int qk = QK_K;
|
|
417
|
+
const int nb = n / qk;
|
|
418
|
+
const int ncols_interleaved = 8;
|
|
419
|
+
const int blocklen = 8;
|
|
420
|
+
|
|
421
|
+
assert (n % qk == 0);
|
|
422
|
+
assert (nc % ncols_interleaved == 0);
|
|
423
|
+
|
|
424
|
+
UNUSED(s);
|
|
425
|
+
UNUSED(bs);
|
|
426
|
+
UNUSED(vx);
|
|
427
|
+
UNUSED(vy);
|
|
428
|
+
UNUSED(nr);
|
|
429
|
+
UNUSED(nc);
|
|
430
|
+
UNUSED(nb);
|
|
431
|
+
UNUSED(ncols_interleaved);
|
|
432
|
+
UNUSED(blocklen);
|
|
433
|
+
|
|
434
|
+
float sumf[8];
|
|
435
|
+
float sum_minf[8];
|
|
436
|
+
int sumi1,sumi2,sumi3,sumi4;
|
|
437
|
+
int sumi;
|
|
438
|
+
|
|
439
|
+
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
|
440
|
+
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
|
441
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
442
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
443
|
+
sumf[j] = 0.0;
|
|
444
|
+
sum_minf[j] = 0.0;
|
|
445
|
+
}
|
|
446
|
+
for (int l = 0; l < nb; l++) {
|
|
447
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
448
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
449
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
450
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
451
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
452
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
453
|
+
sumi1 = 0;
|
|
454
|
+
sumi2 = 0;
|
|
455
|
+
sumi3 = 0;
|
|
456
|
+
sumi4 = 0;
|
|
457
|
+
sumi = 0;
|
|
458
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
459
|
+
for (int i = 0; i < blocklen; ++i){
|
|
460
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
461
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
462
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
463
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
464
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
|
465
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
|
466
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
|
467
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
|
468
|
+
|
|
469
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
470
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
471
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
472
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
473
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
474
|
+
}
|
|
475
|
+
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
479
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
480
|
+
for(int j = 0; j < ncols_interleaved; j++){
|
|
481
|
+
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
486
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
415
491
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
416
492
|
const int qk = QK8_0;
|
|
417
493
|
const int nb = n / qk;
|
|
@@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
711
787
|
}
|
|
712
788
|
}
|
|
713
789
|
|
|
790
|
+
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
791
|
+
const int qk = QK_K;
|
|
792
|
+
const int nb = n / qk;
|
|
793
|
+
const int ncols_interleaved = 8;
|
|
794
|
+
const int blocklen = 8;
|
|
795
|
+
|
|
796
|
+
assert (n % qk == 0);
|
|
797
|
+
assert (nr % 4 == 0);
|
|
798
|
+
assert (nc % ncols_interleaved == 0);
|
|
799
|
+
|
|
800
|
+
UNUSED(s);
|
|
801
|
+
UNUSED(bs);
|
|
802
|
+
UNUSED(vx);
|
|
803
|
+
UNUSED(vy);
|
|
804
|
+
UNUSED(nr);
|
|
805
|
+
UNUSED(nc);
|
|
806
|
+
UNUSED(nb);
|
|
807
|
+
UNUSED(ncols_interleaved);
|
|
808
|
+
UNUSED(blocklen);
|
|
809
|
+
|
|
810
|
+
float sumf[4][8];
|
|
811
|
+
float sum_minf[4][8];
|
|
812
|
+
int sumi1, sumi2, sumi3, sumi4;
|
|
813
|
+
int sumi;
|
|
814
|
+
|
|
815
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
816
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
817
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
818
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
819
|
+
for (int m = 0; m < 4; m++) {
|
|
820
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
821
|
+
sumf[m][j] = 0.0;
|
|
822
|
+
sum_minf[m][j] = 0.0;
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
for (int l = 0; l < nb; l++) {
|
|
826
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
827
|
+
|
|
828
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
829
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
830
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
831
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
832
|
+
for (int m = 0; m < 4; m++) {
|
|
833
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
834
|
+
sumi1 = 0;
|
|
835
|
+
sumi2 = 0;
|
|
836
|
+
sumi3 = 0;
|
|
837
|
+
sumi4 = 0;
|
|
838
|
+
sumi = 0;
|
|
839
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
840
|
+
for (int i = 0; i < blocklen; ++i){
|
|
841
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
842
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
843
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
844
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
845
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
846
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
847
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
|
848
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
|
849
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
850
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
851
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
852
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
853
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
854
|
+
}
|
|
855
|
+
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
860
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
861
|
+
for(int m = 0; m < 4; m++) {
|
|
862
|
+
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
863
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
864
|
+
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
|
865
|
+
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
for (int m = 0; m < 4; m++) {
|
|
872
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
873
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
|
|
714
881
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
715
882
|
const int qk = QK8_0;
|
|
716
883
|
const int nb = n / qk;
|
|
@@ -914,6 +1081,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
914
1081
|
return out;
|
|
915
1082
|
}
|
|
916
1083
|
|
|
1084
|
+
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
|
1085
|
+
block_q2_Kx8 out;
|
|
1086
|
+
|
|
1087
|
+
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
|
1088
|
+
for (int i = 0; i < 8; i++) {
|
|
1089
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
for (int i = 0; i < 8; i++) {
|
|
1093
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
const int end = QK_K * 2 / blck_size_interleave;
|
|
1097
|
+
|
|
1098
|
+
// Interleave Q2_K quants by taking 8 bytes at a time
|
|
1099
|
+
for (int i = 0; i < end; ++i) {
|
|
1100
|
+
int src_id = i % 8;
|
|
1101
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
1102
|
+
int dst_offset = i * blck_size_interleave;
|
|
1103
|
+
|
|
1104
|
+
uint64_t elems;
|
|
1105
|
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1106
|
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
|
1110
|
+
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
|
1111
|
+
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
|
1112
|
+
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
|
1113
|
+
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
|
1114
|
+
|
|
1115
|
+
for(int i = 0; i < 128; i++){
|
|
1116
|
+
|
|
1117
|
+
// Index for selecting which q2k super block
|
|
1118
|
+
int src1 = (i % 16) / 2;
|
|
1119
|
+
// Index for selecting scale
|
|
1120
|
+
int src2 = ((i / 16) * 2) + (i % 2);
|
|
1121
|
+
|
|
1122
|
+
out.scales[i] = in[src1].scales[src2];
|
|
1123
|
+
}
|
|
1124
|
+
return out;
|
|
1125
|
+
|
|
1126
|
+
}
|
|
1127
|
+
|
|
917
1128
|
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
918
1129
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
919
1130
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
@@ -975,6 +1186,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
975
1186
|
GGML_UNUSED(data_size);
|
|
976
1187
|
}
|
|
977
1188
|
|
|
1189
|
+
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1190
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
|
1191
|
+
GGML_ASSERT(interleave_block == 8);
|
|
1192
|
+
constexpr int nrows_interleaved = 8;
|
|
1193
|
+
|
|
1194
|
+
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
|
|
1195
|
+
const block_q2_K * src = (const block_q2_K*) data;
|
|
1196
|
+
block_q2_K dst_tmp[8];
|
|
1197
|
+
int nrow = ggml_nrows(t);
|
|
1198
|
+
int nblocks = t->ne[0] / QK_K;
|
|
1199
|
+
|
|
1200
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
|
1201
|
+
|
|
1202
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
1203
|
+
return -1;
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1207
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1208
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
1209
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1210
|
+
}
|
|
1211
|
+
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
|
1212
|
+
}
|
|
1213
|
+
src += nrows_interleaved * nblocks;
|
|
1214
|
+
}
|
|
1215
|
+
return 0;
|
|
1216
|
+
|
|
1217
|
+
GGML_UNUSED(data_size);
|
|
1218
|
+
}
|
|
1219
|
+
|
|
978
1220
|
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
979
1221
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
980
1222
|
GGML_ASSERT(interleave_block == 8);
|
|
@@ -1095,6 +1337,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
|
|
1095
1337
|
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
|
1096
1338
|
}
|
|
1097
1339
|
|
|
1340
|
+
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1341
|
+
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1098
1344
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1099
1345
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
|
1100
1346
|
}
|
|
@@ -1124,6 +1370,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|
|
1124
1370
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1125
1371
|
}
|
|
1126
1372
|
|
|
1373
|
+
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1374
|
+
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1127
1377
|
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1128
1378
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1129
1379
|
}
|
|
@@ -1148,6 +1398,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|
|
1148
1398
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1149
1399
|
}
|
|
1150
1400
|
|
|
1401
|
+
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1402
|
+
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1151
1405
|
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1152
1406
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1153
1407
|
}
|
|
@@ -1421,6 +1675,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1421
1675
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
|
1422
1676
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
1423
1677
|
|
|
1678
|
+
// instance for Q2
|
|
1679
|
+
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
|
1680
|
+
|
|
1424
1681
|
// instance for IQ4
|
|
1425
1682
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
1426
1683
|
|
|
@@ -1446,6 +1703,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1446
1703
|
return &q4_K_8x8_q8_K;
|
|
1447
1704
|
}
|
|
1448
1705
|
}
|
|
1706
|
+
} else if (cur->type == GGML_TYPE_Q2_K) {
|
|
1707
|
+
if (ggml_cpu_has_avx512()) {
|
|
1708
|
+
if (cur->ne[1] % 8 == 0) {
|
|
1709
|
+
return &q2_K_8x8_q8_K;
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1449
1712
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
1450
1713
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
1451
1714
|
if (cur->ne[1] % 4 == 0) {
|
|
@@ -44,7 +44,14 @@ struct block_q4_Kx8 {
|
|
|
44
44
|
};
|
|
45
45
|
|
|
46
46
|
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
|
47
|
+
struct block_q2_Kx8 {
|
|
48
|
+
ggml_half d[8]; // super-block scale for quantized scales
|
|
49
|
+
ggml_half dmin[8]; // super-block scale for quantized mins
|
|
50
|
+
uint8_t scales[128]; // scales and mins, quantized with 4 bits
|
|
51
|
+
uint8_t qs[512]; // 2--bit quants
|
|
52
|
+
};
|
|
47
53
|
|
|
54
|
+
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
|
48
55
|
struct block_q8_Kx4 {
|
|
49
56
|
float d[4]; // delta
|
|
50
57
|
int8_t qs[QK_K * 4]; // quants
|
|
@@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
71
78
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
72
79
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
73
80
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
81
|
+
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
74
82
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
75
83
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
76
84
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
77
85
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
78
86
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
|
+
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
79
88
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
80
89
|
|
|
81
90
|
// Native implementations
|
|
@@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
86
95
|
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
96
|
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
97
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
98
|
+
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
89
99
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
90
100
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
91
101
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
92
102
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
93
103
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
104
|
+
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
94
105
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
95
106
|
|
|
96
107
|
#if defined(__cplusplus)
|
|
@@ -55,7 +55,22 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
|
|
|
55
55
|
|
|
56
56
|
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
57
57
|
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
58
|
-
|
|
58
|
+
|
|
59
|
+
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
|
|
60
|
+
int i = 0;
|
|
61
|
+
#if defined(__AVX2__)
|
|
62
|
+
for (; i + 7 < n; i += 8) {
|
|
63
|
+
__m256 vx = _mm256_loadu_ps(x + i);
|
|
64
|
+
__m256 vy = _mm256_loadu_ps(y + i);
|
|
65
|
+
__m256 vz = _mm256_add_ps(vx, vy);
|
|
66
|
+
_mm256_storeu_ps(z + i, vz);
|
|
67
|
+
}
|
|
68
|
+
#endif
|
|
69
|
+
for (; i < n; ++i) {
|
|
70
|
+
z[i] = x[i] + y[i];
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
59
74
|
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
60
75
|
for (int i = 0; i < n; ++i) {
|
|
61
76
|
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
|
@@ -992,9 +1007,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
|
|
|
992
1007
|
|
|
993
1008
|
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
994
1009
|
for (int i = 0; i < n; ++i) {
|
|
995
|
-
float
|
|
996
|
-
float
|
|
997
|
-
y[i] = GGML_CPU_FP32_TO_FP16((
|
|
1010
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
1011
|
+
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
1012
|
+
y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
|
|
998
1013
|
}
|
|
999
1014
|
}
|
|
1000
1015
|
|
|
@@ -152,6 +152,7 @@ extern "C" {
|
|
|
152
152
|
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
|
153
153
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
|
154
154
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
|
155
|
+
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
|
155
156
|
|
|
156
157
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
|
157
158
|
};
|
|
@@ -284,10 +285,11 @@ extern "C" {
|
|
|
284
285
|
const struct llama_model_kv_override * kv_overrides;
|
|
285
286
|
|
|
286
287
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
287
|
-
bool vocab_only;
|
|
288
|
-
bool use_mmap;
|
|
289
|
-
bool use_mlock;
|
|
290
|
-
bool check_tensors;
|
|
288
|
+
bool vocab_only; // only load the vocabulary, no weights
|
|
289
|
+
bool use_mmap; // use mmap if possible
|
|
290
|
+
bool use_mlock; // force system to keep model in RAM
|
|
291
|
+
bool check_tensors; // validate model tensor data
|
|
292
|
+
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
|
291
293
|
};
|
|
292
294
|
|
|
293
295
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
@@ -537,6 +539,9 @@ extern "C" {
|
|
|
537
539
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
|
538
540
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
|
539
541
|
|
|
542
|
+
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
|
543
|
+
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
|
544
|
+
|
|
540
545
|
// Returns 0 on success
|
|
541
546
|
LLAMA_API uint32_t llama_model_quantize(
|
|
542
547
|
const char * fname_inp,
|
|
@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
62
62
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
|
63
63
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
|
64
64
|
{ LLM_ARCH_GLM4, "glm4" },
|
|
65
|
+
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
|
|
65
66
|
{ LLM_ARCH_BITNET, "bitnet" },
|
|
66
67
|
{ LLM_ARCH_T5, "t5" },
|
|
67
68
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
|
@@ -85,10 +86,13 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
85
86
|
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
|
86
87
|
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
|
87
88
|
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
|
89
|
+
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
|
88
90
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
91
|
+
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
|
|
89
92
|
{ LLM_ARCH_LFM2, "lfm2" },
|
|
90
93
|
{ LLM_ARCH_DREAM, "dream" },
|
|
91
94
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
|
95
|
+
{ LLM_ARCH_LLADA, "llada" },
|
|
92
96
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
93
97
|
};
|
|
94
98
|
|
|
@@ -125,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
125
129
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
|
126
130
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
|
127
131
|
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
132
|
+
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
|
128
133
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
129
134
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
130
135
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
@@ -1389,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1389
1394
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
1390
1395
|
},
|
|
1391
1396
|
},
|
|
1397
|
+
{
|
|
1398
|
+
LLM_ARCH_GLM4_MOE,
|
|
1399
|
+
{
|
|
1400
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1401
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1402
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1403
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1404
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1405
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1406
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1407
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1408
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1409
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1410
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1411
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1412
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1413
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1414
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1415
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1416
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1417
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1418
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1419
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1420
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1421
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1422
|
+
// NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
|
|
1423
|
+
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
|
|
1424
|
+
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
|
|
1425
|
+
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
|
|
1426
|
+
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
|
|
1427
|
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
|
|
1428
|
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
|
|
1429
|
+
},
|
|
1430
|
+
},
|
|
1392
1431
|
{
|
|
1393
1432
|
LLM_ARCH_BITNET,
|
|
1394
1433
|
{
|
|
@@ -1896,6 +1935,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1896
1935
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1897
1936
|
},
|
|
1898
1937
|
},
|
|
1938
|
+
{
|
|
1939
|
+
LLM_ARCH_HUNYUAN_DENSE,
|
|
1940
|
+
{
|
|
1941
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1942
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1943
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1944
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1945
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1946
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1947
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1948
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1949
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1950
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1951
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1952
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1953
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1954
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1955
|
+
|
|
1956
|
+
},
|
|
1957
|
+
},
|
|
1899
1958
|
{
|
|
1900
1959
|
LLM_ARCH_SMOLLM3,
|
|
1901
1960
|
{
|
|
@@ -1913,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1913
1972
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1914
1973
|
},
|
|
1915
1974
|
},
|
|
1975
|
+
{
|
|
1976
|
+
LLM_ARCH_OPENAI_MOE,
|
|
1977
|
+
{
|
|
1978
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1979
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1980
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1981
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1982
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1983
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1984
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1985
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1986
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1987
|
+
{ LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
|
|
1988
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1989
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1990
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1991
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1992
|
+
},
|
|
1993
|
+
},
|
|
1916
1994
|
{
|
|
1917
1995
|
LLM_ARCH_LFM2,
|
|
1918
1996
|
{
|
|
@@ -1972,6 +2050,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1972
2050
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1973
2051
|
},
|
|
1974
2052
|
},
|
|
2053
|
+
{
|
|
2054
|
+
LLM_ARCH_LLADA,
|
|
2055
|
+
{
|
|
2056
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2057
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2058
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2059
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2060
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2061
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2062
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2063
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2064
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2065
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2066
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2067
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2068
|
+
},
|
|
2069
|
+
},
|
|
1975
2070
|
{
|
|
1976
2071
|
LLM_ARCH_UNKNOWN,
|
|
1977
2072
|
{
|
|
@@ -2011,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2011
2106
|
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2012
2107
|
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2013
2108
|
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2109
|
+
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
|
|
2014
2110
|
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2015
2111
|
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2016
2112
|
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -2142,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2142
2238
|
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
2143
2239
|
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2144
2240
|
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2241
|
+
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
|
2242
|
+
// These tensors only exist in the last layer(s) and are treated as output tensors
|
|
2243
|
+
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2244
|
+
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
|
2245
|
+
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
|
2246
|
+
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
2247
|
+
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
2248
|
+
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
|
2145
2249
|
};
|
|
2146
2250
|
|
|
2147
2251
|
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
|
@@ -2224,6 +2328,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
|
2224
2328
|
bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|
2225
2329
|
switch (arch) {
|
|
2226
2330
|
case LLM_ARCH_DREAM:
|
|
2331
|
+
case LLM_ARCH_LLADA:
|
|
2227
2332
|
return true;
|
|
2228
2333
|
default:
|
|
2229
2334
|
return false;
|
|
@@ -66,6 +66,7 @@ enum llm_arch {
|
|
|
66
66
|
LLM_ARCH_DEEPSEEK2,
|
|
67
67
|
LLM_ARCH_CHATGLM,
|
|
68
68
|
LLM_ARCH_GLM4,
|
|
69
|
+
LLM_ARCH_GLM4_MOE,
|
|
69
70
|
LLM_ARCH_BITNET,
|
|
70
71
|
LLM_ARCH_T5,
|
|
71
72
|
LLM_ARCH_T5ENCODER,
|
|
@@ -89,10 +90,13 @@ enum llm_arch {
|
|
|
89
90
|
LLM_ARCH_ERNIE4_5,
|
|
90
91
|
LLM_ARCH_ERNIE4_5_MOE,
|
|
91
92
|
LLM_ARCH_HUNYUAN_MOE,
|
|
93
|
+
LLM_ARCH_HUNYUAN_DENSE,
|
|
92
94
|
LLM_ARCH_SMOLLM3,
|
|
95
|
+
LLM_ARCH_OPENAI_MOE,
|
|
93
96
|
LLM_ARCH_LFM2,
|
|
94
97
|
LLM_ARCH_DREAM,
|
|
95
98
|
LLM_ARCH_SMALLTHINKER,
|
|
99
|
+
LLM_ARCH_LLADA,
|
|
96
100
|
LLM_ARCH_UNKNOWN,
|
|
97
101
|
};
|
|
98
102
|
|
|
@@ -129,6 +133,7 @@ enum llm_kv {
|
|
|
129
133
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
|
130
134
|
LLM_KV_EXPERT_GATING_FUNC,
|
|
131
135
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
136
|
+
LLM_KV_NEXTN_PREDICT_LAYERS,
|
|
132
137
|
LLM_KV_POOLING_TYPE,
|
|
133
138
|
LLM_KV_LOGIT_SCALE,
|
|
134
139
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
@@ -261,6 +266,7 @@ enum llm_tensor {
|
|
|
261
266
|
LLM_TENSOR_ATTN_OUT_NORM,
|
|
262
267
|
LLM_TENSOR_ATTN_POST_NORM,
|
|
263
268
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
|
269
|
+
LLM_TENSOR_ATTN_SINKS,
|
|
264
270
|
LLM_TENSOR_FFN_GATE_INP,
|
|
265
271
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
|
266
272
|
LLM_TENSOR_FFN_NORM,
|
|
@@ -407,6 +413,12 @@ enum llm_tensor {
|
|
|
407
413
|
LLM_TENSOR_SHORTCONV_CONV,
|
|
408
414
|
LLM_TENSOR_SHORTCONV_INPROJ,
|
|
409
415
|
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
416
|
+
LLM_TENSOR_NEXTN_EH_PROJ,
|
|
417
|
+
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
|
418
|
+
LLM_TENSOR_NEXTN_ENORM,
|
|
419
|
+
LLM_TENSOR_NEXTN_HNORM,
|
|
420
|
+
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
|
421
|
+
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
|
410
422
|
};
|
|
411
423
|
|
|
412
424
|
enum llm_tensor_layer {
|