@fugood/llama.node 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +60 -7
- package/src/llama.cpp/common/chat.cpp +6 -6
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +14 -5
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/include/llama.h +8 -4
- package/src/llama.cpp/src/llama-arch.cpp +40 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +20 -1
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +11 -2
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -139
- package/src/llama.cpp/src/llama-graph.h +31 -32
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +400 -21
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -37,17 +37,21 @@
|
|
|
37
37
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
38
38
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
39
39
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
40
|
+
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
40
41
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
41
42
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
42
43
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
43
44
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
44
45
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
46
|
+
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
45
47
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
46
48
|
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
|
47
49
|
// repack.cpp
|
|
48
50
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
49
51
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
52
|
+
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
50
53
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
54
|
+
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
51
55
|
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
|
52
56
|
// repack.cpp
|
|
53
57
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
@@ -72,11 +76,13 @@
|
|
|
72
76
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
73
77
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
74
78
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
79
|
+
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
75
80
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
76
81
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
77
82
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
78
83
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
79
84
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
85
|
+
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
80
86
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
81
87
|
#elif defined(__loongarch64)
|
|
82
88
|
// quants.c
|
|
@@ -92,11 +98,13 @@
|
|
|
92
98
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
93
99
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
94
100
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
101
|
+
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
95
102
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
96
103
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
97
104
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
98
105
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
99
106
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
107
|
+
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
100
108
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
101
109
|
#elif defined(__riscv)
|
|
102
110
|
// quants.c
|
|
@@ -119,10 +127,12 @@
|
|
|
119
127
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
120
128
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
121
129
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
130
|
+
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
122
131
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
123
132
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
124
133
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
125
134
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
135
|
+
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
126
136
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
127
137
|
#elif defined(__s390x__)
|
|
128
138
|
// quants.c
|
|
@@ -147,11 +157,13 @@
|
|
|
147
157
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
148
158
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
149
159
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
160
|
+
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
150
161
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
151
162
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
152
163
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
153
164
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
154
165
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
166
|
+
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
155
167
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
156
168
|
#elif defined(__wasm__)
|
|
157
169
|
// quants.c
|
|
@@ -175,10 +187,12 @@
|
|
|
175
187
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
176
188
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
177
189
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
190
|
+
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
178
191
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
179
192
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
180
193
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
181
194
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
182
195
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
196
|
+
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
183
197
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
184
198
|
#endif
|
|
@@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
412
412
|
}
|
|
413
413
|
}
|
|
414
414
|
|
|
415
|
+
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
416
|
+
const int qk = QK_K;
|
|
417
|
+
const int nb = n / qk;
|
|
418
|
+
const int ncols_interleaved = 8;
|
|
419
|
+
const int blocklen = 8;
|
|
420
|
+
|
|
421
|
+
assert (n % qk == 0);
|
|
422
|
+
assert (nc % ncols_interleaved == 0);
|
|
423
|
+
|
|
424
|
+
UNUSED(s);
|
|
425
|
+
UNUSED(bs);
|
|
426
|
+
UNUSED(vx);
|
|
427
|
+
UNUSED(vy);
|
|
428
|
+
UNUSED(nr);
|
|
429
|
+
UNUSED(nc);
|
|
430
|
+
UNUSED(nb);
|
|
431
|
+
UNUSED(ncols_interleaved);
|
|
432
|
+
UNUSED(blocklen);
|
|
433
|
+
|
|
434
|
+
float sumf[8];
|
|
435
|
+
float sum_minf[8];
|
|
436
|
+
int sumi1,sumi2,sumi3,sumi4;
|
|
437
|
+
int sumi;
|
|
438
|
+
|
|
439
|
+
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
|
440
|
+
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
|
441
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
442
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
443
|
+
sumf[j] = 0.0;
|
|
444
|
+
sum_minf[j] = 0.0;
|
|
445
|
+
}
|
|
446
|
+
for (int l = 0; l < nb; l++) {
|
|
447
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
448
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
449
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
450
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
451
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
452
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
453
|
+
sumi1 = 0;
|
|
454
|
+
sumi2 = 0;
|
|
455
|
+
sumi3 = 0;
|
|
456
|
+
sumi4 = 0;
|
|
457
|
+
sumi = 0;
|
|
458
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
459
|
+
for (int i = 0; i < blocklen; ++i){
|
|
460
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
461
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
462
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
463
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
464
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
|
465
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
|
466
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
|
467
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
|
468
|
+
|
|
469
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
470
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
471
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
472
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
473
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
474
|
+
}
|
|
475
|
+
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
479
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
480
|
+
for(int j = 0; j < ncols_interleaved; j++){
|
|
481
|
+
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
486
|
+
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
415
491
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
416
492
|
const int qk = QK8_0;
|
|
417
493
|
const int nb = n / qk;
|
|
@@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
711
787
|
}
|
|
712
788
|
}
|
|
713
789
|
|
|
790
|
+
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
791
|
+
const int qk = QK_K;
|
|
792
|
+
const int nb = n / qk;
|
|
793
|
+
const int ncols_interleaved = 8;
|
|
794
|
+
const int blocklen = 8;
|
|
795
|
+
|
|
796
|
+
assert (n % qk == 0);
|
|
797
|
+
assert (nr % 4 == 0);
|
|
798
|
+
assert (nc % ncols_interleaved == 0);
|
|
799
|
+
|
|
800
|
+
UNUSED(s);
|
|
801
|
+
UNUSED(bs);
|
|
802
|
+
UNUSED(vx);
|
|
803
|
+
UNUSED(vy);
|
|
804
|
+
UNUSED(nr);
|
|
805
|
+
UNUSED(nc);
|
|
806
|
+
UNUSED(nb);
|
|
807
|
+
UNUSED(ncols_interleaved);
|
|
808
|
+
UNUSED(blocklen);
|
|
809
|
+
|
|
810
|
+
float sumf[4][8];
|
|
811
|
+
float sum_minf[4][8];
|
|
812
|
+
int sumi1, sumi2, sumi3, sumi4;
|
|
813
|
+
int sumi;
|
|
814
|
+
|
|
815
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
816
|
+
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
817
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
818
|
+
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
|
819
|
+
for (int m = 0; m < 4; m++) {
|
|
820
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
821
|
+
sumf[m][j] = 0.0;
|
|
822
|
+
sum_minf[m][j] = 0.0;
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
for (int l = 0; l < nb; l++) {
|
|
826
|
+
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
|
827
|
+
|
|
828
|
+
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
|
829
|
+
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
|
830
|
+
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
|
831
|
+
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
|
832
|
+
for (int m = 0; m < 4; m++) {
|
|
833
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
834
|
+
sumi1 = 0;
|
|
835
|
+
sumi2 = 0;
|
|
836
|
+
sumi3 = 0;
|
|
837
|
+
sumi4 = 0;
|
|
838
|
+
sumi = 0;
|
|
839
|
+
int offset = ((k / 2) % 2) + j * 2;
|
|
840
|
+
for (int i = 0; i < blocklen; ++i){
|
|
841
|
+
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
|
842
|
+
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
|
843
|
+
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
|
844
|
+
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
|
845
|
+
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
|
846
|
+
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
|
847
|
+
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
|
848
|
+
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
|
849
|
+
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
|
850
|
+
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
|
851
|
+
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
|
852
|
+
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
|
853
|
+
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
|
854
|
+
}
|
|
855
|
+
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
for(int sb = 0; sb < 8; sb++) {
|
|
860
|
+
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
|
861
|
+
for(int m = 0; m < 4; m++) {
|
|
862
|
+
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
863
|
+
for(int j = 0; j < ncols_interleaved; j++) {
|
|
864
|
+
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
|
865
|
+
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
for (int m = 0; m < 4; m++) {
|
|
872
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
873
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
|
|
714
881
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
715
882
|
const int qk = QK8_0;
|
|
716
883
|
const int nb = n / qk;
|
|
@@ -914,6 +1081,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|
|
914
1081
|
return out;
|
|
915
1082
|
}
|
|
916
1083
|
|
|
1084
|
+
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
|
1085
|
+
block_q2_Kx8 out;
|
|
1086
|
+
|
|
1087
|
+
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
|
1088
|
+
for (int i = 0; i < 8; i++) {
|
|
1089
|
+
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
for (int i = 0; i < 8; i++) {
|
|
1093
|
+
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
const int end = QK_K * 2 / blck_size_interleave;
|
|
1097
|
+
|
|
1098
|
+
// Interleave Q2_K quants by taking 8 bytes at a time
|
|
1099
|
+
for (int i = 0; i < end; ++i) {
|
|
1100
|
+
int src_id = i % 8;
|
|
1101
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
|
1102
|
+
int dst_offset = i * blck_size_interleave;
|
|
1103
|
+
|
|
1104
|
+
uint64_t elems;
|
|
1105
|
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
1106
|
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
|
1110
|
+
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
|
1111
|
+
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
|
1112
|
+
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
|
1113
|
+
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
|
1114
|
+
|
|
1115
|
+
for(int i = 0; i < 128; i++){
|
|
1116
|
+
|
|
1117
|
+
// Index for selecting which q2k super block
|
|
1118
|
+
int src1 = (i % 16) / 2;
|
|
1119
|
+
// Index for selecting scale
|
|
1120
|
+
int src2 = ((i / 16) * 2) + (i % 2);
|
|
1121
|
+
|
|
1122
|
+
out.scales[i] = in[src1].scales[src2];
|
|
1123
|
+
}
|
|
1124
|
+
return out;
|
|
1125
|
+
|
|
1126
|
+
}
|
|
1127
|
+
|
|
917
1128
|
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
918
1129
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
919
1130
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
@@ -975,6 +1186,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
975
1186
|
GGML_UNUSED(data_size);
|
|
976
1187
|
}
|
|
977
1188
|
|
|
1189
|
+
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
1190
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
|
1191
|
+
GGML_ASSERT(interleave_block == 8);
|
|
1192
|
+
constexpr int nrows_interleaved = 8;
|
|
1193
|
+
|
|
1194
|
+
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
|
|
1195
|
+
const block_q2_K * src = (const block_q2_K*) data;
|
|
1196
|
+
block_q2_K dst_tmp[8];
|
|
1197
|
+
int nrow = ggml_nrows(t);
|
|
1198
|
+
int nblocks = t->ne[0] / QK_K;
|
|
1199
|
+
|
|
1200
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
|
1201
|
+
|
|
1202
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
1203
|
+
return -1;
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1207
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1208
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
1209
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1210
|
+
}
|
|
1211
|
+
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
|
1212
|
+
}
|
|
1213
|
+
src += nrows_interleaved * nblocks;
|
|
1214
|
+
}
|
|
1215
|
+
return 0;
|
|
1216
|
+
|
|
1217
|
+
GGML_UNUSED(data_size);
|
|
1218
|
+
}
|
|
1219
|
+
|
|
978
1220
|
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
979
1221
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
|
980
1222
|
GGML_ASSERT(interleave_block == 8);
|
|
@@ -1095,6 +1337,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
|
|
1095
1337
|
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
|
1096
1338
|
}
|
|
1097
1339
|
|
|
1340
|
+
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1341
|
+
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1098
1344
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1099
1345
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
|
1100
1346
|
}
|
|
@@ -1124,6 +1370,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|
|
1124
1370
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1125
1371
|
}
|
|
1126
1372
|
|
|
1373
|
+
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1374
|
+
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1127
1377
|
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1128
1378
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1129
1379
|
}
|
|
@@ -1148,6 +1398,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|
|
1148
1398
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1149
1399
|
}
|
|
1150
1400
|
|
|
1401
|
+
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1402
|
+
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1151
1405
|
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1152
1406
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1153
1407
|
}
|
|
@@ -1421,6 +1675,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1421
1675
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
|
1422
1676
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
1423
1677
|
|
|
1678
|
+
// instance for Q2
|
|
1679
|
+
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
|
1680
|
+
|
|
1424
1681
|
// instance for IQ4
|
|
1425
1682
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
1426
1683
|
|
|
@@ -1446,6 +1703,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
1446
1703
|
return &q4_K_8x8_q8_K;
|
|
1447
1704
|
}
|
|
1448
1705
|
}
|
|
1706
|
+
} else if (cur->type == GGML_TYPE_Q2_K) {
|
|
1707
|
+
if (ggml_cpu_has_avx512()) {
|
|
1708
|
+
if (cur->ne[1] % 8 == 0) {
|
|
1709
|
+
return &q2_K_8x8_q8_K;
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1449
1712
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
1450
1713
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
1451
1714
|
if (cur->ne[1] % 4 == 0) {
|
|
@@ -44,7 +44,14 @@ struct block_q4_Kx8 {
|
|
|
44
44
|
};
|
|
45
45
|
|
|
46
46
|
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
|
47
|
+
struct block_q2_Kx8 {
|
|
48
|
+
ggml_half d[8]; // super-block scale for quantized scales
|
|
49
|
+
ggml_half dmin[8]; // super-block scale for quantized mins
|
|
50
|
+
uint8_t scales[128]; // scales and mins, quantized with 4 bits
|
|
51
|
+
uint8_t qs[512]; // 2--bit quants
|
|
52
|
+
};
|
|
47
53
|
|
|
54
|
+
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
|
48
55
|
struct block_q8_Kx4 {
|
|
49
56
|
float d[4]; // delta
|
|
50
57
|
int8_t qs[QK_K * 4]; // quants
|
|
@@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
71
78
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
72
79
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
73
80
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
81
|
+
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
74
82
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
75
83
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
76
84
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
77
85
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
78
86
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
|
+
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
79
88
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
80
89
|
|
|
81
90
|
// Native implementations
|
|
@@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
86
95
|
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
96
|
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
97
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
98
|
+
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
89
99
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
90
100
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
91
101
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
92
102
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
93
103
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
104
|
+
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
94
105
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
95
106
|
|
|
96
107
|
#if defined(__cplusplus)
|
|
@@ -284,10 +284,11 @@ extern "C" {
|
|
|
284
284
|
const struct llama_model_kv_override * kv_overrides;
|
|
285
285
|
|
|
286
286
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
287
|
-
bool vocab_only;
|
|
288
|
-
bool use_mmap;
|
|
289
|
-
bool use_mlock;
|
|
290
|
-
bool check_tensors;
|
|
287
|
+
bool vocab_only; // only load the vocabulary, no weights
|
|
288
|
+
bool use_mmap; // use mmap if possible
|
|
289
|
+
bool use_mlock; // force system to keep model in RAM
|
|
290
|
+
bool check_tensors; // validate model tensor data
|
|
291
|
+
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
|
291
292
|
};
|
|
292
293
|
|
|
293
294
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
@@ -537,6 +538,9 @@ extern "C" {
|
|
|
537
538
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
|
538
539
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
|
539
540
|
|
|
541
|
+
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
|
542
|
+
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
|
543
|
+
|
|
540
544
|
// Returns 0 on success
|
|
541
545
|
LLAMA_API uint32_t llama_model_quantize(
|
|
542
546
|
const char * fname_inp,
|
|
@@ -85,10 +85,12 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
85
85
|
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
|
86
86
|
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
|
87
87
|
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
|
88
|
+
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
|
88
89
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
89
90
|
{ LLM_ARCH_LFM2, "lfm2" },
|
|
90
91
|
{ LLM_ARCH_DREAM, "dream" },
|
|
91
92
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
|
93
|
+
{ LLM_ARCH_LLADA, "llada" },
|
|
92
94
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
93
95
|
};
|
|
94
96
|
|
|
@@ -1896,6 +1898,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1896
1898
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1897
1899
|
},
|
|
1898
1900
|
},
|
|
1901
|
+
{
|
|
1902
|
+
LLM_ARCH_HUNYUAN_DENSE,
|
|
1903
|
+
{
|
|
1904
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1905
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1906
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1907
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1908
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1909
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1910
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1911
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1912
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1913
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1914
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1915
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1916
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1917
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1918
|
+
|
|
1919
|
+
},
|
|
1920
|
+
},
|
|
1899
1921
|
{
|
|
1900
1922
|
LLM_ARCH_SMOLLM3,
|
|
1901
1923
|
{
|
|
@@ -1972,6 +1994,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1972
1994
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1973
1995
|
},
|
|
1974
1996
|
},
|
|
1997
|
+
{
|
|
1998
|
+
LLM_ARCH_LLADA,
|
|
1999
|
+
{
|
|
2000
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2001
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2002
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2003
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2004
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2005
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2006
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2007
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2008
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2009
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2010
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2011
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2012
|
+
},
|
|
2013
|
+
},
|
|
1975
2014
|
{
|
|
1976
2015
|
LLM_ARCH_UNKNOWN,
|
|
1977
2016
|
{
|
|
@@ -2224,6 +2263,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
|
2224
2263
|
bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|
2225
2264
|
switch (arch) {
|
|
2226
2265
|
case LLM_ARCH_DREAM:
|
|
2266
|
+
case LLM_ARCH_LLADA:
|
|
2227
2267
|
return true;
|
|
2228
2268
|
default:
|
|
2229
2269
|
return false;
|
|
@@ -89,10 +89,12 @@ enum llm_arch {
|
|
|
89
89
|
LLM_ARCH_ERNIE4_5,
|
|
90
90
|
LLM_ARCH_ERNIE4_5_MOE,
|
|
91
91
|
LLM_ARCH_HUNYUAN_MOE,
|
|
92
|
+
LLM_ARCH_HUNYUAN_DENSE,
|
|
92
93
|
LLM_ARCH_SMOLLM3,
|
|
93
94
|
LLM_ARCH_LFM2,
|
|
94
95
|
LLM_ARCH_DREAM,
|
|
95
96
|
LLM_ARCH_SMALLTHINKER,
|
|
97
|
+
LLM_ARCH_LLADA,
|
|
96
98
|
LLM_ARCH_UNKNOWN,
|
|
97
99
|
};
|
|
98
100
|
|
|
@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
|
|
|
59
59
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
|
60
60
|
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
|
61
61
|
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
|
|
62
|
-
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d
|
|
62
|
+
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
|
|
63
63
|
return false;
|
|
64
64
|
}
|
|
65
65
|
}
|
|
@@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
66
66
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
|
67
67
|
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
|
68
68
|
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
|
69
|
+
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
|
69
70
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
|
70
71
|
};
|
|
71
72
|
|
|
@@ -193,6 +194,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
193
194
|
return LLM_CHAT_TEMPLATE_DOTS1;
|
|
194
195
|
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
|
195
196
|
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
|
197
|
+
} else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
|
198
|
+
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
|
196
199
|
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
|
197
200
|
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
|
198
201
|
}
|
|
@@ -698,11 +701,27 @@ int32_t llm_chat_apply_template(
|
|
|
698
701
|
if (role == "system") {
|
|
699
702
|
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
|
700
703
|
} else if (role == "assistant") {
|
|
701
|
-
ss <<
|
|
704
|
+
ss << message->content << "<|eos|>";
|
|
702
705
|
} else {
|
|
703
706
|
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
|
704
707
|
}
|
|
705
708
|
}
|
|
709
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
|
|
710
|
+
// tencent/Hunyuan-4B-Instruct
|
|
711
|
+
for (size_t i = 0; i < chat.size(); i++) {
|
|
712
|
+
std::string role(chat[i]->role);
|
|
713
|
+
if (i == 0) {
|
|
714
|
+
if (role == "system") {
|
|
715
|
+
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
if (role == "assistant") {
|
|
720
|
+
ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
|
|
721
|
+
} else if (role == "user") {
|
|
722
|
+
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
|
723
|
+
}
|
|
724
|
+
}
|
|
706
725
|
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
|
707
726
|
// moonshotai/Kimi-K2-Instruct
|
|
708
727
|
for (auto message : chat) {
|
|
@@ -105,7 +105,7 @@ llama_context::llama_context(
|
|
|
105
105
|
|
|
106
106
|
{
|
|
107
107
|
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
108
|
-
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) :
|
|
108
|
+
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
|
|
109
109
|
|
|
110
110
|
if (!supports_set_rows && !cparams.kv_unified) {
|
|
111
111
|
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
|
@@ -113,6 +113,15 @@ llama_context::llama_context(
|
|
|
113
113
|
}
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
+
{
|
|
117
|
+
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
|
118
|
+
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
|
119
|
+
|
|
120
|
+
if (graph_reuse_disable) {
|
|
121
|
+
LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
116
125
|
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
117
126
|
|
|
118
127
|
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
|
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|
|
716
725
|
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
|
717
726
|
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
|
718
727
|
|
|
719
|
-
if (res->can_reuse(gparams)) {
|
|
728
|
+
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
|
720
729
|
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
|
721
730
|
|
|
722
731
|
n_reused++;
|