@fugood/llama.node 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/lib/binding.ts +8 -0
  3. package/package.json +14 -14
  4. package/src/LlamaCompletionWorker.cpp +45 -5
  5. package/src/LlamaContext.cpp +3 -0
  6. package/src/llama.cpp/common/arg.cpp +60 -7
  7. package/src/llama.cpp/common/chat.cpp +6 -6
  8. package/src/llama.cpp/common/common.cpp +1 -0
  9. package/src/llama.cpp/common/common.h +14 -5
  10. package/src/llama.cpp/common/speculative.cpp +135 -54
  11. package/src/llama.cpp/common/speculative.h +8 -1
  12. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  17. package/src/llama.cpp/include/llama.h +8 -4
  18. package/src/llama.cpp/src/llama-arch.cpp +40 -0
  19. package/src/llama.cpp/src/llama-arch.h +2 -0
  20. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  21. package/src/llama.cpp/src/llama-chat.cpp +20 -1
  22. package/src/llama.cpp/src/llama-chat.h +1 -0
  23. package/src/llama.cpp/src/llama-context.cpp +11 -2
  24. package/src/llama.cpp/src/llama-context.h +4 -1
  25. package/src/llama.cpp/src/llama-graph.cpp +57 -139
  26. package/src/llama.cpp/src/llama-graph.h +31 -32
  27. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
  28. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  29. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  30. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  31. package/src/llama.cpp/src/llama-model.cpp +400 -21
  32. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  33. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  34. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -37,17 +37,21 @@
37
37
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
38
38
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
39
39
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
40
+ #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
40
41
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
41
42
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
42
43
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
43
44
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
44
45
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
46
+ #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
45
47
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
46
48
  #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
47
49
  // repack.cpp
48
50
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
49
51
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
52
+ #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
50
53
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
54
+ #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
51
55
  #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
52
56
  // repack.cpp
53
57
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
@@ -72,11 +76,13 @@
72
76
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
73
77
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
74
78
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
79
+ #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
75
80
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
76
81
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
77
82
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
78
83
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
79
84
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
85
+ #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
80
86
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
81
87
  #elif defined(__loongarch64)
82
88
  // quants.c
@@ -92,11 +98,13 @@
92
98
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
93
99
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
94
100
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
101
+ #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
95
102
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
96
103
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
97
104
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
98
105
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
99
106
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
107
+ #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
100
108
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
101
109
  #elif defined(__riscv)
102
110
  // quants.c
@@ -119,10 +127,12 @@
119
127
  #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
120
128
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
121
129
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
130
+ #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
122
131
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
123
132
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
124
133
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
125
134
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
135
+ #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
126
136
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
127
137
  #elif defined(__s390x__)
128
138
  // quants.c
@@ -147,11 +157,13 @@
147
157
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
148
158
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
149
159
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
160
+ #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
150
161
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
151
162
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
152
163
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
153
164
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
154
165
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
166
+ #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
155
167
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
156
168
  #elif defined(__wasm__)
157
169
  // quants.c
@@ -175,10 +187,12 @@
175
187
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
176
188
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
177
189
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
190
+ #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
178
191
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
179
192
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
180
193
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
181
194
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
182
195
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
196
+ #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
183
197
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
184
198
  #endif
@@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
412
412
  }
413
413
  }
414
414
 
415
+ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
416
+ const int qk = QK_K;
417
+ const int nb = n / qk;
418
+ const int ncols_interleaved = 8;
419
+ const int blocklen = 8;
420
+
421
+ assert (n % qk == 0);
422
+ assert (nc % ncols_interleaved == 0);
423
+
424
+ UNUSED(s);
425
+ UNUSED(bs);
426
+ UNUSED(vx);
427
+ UNUSED(vy);
428
+ UNUSED(nr);
429
+ UNUSED(nc);
430
+ UNUSED(nb);
431
+ UNUSED(ncols_interleaved);
432
+ UNUSED(blocklen);
433
+
434
+ float sumf[8];
435
+ float sum_minf[8];
436
+ int sumi1,sumi2,sumi3,sumi4;
437
+ int sumi;
438
+
439
+ const block_q8_K * a_ptr = (const block_q8_K *)vy;
440
+ for(int x = 0; x < nc / ncols_interleaved; x++) {
441
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
442
+ for (int j = 0; j < ncols_interleaved; j++) {
443
+ sumf[j] = 0.0;
444
+ sum_minf[j] = 0.0;
445
+ }
446
+ for (int l = 0; l < nb; l++) {
447
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
448
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
449
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
450
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
451
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
452
+ for (int j = 0; j < ncols_interleaved; j++) {
453
+ sumi1 = 0;
454
+ sumi2 = 0;
455
+ sumi3 = 0;
456
+ sumi4 = 0;
457
+ sumi = 0;
458
+ int offset = ((k / 2) % 2) + j * 2;
459
+ for (int i = 0; i < blocklen; ++i){
460
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
461
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
462
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
463
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
464
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
465
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
466
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
467
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
468
+
469
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
470
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
471
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
472
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
473
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
474
+ }
475
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
476
+ }
477
+ }
478
+ for(int sb = 0; sb < 8; sb++) {
479
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
480
+ for(int j = 0; j < ncols_interleaved; j++){
481
+ sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
482
+ }
483
+ }
484
+ }
485
+ for (int j = 0; j < ncols_interleaved; j++) {
486
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
487
+ }
488
+ }
489
+ }
490
+
415
491
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
416
492
  const int qk = QK8_0;
417
493
  const int nb = n / qk;
@@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
711
787
  }
712
788
  }
713
789
 
790
+ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
791
+ const int qk = QK_K;
792
+ const int nb = n / qk;
793
+ const int ncols_interleaved = 8;
794
+ const int blocklen = 8;
795
+
796
+ assert (n % qk == 0);
797
+ assert (nr % 4 == 0);
798
+ assert (nc % ncols_interleaved == 0);
799
+
800
+ UNUSED(s);
801
+ UNUSED(bs);
802
+ UNUSED(vx);
803
+ UNUSED(vy);
804
+ UNUSED(nr);
805
+ UNUSED(nc);
806
+ UNUSED(nb);
807
+ UNUSED(ncols_interleaved);
808
+ UNUSED(blocklen);
809
+
810
+ float sumf[4][8];
811
+ float sum_minf[4][8];
812
+ int sumi1, sumi2, sumi3, sumi4;
813
+ int sumi;
814
+
815
+ for (int y = 0; y < nr / 4; y++) {
816
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
817
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
818
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
819
+ for (int m = 0; m < 4; m++) {
820
+ for (int j = 0; j < ncols_interleaved; j++) {
821
+ sumf[m][j] = 0.0;
822
+ sum_minf[m][j] = 0.0;
823
+ }
824
+ }
825
+ for (int l = 0; l < nb; l++) {
826
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
827
+
828
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
829
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
830
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
831
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
832
+ for (int m = 0; m < 4; m++) {
833
+ for (int j = 0; j < ncols_interleaved; j++) {
834
+ sumi1 = 0;
835
+ sumi2 = 0;
836
+ sumi3 = 0;
837
+ sumi4 = 0;
838
+ sumi = 0;
839
+ int offset = ((k / 2) % 2) + j * 2;
840
+ for (int i = 0; i < blocklen; ++i){
841
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
842
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
843
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
844
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
845
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
846
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
847
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
848
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
849
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
850
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
851
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
852
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
853
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
854
+ }
855
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
856
+ }
857
+ }
858
+ }
859
+ for(int sb = 0; sb < 8; sb++) {
860
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
861
+ for(int m = 0; m < 4; m++) {
862
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
863
+ for(int j = 0; j < ncols_interleaved; j++) {
864
+ int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
865
+ sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
866
+ }
867
+ }
868
+ }
869
+ }
870
+
871
+ for (int m = 0; m < 4; m++) {
872
+ for (int j = 0; j < ncols_interleaved; j++) {
873
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
874
+ }
875
+ }
876
+ }
877
+ }
878
+ }
879
+
880
+
714
881
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
715
882
  const int qk = QK8_0;
716
883
  const int nb = n / qk;
@@ -914,6 +1081,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
914
1081
  return out;
915
1082
  }
916
1083
 
1084
+ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
1085
+ block_q2_Kx8 out;
1086
+
1087
+ // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
1088
+ for (int i = 0; i < 8; i++) {
1089
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1090
+ }
1091
+
1092
+ for (int i = 0; i < 8; i++) {
1093
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
1094
+ }
1095
+
1096
+ const int end = QK_K * 2 / blck_size_interleave;
1097
+
1098
+ // Interleave Q2_K quants by taking 8 bytes at a time
1099
+ for (int i = 0; i < end; ++i) {
1100
+ int src_id = i % 8;
1101
+ int src_offset = (i / 8) * blck_size_interleave;
1102
+ int dst_offset = i * blck_size_interleave;
1103
+
1104
+ uint64_t elems;
1105
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1106
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1107
+ }
1108
+
1109
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
1110
+ // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
1111
+ // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
1112
+ // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
1113
+ // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
1114
+
1115
+ for(int i = 0; i < 128; i++){
1116
+
1117
+ // Index for selecting which q2k super block
1118
+ int src1 = (i % 16) / 2;
1119
+ // Index for selecting scale
1120
+ int src2 = ((i / 16) * 2) + (i % 2);
1121
+
1122
+ out.scales[i] = in[src1].scales[src2];
1123
+ }
1124
+ return out;
1125
+
1126
+ }
1127
+
917
1128
  static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
918
1129
  GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
919
1130
  GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
@@ -975,6 +1186,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
975
1186
  GGML_UNUSED(data_size);
976
1187
  }
977
1188
 
1189
+ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1190
+ GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
1191
+ GGML_ASSERT(interleave_block == 8);
1192
+ constexpr int nrows_interleaved = 8;
1193
+
1194
+ block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
1195
+ const block_q2_K * src = (const block_q2_K*) data;
1196
+ block_q2_K dst_tmp[8];
1197
+ int nrow = ggml_nrows(t);
1198
+ int nblocks = t->ne[0] / QK_K;
1199
+
1200
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
1201
+
1202
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1203
+ return -1;
1204
+ }
1205
+
1206
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1207
+ for (int64_t x = 0; x < nblocks; x++) {
1208
+ for (int i = 0; i < nrows_interleaved; i++ ) {
1209
+ dst_tmp[i] = src[x + i * nblocks];
1210
+ }
1211
+ *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
1212
+ }
1213
+ src += nrows_interleaved * nblocks;
1214
+ }
1215
+ return 0;
1216
+
1217
+ GGML_UNUSED(data_size);
1218
+ }
1219
+
978
1220
  static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
979
1221
  GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
980
1222
  GGML_ASSERT(interleave_block == 8);
@@ -1095,6 +1337,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
1095
1337
  return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
1096
1338
  }
1097
1339
 
1340
+ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1341
+ return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
1342
+ }
1343
+
1098
1344
  template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1099
1345
  return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
1100
1346
  }
@@ -1124,6 +1370,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
1124
1370
  ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1125
1371
  }
1126
1372
 
1373
+ template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1374
+ ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1375
+ }
1376
+
1127
1377
  template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1128
1378
  ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1129
1379
  }
@@ -1148,6 +1398,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
1148
1398
  ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1149
1399
  }
1150
1400
 
1401
+ template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1402
+ ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1403
+ }
1404
+
1151
1405
  template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1152
1406
  ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1153
1407
  }
@@ -1421,6 +1675,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1421
1675
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
1422
1676
  static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1423
1677
 
1678
+ // instance for Q2
1679
+ static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
1680
+
1424
1681
  // instance for IQ4
1425
1682
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1426
1683
 
@@ -1446,6 +1703,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1446
1703
  return &q4_K_8x8_q8_K;
1447
1704
  }
1448
1705
  }
1706
+ } else if (cur->type == GGML_TYPE_Q2_K) {
1707
+ if (ggml_cpu_has_avx512()) {
1708
+ if (cur->ne[1] % 8 == 0) {
1709
+ return &q2_K_8x8_q8_K;
1710
+ }
1711
+ }
1449
1712
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
1450
1713
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1451
1714
  if (cur->ne[1] % 4 == 0) {
@@ -44,7 +44,14 @@ struct block_q4_Kx8 {
44
44
  };
45
45
 
46
46
  static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
47
+ struct block_q2_Kx8 {
48
+ ggml_half d[8]; // super-block scale for quantized scales
49
+ ggml_half dmin[8]; // super-block scale for quantized mins
50
+ uint8_t scales[128]; // scales and mins, quantized with 4 bits
51
+ uint8_t qs[512]; // 2--bit quants
52
+ };
47
53
 
54
+ static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
48
55
  struct block_q8_Kx4 {
49
56
  float d[4]; // delta
50
57
  int8_t qs[QK_K * 4]; // quants
@@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
71
78
  void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
72
79
  void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
73
80
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
81
+ void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
74
82
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
75
83
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
76
84
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
77
85
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
78
86
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
+ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
79
88
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
80
89
 
81
90
  // Native implementations
@@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
86
95
  void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
96
  void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
97
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
98
+ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
99
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
90
100
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
91
101
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
92
102
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
93
103
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
+ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
94
105
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
95
106
 
96
107
  #if defined(__cplusplus)
@@ -284,10 +284,11 @@ extern "C" {
284
284
  const struct llama_model_kv_override * kv_overrides;
285
285
 
286
286
  // Keep the booleans together to avoid misalignment during copy-by-value.
287
- bool vocab_only; // only load the vocabulary, no weights
288
- bool use_mmap; // use mmap if possible
289
- bool use_mlock; // force system to keep model in RAM
290
- bool check_tensors; // validate model tensor data
287
+ bool vocab_only; // only load the vocabulary, no weights
288
+ bool use_mmap; // use mmap if possible
289
+ bool use_mlock; // force system to keep model in RAM
290
+ bool check_tensors; // validate model tensor data
291
+ bool use_extra_bufts; // use extra buffer types (used for weight repacking)
291
292
  };
292
293
 
293
294
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -537,6 +538,9 @@ extern "C" {
537
538
  // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
538
539
  LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
539
540
 
541
+ // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
542
+ LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
543
+
540
544
  // Returns 0 on success
541
545
  LLAMA_API uint32_t llama_model_quantize(
542
546
  const char * fname_inp,
@@ -85,10 +85,12 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
85
85
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
86
86
  { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
87
87
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
88
+ { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
88
89
  { LLM_ARCH_SMOLLM3, "smollm3" },
89
90
  { LLM_ARCH_LFM2, "lfm2" },
90
91
  { LLM_ARCH_DREAM, "dream" },
91
92
  { LLM_ARCH_SMALLTHINKER, "smallthinker" },
93
+ { LLM_ARCH_LLADA, "llada" },
92
94
  { LLM_ARCH_UNKNOWN, "(unknown)" },
93
95
  };
94
96
 
@@ -1896,6 +1898,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1896
1898
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1897
1899
  },
1898
1900
  },
1901
+ {
1902
+ LLM_ARCH_HUNYUAN_DENSE,
1903
+ {
1904
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1905
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1906
+ { LLM_TENSOR_OUTPUT, "output" },
1907
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1908
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1909
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1910
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1911
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1912
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1913
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1914
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1915
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1916
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1917
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1918
+
1919
+ },
1920
+ },
1899
1921
  {
1900
1922
  LLM_ARCH_SMOLLM3,
1901
1923
  {
@@ -1972,6 +1994,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1972
1994
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1973
1995
  },
1974
1996
  },
1997
+ {
1998
+ LLM_ARCH_LLADA,
1999
+ {
2000
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2001
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2002
+ { LLM_TENSOR_OUTPUT, "output" },
2003
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2004
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2005
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2006
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2007
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2008
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2009
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2010
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2011
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2012
+ },
2013
+ },
1975
2014
  {
1976
2015
  LLM_ARCH_UNKNOWN,
1977
2016
  {
@@ -2224,6 +2263,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
2224
2263
  bool llm_arch_is_diffusion(const llm_arch & arch) {
2225
2264
  switch (arch) {
2226
2265
  case LLM_ARCH_DREAM:
2266
+ case LLM_ARCH_LLADA:
2227
2267
  return true;
2228
2268
  default:
2229
2269
  return false;
@@ -89,10 +89,12 @@ enum llm_arch {
89
89
  LLM_ARCH_ERNIE4_5,
90
90
  LLM_ARCH_ERNIE4_5_MOE,
91
91
  LLM_ARCH_HUNYUAN_MOE,
92
+ LLM_ARCH_HUNYUAN_DENSE,
92
93
  LLM_ARCH_SMOLLM3,
93
94
  LLM_ARCH_LFM2,
94
95
  LLM_ARCH_DREAM,
95
96
  LLM_ARCH_SMALLTHINKER,
97
+ LLM_ARCH_LLADA,
96
98
  LLM_ARCH_UNKNOWN,
97
99
  };
98
100
 
@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
59
59
  for (int32_t i = 0; i < batch.n_tokens; ++i) {
60
60
  for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
61
61
  if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
62
- LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
62
+ LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
63
63
  return false;
64
64
  }
65
65
  }
@@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
66
66
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
67
67
  { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
68
68
  { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
69
+ { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
69
70
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
70
71
  };
71
72
 
@@ -193,6 +194,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
193
194
  return LLM_CHAT_TEMPLATE_DOTS1;
194
195
  } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
195
196
  return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
197
+ } else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
198
+ return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
196
199
  } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
197
200
  return LLM_CHAT_TEMPLATE_KIMI_K2;
198
201
  }
@@ -698,11 +701,27 @@ int32_t llm_chat_apply_template(
698
701
  if (role == "system") {
699
702
  ss << "<|startoftext|>" << message->content << "<|extra_4|>";
700
703
  } else if (role == "assistant") {
701
- ss << "<|startoftext|>" << message->content << "<|eos|>";
704
+ ss << message->content << "<|eos|>";
702
705
  } else {
703
706
  ss << "<|startoftext|>" << message->content << "<|extra_0|>";
704
707
  }
705
708
  }
709
+ } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
710
+ // tencent/Hunyuan-4B-Instruct
711
+ for (size_t i = 0; i < chat.size(); i++) {
712
+ std::string role(chat[i]->role);
713
+ if (i == 0) {
714
+ if (role == "system") {
715
+ ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
716
+ }
717
+ }
718
+
719
+ if (role == "assistant") {
720
+ ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
721
+ } else if (role == "user") {
722
+ ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
723
+ }
724
+ }
706
725
  } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
707
726
  // moonshotai/Kimi-K2-Instruct
708
727
  for (auto message : chat) {
@@ -46,6 +46,7 @@ enum llm_chat_template {
46
46
  LLM_CHAT_TEMPLATE_SMOLVLM,
47
47
  LLM_CHAT_TEMPLATE_DOTS1,
48
48
  LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
49
+ LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
49
50
  LLM_CHAT_TEMPLATE_KIMI_K2,
50
51
  LLM_CHAT_TEMPLATE_UNKNOWN,
51
52
  };
@@ -105,7 +105,7 @@ llama_context::llama_context(
105
105
 
106
106
  {
107
107
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108
- supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
108
+ supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
109
109
 
110
110
  if (!supports_set_rows && !cparams.kv_unified) {
111
111
  LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -113,6 +113,15 @@ llama_context::llama_context(
113
113
  }
114
114
  }
115
115
 
116
+ {
117
+ const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
118
+ graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
119
+
120
+ if (graph_reuse_disable) {
121
+ LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
122
+ }
123
+ }
124
+
116
125
  const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
117
126
 
118
127
  LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
716
725
  // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
717
726
  const auto gparams = graph_params(res, ubatch, mctx, gtype);
718
727
 
719
- if (res->can_reuse(gparams)) {
728
+ if (!graph_reuse_disable && res->can_reuse(gparams)) {
720
729
  //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
721
730
 
722
731
  n_reused++;