@fugood/llama.node 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -281,35 +281,9 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
281
281
  }
282
282
 
283
283
  #else
284
- // scalar
285
- const int blck_size_interleave = 8;
286
- float srcv[4][QK8_0];
287
- float id[4];
288
-
289
- for (int i = 0; i < nb; i++) {
290
- for (int row_iter = 0; row_iter < 4; row_iter++) {
291
- float amax = 0.0f; // absolute max
292
-
293
- for (int j = 0; j < QK8_0; j++) {
294
- srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
295
- amax = MAX(amax, fabsf(srcv[row_iter][j]));
296
- }
297
-
298
- const float d = amax / ((1 << 7) - 1);
299
- id[row_iter] = d ? 1.0f / d : 0.0f;
300
-
301
- y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
302
- }
303
-
304
- for (int j = 0; j < QK8_0 * 4; j++) {
305
- int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
306
- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
307
- src_offset += (j % blck_size_interleave);
308
-
309
- float x0 = srcv[src_id][src_offset] * id[src_id];
310
- y[i].qs[j] = roundf(x0);
311
- }
312
- }
284
+ UNUSED(nb);
285
+ UNUSED(y);
286
+ ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
313
287
  #endif
314
288
  }
315
289
 
@@ -531,49 +505,9 @@ void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
531
505
  }
532
506
 
533
507
  #else
534
-
535
- // scalar
536
- const int blck_size_interleave = 8;
537
- float srcv[4][QK_K];
538
- float iscale[4];
539
-
540
- for (int i = 0; i < nb; i++) {
541
- for (int row_iter = 0; row_iter < 4; row_iter++) {
542
- float amax = 0.0f; // absolute max
543
- float max = 0;
544
-
545
- for (int j = 0; j < QK_K; j++) {
546
- srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
547
- // Update the maximum value of the corresponding super block
548
- if(amax < fabsf(srcv[row_iter][j])) {
549
- amax = fabsf(srcv[row_iter][j]);
550
- max = srcv[row_iter][j];
551
- }
552
- }
553
-
554
- iscale[row_iter] = amax ? -127.f/max : 0;
555
-
556
- y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
557
- }
558
-
559
- for (int j = 0; j < QK_K / 4; j++) {
560
- y[i].bsums[j] = 0;
561
- }
562
-
563
- // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
564
- // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
565
- // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
566
- for (int j = 0; j < QK_K * 4; j++) {
567
- int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
568
- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
569
- src_offset += (j % blck_size_interleave);
570
- int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
571
-
572
- float x0 = srcv[src_id][src_offset] * iscale[src_id];
573
- y[i].qs[j] = nearest_int(x0);
574
- y[i].bsums[index] += y[i].qs[j];
575
- }
576
- }
508
+ UNUSED(nb);
509
+ UNUSED(y);
510
+ ggml_quantize_mat_q8_K_4x8_generic(x, vy, k);
577
511
  #endif
578
512
  }
579
513
 
@@ -689,31 +623,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
689
623
  return;
690
624
 
691
625
  #endif
692
- {
693
- float sumf[8];
694
- int sumi;
695
-
696
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
697
- for (int x = 0; x < nc / ncols_interleaved; x++) {
698
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
699
-
700
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
701
- for (int l = 0; l < nb; l++) {
702
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
703
- for (int j = 0; j < ncols_interleaved; j++) {
704
- sumi = 0;
705
- for (int i = 0; i < blocklen; ++i) {
706
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
707
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
708
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
709
- }
710
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
711
- }
712
- }
713
- }
714
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
715
- }
716
- }
626
+ ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
717
627
  }
718
628
 
719
629
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -932,61 +842,10 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
932
842
  }
933
843
 
934
844
  #else
935
-
936
- float sumf[8];
937
- float sum_minf[8];
938
- uint32_t utmp[32];
939
- int sumi1;
940
- int sumi2;
941
- int sumi;
942
-
943
- const block_q8_K * a_ptr = (const block_q8_K *) vy;
944
- for (int x = 0; x < nc / ncols_interleaved; x++) {
945
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
946
-
947
- for (int j = 0; j < ncols_interleaved; j++) {
948
- sumf[j] = 0.0;
949
- sum_minf[j] = 0.0;
950
- }
951
- for (int l = 0; l < nb; l++) {
952
- for (int sb = 0; sb < 8; sb++) {
953
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
954
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
955
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
956
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
957
- utmp[sb * 4 + 2] = uaux_0;
958
- utmp[sb * 4 + 0] &= kmask1;
959
- }
960
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
961
- uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
962
- uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
963
- for (int j = 0; j < ncols_interleaved; j++) {
964
- sumi1 = 0;
965
- sumi2 = 0;
966
- sumi = 0;
967
- for (int i = 0; i < blocklen; ++i) {
968
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
969
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
970
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
971
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
972
- sumi1 = sumi1 * scales_0[j];
973
- sumi2 = sumi2 * scales_1[j];
974
- sumi += sumi1 + sumi2;
975
- }
976
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
977
- }
978
- }
979
- for (int sb = 0; sb < 8; sb++) {
980
- uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
981
- for (int j = 0; j < ncols_interleaved; j++) {
982
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
983
- }
984
- }
985
- }
986
- for (int j = 0; j < ncols_interleaved; j++) {
987
- s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
988
- }
989
- }
845
+ UNUSED(kmask1);
846
+ UNUSED(kmask2);
847
+ UNUSED(kmask3);
848
+ ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
990
849
  #endif
991
850
  }
992
851
 
@@ -1735,38 +1594,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1735
1594
  }
1736
1595
 
1737
1596
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
1738
- float sumf[4][8];
1739
- int sumi;
1740
-
1741
- for (int y = 0; y < nr / 4; y++) {
1742
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1743
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1744
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
1745
- for (int m = 0; m < 4; m++) {
1746
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1747
- }
1748
- for (int l = 0; l < nb; l++) {
1749
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1750
- for (int m = 0; m < 4; m++) {
1751
- for (int j = 0; j < ncols_interleaved; j++) {
1752
- sumi = 0;
1753
- for (int i = 0; i < blocklen; ++i) {
1754
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1755
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1756
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1757
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1758
- }
1759
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1760
- }
1761
- }
1762
- }
1763
- }
1764
- for (int m = 0; m < 4; m++) {
1765
- for (int j = 0; j < ncols_interleaved; j++)
1766
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1767
- }
1768
- }
1769
- }
1597
+ ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
1770
1598
  }
1771
1599
 
1772
1600
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -3216,70 +3044,9 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3216
3044
  }
3217
3045
 
3218
3046
  #else
3219
-
3220
- float sumf[4][8];
3221
- float sum_minf[4][8];
3222
- uint32_t utmp[32];
3223
- int sumi1;
3224
- int sumi2;
3225
- int sumi;
3226
-
3227
- for (int y = 0; y < nr / 4; y++) {
3228
- const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
3229
- for (int x = 0; x < nc / ncols_interleaved; x++) {
3230
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
3231
- for (int m = 0; m < 4; m++) {
3232
- for (int j = 0; j < ncols_interleaved; j++) {
3233
- sumf[m][j] = 0.0;
3234
- sum_minf[m][j] = 0.0;
3235
- }
3236
- }
3237
- for (int l = 0; l < nb; l++) {
3238
- for (int sb = 0; sb < 8; sb++) {
3239
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
3240
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
3241
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
3242
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
3243
- utmp[sb * 4 + 2] = uaux_0;
3244
- utmp[sb * 4 + 0] &= kmask1;
3245
- }
3246
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
3247
- uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
3248
- uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
3249
- for (int m = 0; m < 4; m++) {
3250
- for (int j = 0; j < ncols_interleaved; j++) {
3251
- sumi1 = 0;
3252
- sumi2 = 0;
3253
- sumi = 0;
3254
- for (int i = 0; i < blocklen; ++i) {
3255
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
3256
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
3257
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
3258
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
3259
- sumi1 = sumi1 * scales_0[j];
3260
- sumi2 = sumi2 * scales_1[j];
3261
- sumi += sumi1 + sumi2;
3262
- }
3263
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
3264
- }
3265
- }
3266
- }
3267
- for (int sb = 0; sb < 8; sb++) {
3268
- uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
3269
- for(int m = 0; m < 4; m++) {
3270
- const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
3271
- for(int j = 0; j < ncols_interleaved; j++) {
3272
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
3273
- }
3274
- }
3275
- }
3276
- }
3277
- for (int m = 0; m < 4; m++) {
3278
- for (int j = 0; j < ncols_interleaved; j++) {
3279
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
3280
- }
3281
- }
3282
- }
3283
- }
3047
+ UNUSED(kmask1);
3048
+ UNUSED(kmask2);
3049
+ UNUSED(kmask3);
3050
+ ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
3284
3051
  #endif
3285
3052
  }
@@ -88,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
88
88
  { LLM_ARCH_SMOLLM3, "smollm3" },
89
89
  { LLM_ARCH_LFM2, "lfm2" },
90
90
  { LLM_ARCH_DREAM, "dream" },
91
+ { LLM_ARCH_SMALLTHINKER, "smallthinker" },
91
92
  { LLM_ARCH_UNKNOWN, "(unknown)" },
92
93
  };
93
94
 
@@ -1933,6 +1934,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1933
1934
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1934
1935
  }
1935
1936
  },
1937
+ {
1938
+ LLM_ARCH_SMALLTHINKER,
1939
+ {
1940
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1941
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1942
+ { LLM_TENSOR_OUTPUT, "output" },
1943
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1944
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1945
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1946
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1947
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1948
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1949
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1950
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1951
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1952
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1953
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1954
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1955
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
1956
+ },
1957
+ },
1936
1958
  {
1937
1959
  LLM_ARCH_DREAM,
1938
1960
  {
@@ -92,6 +92,7 @@ enum llm_arch {
92
92
  LLM_ARCH_SMOLLM3,
93
93
  LLM_ARCH_LFM2,
94
94
  LLM_ARCH_DREAM,
95
+ LLM_ARCH_SMALLTHINKER,
95
96
  LLM_ARCH_UNKNOWN,
96
97
  };
97
98
 
@@ -298,7 +298,7 @@ llama_context::llama_context(
298
298
 
299
299
  cross.v_embd.clear();
300
300
 
301
- // reserve pp graph first so that buffers are only allocated once
301
+ // reserve pp (prompt processing) graph first so that buffers are only allocated once
302
302
  {
303
303
  auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
304
304
  if (!gf) {
@@ -309,7 +309,7 @@ llama_context::llama_context(
309
309
  n_nodes_pp = ggml_graph_n_nodes(gf);
310
310
  }
311
311
 
312
- // reserve with tg graph to get the number of splits and nodes
312
+ // reserve with tg (token generation) graph to get the number of splits and nodes
313
313
  {
314
314
  auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
315
315
  if (!gf) {
@@ -938,6 +938,100 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
938
938
  return moe_out;
939
939
  }
940
940
 
941
+ ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
942
+ ggml_tensor * cur,
943
+ ggml_tensor * probs,
944
+ ggml_tensor * up_exps,
945
+ ggml_tensor * gate_exps,
946
+ ggml_tensor * down_exps,
947
+ ggml_tensor * exp_probs_b,
948
+ int64_t n_expert,
949
+ int64_t n_expert_used,
950
+ llama_expert_gating_func_type gating_op,
951
+ int il) const {
952
+ const int64_t n_embd = cur->ne[0];
953
+ const int64_t n_tokens = cur->ne[1];
954
+
955
+ // add experts selection bias - introduced in DeepSeek V3
956
+ // leave probs unbiased as it's later used to get expert weights
957
+ ggml_tensor * selection_probs = probs;
958
+ if (exp_probs_b != nullptr) {
959
+ selection_probs = ggml_add(ctx0, probs, exp_probs_b);
960
+ cb(selection_probs, "ffn_moe_probs_biased", il);
961
+ }
962
+
963
+ // select experts
964
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
965
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
966
+ cb(selected_experts, "ffn_moe_topk", il);
967
+
968
+ ggml_tensor * weights = ggml_get_rows(ctx0,
969
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
970
+ cb(weights, "ffn_moe_weights", il);
971
+
972
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
973
+ if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
974
+ weights = ggml_soft_max(ctx0, weights);
975
+ } else {
976
+ weights = ggml_sigmoid(ctx0, weights);
977
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
978
+ cb(weights_sum, "ffn_moe_weights_sum", il);
979
+
980
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
981
+ cb(weights, "ffn_moe_weights_norm", il);
982
+ }
983
+
984
+ weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
985
+
986
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
987
+
988
+ ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
989
+ cb(up, "ffn_moe_up", il);
990
+
991
+ ggml_tensor * experts = nullptr;
992
+ cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
993
+ cb(cur, "ffn_moe_gate", il);
994
+
995
+ cur = ggml_reglu_split(ctx0, cur, up);
996
+ cb(cur, "ffn_moe_reglu", il);
997
+
998
+ experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
999
+ cb(experts, "ffn_moe_down", il);
1000
+
1001
+ experts = ggml_mul(ctx0, experts, weights);
1002
+ cb(cur, "ffn_moe_weighted", il);
1003
+
1004
+ ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
1005
+
1006
+ assert(n_expert_used > 0);
1007
+
1008
+ // order the views before the adds
1009
+ for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
1010
+ cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
1011
+
1012
+ ggml_build_forward_expand(gf, cur_experts[i]);
1013
+ }
1014
+
1015
+ // aggregate experts
1016
+ // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
1017
+ // to avoid potentially a large number of add nodes during warmup
1018
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14753
1019
+ ggml_tensor * moe_out = cur_experts[0];
1020
+
1021
+ for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
1022
+ moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
1023
+ }
1024
+
1025
+ if (n_expert_used == 1) {
1026
+ // avoid returning a non-contiguous tensor
1027
+ moe_out = ggml_cont(ctx0, moe_out);
1028
+ }
1029
+
1030
+ cb(moe_out, "ffn_moe_out", il);
1031
+
1032
+ return moe_out;
1033
+ }
1034
+
941
1035
  // input embeddings with optional lora
942
1036
  ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
943
1037
  const int64_t n_embd = hparams.n_embd;
@@ -625,6 +625,18 @@ struct llm_graph_context {
625
625
  llama_expert_gating_func_type gating_op,
626
626
  int il) const;
627
627
 
628
+ ggml_tensor * build_moe_ffn_from_probs(
629
+ ggml_tensor * cur,
630
+ ggml_tensor * probs,
631
+ ggml_tensor * up_exps,
632
+ ggml_tensor * gate_exps,
633
+ ggml_tensor * down_exps,
634
+ ggml_tensor * exp_probs_b,
635
+ int64_t n_expert,
636
+ int64_t n_expert_used,
637
+ llama_expert_gating_func_type gating_op,
638
+ int il) const;
639
+
628
640
  //
629
641
  // inputs
630
642
  //
@@ -2,9 +2,15 @@
2
2
 
3
3
  #include "ggml.h"
4
4
 
5
- void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
6
- for (uint32_t il = 0; il < n_layer; ++il) {
7
- swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
5
+ void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
6
+ if (dense_first) {
7
+ for (uint32_t il = 0; il < n_layer; ++il) {
8
+ swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
9
+ }
10
+ } else {
11
+ for (uint32_t il = 0; il < n_layer; ++il) {
12
+ swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
13
+ }
8
14
  }
9
15
  }
10
16
 
@@ -98,7 +98,7 @@ struct llama_hparams {
98
98
  float rope_freq_scale_train;
99
99
  float rope_freq_scale_train_swa;
100
100
  uint32_t n_ctx_orig_yarn;
101
- float rope_yarn_log_mul;
101
+ float rope_yarn_log_mul = 0.0f;
102
102
 
103
103
  std::array<int, 4> rope_sections;
104
104
 
@@ -140,7 +140,7 @@ struct llama_hparams {
140
140
  // for Classifiers
141
141
  uint32_t n_cls_out = 1;
142
142
 
143
- // llama4
143
+ // llama4 smallthinker
144
144
  uint32_t n_moe_layer_step = 0;
145
145
  uint32_t n_no_rope_layer_step = 4;
146
146
  uint32_t n_attn_temp_floor_scale = 8192;
@@ -161,9 +161,10 @@ struct llama_hparams {
161
161
  enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
162
162
 
163
163
  // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
164
+ // dense_first means whether the pattern is start with a dense layer
164
165
  // note that if n_pattern == 0, all layers are SWA
165
166
  // if n_pattern == 1, all layers are dense
166
- // example: n_pattern = 3
167
+ // example 1: n_pattern = 3, dense_first = false
167
168
  // il == 0: swa
168
169
  // il == 1: swa
169
170
  // il == 2: dense
@@ -172,7 +173,13 @@ struct llama_hparams {
172
173
  // il == 5: dense
173
174
  // il == 6: swa
174
175
  // etc ...
175
- void set_swa_pattern(uint32_t n_pattern);
176
+ // example 2: n_pattern = 2, dense_first = true
177
+ // il == 0: dense
178
+ // il == 1: swa
179
+ // il == 2: dense
180
+ // il == 3: swa
181
+ // etc ...
182
+ void set_swa_pattern(uint32_t n_pattern, bool dense_first = false);
176
183
 
177
184
  // return true if one of the layers is SWA
178
185
  bool is_swa_any() const;