@fugood/llama.node 1.3.0-rc.6 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/CMakeLists.txt +12 -2
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +8 -9
  4. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  5. package/src/llama.cpp/common/arg.cpp +39 -1001
  6. package/src/llama.cpp/common/arg.h +2 -2
  7. package/src/llama.cpp/common/chat.cpp +216 -2
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +33 -0
  10. package/src/llama.cpp/common/common.h +13 -0
  11. package/src/llama.cpp/common/download.cpp +1054 -0
  12. package/src/llama.cpp/common/download.h +55 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
  14. package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
  15. package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  16. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  26. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  27. package/src/llama.cpp/include/llama.h +7 -3
  28. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  29. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  30. package/src/llama.cpp/src/llama-arch.h +11 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  32. package/src/llama.cpp/src/llama-batch.h +12 -1
  33. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  34. package/src/llama.cpp/src/llama-chat.h +1 -0
  35. package/src/llama.cpp/src/llama-context.cpp +44 -16
  36. package/src/llama.cpp/src/llama-context.h +5 -5
  37. package/src/llama.cpp/src/llama-cparams.h +1 -0
  38. package/src/llama.cpp/src/llama-graph.cpp +12 -7
  39. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  40. package/src/llama.cpp/src/llama-hparams.h +6 -0
  41. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  42. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
  43. package/src/llama.cpp/src/llama-kv-cache.h +2 -4
  44. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  45. package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
  46. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  47. package/src/llama.cpp/src/llama-model.cpp +350 -13194
  48. package/src/llama.cpp/src/llama-model.h +9 -2
  49. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  50. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  51. package/src/llama.cpp/src/llama-vocab.h +1 -0
  52. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  53. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  54. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  55. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  56. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  57. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  58. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  59. package/src/llama.cpp/src/models/bert.cpp +176 -0
  60. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  61. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  62. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  63. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  64. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  65. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  66. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  67. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  68. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  69. package/src/llama.cpp/src/models/deci.cpp +135 -0
  70. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  71. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  72. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  73. package/src/llama.cpp/src/models/dream.cpp +105 -0
  74. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  75. package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
  76. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  77. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  78. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  79. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  80. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  81. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  82. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  83. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  84. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  85. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  86. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  87. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  88. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  89. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  90. package/src/llama.cpp/src/models/granite.cpp +211 -0
  91. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  92. package/src/llama.cpp/src/models/grok.cpp +159 -0
  93. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  94. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  95. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  96. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  97. package/src/llama.cpp/src/models/jais.cpp +86 -0
  98. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  99. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  100. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  101. package/src/llama.cpp/src/models/llada.cpp +99 -0
  102. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  103. package/src/llama.cpp/src/models/llama.cpp +155 -0
  104. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  105. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  106. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  107. package/src/llama.cpp/src/models/models.h +481 -0
  108. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  109. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  110. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  111. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  112. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  113. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  114. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  115. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
  116. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  117. package/src/llama.cpp/src/models/orion.cpp +123 -0
  118. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  119. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  120. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  121. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  122. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  123. package/src/llama.cpp/src/models/plm.cpp +168 -0
  124. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  125. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  126. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  127. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  128. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  129. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  130. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  131. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  132. package/src/llama.cpp/src/models/refact.cpp +94 -0
  133. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  134. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  135. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  136. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  137. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  138. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  139. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  140. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  141. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  142. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  143. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  144. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  145. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  146. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  147. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -5474,7 +5474,7 @@ static void ggml_rope_cache_init(
5474
5474
  }
5475
5475
 
5476
5476
  static void ggml_mrope_cache_init(
5477
- float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
5477
+ float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool is_imrope, bool indep_sects,
5478
5478
  float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
5479
5479
  float * cache, float sin_sign, float theta_scale) {
5480
5480
  // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@@ -5509,14 +5509,26 @@ static void ggml_mrope_cache_init(
5509
5509
  }
5510
5510
 
5511
5511
  float theta = theta_t;
5512
- if (sector >= sections[0] && sector < sec_w) {
5513
- theta = theta_h;
5514
- }
5515
- else if (sector >= sec_w && sector < sec_w + sections[2]) {
5516
- theta = theta_w;
5517
- }
5518
- else if (sector >= sec_w + sections[2]) {
5519
- theta = theta_e;
5512
+ if (is_imrope) { // qwen3vl apply interleaved mrope
5513
+ if (sector % 3 == 1 && sector < 3 * sections[1]) {
5514
+ theta = theta_h;
5515
+ } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
5516
+ theta = theta_w;
5517
+ } else if (sector % 3 == 0 && sector < 3 * sections[0]) {
5518
+ theta = theta_t;
5519
+ } else {
5520
+ theta = theta_e;
5521
+ }
5522
+ } else {
5523
+ if (sector >= sections[0] && sector < sec_w) {
5524
+ theta = theta_h;
5525
+ }
5526
+ else if (sector >= sec_w && sector < sec_w + sections[2]) {
5527
+ theta = theta_w;
5528
+ }
5529
+ else if (sector >= sec_w + sections[2]) {
5530
+ theta = theta_e;
5531
+ }
5520
5532
  }
5521
5533
 
5522
5534
  rope_yarn(
@@ -5589,6 +5601,7 @@ static void ggml_compute_forward_rope_f32(
5589
5601
 
5590
5602
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
5591
5603
  const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
5604
+ const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
5592
5605
  const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
5593
5606
 
5594
5607
  if (is_mrope) {
@@ -5627,7 +5640,7 @@ static void ggml_compute_forward_rope_f32(
5627
5640
  const int64_t p_w = pos[i2 + ne2 * 2];
5628
5641
  const int64_t p_e = pos[i2 + ne2 * 3];
5629
5642
  ggml_mrope_cache_init(
5630
- p_t, p_h, p_w, p_e, sections, is_vision,
5643
+ p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
5631
5644
  freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5632
5645
  }
5633
5646
 
@@ -5775,6 +5788,7 @@ static void ggml_compute_forward_rope_f16(
5775
5788
 
5776
5789
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
5777
5790
  const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
5791
+ const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
5778
5792
  const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
5779
5793
 
5780
5794
  if (is_mrope) {
@@ -5813,7 +5827,7 @@ static void ggml_compute_forward_rope_f16(
5813
5827
  const int64_t p_w = pos[i2 + ne2 * 2];
5814
5828
  const int64_t p_e = pos[i2 + ne2 * 3];
5815
5829
  ggml_mrope_cache_init(
5816
- p_t, p_h, p_w, p_e, sections, is_vision,
5830
+ p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
5817
5831
  freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
5818
5832
  }
5819
5833
 
@@ -7070,7 +7084,11 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
7070
7084
  const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
7071
7085
 
7072
7086
  #ifdef GGML_SIMD
7073
- const int64_t pkg_size = GGML_F32_EPR;
7087
+ #if defined(__ARM_FEATURE_SVE)
7088
+ const int64_t pkg_size = svcntw();
7089
+ #else
7090
+ const int64_t pkg_size = GGML_F32_EPR;
7091
+ #endif
7074
7092
  const int64_t pkg_count = c / pkg_size;
7075
7093
  const int64_t c_pkg_end = pkg_count * pkg_size;
7076
7094
  #else
@@ -7493,10 +7511,17 @@ static void ggml_compute_forward_upscale_f32(
7493
7511
  float sf1 = (float)ne1/src0->ne[1];
7494
7512
  float sf2 = (float)ne2/src0->ne[2];
7495
7513
  float sf3 = (float)ne3/src0->ne[3];
7514
+ float pixel_offset = 0.5f;
7496
7515
 
7497
7516
  const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
7498
7517
  const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
7499
7518
 
7519
+ if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
7520
+ pixel_offset = 0.0f;
7521
+ sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
7522
+ sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
7523
+ }
7524
+
7500
7525
  if (mode == GGML_SCALE_MODE_NEAREST) {
7501
7526
  for (int64_t i3 = 0; i3 < ne3; i3++) {
7502
7527
  const int64_t i03 = i3 / sf3;
@@ -7516,13 +7541,6 @@ static void ggml_compute_forward_upscale_f32(
7516
7541
  }
7517
7542
  }
7518
7543
  } else if (mode == GGML_SCALE_MODE_BILINEAR) {
7519
- float pixel_offset = 0.5f;
7520
- if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
7521
- pixel_offset = 0.0f;
7522
- sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
7523
- sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
7524
- }
7525
-
7526
7544
  for (int64_t i3 = 0; i3 < ne3; i3++) {
7527
7545
  const int64_t i03 = i3 / sf3;
7528
7546
  for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
@@ -7557,6 +7575,51 @@ static void ggml_compute_forward_upscale_f32(
7557
7575
 
7558
7576
  const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
7559
7577
 
7578
+ float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
7579
+ *y_dst = val;
7580
+ }
7581
+ }
7582
+ }
7583
+ }
7584
+ } else if (mode == GGML_SCALE_MODE_BICUBIC) {
7585
+ // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
7586
+ const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
7587
+ auto weight1 = [a](float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
7588
+ auto weight2 = [a](float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
7589
+ auto bicubic = [=](float p0, float p1, float p2, float p3, float x) {
7590
+ const float w0 = weight2(x + 1);
7591
+ const float w1 = weight1(x + 0);
7592
+ const float w2 = weight1(1 - x);
7593
+ const float w3 = weight2(2 - x);
7594
+ return p0*w0 + p1*w1 + p2*w2 + p3*w3;
7595
+ };
7596
+
7597
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
7598
+ const int64_t i03 = i3 / sf3;
7599
+ for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
7600
+ const int64_t i02 = i2 / sf2;
7601
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
7602
+ const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
7603
+ const int64_t y0 = (int64_t)floorf(y);
7604
+ const float dy = y - (float)y0;
7605
+
7606
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
7607
+ const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
7608
+ const int64_t x0 = (int64_t)floorf(x);
7609
+ const float dx = x - (float)x0;
7610
+
7611
+ auto p = [=](int64_t x_off, int64_t y_off) -> float {
7612
+ int64_t i00 = std::max(int64_t(0), std::min(x0 + x_off, ne00 - 1));
7613
+ int64_t i01 = std::max(int64_t(0), std::min(y0 + y_off, ne01 - 1));
7614
+ return *(const float *)((const char *)src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
7615
+ };
7616
+
7617
+ const float val = bicubic(
7618
+ bicubic(p(-1,-1), p(0,-1), p(1,-1), p(2,-1), dx),
7619
+ bicubic(p(-1, 0), p(0, 0), p(1, 0), p(2, 0), dx),
7620
+ bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx),
7621
+ bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy);
7622
+
7560
7623
  float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
7561
7624
  *y_dst = val;
7562
7625
  }
@@ -7909,10 +7972,10 @@ void ggml_compute_forward_argsort(
7909
7972
 
7910
7973
  // ggml_compute_forward_flash_attn_ext
7911
7974
 
7912
- static void ggml_compute_forward_flash_attn_ext_f16(
7975
+ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
7913
7976
  const ggml_compute_params * params,
7914
- ggml_tensor * dst) {
7915
-
7977
+ ggml_tensor * dst,
7978
+ int ir0, int ir1) {
7916
7979
  const ggml_tensor * q = dst->src[0];
7917
7980
  const ggml_tensor * k = dst->src[1];
7918
7981
  const ggml_tensor * v = dst->src[2];
@@ -7928,9 +7991,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
7928
7991
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
7929
7992
  GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
7930
7993
 
7931
- const int ith = params->ith;
7932
- const int nth = params->nth;
7933
-
7934
7994
  const int64_t DK = nek0;
7935
7995
  const int64_t DV = nev0;
7936
7996
  const int64_t N = neq1;
@@ -7964,16 +8024,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
7964
8024
 
7965
8025
  // parallelize by q rows using ggml_vec_dot_f32
7966
8026
 
7967
- // total rows in q
7968
- const int nr = neq1*neq2*neq3;
7969
-
7970
- // rows per thread
7971
- const int dr = (nr + nth - 1)/nth;
7972
-
7973
- // row range for this thread
7974
- const int ir0 = dr*ith;
7975
- const int ir1 = MIN(ir0 + dr, nr);
7976
-
7977
8027
  float scale = 1.0f;
7978
8028
  float max_bias = 0.0f;
7979
8029
  float logit_softcap = 0.0f;
@@ -8000,6 +8050,8 @@ static void ggml_compute_forward_flash_attn_ext_f16(
8000
8050
  GGML_ASSERT(( q_to_vec_dot) && "fattn: unsupported K-type");
8001
8051
  GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float ) && "fattn: unsupported V-type");
8002
8052
 
8053
+ int ith = params->ith;
8054
+
8003
8055
  // loop over n_batch and n_head
8004
8056
  for (int ir = ir0; ir < ir1; ++ir) {
8005
8057
  // q indices
@@ -8147,6 +8199,91 @@ static void ggml_compute_forward_flash_attn_ext_f16(
8147
8199
  }
8148
8200
  }
8149
8201
 
8202
+ static void ggml_compute_forward_flash_attn_ext_f16(
8203
+ const ggml_compute_params * params,
8204
+ ggml_tensor * dst) {
8205
+
8206
+ const ggml_tensor * q = dst->src[0];
8207
+ const ggml_tensor * k = dst->src[1];
8208
+ const ggml_tensor * v = dst->src[2];
8209
+
8210
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
8211
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
8212
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
8213
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
8214
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
8215
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
8216
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
8217
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
8218
+
8219
+ const int64_t DK = nek0;
8220
+ const int64_t DV = nev0;
8221
+ const int64_t N = neq1;
8222
+
8223
+ GGML_ASSERT(ne0 == DV);
8224
+ GGML_ASSERT(ne2 == N);
8225
+
8226
+ // input tensor rows must be contiguous
8227
+ GGML_ASSERT(nbq0 == ggml_type_size(q->type));
8228
+ GGML_ASSERT(nbk0 == ggml_type_size(k->type));
8229
+ GGML_ASSERT(nbv0 == ggml_type_size(v->type));
8230
+
8231
+ GGML_ASSERT(neq0 == DK);
8232
+ GGML_ASSERT(nek0 == DK);
8233
+ GGML_ASSERT(nev0 == DV);
8234
+
8235
+ GGML_ASSERT(neq1 == N);
8236
+
8237
+ // dst cannot be transposed or permuted
8238
+ GGML_ASSERT(nb0 == sizeof(float));
8239
+ GGML_ASSERT(nb0 <= nb1);
8240
+ GGML_ASSERT(nb1 <= nb2);
8241
+ GGML_ASSERT(nb2 <= nb3);
8242
+
8243
+ // parallelize by q rows using ggml_vec_dot_f32
8244
+
8245
+ // total rows in q
8246
+ const int64_t nr = neq1*neq2*neq3;
8247
+
8248
+ // rows per thread
8249
+ const int ith = params->ith;
8250
+ const int nth = params->nth;
8251
+
8252
+ // disable for NUMA
8253
+ const bool disable_chunking = ggml_is_numa();
8254
+
8255
+ // 4x chunks per thread
8256
+ int nth_scaled = nth * 4;
8257
+ int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
8258
+ int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
8259
+
8260
+ if (nth == 1 || nchunk < nth || disable_chunking) {
8261
+ nchunk = nth;
8262
+ }
8263
+
8264
+ if (ith == 0) {
8265
+ // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
8266
+ ggml_threadpool_chunk_set(params->threadpool, nth);
8267
+ }
8268
+
8269
+ ggml_barrier(params->threadpool);
8270
+
8271
+ // The number of elements in each chunk
8272
+ const int64_t dr = (nr + nchunk - 1) / nchunk;
8273
+
8274
+ // The first chunk comes from our thread_id, the rest will get auto-assigned.
8275
+ int current_chunk = ith;
8276
+
8277
+ while (current_chunk < nchunk) {
8278
+ const int64_t ir0 = dr * current_chunk;
8279
+ const int64_t ir1 = MIN(ir0 + dr, nr);
8280
+
8281
+ ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
8282
+
8283
+ current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
8284
+ }
8285
+ }
8286
+
8150
8287
  void ggml_compute_forward_flash_attn_ext(
8151
8288
  const ggml_compute_params * params,
8152
8289
  ggml_tensor * dst) {
@@ -1600,6 +1600,32 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1600
1600
  return false;
1601
1601
  }
1602
1602
 
1603
+ void forward_mul_mat_one_chunk(ggml_compute_params * params, ggml_tensor * op, int64_t src0_start, int64_t src0_end) {
1604
+ const ggml_tensor * src0 = op->src[0];
1605
+ const ggml_tensor * src1 = op->src[1];
1606
+ ggml_tensor * dst = op;
1607
+
1608
+ GGML_TENSOR_BINARY_OP_LOCALS
1609
+
1610
+ const void * src1_wdata = params->wdata;
1611
+ const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
1612
+
1613
+ // If there are more than three rows in src1, use gemm; otherwise, use gemv.
1614
+ if (ne11 > 3) {
1615
+ gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1616
+ (float *) ((char *) dst->data) + src0_start, ne01,
1617
+ (const char *) src0->data + src0_start * nb01,
1618
+ (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
1619
+ }
1620
+ for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
1621
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1622
+ (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
1623
+ (const char *) src0->data + src0_start * nb01,
1624
+ (const char *) src1_wdata + (src1_col_stride * iter), 1,
1625
+ src0_end - src0_start);
1626
+ }
1627
+ }
1628
+
1603
1629
  void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
1604
1630
  const ggml_tensor * src0 = op->src[0];
1605
1631
  const ggml_tensor * src1 = op->src[1];
@@ -1643,31 +1669,62 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1643
1669
  from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
1644
1670
  }
1645
1671
 
1646
- ggml_barrier(params->threadpool);
1672
+ // disable for NUMA
1673
+ const bool disable_chunking = ggml_is_numa();
1647
1674
 
1648
- const void * src1_wdata = params->wdata;
1649
- const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
1650
- int64_t src0_start = (ith * ne01) / nth;
1651
- int64_t src0_end = ((ith + 1) * ne01) / nth;
1652
- src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
1653
- src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
1654
- if (src0_start >= src0_end) {
1655
- return;
1675
+ // 4x chunks per thread
1676
+ int64_t nr = ggml_nrows(op->src[0]);
1677
+ int nth_scaled = nth * 4;
1678
+ int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
1679
+ int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
1680
+
1681
+ // Ensure minimum chunk size to avoid alignment issues with high thread counts
1682
+ // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
1683
+ const int64_t min_chunk_size = NB_COLS;
1684
+ if (nchunk > 0 && (nr / nchunk) < min_chunk_size && nr >= min_chunk_size) {
1685
+ nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
1656
1686
  }
1657
1687
 
1658
- // If there are more than three rows in src1, use gemm; otherwise, use gemv.
1659
- if (ne11 > 3) {
1660
- gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1661
- (float *) ((char *) dst->data) + src0_start, ne01,
1662
- (const char *) src0->data + src0_start * nb01,
1663
- (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
1688
+ if (nth == 1 || nchunk < nth || disable_chunking) {
1689
+ nchunk = nth;
1664
1690
  }
1665
- for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
1666
- gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1667
- (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
1668
- (const char *) src0->data + src0_start * nb01,
1669
- (const char *) src1_wdata + (src1_col_stride * iter), 1,
1670
- src0_end - src0_start);
1691
+
1692
+ // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
1693
+ // This prevents creating too many tiny chunks that could overlap after alignment
1694
+ const int64_t max_nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
1695
+ if (nchunk > max_nchunk) {
1696
+ nchunk = max_nchunk;
1697
+ }
1698
+
1699
+ if (ith == 0) {
1700
+ // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
1701
+ ggml_threadpool_chunk_set(params->threadpool, nth);
1702
+ }
1703
+
1704
+ ggml_barrier(params->threadpool);
1705
+
1706
+ // The first chunk comes from our thread_id, the rest will get auto-assigned.
1707
+ int current_chunk = ith;
1708
+
1709
+ while (current_chunk < nchunk) {
1710
+ int64_t src0_start = (current_chunk * ne01) / nchunk;
1711
+ int64_t src0_end = ((current_chunk + 1) * ne01) / nchunk;
1712
+
1713
+ // Align boundaries to NB_COLS - round up to ensure all data is included
1714
+ // The chunk size limiting above ensures chunks are large enough to prevent overlaps
1715
+ src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
1716
+ src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
1717
+ if (src0_end > ne01) {
1718
+ src0_end = ne01;
1719
+ }
1720
+
1721
+ if (src0_start >= src0_end) {
1722
+ break;
1723
+ }
1724
+
1725
+ forward_mul_mat_one_chunk(params, dst, src0_start, src0_end);
1726
+
1727
+ current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
1671
1728
  }
1672
1729
  }
1673
1730
 
@@ -1772,8 +1829,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1772
1829
  int64_t src0_cur_start = (ith * ne01) / nth;
1773
1830
  int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
1774
1831
 
1832
+ // Align boundaries to NB_COLS - round up to ensure all data is included
1775
1833
  src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
1776
1834
  src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
1835
+ if (src0_cur_end > ne01) {
1836
+ src0_cur_end = ne01;
1837
+ }
1777
1838
 
1778
1839
  if (src0_cur_start >= src0_cur_end) {
1779
1840
  return;
@@ -956,7 +956,7 @@ do { \
956
956
 
957
957
  #define GGML_F32Cx8 __m256
958
958
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
959
- #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
959
+ #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
960
960
 
961
961
  static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
962
962
  __m256i a;
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
999
999
 
1000
1000
  #define GGML_F32x4 __m128
1001
1001
  #define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1002
- #define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1002
+ #define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1003
1003
  #define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
1004
1004
  #define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
1005
1005
  #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1006
1006
  #define GGML_F32x4_ADD __lsx_vfadd_s
1007
1007
  #define GGML_F32x4_MUL __lsx_vfmul_s
1008
- #define GGML_F32x4_REDUCE(res, x) \
1009
- { \
1010
- int offset = GGML_F32_ARR >> 1; \
1011
- for (int i = 0; i < offset; ++i) { \
1012
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1013
- } \
1014
- offset >>= 1; \
1015
- for (int i = 0; i < offset; ++i) { \
1016
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1017
- } \
1018
- offset >>= 1; \
1019
- for (int i = 0; i < offset; ++i) { \
1020
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1021
- } \
1022
- __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1023
- tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1024
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1025
- const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
1026
- tmp = __lsx_vsrli_d((__m128i) t0, 32); \
1027
- tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1028
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1029
- res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1008
+
1009
+ #define GGML_F32x4_REDUCE(res, x) \
1010
+ { \
1011
+ int offset = GGML_F32_ARR >> 1; \
1012
+ for (int i = 0; i < offset; ++i) { \
1013
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1014
+ } \
1015
+ offset >>= 1; \
1016
+ for (int i = 0; i < offset; ++i) { \
1017
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1018
+ } \
1019
+ offset >>= 1; \
1020
+ for (int i = 0; i < offset; ++i) { \
1021
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1022
+ } \
1023
+ __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1024
+ __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1025
+ __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
1026
+ __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
1027
+ __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
1028
+ __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
1029
+ res = (ggml_float) ((v4f32)t5)[0]; \
1030
1030
  }
1031
1031
 
1032
1032
  #define GGML_F32_VEC GGML_F32x4
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1068
1068
 
1069
1069
  #define GGML_F32Cx4 __m128
1070
1070
  #define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1071
- #define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1071
+ #define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1072
1072
  #define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
1073
1073
  #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1074
1074
  #define GGML_F32Cx4_FMA GGML_F32x4_FMA
@@ -83,6 +83,7 @@ extern "C" {
83
83
  LLAMA_ROPE_TYPE_NORM = 0,
84
84
  LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
85
85
  LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
86
+ LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE,
86
87
  LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
87
88
  };
88
89
 
@@ -460,7 +461,11 @@ extern "C" {
460
461
  LLAMA_API bool llama_supports_gpu_offload(void);
461
462
  LLAMA_API bool llama_supports_rpc (void);
462
463
 
464
+ // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
465
+ // In some cases the requested values via llama_context_params may differ from the actual values used by the context
466
+ // ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
463
467
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
468
+ LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx);
464
469
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
465
470
  LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
466
471
  LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
@@ -481,6 +486,7 @@ extern "C" {
481
486
 
482
487
  LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
483
488
  LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
489
+ LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
484
490
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
485
491
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
486
492
  LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
@@ -584,7 +590,7 @@ extern "C" {
584
590
  LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
585
591
 
586
592
  // Manually free a LoRA adapter
587
- // Note: loaded adapters will be free when the associated model is deleted
593
+ // NOTE: loaded adapters will be free when the associated model is deleted
588
594
  LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
589
595
 
590
596
  // Get the invocation tokens if the current lora is an alora
@@ -1110,8 +1116,6 @@ extern "C" {
1110
1116
  // // sample from the logits of the last token in the batch
1111
1117
  // const llama_token id = llama_sampler_sample(smpl, ctx, -1);
1112
1118
  //
1113
- // // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
1114
- // llama_sampler_accept(smpl, id);
1115
1119
  // ...
1116
1120
  // }
1117
1121
  //
@@ -35,6 +35,101 @@ add_library(llama
35
35
  unicode-data.cpp
36
36
  unicode.cpp
37
37
  unicode.h
38
+ models/apertus.cpp
39
+ models/arcee.cpp
40
+ models/arctic.cpp
41
+ models/arwkv7.cpp
42
+ models/baichuan.cpp
43
+ models/bailingmoe.cpp
44
+ models/bailingmoe2.cpp
45
+ models/bert.cpp
46
+ models/bitnet.cpp
47
+ models/bloom.cpp
48
+ models/chameleon.cpp
49
+ models/chatglm.cpp
50
+ models/codeshell.cpp
51
+ models/cogvlm.cpp
52
+ models/cohere2-iswa.cpp
53
+ models/command-r.cpp
54
+ models/dbrx.cpp
55
+ models/deci.cpp
56
+ models/deepseek.cpp
57
+ models/deepseek2.cpp
58
+ models/dots1.cpp
59
+ models/dream.cpp
60
+ models/ernie4-5-moe.cpp
61
+ models/ernie4-5.cpp
62
+ models/exaone.cpp
63
+ models/exaone4.cpp
64
+ models/falcon-h1.cpp
65
+ models/falcon.cpp
66
+ models/gemma-embedding.cpp
67
+ models/gemma.cpp
68
+ models/gemma2-iswa.cpp
69
+ models/gemma3-iswa.cpp
70
+ models/gemma3n-iswa.cpp
71
+ models/glm4-moe.cpp
72
+ models/glm4.cpp
73
+ models/gpt2.cpp
74
+ models/gptneox.cpp
75
+ models/granite-hybrid.cpp
76
+ models/granite.cpp
77
+ models/grok.cpp
78
+ models/grovemoe.cpp
79
+ models/hunyuan-dense.cpp
80
+ models/hunyuan-moe.cpp
81
+ models/internlm2.cpp
82
+ models/jais.cpp
83
+ models/jamba.cpp
84
+ models/lfm2.cpp
85
+ models/llada-moe.cpp
86
+ models/llada.cpp
87
+ models/llama-iswa.cpp
88
+ models/llama.cpp
89
+ models/mamba.cpp
90
+ models/minicpm3.cpp
91
+ models/minimax-m2.cpp
92
+ models/mpt.cpp
93
+ models/nemotron-h.cpp
94
+ models/nemotron.cpp
95
+ models/neo-bert.cpp
96
+ models/olmo.cpp
97
+ models/olmo2.cpp
98
+ models/olmoe.cpp
99
+ models/openai-moe-iswa.cpp
100
+ models/openelm.cpp
101
+ models/orion.cpp
102
+ models/pangu-embedded.cpp
103
+ models/phi2.cpp
104
+ models/phi3.cpp
105
+ models/plamo.cpp
106
+ models/plamo2.cpp
107
+ models/plm.cpp
108
+ models/qwen.cpp
109
+ models/qwen2.cpp
110
+ models/qwen2moe.cpp
111
+ models/qwen2vl.cpp
112
+ models/qwen3.cpp
113
+ models/qwen3vl.cpp
114
+ models/qwen3vl-moe.cpp
115
+ models/qwen3moe.cpp
116
+ models/refact.cpp
117
+ models/rwkv6-base.cpp
118
+ models/rwkv6.cpp
119
+ models/rwkv6qwen2.cpp
120
+ models/rwkv7-base.cpp
121
+ models/rwkv7.cpp
122
+ models/seed-oss.cpp
123
+ models/smallthinker.cpp
124
+ models/smollm3.cpp
125
+ models/stablelm.cpp
126
+ models/starcoder.cpp
127
+ models/starcoder2.cpp
128
+ models/t5-dec.cpp
129
+ models/t5-enc.cpp
130
+ models/wavtokenizer-dec.cpp
131
+ models/xverse.cpp
132
+ models/graph-context-mamba.cpp
38
133
  )
39
134
 
40
135
  target_include_directories(llama PRIVATE .)