@fugood/llama.node 1.2.0-rc.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +16 -15
  2. package/src/llama.cpp/CMakeLists.txt +7 -0
  3. package/src/llama.cpp/common/arg.cpp +141 -21
  4. package/src/llama.cpp/common/chat.cpp +139 -0
  5. package/src/llama.cpp/common/chat.h +1 -0
  6. package/src/llama.cpp/common/common.h +23 -8
  7. package/src/llama.cpp/common/json-schema-to-grammar.cpp +28 -7
  8. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  9. package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
  10. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  11. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -6
  12. package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
  13. package/src/llama.cpp/ggml/include/ggml.h +10 -5
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -1
  21. package/src/llama.cpp/src/llama-arch.cpp +44 -10
  22. package/src/llama.cpp/src/llama-arch.h +9 -0
  23. package/src/llama.cpp/src/llama-chat.cpp +17 -0
  24. package/src/llama.cpp/src/llama-chat.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +13 -11
  26. package/src/llama.cpp/src/llama-graph.cpp +6 -5
  27. package/src/llama.cpp/src/llama-hparams.h +14 -3
  28. package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
  29. package/src/llama.cpp/src/llama-kv-cache.h +8 -0
  30. package/src/llama.cpp/src/llama-model.cpp +386 -140
  31. package/src/llama.cpp/src/llama-model.h +3 -0
  32. package/src/llama.cpp/src/llama-quant.cpp +6 -4
  33. package/src/llama.cpp/src/llama-vocab.cpp +13 -1
  34. package/src/llama.cpp/src/llama-vocab.h +1 -0
  35. package/src/llama.cpp/src/llama.cpp +53 -10
@@ -776,6 +776,24 @@ static void ggml_compute_forward_dup_f32(
776
776
  id += ne00 * (ne01 - ir1);
777
777
  }
778
778
  }
779
+ } else if (dst->type == GGML_TYPE_I32) {
780
+ size_t id = 0;
781
+ int32_t * dst_ptr = (int32_t *) dst->data;
782
+
783
+ for (int i03 = 0; i03 < ne03; i03++) {
784
+ for (int i02 = 0; i02 < ne02; i02++) {
785
+ id += ne00 * ir0;
786
+ for (int i01 = ir0; i01 < ir1; i01++) {
787
+ for (int i00 = 0; i00 < ne00; i00++) {
788
+ const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
789
+
790
+ dst_ptr[id] = *src0_ptr;
791
+ id++;
792
+ }
793
+ }
794
+ id += ne00 * (ne01 - ir1);
795
+ }
796
+ }
779
797
  } else {
780
798
  GGML_ABORT("fatal error"); // TODO: implement
781
799
  }
@@ -947,6 +965,144 @@ static void ggml_compute_forward_dup_f32(
947
965
  }
948
966
  }
949
967
  }
968
+ } else if (dst->type == GGML_TYPE_I32) {
969
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
970
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
971
+ i10 += ne00 * ir0;
972
+ while (i10 >= ne0) {
973
+ i10 -= ne0;
974
+ if (++i11 == ne1) {
975
+ i11 = 0;
976
+ if (++i12 == ne2) {
977
+ i12 = 0;
978
+ if (++i13 == ne3) {
979
+ i13 = 0;
980
+ }
981
+ }
982
+ }
983
+ }
984
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
985
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
986
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
987
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
988
+
989
+ *(int32_t *) dst_ptr = *(const float *) src0_ptr;
990
+
991
+ if (++i10 == ne0) {
992
+ i10 = 0;
993
+ if (++i11 == ne1) {
994
+ i11 = 0;
995
+ if (++i12 == ne2) {
996
+ i12 = 0;
997
+ if (++i13 == ne3) {
998
+ i13 = 0;
999
+ }
1000
+ }
1001
+ }
1002
+ }
1003
+ }
1004
+ }
1005
+ i10 += ne00 * (ne01 - ir1);
1006
+ while (i10 >= ne0) {
1007
+ i10 -= ne0;
1008
+ if (++i11 == ne1) {
1009
+ i11 = 0;
1010
+ if (++i12 == ne2) {
1011
+ i12 = 0;
1012
+ if (++i13 == ne3) {
1013
+ i13 = 0;
1014
+ }
1015
+ }
1016
+ }
1017
+ }
1018
+ }
1019
+ }
1020
+ } else {
1021
+ GGML_ABORT("fatal error"); // TODO: implement
1022
+ }
1023
+ }
1024
+
1025
+ static void ggml_compute_forward_dup_i32(
1026
+ const ggml_compute_params * params,
1027
+ ggml_tensor * dst) {
1028
+
1029
+ const ggml_tensor * src0 = dst->src[0];
1030
+
1031
+ GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
1032
+
1033
+ GGML_TENSOR_UNARY_OP_LOCALS
1034
+
1035
+ const int ith = params->ith; // thread index
1036
+ const int nth = params->nth; // number of threads
1037
+
1038
+ // parallelize by rows
1039
+ const int nr = ne01;
1040
+ // number of rows per thread
1041
+ const int dr = (nr + nth - 1) / nth;
1042
+ // row range for this thread
1043
+ const int ir0 = dr * ith;
1044
+ const int ir1 = MIN(ir0 + dr, nr);
1045
+
1046
+ // dst counters
1047
+
1048
+ int64_t i10 = 0;
1049
+ int64_t i11 = 0;
1050
+ int64_t i12 = 0;
1051
+ int64_t i13 = 0;
1052
+
1053
+ // TODO: not optimal, but works
1054
+ if (dst->type == GGML_TYPE_F32) {
1055
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1056
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1057
+ i10 += ne00 * ir0;
1058
+ while (i10 >= ne0) {
1059
+ i10 -= ne0;
1060
+ if (++i11 == ne1) {
1061
+ i11 = 0;
1062
+ if (++i12 == ne2) {
1063
+ i12 = 0;
1064
+ if (++i13 == ne3) {
1065
+ i13 = 0;
1066
+ }
1067
+ }
1068
+ }
1069
+ }
1070
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
1071
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
1072
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
1073
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
1074
+
1075
+ *(float *) dst_ptr = *(const int32_t *) src0_ptr;
1076
+
1077
+ if (++i10 == ne0) {
1078
+ i10 = 0;
1079
+ if (++i11 == ne1) {
1080
+ i11 = 0;
1081
+ if (++i12 == ne2) {
1082
+ i12 = 0;
1083
+ if (++i13 == ne3) {
1084
+ i13 = 0;
1085
+ }
1086
+ }
1087
+ }
1088
+ }
1089
+ }
1090
+ }
1091
+ i10 += ne00 * (ne01 - ir1);
1092
+ while (i10 >= ne0) {
1093
+ i10 -= ne0;
1094
+ if (++i11 == ne1) {
1095
+ i11 = 0;
1096
+ if (++i12 == ne2) {
1097
+ i12 = 0;
1098
+ if (++i13 == ne3) {
1099
+ i13 = 0;
1100
+ }
1101
+ }
1102
+ }
1103
+ }
1104
+ }
1105
+ }
950
1106
  } else {
951
1107
  GGML_ABORT("fatal error"); // TODO: implement
952
1108
  }
@@ -1177,6 +1333,10 @@ void ggml_compute_forward_dup(
1177
1333
  {
1178
1334
  ggml_compute_forward_dup_f32(params, dst);
1179
1335
  } break;
1336
+ case GGML_TYPE_I32:
1337
+ {
1338
+ ggml_compute_forward_dup_i32(params, dst);
1339
+ } break;
1180
1340
  default:
1181
1341
  {
1182
1342
  if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
@@ -8438,7 +8598,7 @@ static void ggml_compute_forward_timestep_embedding_f32(
8438
8598
  embed_data[j + half] = sinf(arg);
8439
8599
  }
8440
8600
  if (dim % 2 != 0 && ith == 0) {
8441
- embed_data[dim] = 0.f;
8601
+ embed_data[2 * half] = 0.f;
8442
8602
  }
8443
8603
  }
8444
8604
  }
@@ -96,6 +96,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
96
96
  { LLM_ARCH_DREAM, "dream" },
97
97
  { LLM_ARCH_SMALLTHINKER, "smallthinker" },
98
98
  { LLM_ARCH_LLADA, "llada" },
99
+ { LLM_ARCH_LLADA_MOE, "llada-moe" },
99
100
  { LLM_ARCH_SEED_OSS, "seed_oss" },
100
101
  { LLM_ARCH_UNKNOWN, "(unknown)" },
101
102
  };
@@ -137,7 +138,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
137
138
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
138
139
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
139
140
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
141
+ { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
140
142
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
143
+ { LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
141
144
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
142
145
  { LLM_KV_SWIN_NORM, "%s.swin_norm" },
143
146
  { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
@@ -168,19 +171,25 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
168
171
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
169
172
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
170
173
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
174
+ { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
175
+ { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
171
176
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
172
177
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
173
178
 
174
- { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
175
- { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
176
- { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
177
- { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
178
- { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
179
- { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
180
- { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
181
- { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
182
- { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
183
- { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
179
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
180
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
181
+ { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
182
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
183
+ { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
184
+ { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
185
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
186
+ { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
187
+ { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
188
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
189
+ { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
190
+ { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
191
+ { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
192
+ { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
184
193
 
185
194
  { LLM_KV_SPLIT_NO, "split.no" },
186
195
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -397,12 +406,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
397
406
  { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
398
407
  { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
399
408
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
409
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
410
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
411
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
400
412
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
401
413
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
402
414
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
403
415
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
404
416
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
405
417
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
418
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
406
419
  { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
407
420
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
408
421
  },
@@ -2135,6 +2148,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2135
2148
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2136
2149
  },
2137
2150
  },
2151
+ {
2152
+ LLM_ARCH_LLADA_MOE,
2153
+ {
2154
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2155
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2156
+ { LLM_TENSOR_OUTPUT, "output" },
2157
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2158
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2159
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2160
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2161
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2162
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2163
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2164
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2165
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2166
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2167
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2168
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2169
+ },
2170
+ },
2138
2171
  {
2139
2172
  LLM_ARCH_SEED_OSS,
2140
2173
  {
@@ -2415,6 +2448,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
2415
2448
  switch (arch) {
2416
2449
  case LLM_ARCH_DREAM:
2417
2450
  case LLM_ARCH_LLADA:
2451
+ case LLM_ARCH_LLADA_MOE:
2418
2452
  return true;
2419
2453
  default:
2420
2454
  return false;
@@ -100,6 +100,7 @@ enum llm_arch {
100
100
  LLM_ARCH_DREAM,
101
101
  LLM_ARCH_SMALLTHINKER,
102
102
  LLM_ARCH_LLADA,
103
+ LLM_ARCH_LLADA_MOE,
103
104
  LLM_ARCH_SEED_OSS,
104
105
  LLM_ARCH_UNKNOWN,
105
106
  };
@@ -141,7 +142,9 @@ enum llm_kv {
141
142
  LLM_KV_POOLING_TYPE,
142
143
  LLM_KV_LOGIT_SCALE,
143
144
  LLM_KV_DECODER_START_TOKEN_ID,
145
+ LLM_KV_DECODER_BLOCK_COUNT,
144
146
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
147
+ LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
145
148
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
146
149
  LLM_KV_SWIN_NORM,
147
150
  LLM_KV_RESCALE_EVERY_N_LAYERS,
@@ -172,6 +175,8 @@ enum llm_kv {
172
175
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
173
176
  LLM_KV_ATTENTION_SLIDING_WINDOW,
174
177
  LLM_KV_ATTENTION_SCALE,
178
+ LLM_KV_ATTENTION_OUTPUT_SCALE,
179
+ LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
175
180
  LLM_KV_ATTENTION_KEY_LENGTH_MLA,
176
181
  LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
177
182
 
@@ -185,6 +190,10 @@ enum llm_kv {
185
190
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
186
191
  LLM_KV_ROPE_SCALING_FINETUNED,
187
192
  LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
193
+ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
194
+ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
195
+ LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
196
+ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
188
197
 
189
198
  LLM_KV_SPLIT_NO,
190
199
  LLM_KV_SPLIT_COUNT,
@@ -70,6 +70,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
70
70
  { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
71
71
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
72
72
  { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
73
+ { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
73
74
  };
74
75
 
75
76
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -204,6 +205,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
204
205
  return LLM_CHAT_TEMPLATE_KIMI_K2;
205
206
  } else if (tmpl_contains("<seed:bos>")) {
206
207
  return LLM_CHAT_TEMPLATE_SEED_OSS;
208
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
209
+ return LLM_CHAT_TEMPLATE_GROK_2;
207
210
  }
208
211
  return LLM_CHAT_TEMPLATE_UNKNOWN;
209
212
  }
@@ -763,6 +766,20 @@ int32_t llm_chat_apply_template(
763
766
  if (add_ass) {
764
767
  ss << "<seed:bos>assistant\n";
765
768
  }
769
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
770
+ for (auto message : chat) {
771
+ std::string role(message->role);
772
+ if (role == "system") {
773
+ ss << "System: " << trim(message->content) << "<|separator|>\n\n";
774
+ } else if (role == "user") {
775
+ ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
776
+ } else if (role == "assistant") {
777
+ ss << "Assistant: " << message->content << "<|separator|>\n\n";
778
+ }
779
+ }
780
+ if (add_ass) {
781
+ ss << "Assistant:";
782
+ }
766
783
  } else {
767
784
  // template not supported
768
785
  return -1;
@@ -50,6 +50,7 @@ enum llm_chat_template {
50
50
  LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
51
51
  LLM_CHAT_TEMPLATE_KIMI_K2,
52
52
  LLM_CHAT_TEMPLATE_SEED_OSS,
53
+ LLM_CHAT_TEMPLATE_GROK_2,
53
54
  LLM_CHAT_TEMPLATE_UNKNOWN,
54
55
  };
55
56
 
@@ -35,10 +35,10 @@ llama_context::llama_context(
35
35
 
36
36
  cparams.n_threads = params.n_threads;
37
37
  cparams.n_threads_batch = params.n_threads_batch;
38
- cparams.yarn_ext_factor = params.yarn_ext_factor;
39
- cparams.yarn_attn_factor = params.yarn_attn_factor;
40
- cparams.yarn_beta_fast = params.yarn_beta_fast;
41
- cparams.yarn_beta_slow = params.yarn_beta_slow;
38
+ cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
39
+ cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
40
+ cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
41
+ cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
42
42
  cparams.embeddings = params.embeddings;
43
43
  cparams.offload_kqv = params.offload_kqv;
44
44
  cparams.no_perf = params.no_perf;
@@ -181,7 +181,7 @@ llama_context::llama_context(
181
181
  // graph outputs buffer
182
182
  {
183
183
  // resized during inference when a batch uses more outputs
184
- if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
184
+ if (output_reserve(params.n_seq_max) < params.n_seq_max) {
185
185
  throw std::runtime_error("failed to reserve initial output buffer");
186
186
  }
187
187
 
@@ -285,8 +285,8 @@ llama_context::llama_context(
285
285
  const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
286
286
  const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
287
287
 
288
- // avoid reserving graphs with zero outputs
289
- n_outputs = 1;
288
+ // avoid reserving graphs with zero outputs - assume one output per sequence
289
+ n_outputs = n_seqs;
290
290
 
291
291
  LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
292
292
 
@@ -1447,7 +1447,9 @@ ggml_status llama_context::graph_compute(
1447
1447
  if (backend_cpu != nullptr) {
1448
1448
  auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
1449
1449
  auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
1450
- set_threadpool_fn(backend_cpu, tp);
1450
+ if (set_threadpool_fn) {
1451
+ set_threadpool_fn(backend_cpu, tp);
1452
+ }
1451
1453
  }
1452
1454
 
1453
1455
  // set the number of threads for all the backends
@@ -2261,9 +2263,9 @@ llama_context_params llama_context_default_params() {
2261
2263
  /*.rope_freq_base =*/ 0.0f,
2262
2264
  /*.rope_freq_scale =*/ 0.0f,
2263
2265
  /*.yarn_ext_factor =*/ -1.0f,
2264
- /*.yarn_attn_factor =*/ 1.0f,
2265
- /*.yarn_beta_fast =*/ 32.0f,
2266
- /*.yarn_beta_slow =*/ 1.0f,
2266
+ /*.yarn_attn_factor =*/ -1.0f,
2267
+ /*.yarn_beta_fast =*/ -1.0f,
2268
+ /*.yarn_beta_slow =*/ -1.0f,
2267
2269
  /*.yarn_orig_ctx =*/ 0,
2268
2270
  /*.defrag_thold =*/ -1.0f,
2269
2271
  /*.cb_eval =*/ nullptr,
@@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1273
1273
  // split the batch into streams if needed
1274
1274
  const auto n_stream = k->ne[3];
1275
1275
 
1276
- q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
1276
+ q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
1277
1277
 
1278
1278
  q = ggml_permute(ctx0, q, 0, 2, 1, 3);
1279
1279
  k = ggml_permute(ctx0, k, 0, 2, 1, 3);
@@ -1335,14 +1335,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1335
1335
 
1336
1336
  if (arch == LLM_ARCH_GROK) {
1337
1337
  // need to do the following:
1338
- // multiply by attn_output_multiplyer of 0.08838834764831845
1338
+ // multiply by attn_output_multiplier
1339
1339
  // and then :
1340
1340
  // kq = 30 * tanh(kq / 30)
1341
1341
  // before the softmax below
1342
1342
 
1343
- kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
1343
+ kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
1344
1344
  cb(kq, "kq_tanh", il);
1345
- kq = ggml_scale(ctx0, kq, 30);
1345
+ kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
1346
1346
  cb(kq, "kq_scaled", il);
1347
1347
  }
1348
1348
 
@@ -1431,7 +1431,8 @@ ggml_tensor * llm_graph_context::build_attn(
1431
1431
 
1432
1432
  // [TAG_NO_CACHE_PAD]
1433
1433
  // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
1434
- assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
1434
+ // but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
1435
+ //assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
1435
1436
 
1436
1437
  ggml_tensor * q = q_cur;
1437
1438
  ggml_tensor * k = k_cur;
@@ -82,8 +82,9 @@ struct llama_hparams {
82
82
  float f_norm_rms_eps;
83
83
  float f_norm_group_eps;
84
84
 
85
- float f_attn_logit_softcapping = 50.0f;
86
- float f_final_logit_softcapping = 30.0f;
85
+ float f_attn_logit_softcapping = 50.0f;
86
+ float f_router_logit_softcapping = 30.0f;
87
+ float f_final_logit_softcapping = 30.0f;
87
88
 
88
89
  // for RWKV
89
90
  uint32_t rescale_every_n_layers = 0;
@@ -104,6 +105,11 @@ struct llama_hparams {
104
105
  uint32_t n_ctx_orig_yarn;
105
106
  float rope_yarn_log_mul = 0.0f;
106
107
 
108
+ float yarn_ext_factor = -1.0f;
109
+ float yarn_attn_factor = 1.0f;
110
+ float yarn_beta_fast = 32.0f;
111
+ float yarn_beta_slow = 1.0f;
112
+
107
113
  std::array<int, 4> rope_sections;
108
114
 
109
115
  // Sliding Window Attention (SWA)
@@ -136,10 +142,14 @@ struct llama_hparams {
136
142
  float f_embedding_scale = 0.0f;
137
143
  float f_attention_scale = 0.0f;
138
144
 
145
+ // grok-2
146
+ float f_attn_out_scale = 0.0f;
147
+ uint32_t attn_temp_length = 0;
148
+
139
149
  bool causal_attn = true;
140
150
  bool use_alibi = false;
141
151
  bool attn_soft_cap = false;
142
- bool use_kq_norm = true;
152
+ bool use_kq_norm = false;
143
153
 
144
154
  // for Classifiers
145
155
  uint32_t n_cls_out = 1;
@@ -159,6 +169,7 @@ struct llama_hparams {
159
169
  // needed by encoder-decoder models (e.g. T5, FLAN-T5)
160
170
  // ref: https://github.com/ggerganov/llama.cpp/pull/8141
161
171
  llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
172
+ uint32_t dec_n_layer = 0;
162
173
 
163
174
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
164
175
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1018,16 +1018,33 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
1018
1018
 
1019
1019
  const int32_t ikv = map_layer_ids.at(il);
1020
1020
 
1021
- auto * k = layers[ikv].k;
1021
+ ggml_tensor * k = layers[ikv].k;
1022
+
1023
+ const int64_t n_embd_head = k_cur->ne[0];
1024
+ const int64_t n_head = k_cur->ne[1];
1025
+ const int64_t n_tokens = k_cur->ne[2];
1026
+
1027
+ const int64_t n_embd_gqa = n_embd_head*n_head;
1022
1028
 
1023
- const int64_t n_tokens = k_cur->ne[2];
1029
+ // we can merge dims 0 and 1
1030
+ // TODO: add ggml helper function for this?
1031
+ GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
1024
1032
 
1025
- k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
1033
+ k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
1026
1034
 
1027
- if (k->ne[2] > 1) {
1028
- k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
1035
+ const int64_t n_stream = k->ne[2];
1036
+
1037
+ if (n_stream > 1) {
1038
+ const int64_t kv_size = get_size();
1039
+
1040
+ assert(n_embd_gqa == k->ne[0]);
1041
+ assert(kv_size == k->ne[1]);
1042
+
1043
+ // merge the buffer across all streams because the idxs are global
1044
+ k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
1029
1045
  }
1030
1046
 
1047
+ // store the current K values into the cache
1031
1048
  return ggml_set_rows(ctx, k, k_cur, k_idxs);
1032
1049
  }
1033
1050
 
@@ -1038,28 +1055,51 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
1038
1055
 
1039
1056
  auto * v = layers[ikv].v;
1040
1057
 
1041
- const int64_t n_embd_v_gqa = v_cur->ne[0]*v_cur->ne[1];
1042
- const int64_t n_tokens = v_cur->ne[2];
1058
+ const int64_t n_embd_head = v_cur->ne[0];
1059
+ const int64_t n_head = v_cur->ne[1];
1060
+ const int64_t n_tokens = v_cur->ne[2];
1061
+
1062
+ const int64_t n_embd_gqa = n_embd_head*n_head;
1043
1063
 
1044
- v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
1064
+ // we can merge dims 0 and 1
1065
+ GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
1045
1066
 
1067
+ const int64_t n_stream = v->ne[2];
1068
+
1069
+ // take this branch when FA is enabled (the V cache is not transposed)
1046
1070
  if (!v_trans) {
1047
- if (v->ne[2] > 1) {
1048
- v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
1071
+ v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
1072
+
1073
+ if (n_stream > 1) {
1074
+ const int64_t kv_size = get_size();
1075
+
1076
+ assert(n_embd_gqa == v->ne[0]);
1077
+ assert(kv_size == v->ne[1]);
1078
+
1079
+ // merge the buffer across all streams because the idxs are global
1080
+ v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
1049
1081
  }
1050
1082
 
1051
1083
  return ggml_set_rows(ctx, v, v_cur, v_idxs);
1052
1084
  }
1053
1085
 
1086
+ if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
1087
+ // we can merge dims 0, 1 and 2
1088
+ v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
1089
+ } else {
1090
+ // otherwise -> make a copy to get contiguous data
1091
+ v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
1092
+ }
1093
+
1054
1094
  // [TAG_V_CACHE_VARIABLE]
1055
- if (n_embd_v_gqa < v->ne[0]) {
1056
- v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1095
+ if (n_embd_gqa < v->ne[0]) {
1096
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
1057
1097
  }
1058
1098
 
1059
- // the row becomes a single element
1060
- ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
1099
+ // in this branch the v_idxs are constructed in such a way that each row is a single head element
1100
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
1061
1101
 
1062
- v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
1102
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
1063
1103
 
1064
1104
  return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1065
1105
  }
@@ -317,9 +317,17 @@ public:
317
317
  ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
318
318
 
319
319
  // store k_cur and v_cur in the cache based on the provided head location
320
+ // note: the heads in k_cur and v_cur should be layed out contiguously in memory
321
+ // - k_cur [n_embd_head_k, n_head_k, n_tokens]
322
+ // - k_idxs [n_tokens]
323
+ // - v_cur [n_embd_head_v, n_head_v, n_tokens]
324
+ // - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
320
325
  ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
321
326
  ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
322
327
 
328
+ // create destination indices for each head of the current batch for where it would be written in the KV cache
329
+ // the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
330
+ // helps understand the implementation logic of cpy_k and cpy_v
323
331
  ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
324
332
  ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
325
333