cui-llama.rn 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/android/src/main/CMakeLists.txt +5 -7
  2. package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
  3. package/android/src/main/jni.cpp +9 -9
  4. package/cpp/common.cpp +21 -40
  5. package/cpp/common.h +21 -12
  6. package/cpp/ggml-backend-impl.h +38 -20
  7. package/cpp/ggml-backend-reg.cpp +216 -87
  8. package/cpp/ggml-backend.h +1 -0
  9. package/cpp/ggml-common.h +42 -48
  10. package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +591 -152
  11. package/cpp/ggml-cpu-aarch64.h +2 -26
  12. package/cpp/ggml-cpu-traits.cpp +36 -0
  13. package/cpp/ggml-cpu-traits.h +38 -0
  14. package/cpp/ggml-cpu.c +14122 -13971
  15. package/cpp/ggml-cpu.cpp +618 -715
  16. package/cpp/ggml-cpu.h +0 -17
  17. package/cpp/ggml-impl.h +6 -6
  18. package/cpp/ggml-metal.m +482 -24
  19. package/cpp/ggml-quants.c +0 -9
  20. package/cpp/ggml-threading.h +4 -2
  21. package/cpp/ggml.c +132 -43
  22. package/cpp/ggml.h +44 -13
  23. package/cpp/llama-sampling.cpp +35 -90
  24. package/cpp/llama-vocab.cpp +2 -1
  25. package/cpp/llama.cpp +737 -233
  26. package/cpp/llama.h +20 -16
  27. package/cpp/sampling.cpp +11 -16
  28. package/cpp/speculative.cpp +4 -0
  29. package/cpp/unicode.cpp +51 -51
  30. package/cpp/unicode.h +9 -10
  31. package/lib/commonjs/index.js +38 -1
  32. package/lib/commonjs/index.js.map +1 -1
  33. package/lib/module/index.js +36 -0
  34. package/lib/module/index.js.map +1 -1
  35. package/lib/typescript/NativeRNLlama.d.ts +2 -3
  36. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  37. package/lib/typescript/index.d.ts +36 -2
  38. package/lib/typescript/index.d.ts.map +1 -1
  39. package/package.json +1 -1
  40. package/src/NativeRNLlama.ts +3 -3
  41. package/src/index.ts +46 -2
  42. package/cpp/amx/amx.cpp +0 -196
  43. package/cpp/amx/amx.h +0 -20
  44. package/cpp/amx/common.h +0 -101
  45. package/cpp/amx/mmq.cpp +0 -2524
  46. package/cpp/amx/mmq.h +0 -16
  47. package/cpp/ggml-aarch64.c +0 -129
  48. package/cpp/ggml-aarch64.h +0 -19
package/cpp/llama.cpp CHANGED
@@ -174,6 +174,7 @@ enum llm_arch {
174
174
  LLM_ARCH_QWEN,
175
175
  LLM_ARCH_QWEN2,
176
176
  LLM_ARCH_QWEN2MOE,
177
+ LLM_ARCH_QWEN2VL,
177
178
  LLM_ARCH_PHI2,
178
179
  LLM_ARCH_PHI3,
179
180
  LLM_ARCH_PLAMO,
@@ -194,6 +195,7 @@ enum llm_arch {
194
195
  LLM_ARCH_OLMOE,
195
196
  LLM_ARCH_OPENELM,
196
197
  LLM_ARCH_ARCTIC,
198
+ LLM_ARCH_DEEPSEEK,
197
199
  LLM_ARCH_DEEPSEEK2,
198
200
  LLM_ARCH_CHATGLM,
199
201
  LLM_ARCH_BITNET,
@@ -228,6 +230,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
228
230
  { LLM_ARCH_QWEN, "qwen" },
229
231
  { LLM_ARCH_QWEN2, "qwen2" },
230
232
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
233
+ { LLM_ARCH_QWEN2VL, "qwen2vl" },
231
234
  { LLM_ARCH_PHI2, "phi2" },
232
235
  { LLM_ARCH_PHI3, "phi3" },
233
236
  { LLM_ARCH_PLAMO, "plamo" },
@@ -248,6 +251,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
248
251
  { LLM_ARCH_OLMOE, "olmoe" },
249
252
  { LLM_ARCH_OPENELM, "openelm" },
250
253
  { LLM_ARCH_ARCTIC, "arctic" },
254
+ { LLM_ARCH_DEEPSEEK, "deepseek" },
251
255
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
252
256
  { LLM_ARCH_CHATGLM, "chatglm" },
253
257
  { LLM_ARCH_BITNET, "bitnet" },
@@ -319,6 +323,7 @@ enum llm_kv {
319
323
  LLM_KV_ATTENTION_SCALE,
320
324
 
321
325
  LLM_KV_ROPE_DIMENSION_COUNT,
326
+ LLM_KV_ROPE_DIMENSION_SECTIONS,
322
327
  LLM_KV_ROPE_FREQ_BASE,
323
328
  LLM_KV_ROPE_SCALE_LINEAR,
324
329
  LLM_KV_ROPE_SCALING_TYPE,
@@ -435,6 +440,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
435
440
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
436
441
 
437
442
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
443
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
438
444
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
439
445
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
440
446
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
@@ -909,6 +915,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
909
915
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
910
916
  },
911
917
  },
918
+ {
919
+ LLM_ARCH_QWEN2VL,
920
+ {
921
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
922
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
923
+ { LLM_TENSOR_OUTPUT, "output" },
924
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
925
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
926
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
927
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
928
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
929
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
930
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
931
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
932
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
933
+ },
934
+ },
912
935
  {
913
936
  LLM_ARCH_QWEN2MOE,
914
937
  {
@@ -1047,6 +1070,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1047
1070
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1048
1071
  { LLM_TENSOR_OUTPUT, "output" },
1049
1072
  { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1073
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
1074
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
1050
1075
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1051
1076
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1052
1077
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
@@ -1297,6 +1322,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1297
1322
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1298
1323
  },
1299
1324
  },
1325
+ {
1326
+ LLM_ARCH_DEEPSEEK,
1327
+ {
1328
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1329
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1330
+ { LLM_TENSOR_OUTPUT, "output" },
1331
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1332
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1333
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1334
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1335
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1336
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1337
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1338
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1339
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1340
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1341
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1342
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1343
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1344
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1345
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1346
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1347
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1348
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1349
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1350
+ },
1351
+ },
1300
1352
  {
1301
1353
  LLM_ARCH_DEEPSEEK2,
1302
1354
  {
@@ -1560,6 +1612,69 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1560
1612
  },
1561
1613
  };
1562
1614
 
1615
+ enum llm_chat_template {
1616
+ LLM_CHAT_TEMPLATE_CHATML,
1617
+ LLM_CHAT_TEMPLATE_LLAMA_2,
1618
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
1619
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
1620
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
1621
+ LLM_CHAT_TEMPLATE_MISTRAL_V1,
1622
+ LLM_CHAT_TEMPLATE_MISTRAL_V3,
1623
+ LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1624
+ LLM_CHAT_TEMPLATE_MISTRAL_V7,
1625
+ LLM_CHAT_TEMPLATE_PHI_3,
1626
+ LLM_CHAT_TEMPLATE_ZEPHYR,
1627
+ LLM_CHAT_TEMPLATE_MONARCH,
1628
+ LLM_CHAT_TEMPLATE_GEMMA,
1629
+ LLM_CHAT_TEMPLATE_ORION,
1630
+ LLM_CHAT_TEMPLATE_OPENCHAT,
1631
+ LLM_CHAT_TEMPLATE_VICUNA,
1632
+ LLM_CHAT_TEMPLATE_VICUNA_ORCA,
1633
+ LLM_CHAT_TEMPLATE_DEEPSEEK,
1634
+ LLM_CHAT_TEMPLATE_DEEPSEEK_2,
1635
+ LLM_CHAT_TEMPLATE_COMMAND_R,
1636
+ LLM_CHAT_TEMPLATE_LLAMA_3,
1637
+ LLM_CHAT_TEMPLATE_CHATGML_3,
1638
+ LLM_CHAT_TEMPLATE_CHATGML_4,
1639
+ LLM_CHAT_TEMPLATE_MINICPM,
1640
+ LLM_CHAT_TEMPLATE_EXAONE_3,
1641
+ LLM_CHAT_TEMPLATE_RWKV_WORLD,
1642
+ LLM_CHAT_TEMPLATE_GRANITE,
1643
+ LLM_CHAT_TEMPLATE_GIGACHAT,
1644
+ LLM_CHAT_TEMPLATE_UNKNOWN,
1645
+ };
1646
+
1647
+ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
1648
+ { "chatml", LLM_CHAT_TEMPLATE_CHATML },
1649
+ { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
1650
+ { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
1651
+ { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
1652
+ { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
1653
+ { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
1654
+ { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
1655
+ { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
1656
+ { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
1657
+ { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
1658
+ { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
1659
+ { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
1660
+ { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
1661
+ { "orion", LLM_CHAT_TEMPLATE_ORION },
1662
+ { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
1663
+ { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
1664
+ { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
1665
+ { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
1666
+ { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
1667
+ { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
1668
+ { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
1669
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
1670
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
1671
+ { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
1672
+ { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
1673
+ { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
1674
+ { "granite", LLM_CHAT_TEMPLATE_GRANITE },
1675
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
1676
+ };
1677
+
1563
1678
  static llm_arch llm_arch_from_string(const std::string & name) {
1564
1679
  for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
1565
1680
  if (kv.second == name) {
@@ -1633,9 +1748,10 @@ struct LLM_TN {
1633
1748
  //
1634
1749
 
1635
1750
  static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
1636
- { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
1637
- { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
1638
- { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
1751
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
1752
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
1753
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
1754
+ { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
1639
1755
  };
1640
1756
 
1641
1757
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
@@ -1741,7 +1857,7 @@ private:
1741
1857
  DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1742
1858
  NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1743
1859
  if (!bufLen) {
1744
- ret = format("Win32 error code: %s", error_code);
1860
+ ret = format("Win32 error code: %lx", error_code);
1745
1861
  } else {
1746
1862
  ret = lpMsgBuf;
1747
1863
  LocalFree(lpMsgBuf);
@@ -2079,7 +2195,7 @@ struct llama_mmap {
2079
2195
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
2080
2196
 
2081
2197
  // may fail on pre-Windows 8 systems
2082
- pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
2198
+ pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
2083
2199
 
2084
2200
  if (pPrefetchVirtualMemory) {
2085
2201
  // advise the kernel to preload the mapped memory
@@ -2421,11 +2537,12 @@ struct llama_hparams {
2421
2537
  uint32_t time_decay_extra_dim = 0;
2422
2538
  uint32_t wkv_head_size = 0;
2423
2539
 
2424
- float rope_attn_factor = 1.0f;
2425
- float rope_freq_base_train;
2426
- float rope_freq_scale_train;
2427
- uint32_t n_ctx_orig_yarn;
2428
- float rope_yarn_log_mul;
2540
+ float rope_attn_factor = 1.0f;
2541
+ float rope_freq_base_train;
2542
+ float rope_freq_scale_train;
2543
+ uint32_t n_ctx_orig_yarn;
2544
+ float rope_yarn_log_mul;
2545
+ int rope_sections[4];
2429
2546
 
2430
2547
  // for State Space Models
2431
2548
  uint32_t ssm_d_conv = 0;
@@ -2482,6 +2599,9 @@ struct llama_hparams {
2482
2599
 
2483
2600
  if (this->rope_finetuned != other.rope_finetuned) return true;
2484
2601
  if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
2602
+ if (std::equal(std::begin(this->rope_sections),
2603
+ std::end(this->rope_sections),
2604
+ std::begin(other.rope_sections))) return true;
2485
2605
 
2486
2606
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
2487
2607
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -3325,6 +3445,11 @@ struct llama_context {
3325
3445
  // whether we are computing encoder output or decoder output
3326
3446
  bool is_encoding = false;
3327
3447
 
3448
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
3449
+ // number of position id each token get, 1 for each token in most cases.
3450
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
3451
+ int n_pos_per_token = 1;
3452
+
3328
3453
  // output of the encoder part of the encoder-decoder models
3329
3454
  std::vector<float> embd_enc;
3330
3455
  std::vector<std::set<llama_seq_id>> seq_ids_enc;
@@ -4525,9 +4650,6 @@ struct llama_model_loader {
4525
4650
  case LM_GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
4526
4651
  case LM_GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
4527
4652
  case LM_GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
4528
- case LM_GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
4529
- case LM_GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
4530
- case LM_GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
4531
4653
  default:
4532
4654
  {
4533
4655
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max));
@@ -5291,9 +5413,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5291
5413
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
5292
5414
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
5293
5415
  case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
5294
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
5295
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
5296
- case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
5297
5416
 
5298
5417
  default: return "unknown, may not work";
5299
5418
  }
@@ -5530,8 +5649,12 @@ static void llm_load_hparams(
5530
5649
  case LLM_ARCH_MINICPM:
5531
5650
  {
5532
5651
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5652
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5653
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5654
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5533
5655
 
5534
5656
  switch (hparams.n_layer) {
5657
+ case 52: model.type = e_model::MODEL_1B; break;
5535
5658
  case 40: model.type = e_model::MODEL_2B; break;
5536
5659
  default: model.type = e_model::MODEL_UNKNOWN;
5537
5660
  }
@@ -5696,6 +5819,13 @@ static void llm_load_hparams(
5696
5819
  default: model.type = e_model::MODEL_UNKNOWN;
5697
5820
  }
5698
5821
  } break;
5822
+ case LLM_ARCH_QWEN2VL:
5823
+ {
5824
+ std::array<int, 4> section_dims;
5825
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
5826
+ std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
5827
+ }
5828
+ // fall through
5699
5829
  case LLM_ARCH_QWEN2:
5700
5830
  {
5701
5831
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6006,6 +6136,19 @@ static void llm_load_hparams(
6006
6136
  model.type = e_model::MODEL_UNKNOWN;
6007
6137
  }
6008
6138
  } break;
6139
+ case LLM_ARCH_DEEPSEEK:
6140
+ {
6141
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6142
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
6143
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
6144
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
6145
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
6146
+
6147
+ switch (hparams.n_layer) {
6148
+ case 28: model.type = e_model::MODEL_20B; break;
6149
+ default: model.type = e_model::MODEL_UNKNOWN;
6150
+ }
6151
+ } break;
6009
6152
  case LLM_ARCH_DEEPSEEK2:
6010
6153
  {
6011
6154
  bool is_lite = (hparams.n_layer == 27);
@@ -6352,6 +6495,7 @@ static void llm_load_vocab(
6352
6495
  tokenizer_pre == "phi-2" ||
6353
6496
  tokenizer_pre == "jina-es" ||
6354
6497
  tokenizer_pre == "jina-de" ||
6498
+ tokenizer_pre == "gigachat" ||
6355
6499
  tokenizer_pre == "jina-v1-en" ||
6356
6500
  tokenizer_pre == "jina-v2-es" ||
6357
6501
  tokenizer_pre == "jina-v2-de" ||
@@ -6422,6 +6566,9 @@ static void llm_load_vocab(
6422
6566
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
6423
6567
  vocab.tokenizer_add_bos = true;
6424
6568
  vocab.tokenizer_clean_spaces = false;
6569
+ } else if (
6570
+ tokenizer_pre == "minerva-7b") {
6571
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
6425
6572
  } else {
6426
6573
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6427
6574
  }
@@ -7000,6 +7147,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
7000
7147
 
7001
7148
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
7002
7149
 
7150
+ if (model.arch == LLM_ARCH_DEEPSEEK) {
7151
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7152
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7153
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7154
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7155
+ }
7156
+
7003
7157
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
7004
7158
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7005
7159
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -7015,7 +7169,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
7015
7169
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7016
7170
  }
7017
7171
 
7018
- if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
7172
+ if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
7019
7173
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7020
7174
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7021
7175
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7640,7 +7794,13 @@ static bool llm_load_tensors(
7640
7794
 
7641
7795
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7642
7796
 
7643
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7797
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7798
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7799
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7800
+ }
7801
+ else {
7802
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7803
+ }
7644
7804
 
7645
7805
  if (n_expert == 0) {
7646
7806
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
@@ -8107,6 +8267,7 @@ static bool llm_load_tensors(
8107
8267
  }
8108
8268
  } break;
8109
8269
  case LLM_ARCH_QWEN2:
8270
+ case LLM_ARCH_QWEN2VL:
8110
8271
  {
8111
8272
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8112
8273
 
@@ -8767,6 +8928,55 @@ static bool llm_load_tensors(
8767
8928
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8768
8929
  }
8769
8930
  } break;
8931
+ case LLM_ARCH_DEEPSEEK:
8932
+ {
8933
+
8934
+ const int64_t n_ff_exp = hparams.n_ff_exp;
8935
+ const int64_t n_expert_shared = hparams.n_expert_shared;
8936
+
8937
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8938
+
8939
+ // output
8940
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
8941
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
8942
+
8943
+ for (int i = 0; i < n_layer; ++i) {
8944
+ auto & layer = model.layers[i];
8945
+
8946
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
8947
+
8948
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
8949
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
8950
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
8951
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
8952
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
8953
+
8954
+ if (i < (int) hparams.n_layer_dense_lead) {
8955
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
8956
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
8957
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8958
+ } else {
8959
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
8960
+
8961
+ if (n_expert == 0) {
8962
+ throw std::runtime_error("n_expert must be > 0");
8963
+ }
8964
+ if (n_expert_used == 0) {
8965
+ throw std::runtime_error("n_expert_used must be > 0");
8966
+ }
8967
+
8968
+ // MoE branch
8969
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
8970
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
8971
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
8972
+
8973
+ // Shared expert branch
8974
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
8975
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
8976
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
8977
+ }
8978
+ }
8979
+ } break;
8770
8980
  case LLM_ARCH_DEEPSEEK2:
8771
8981
  {
8772
8982
  const bool is_lite = (hparams.n_layer == 27);
@@ -12496,6 +12706,124 @@ struct llm_build_context {
12496
12706
  return gf;
12497
12707
  }
12498
12708
 
12709
+ struct lm_ggml_cgraph * build_qwen2vl() {
12710
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12711
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12712
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12713
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
12714
+
12715
+ struct lm_ggml_tensor * cur;
12716
+ struct lm_ggml_tensor * inpL;
12717
+
12718
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
12719
+
12720
+ // inp_pos - contains the positions
12721
+ lctx.inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens * 4);
12722
+ cb(lctx.inp_pos, "inp_pos", -1);
12723
+ lm_ggml_set_input(lctx.inp_pos);
12724
+ struct lm_ggml_tensor * inp_pos = lctx.inp_pos;
12725
+
12726
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
12727
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
12728
+ int sections[4];
12729
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
12730
+
12731
+ for (int il = 0; il < n_layer; ++il) {
12732
+ struct lm_ggml_tensor * inpSA = inpL;
12733
+
12734
+ // norm
12735
+ cur = llm_build_norm(ctx0, inpL, hparams,
12736
+ model.layers[il].attn_norm, NULL,
12737
+ LLM_NORM_RMS, cb, il);
12738
+ cb(cur, "attn_norm", il);
12739
+
12740
+ // self-attention
12741
+ {
12742
+ // compute Q and K and RoPE them
12743
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
12744
+ cb(Qcur, "Qcur", il);
12745
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
12746
+ cb(Qcur, "Qcur", il);
12747
+
12748
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
12749
+ cb(Kcur, "Kcur", il);
12750
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
12751
+ cb(Kcur, "Kcur", il);
12752
+
12753
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
12754
+ cb(Vcur, "Vcur", il);
12755
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
12756
+ cb(Vcur, "Vcur", il);
12757
+
12758
+ Qcur = lm_ggml_rope_multi(
12759
+ ctx0,
12760
+ lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
12761
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
12762
+ ext_factor, attn_factor, beta_fast, beta_slow
12763
+ );
12764
+ cb(Qcur, "Qcur", il);
12765
+
12766
+ Kcur = lm_ggml_rope_multi(
12767
+ ctx0,
12768
+ lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
12769
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
12770
+ ext_factor, attn_factor, beta_fast, beta_slow
12771
+ );
12772
+ cb(Kcur, "Kcur", il);
12773
+
12774
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12775
+ model.layers[il].wo, model.layers[il].bo,
12776
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
12777
+ }
12778
+
12779
+ if (il == n_layer - 1) {
12780
+ // skip computing output for unused tokens
12781
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
12782
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
12783
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
12784
+ }
12785
+
12786
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
12787
+ cb(ffn_inp, "ffn_inp", il);
12788
+
12789
+ // feed-forward network
12790
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
12791
+ model.layers[il].ffn_norm, NULL,
12792
+ LLM_NORM_RMS, cb, il);
12793
+ cb(cur, "ffn_norm", il);
12794
+
12795
+ cur = llm_build_ffn(ctx0, lctx, cur,
12796
+ model.layers[il].ffn_up, NULL, NULL,
12797
+ model.layers[il].ffn_gate, NULL, NULL,
12798
+ model.layers[il].ffn_down, NULL, NULL,
12799
+ NULL,
12800
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
12801
+ cb(cur, "ffn_out", il);
12802
+
12803
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
12804
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
12805
+ cb(cur, "l_out", il);
12806
+
12807
+ // input for next layer
12808
+ inpL = cur;
12809
+ }
12810
+
12811
+ cur = inpL;
12812
+
12813
+ cur = llm_build_norm(ctx0, cur, hparams,
12814
+ model.output_norm, NULL,
12815
+ LLM_NORM_RMS, cb, -1);
12816
+ cb(cur, "result_norm", -1);
12817
+
12818
+ // lm_head
12819
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
12820
+ cb(cur, "result_output", -1);
12821
+
12822
+ lm_ggml_build_forward_expand(gf, cur);
12823
+
12824
+ return gf;
12825
+ }
12826
+
12499
12827
  struct lm_ggml_cgraph * build_qwen2moe() {
12500
12828
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12501
12829
 
@@ -13447,21 +13775,18 @@ struct llm_build_context {
13447
13775
  return gf;
13448
13776
  }
13449
13777
 
13450
- // ref: https://arxiv.org/abs/2203.03466
13451
- // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
13452
- // based on the original build_llama() function
13453
- struct lm_ggml_cgraph * build_minicpm() {
13778
+ struct lm_ggml_cgraph * build_minicpm3() {
13454
13779
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13455
13780
 
13456
- const int64_t n_embd_head = hparams.n_embd_head_v;
13457
- LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13458
- LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
13459
-
13460
- const int64_t n_embd = hparams.n_embd;
13461
13781
  //TODO: if the model varies, these parameters need to be read from the model
13462
13782
  const int64_t n_embd_base = 256;
13463
13783
  const float scale_embd = 12.0f;
13464
13784
  const float scale_depth = 1.4f;
13785
+ const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
13786
+
13787
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
13788
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
13789
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
13465
13790
 
13466
13791
  struct lm_ggml_tensor * cur;
13467
13792
  struct lm_ggml_tensor * inpL;
@@ -13481,163 +13806,19 @@ struct llm_build_context {
13481
13806
  for (int il = 0; il < n_layer; ++il) {
13482
13807
  struct lm_ggml_tensor * inpSA = inpL;
13483
13808
 
13809
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
13484
13810
  // norm
13485
13811
  cur = llm_build_norm(ctx0, inpL, hparams,
13486
13812
  model.layers[il].attn_norm, NULL,
13487
13813
  LLM_NORM_RMS, cb, il);
13488
13814
  cb(cur, "attn_norm", il);
13489
13815
 
13490
- // self-attention
13816
+ // self_attention
13491
13817
  {
13492
- // compute Q and K and RoPE them
13493
- struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13494
- cb(Qcur, "Qcur", il);
13495
- if (model.layers[il].bq) {
13496
- Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
13497
- cb(Qcur, "Qcur", il);
13498
- }
13499
-
13500
- struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13501
- cb(Kcur, "Kcur", il);
13502
- if (model.layers[il].bk) {
13503
- Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
13504
- cb(Kcur, "Kcur", il);
13505
- }
13506
-
13507
- struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13508
- cb(Vcur, "Vcur", il);
13509
- if (model.layers[il].bv) {
13510
- Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
13511
- cb(Vcur, "Vcur", il);
13512
- }
13513
-
13514
- Qcur = lm_ggml_rope_ext(
13515
- ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13516
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13517
- ext_factor, attn_factor, beta_fast, beta_slow
13518
- );
13519
- cb(Qcur, "Qcur", il);
13520
-
13521
- Kcur = lm_ggml_rope_ext(
13522
- ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
13523
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13524
- ext_factor, attn_factor, beta_fast, beta_slow
13525
- );
13526
- cb(Kcur, "Kcur", il);
13527
-
13528
- cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13529
- model.layers[il].wo, model.layers[il].bo,
13530
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13531
- }
13532
-
13533
- if (il == n_layer - 1) {
13534
- // skip computing output for unused tokens
13535
- struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13536
- cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13537
- inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13538
- }
13539
-
13540
- // scale_res - scale the hidden states for residual connection
13541
- const float scale_res = scale_depth/sqrtf(float(n_layer));
13542
- cur = lm_ggml_scale(ctx0, cur, scale_res);
13543
- cb(cur, "hidden_scaled", -1);
13544
-
13545
- struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13546
- cb(ffn_inp, "ffn_inp", il);
13547
-
13548
- // feed-forward network
13549
- {
13550
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
13551
- model.layers[il].ffn_norm, NULL,
13552
- LLM_NORM_RMS, cb, il);
13553
- cb(cur, "ffn_norm", il);
13554
-
13555
- cur = llm_build_ffn(ctx0, lctx, cur,
13556
- model.layers[il].ffn_up, NULL, NULL,
13557
- model.layers[il].ffn_gate, NULL, NULL,
13558
- model.layers[il].ffn_down, NULL, NULL,
13559
- NULL,
13560
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
13561
- cb(cur, "ffn_out", il);
13562
- }
13563
-
13564
- // scale the hidden states for residual connection
13565
- cur = lm_ggml_scale(ctx0, cur, scale_res);
13566
- cb(cur, "hidden_scaled_ffn", -1);
13567
-
13568
- cur = lm_ggml_add(ctx0, cur, ffn_inp);
13569
- cur = lctx.cvec.apply_to(ctx0, cur, il);
13570
- cb(cur, "l_out", il);
13571
-
13572
- // input for next layer
13573
- inpL = cur;
13574
- }
13575
-
13576
- cur = inpL;
13577
-
13578
- cur = llm_build_norm(ctx0, cur, hparams,
13579
- model.output_norm, NULL,
13580
- LLM_NORM_RMS, cb, -1);
13581
- cb(cur, "result_norm", -1);
13582
-
13583
- // lm_head scaling
13584
- const float scale_lmhead = float(n_embd_base)/float(n_embd);
13585
- cur = lm_ggml_scale(ctx0, cur, scale_lmhead);
13586
- cb(cur, "lmhead_scaling", -1);
13587
-
13588
- // lm_head
13589
- cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13590
- cb(cur, "result_output", -1);
13591
-
13592
- lm_ggml_build_forward_expand(gf, cur);
13593
-
13594
- return gf;
13595
- }
13596
-
13597
- struct lm_ggml_cgraph * build_minicpm3() {
13598
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13599
-
13600
- //TODO: if the model varies, these parameters need to be read from the model
13601
- const int64_t n_embd_base = 256;
13602
- const float scale_embd = 12.0f;
13603
- const float scale_depth = 1.4f;
13604
- const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
13605
-
13606
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
13607
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
13608
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
13609
-
13610
- struct lm_ggml_tensor * cur;
13611
- struct lm_ggml_tensor * inpL;
13612
-
13613
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
13614
-
13615
- // scale the input embeddings
13616
- inpL = lm_ggml_scale(ctx0, inpL, scale_embd);
13617
- cb(inpL, "inp_scaled", -1);
13618
-
13619
- // inp_pos - contains the positions
13620
- struct lm_ggml_tensor * inp_pos = build_inp_pos();
13621
-
13622
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13623
- struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13624
-
13625
- for (int il = 0; il < n_layer; ++il) {
13626
- struct lm_ggml_tensor * inpSA = inpL;
13627
-
13628
- struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
13629
- // norm
13630
- cur = llm_build_norm(ctx0, inpL, hparams,
13631
- model.layers[il].attn_norm, NULL,
13632
- LLM_NORM_RMS, cb, il);
13633
- cb(cur, "attn_norm", il);
13634
-
13635
- // self_attention
13636
- {
13637
- struct lm_ggml_tensor * q = NULL;
13638
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
13639
- q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
13640
- cb(q, "q", il);
13818
+ struct lm_ggml_tensor * q = NULL;
13819
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
13820
+ q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
13821
+ cb(q, "q", il);
13641
13822
 
13642
13823
  q = llm_build_norm(ctx0, q, hparams,
13643
13824
  model.layers[il].attn_q_a_norm, NULL,
@@ -15150,6 +15331,161 @@ struct llm_build_context {
15150
15331
  return gf;
15151
15332
  }
15152
15333
 
15334
+ struct lm_ggml_cgraph * build_deepseek() {
15335
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15336
+
15337
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
15338
+ int32_t n_tokens = this->n_tokens;
15339
+
15340
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15341
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15342
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
15343
+
15344
+ struct lm_ggml_tensor * cur;
15345
+ struct lm_ggml_tensor * inpL;
15346
+
15347
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
15348
+
15349
+ // inp_pos - contains the positions
15350
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
15351
+
15352
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
15353
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
15354
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15355
+ for (int il = 0; il < n_layer; ++il) {
15356
+ struct lm_ggml_tensor * inpSA = inpL;
15357
+
15358
+ // norm
15359
+ cur = llm_build_norm(ctx0, inpL, hparams,
15360
+ model.layers[il].attn_norm, NULL,
15361
+ LLM_NORM_RMS, cb, il);
15362
+ cb(cur, "attn_norm", il);
15363
+
15364
+ // self-attention
15365
+ {
15366
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15367
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
15368
+
15369
+ // compute Q and K and RoPE them
15370
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
15371
+ cb(Qcur, "Qcur", il);
15372
+ if (model.layers[il].bq) {
15373
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
15374
+ cb(Qcur, "Qcur", il);
15375
+ }
15376
+
15377
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
15378
+ cb(Kcur, "Kcur", il);
15379
+ if (model.layers[il].bk) {
15380
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
15381
+ cb(Kcur, "Kcur", il);
15382
+ }
15383
+
15384
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
15385
+ cb(Vcur, "Vcur", il);
15386
+ if (model.layers[il].bv) {
15387
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
15388
+ cb(Vcur, "Vcur", il);
15389
+ }
15390
+
15391
+ Qcur = lm_ggml_rope_ext(
15392
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
15393
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15394
+ ext_factor, attn_factor, beta_fast, beta_slow
15395
+ );
15396
+ cb(Qcur, "Qcur", il);
15397
+
15398
+ Kcur = lm_ggml_rope_ext(
15399
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
15400
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15401
+ ext_factor, attn_factor, beta_fast, beta_slow
15402
+ );
15403
+ cb(Kcur, "Kcur", il);
15404
+
15405
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
15406
+ model.layers[il].wo, model.layers[il].bo,
15407
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
15408
+ }
15409
+
15410
+ if (il == n_layer - 1) {
15411
+ // skip computing output for unused tokens
15412
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
15413
+ n_tokens = n_outputs;
15414
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
15415
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
15416
+ }
15417
+
15418
+
15419
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
15420
+ cb(ffn_inp, "ffn_inp", il);
15421
+
15422
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
15423
+ model.layers[il].ffn_norm, NULL,
15424
+ LLM_NORM_RMS, cb, il);
15425
+ cb(cur, "ffn_norm", il);
15426
+
15427
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
15428
+ cur = llm_build_ffn(ctx0, lctx, cur,
15429
+ model.layers[il].ffn_up, NULL, NULL,
15430
+ model.layers[il].ffn_gate, NULL, NULL,
15431
+ model.layers[il].ffn_down, NULL, NULL,
15432
+ NULL,
15433
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15434
+ cb(cur, "ffn_out", il);
15435
+ } else {
15436
+ // MoE branch
15437
+ lm_ggml_tensor * moe_out =
15438
+ llm_build_moe_ffn(ctx0, lctx, cur,
15439
+ model.layers[il].ffn_gate_inp,
15440
+ model.layers[il].ffn_up_exps,
15441
+ model.layers[il].ffn_gate_exps,
15442
+ model.layers[il].ffn_down_exps,
15443
+ n_expert, n_expert_used,
15444
+ LLM_FFN_SILU, false,
15445
+ false, hparams.expert_weights_scale,
15446
+ cb, il);
15447
+ cb(moe_out, "ffn_moe_out", il);
15448
+
15449
+ // FFN shared expert
15450
+ {
15451
+ lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
15452
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15453
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15454
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15455
+ NULL,
15456
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15457
+ cb(ffn_shexp, "ffn_shexp", il);
15458
+
15459
+ cur = lm_ggml_add(ctx0, moe_out, ffn_shexp);
15460
+ cb(cur, "ffn_out", il);
15461
+ }
15462
+ }
15463
+
15464
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
15465
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
15466
+ cb(cur, "l_out", il);
15467
+
15468
+ // input for next layer
15469
+ inpL = cur;
15470
+ }
15471
+
15472
+ cur = inpL;
15473
+
15474
+ cur = llm_build_norm(ctx0, cur, hparams,
15475
+ model.output_norm, NULL,
15476
+ LLM_NORM_RMS, cb, -1);
15477
+ cb(cur, "result_norm", -1);
15478
+
15479
+ // lm_head
15480
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15481
+
15482
+ cb(cur, "result_output", -1);
15483
+
15484
+ lm_ggml_build_forward_expand(gf, cur);
15485
+
15486
+ return gf;
15487
+ }
15488
+
15153
15489
  struct lm_ggml_cgraph * build_deepseek2() {
15154
15490
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15155
15491
 
@@ -16692,6 +17028,7 @@ static struct lm_ggml_cgraph * llama_build_graph(
16692
17028
 
16693
17029
  switch (model.arch) {
16694
17030
  case LLM_ARCH_LLAMA:
17031
+ case LLM_ARCH_MINICPM:
16695
17032
  case LLM_ARCH_GRANITE:
16696
17033
  case LLM_ARCH_GRANITE_MOE:
16697
17034
  {
@@ -16743,6 +17080,11 @@ static struct lm_ggml_cgraph * llama_build_graph(
16743
17080
  {
16744
17081
  result = llm.build_qwen2();
16745
17082
  } break;
17083
+ case LLM_ARCH_QWEN2VL:
17084
+ {
17085
+ lctx.n_pos_per_token = 4;
17086
+ result = llm.build_qwen2vl();
17087
+ } break;
16746
17088
  case LLM_ARCH_QWEN2MOE:
16747
17089
  {
16748
17090
  result = llm.build_qwen2moe();
@@ -16775,10 +17117,6 @@ static struct lm_ggml_cgraph * llama_build_graph(
16775
17117
  {
16776
17118
  result = llm.build_internlm2();
16777
17119
  } break;
16778
- case LLM_ARCH_MINICPM:
16779
- {
16780
- result = llm.build_minicpm();
16781
- } break;
16782
17120
  case LLM_ARCH_MINICPM3:
16783
17121
  {
16784
17122
  result = llm.build_minicpm3();
@@ -16835,6 +17173,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
16835
17173
  {
16836
17174
  result = llm.build_arctic();
16837
17175
  } break;
17176
+ case LLM_ARCH_DEEPSEEK:
17177
+ {
17178
+ result = llm.build_deepseek();
17179
+ } break;
16838
17180
  case LLM_ARCH_DEEPSEEK2:
16839
17181
  {
16840
17182
  result = llm.build_deepseek2();
@@ -16965,8 +17307,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
16965
17307
 
16966
17308
  if (ubatch.pos && lctx.inp_pos) {
16967
17309
  const int64_t n_tokens = ubatch.n_tokens;
16968
-
16969
- lm_ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*lm_ggml_element_size(lctx.inp_pos));
17310
+ auto n_pos = lctx.n_pos_per_token;
17311
+ lm_ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*lm_ggml_element_size(lctx.inp_pos));
16970
17312
  }
16971
17313
 
16972
17314
  if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -18451,10 +18793,6 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
18451
18793
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18452
18794
  new_type = LM_GGML_TYPE_IQ3_S;
18453
18795
  }
18454
- else if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8 ||
18455
- new_type == LM_GGML_TYPE_Q4_0_8_8) {
18456
- new_type = LM_GGML_TYPE_Q4_0;
18457
- }
18458
18796
  else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
18459
18797
  new_type = LM_GGML_TYPE_Q4_K;
18460
18798
  }
@@ -18777,9 +19115,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18777
19115
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = LM_GGML_TYPE_IQ4_XS; break;
18778
19116
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = LM_GGML_TYPE_IQ3_S; break;
18779
19117
  case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = LM_GGML_TYPE_IQ3_S; break;
18780
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = LM_GGML_TYPE_Q4_0_4_4; break;
18781
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = LM_GGML_TYPE_Q4_0_4_8; break;
18782
- case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = LM_GGML_TYPE_Q4_0_8_8; break;
18783
19118
 
18784
19119
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
18785
19120
  }
@@ -19118,14 +19453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19118
19453
  f32_data = (float *) f32_conv_buf.data();
19119
19454
  }
19120
19455
 
19121
- int chunk_size_multiplier = 1;
19122
- if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8 || new_type == LM_GGML_TYPE_Q4_0_8_8) {
19123
- if ((new_type == LM_GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = LM_GGML_TYPE_Q4_0;
19124
- else if (tensor->ne[1] % 4 != 0) new_type = LM_GGML_TYPE_Q4_0;
19125
- if (new_type == LM_GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
19126
- else if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
19127
- }
19128
-
19129
19456
  LLAMA_LOG_INFO("converting to %s .. ", lm_ggml_type_name(new_type));
19130
19457
  fflush(stdout);
19131
19458
 
@@ -19138,8 +19465,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19138
19465
  const int64_t nrows = tensor->ne[1];
19139
19466
 
19140
19467
  static const int64_t min_chunk_size = 32 * 512;
19141
- const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
19142
- chunk_size_multiplier;
19468
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
19143
19469
 
19144
19470
  const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
19145
19471
  const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@@ -20082,6 +20408,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20082
20408
  case LLM_ARCH_COMMAND_R:
20083
20409
  case LLM_ARCH_OLMO:
20084
20410
  case LLM_ARCH_ARCTIC:
20411
+ case LLM_ARCH_DEEPSEEK:
20085
20412
  case LLM_ARCH_DEEPSEEK2:
20086
20413
  case LLM_ARCH_CHATGLM:
20087
20414
  case LLM_ARCH_GRANITE:
@@ -20115,6 +20442,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20115
20442
  case LLM_ARCH_MINICPM3:
20116
20443
  return LLAMA_ROPE_TYPE_NEOX;
20117
20444
 
20445
+ case LLM_ARCH_QWEN2VL:
20446
+ return LLAMA_ROPE_TYPE_MROPE;
20447
+
20118
20448
  // all model arches should be listed explicitly here
20119
20449
  case LLM_ARCH_UNKNOWN:
20120
20450
  LM_GGML_ABORT("unknown architecture");
@@ -21683,7 +22013,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
21683
22013
  throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
21684
22014
  }
21685
22015
  } else if ((size_t) i >= ctx->output_ids.size()) {
21686
- throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
22016
+ throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
21687
22017
  } else {
21688
22018
  j = ctx->output_ids[i];
21689
22019
  }
@@ -21854,18 +22184,111 @@ int32_t llama_detokenize(
21854
22184
  // chat templates
21855
22185
  //
21856
22186
 
22187
+ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
22188
+ if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
22189
+ return LLM_CHAT_TEMPLATES.at(tmpl);
22190
+ }
22191
+ auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
22192
+ return tmpl.find(haystack) != std::string::npos;
22193
+ };
22194
+ if (tmpl_contains("<|im_start|>")) {
22195
+ return LLM_CHAT_TEMPLATE_CHATML;
22196
+ } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
22197
+ if (tmpl_contains("[SYSTEM_PROMPT]")) {
22198
+ return LLM_CHAT_TEMPLATE_MISTRAL_V7;
22199
+ } else if (
22200
+ // catches official 'v1' template
22201
+ tmpl_contains("' [INST] ' + system_message")
22202
+ // catches official 'v3' and 'v3-tekken' templates
22203
+ || tmpl_contains("[AVAILABLE_TOOLS]")
22204
+ ) {
22205
+ // Official mistral 'v1', 'v3' and 'v3-tekken' templates
22206
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
22207
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
22208
+ if (tmpl_contains(" [INST]")) {
22209
+ return LLM_CHAT_TEMPLATE_MISTRAL_V1;
22210
+ } else if (tmpl_contains("\"[INST]\"")) {
22211
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
22212
+ }
22213
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3;
22214
+ } else {
22215
+ // llama2 template and its variants
22216
+ // [variant] support system message
22217
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
22218
+ bool support_system_message = tmpl_contains("<<SYS>>");
22219
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
22220
+ bool strip_message = tmpl_contains("content.strip()");
22221
+ if (strip_message) {
22222
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
22223
+ } else if (add_bos_inside_history) {
22224
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
22225
+ } else if (support_system_message) {
22226
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
22227
+ } else {
22228
+ return LLM_CHAT_TEMPLATE_LLAMA_2;
22229
+ }
22230
+ }
22231
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
22232
+ return LLM_CHAT_TEMPLATE_PHI_3;
22233
+ } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
22234
+ return LLM_CHAT_TEMPLATE_ZEPHYR;
22235
+ } else if (tmpl_contains("bos_token + message['role']")) {
22236
+ return LLM_CHAT_TEMPLATE_MONARCH;
22237
+ } else if (tmpl_contains("<start_of_turn>")) {
22238
+ return LLM_CHAT_TEMPLATE_GEMMA;
22239
+ } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
22240
+ // OrionStarAI/Orion-14B-Chat
22241
+ return LLM_CHAT_TEMPLATE_ORION;
22242
+ } else if (tmpl_contains("GPT4 Correct ")) {
22243
+ // openchat/openchat-3.5-0106
22244
+ return LLM_CHAT_TEMPLATE_OPENCHAT;
22245
+ } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
22246
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
22247
+ if (tmpl_contains("SYSTEM: ")) {
22248
+ return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
22249
+ }
22250
+ return LLM_CHAT_TEMPLATE_VICUNA;
22251
+ } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
22252
+ // deepseek-ai/deepseek-coder-33b-instruct
22253
+ return LLM_CHAT_TEMPLATE_DEEPSEEK;
22254
+ } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
22255
+ // CohereForAI/c4ai-command-r-plus
22256
+ return LLM_CHAT_TEMPLATE_COMMAND_R;
22257
+ } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
22258
+ return LLM_CHAT_TEMPLATE_LLAMA_3;
22259
+ } else if (tmpl_contains("[gMASK]sop")) {
22260
+ // chatglm3-6b
22261
+ return LLM_CHAT_TEMPLATE_CHATGML_3;
22262
+ } else if (tmpl_contains("[gMASK]<sop>")) {
22263
+ return LLM_CHAT_TEMPLATE_CHATGML_4;
22264
+ } else if (tmpl_contains(LU8("<用户>"))) {
22265
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
22266
+ return LLM_CHAT_TEMPLATE_MINICPM;
22267
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
22268
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
22269
+ } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
22270
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
22271
+ // EXAONE-3.0-7.8B-Instruct
22272
+ return LLM_CHAT_TEMPLATE_EXAONE_3;
22273
+ } else if (tmpl_contains("rwkv-world")) {
22274
+ return LLM_CHAT_TEMPLATE_RWKV_WORLD;
22275
+ } else if (tmpl_contains("<|start_of_role|>")) {
22276
+ return LLM_CHAT_TEMPLATE_GRANITE;
22277
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
22278
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
22279
+ }
22280
+ return LLM_CHAT_TEMPLATE_UNKNOWN;
22281
+ }
22282
+
21857
22283
  // Simple version of "llama_apply_chat_template" that only works with strings
21858
22284
  // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
21859
22285
  static int32_t llama_chat_apply_template_internal(
21860
- const std::string & tmpl,
22286
+ const llm_chat_template tmpl,
21861
22287
  const std::vector<const llama_chat_message *> & chat,
21862
22288
  std::string & dest, bool add_ass) {
21863
22289
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
21864
22290
  std::stringstream ss;
21865
- auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
21866
- return tmpl.find(haystack) != std::string::npos;
21867
- };
21868
- if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
22291
+ if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
21869
22292
  // chatml template
21870
22293
  for (auto message : chat) {
21871
22294
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -21873,16 +22296,59 @@ static int32_t llama_chat_apply_template_internal(
21873
22296
  if (add_ass) {
21874
22297
  ss << "<|im_start|>assistant\n";
21875
22298
  }
21876
- } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
22299
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
22300
+ // Official mistral 'v7' template
22301
+ // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
22302
+ for (auto message : chat) {
22303
+ std::string role(message->role);
22304
+ std::string content(message->content);
22305
+ if (role == "system") {
22306
+ ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
22307
+ } else if (role == "user") {
22308
+ ss << "[INST] " << content << "[/INST]";
22309
+ }
22310
+ else {
22311
+ ss << " " << content << "</s>";
22312
+ }
22313
+ }
22314
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
22315
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
22316
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
22317
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
22318
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
22319
+ std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
22320
+ std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
22321
+ bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
22322
+ bool is_inside_turn = false;
22323
+ for (auto message : chat) {
22324
+ if (!is_inside_turn) {
22325
+ ss << leading_space << "[INST]" << trailing_space;
22326
+ is_inside_turn = true;
22327
+ }
22328
+ std::string role(message->role);
22329
+ std::string content(message->content);
22330
+ if (role == "system") {
22331
+ ss << content << "\n\n";
22332
+ } else if (role == "user") {
22333
+ ss << content << leading_space << "[/INST]";
22334
+ } else {
22335
+ ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
22336
+ is_inside_turn = false;
22337
+ }
22338
+ }
22339
+ } else if (
22340
+ tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
22341
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
22342
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
22343
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
21877
22344
  // llama2 template and its variants
21878
22345
  // [variant] support system message
21879
- bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
21880
- // [variant] space before + after response
21881
- bool space_around_response = tmpl_contains("' ' + eos_token");
22346
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
22347
+ bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
21882
22348
  // [variant] add BOS inside history
21883
- bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
22349
+ bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
21884
22350
  // [variant] trim spaces from the input message
21885
- bool strip_message = tmpl_contains("content.strip()");
22351
+ bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
21886
22352
  // construct the prompt
21887
22353
  bool is_inside_turn = true; // skip BOS at the beginning
21888
22354
  ss << "[INST] ";
@@ -21903,12 +22369,11 @@ static int32_t llama_chat_apply_template_internal(
21903
22369
  } else if (role == "user") {
21904
22370
  ss << content << " [/INST]";
21905
22371
  } else {
21906
- ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
22372
+ ss << content << "</s>";
21907
22373
  is_inside_turn = false;
21908
22374
  }
21909
22375
  }
21910
- // llama2 templates seem to not care about "add_generation_prompt"
21911
- } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
22376
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
21912
22377
  // Phi 3
21913
22378
  for (auto message : chat) {
21914
22379
  std::string role(message->role);
@@ -21917,7 +22382,7 @@ static int32_t llama_chat_apply_template_internal(
21917
22382
  if (add_ass) {
21918
22383
  ss << "<|assistant|>\n";
21919
22384
  }
21920
- } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
22385
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
21921
22386
  // zephyr template
21922
22387
  for (auto message : chat) {
21923
22388
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -21925,7 +22390,7 @@ static int32_t llama_chat_apply_template_internal(
21925
22390
  if (add_ass) {
21926
22391
  ss << "<|assistant|>\n";
21927
22392
  }
21928
- } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
22393
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
21929
22394
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
21930
22395
  for (auto message : chat) {
21931
22396
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -21934,7 +22399,7 @@ static int32_t llama_chat_apply_template_internal(
21934
22399
  if (add_ass) {
21935
22400
  ss << "<s>assistant\n";
21936
22401
  }
21937
- } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
22402
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
21938
22403
  // google/gemma-7b-it
21939
22404
  std::string system_prompt = "";
21940
22405
  for (auto message : chat) {
@@ -21956,7 +22421,7 @@ static int32_t llama_chat_apply_template_internal(
21956
22421
  if (add_ass) {
21957
22422
  ss << "<start_of_turn>model\n";
21958
22423
  }
21959
- } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
22424
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
21960
22425
  // OrionStarAI/Orion-14B-Chat
21961
22426
  std::string system_prompt = "";
21962
22427
  for (auto message : chat) {
@@ -21976,7 +22441,7 @@ static int32_t llama_chat_apply_template_internal(
21976
22441
  ss << message->content << "</s>";
21977
22442
  }
21978
22443
  }
21979
- } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
22444
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
21980
22445
  // openchat/openchat-3.5-0106,
21981
22446
  for (auto message : chat) {
21982
22447
  std::string role(message->role);
@@ -21990,13 +22455,13 @@ static int32_t llama_chat_apply_template_internal(
21990
22455
  if (add_ass) {
21991
22456
  ss << "GPT4 Correct Assistant:";
21992
22457
  }
21993
- } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
22458
+ } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
21994
22459
  // eachadea/vicuna-13b-1.1 (and Orca variant)
21995
22460
  for (auto message : chat) {
21996
22461
  std::string role(message->role);
21997
22462
  if (role == "system") {
21998
22463
  // Orca-Vicuna variant uses a system prefix
21999
- if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
22464
+ if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
22000
22465
  ss << "SYSTEM: " << message->content << "\n";
22001
22466
  } else {
22002
22467
  ss << message->content << "\n\n";
@@ -22010,7 +22475,7 @@ static int32_t llama_chat_apply_template_internal(
22010
22475
  if (add_ass) {
22011
22476
  ss << "ASSISTANT:";
22012
22477
  }
22013
- } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
22478
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
22014
22479
  // deepseek-ai/deepseek-coder-33b-instruct
22015
22480
  for (auto message : chat) {
22016
22481
  std::string role(message->role);
@@ -22025,7 +22490,7 @@ static int32_t llama_chat_apply_template_internal(
22025
22490
  if (add_ass) {
22026
22491
  ss << "### Response:\n";
22027
22492
  }
22028
- } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
22493
+ } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
22029
22494
  // CohereForAI/c4ai-command-r-plus
22030
22495
  for (auto message : chat) {
22031
22496
  std::string role(message->role);
@@ -22040,7 +22505,7 @@ static int32_t llama_chat_apply_template_internal(
22040
22505
  if (add_ass) {
22041
22506
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
22042
22507
  }
22043
- } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
22508
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
22044
22509
  // Llama 3
22045
22510
  for (auto message : chat) {
22046
22511
  std::string role(message->role);
@@ -22049,7 +22514,7 @@ static int32_t llama_chat_apply_template_internal(
22049
22514
  if (add_ass) {
22050
22515
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
22051
22516
  }
22052
- } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
22517
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
22053
22518
  // chatglm3-6b
22054
22519
  ss << "[gMASK]" << "sop";
22055
22520
  for (auto message : chat) {
@@ -22059,7 +22524,7 @@ static int32_t llama_chat_apply_template_internal(
22059
22524
  if (add_ass) {
22060
22525
  ss << "<|assistant|>";
22061
22526
  }
22062
- } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
22527
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
22063
22528
  ss << "[gMASK]" << "<sop>";
22064
22529
  for (auto message : chat) {
22065
22530
  std::string role(message->role);
@@ -22068,7 +22533,7 @@ static int32_t llama_chat_apply_template_internal(
22068
22533
  if (add_ass) {
22069
22534
  ss << "<|assistant|>";
22070
22535
  }
22071
- } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
22536
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
22072
22537
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
22073
22538
  for (auto message : chat) {
22074
22539
  std::string role(message->role);
@@ -22080,7 +22545,7 @@ static int32_t llama_chat_apply_template_internal(
22080
22545
  ss << trim(message->content);
22081
22546
  }
22082
22547
  }
22083
- } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
22548
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
22084
22549
  // DeepSeek-V2
22085
22550
  for (auto message : chat) {
22086
22551
  std::string role(message->role);
@@ -22095,7 +22560,7 @@ static int32_t llama_chat_apply_template_internal(
22095
22560
  if (add_ass) {
22096
22561
  ss << "Assistant:";
22097
22562
  }
22098
- } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
22563
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
22099
22564
  // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
22100
22565
  // EXAONE-3.0-7.8B-Instruct
22101
22566
  for (auto message : chat) {
@@ -22111,7 +22576,7 @@ static int32_t llama_chat_apply_template_internal(
22111
22576
  if (add_ass) {
22112
22577
  ss << "[|assistant|]";
22113
22578
  }
22114
- } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
22579
+ } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
22115
22580
  // this template requires the model to have "\n\n" as EOT token
22116
22581
  for (auto message : chat) {
22117
22582
  std::string role(message->role);
@@ -22121,7 +22586,7 @@ static int32_t llama_chat_apply_template_internal(
22121
22586
  ss << message->content << "\n\n";
22122
22587
  }
22123
22588
  }
22124
- } else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
22589
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
22125
22590
  // IBM Granite template
22126
22591
  for (const auto & message : chat) {
22127
22592
  std::string role(message->role);
@@ -22134,6 +22599,32 @@ static int32_t llama_chat_apply_template_internal(
22134
22599
  if (add_ass) {
22135
22600
  ss << "<|start_of_role|>assistant<|end_of_role|>\n";
22136
22601
  }
22602
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
22603
+ // GigaChat template
22604
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
22605
+
22606
+ // Handle system message if present
22607
+ if (has_system) {
22608
+ ss << "<s>" << chat[0]->content << "<|message_sep|>";
22609
+ } else {
22610
+ ss << "<s>";
22611
+ }
22612
+
22613
+ // Process remaining messages
22614
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
22615
+ std::string role(chat[i]->role);
22616
+ if (role == "user") {
22617
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
22618
+ << "available functions<|role_sep|>[]<|message_sep|>";
22619
+ } else if (role == "assistant") {
22620
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
22621
+ }
22622
+ }
22623
+
22624
+ // Add generation prompt if needed
22625
+ if (add_ass) {
22626
+ ss << "assistant<|role_sep|>";
22627
+ }
22137
22628
  } else {
22138
22629
  // template not supported
22139
22630
  return -1;
@@ -22173,7 +22664,11 @@ int32_t llama_chat_apply_template(
22173
22664
  }
22174
22665
 
22175
22666
  std::string formatted_chat;
22176
- int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
22667
+ llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
22668
+ if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
22669
+ return -1;
22670
+ }
22671
+ int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
22177
22672
  if (res < 0) {
22178
22673
  return res;
22179
22674
  }
@@ -22183,6 +22678,15 @@ int32_t llama_chat_apply_template(
22183
22678
  return res;
22184
22679
  }
22185
22680
 
22681
+ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
22682
+ auto it = LLM_CHAT_TEMPLATES.begin();
22683
+ for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
22684
+ output[i] = it->first.c_str();
22685
+ std::advance(it, 1);
22686
+ }
22687
+ return (int32_t) LLM_CHAT_TEMPLATES.size();
22688
+ }
22689
+
22186
22690
  //
22187
22691
  // sampling
22188
22692
  //