bigdl-core-cpp 2.6.0b20250320__py3-none-win_amd64.whl → 2.6.0b20250321__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +687 -60
  2. bigdl/cpp/convert_hf_to_gguf_update.py +46 -41
  3. bigdl/cpp/convert_lora_to_gguf.py +33 -5
  4. bigdl/cpp/gguf-py/gguf/constants.py +306 -123
  5. bigdl/cpp/gguf-py/gguf/gguf_writer.py +31 -3
  6. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +122 -25
  7. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  8. bigdl/cpp/gguf-py/gguf/vocab.py +1 -1
  9. bigdl/cpp/libs/common.lib +0 -0
  10. bigdl/cpp/libs/ggml-base.dll +0 -0
  11. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  12. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  13. bigdl/cpp/libs/ggml.dll +0 -0
  14. bigdl/cpp/libs/llama-batched.exe +0 -0
  15. bigdl/cpp/libs/llama-bench.exe +0 -0
  16. bigdl/cpp/libs/llama-cli.exe +0 -0
  17. bigdl/cpp/libs/llama-embedding.exe +0 -0
  18. bigdl/cpp/libs/llama-gemma3-cli.exe +0 -0
  19. bigdl/cpp/libs/llama-gguf.exe +0 -0
  20. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  21. bigdl/cpp/libs/llama-lookup.exe +0 -0
  22. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  23. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  24. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  25. bigdl/cpp/libs/llama-quantize.exe +0 -0
  26. bigdl/cpp/libs/llama-server.exe +0 -0
  27. bigdl/cpp/libs/llama-simple.exe +0 -0
  28. bigdl/cpp/libs/llama-speculative.exe +0 -0
  29. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  30. bigdl/cpp/libs/llama.dll +0 -0
  31. bigdl/cpp/libs/llava_shared.dll +0 -0
  32. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  33. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  34. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  35. bigdl/cpp/libs/ollama-lib.exe +0 -0
  36. bigdl/cpp/libs/ollama.exe +0 -0
  37. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  38. bigdl/cpp/libs/ollama_llama.dll +0 -0
  39. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  40. {bigdl_core_cpp-2.6.0b20250320.dist-info → bigdl_core_cpp-2.6.0b20250321.dist-info}/METADATA +1 -1
  41. bigdl_core_cpp-2.6.0b20250321.dist-info/RECORD +57 -0
  42. {bigdl_core_cpp-2.6.0b20250320.dist-info → bigdl_core_cpp-2.6.0b20250321.dist-info}/WHEEL +1 -1
  43. bigdl_core_cpp-2.6.0b20250320.dist-info/RECORD +0 -57
  44. {bigdl_core_cpp-2.6.0b20250320.data → bigdl_core_cpp-2.6.0b20250321.data}/scripts/init-llama-cpp.bat +0 -0
  45. {bigdl_core_cpp-2.6.0b20250320.data → bigdl_core_cpp-2.6.0b20250321.data}/scripts/init-llama-cpp.ps1 +0 -0
  46. {bigdl_core_cpp-2.6.0b20250320.data → bigdl_core_cpp-2.6.0b20250321.data}/scripts/init-ollama.bat +0 -0
  47. {bigdl_core_cpp-2.6.0b20250320.dist-info → bigdl_core_cpp-2.6.0b20250321.dist-info}/top_level.txt +0 -0
@@ -221,17 +221,17 @@ class Model:
221
221
  self.gguf_writer.add_context_length(n_ctx)
222
222
  logger.info(f"gguf: context length = {n_ctx}")
223
223
 
224
- n_embd = self.find_hparam(["hidden_size", "n_embd"])
225
- self.gguf_writer.add_embedding_length(n_embd)
226
- logger.info(f"gguf: embedding length = {n_embd}")
224
+ if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
225
+ self.gguf_writer.add_embedding_length(n_embd)
226
+ logger.info(f"gguf: embedding length = {n_embd}")
227
227
 
228
228
  if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
229
229
  self.gguf_writer.add_feed_forward_length(n_ff)
230
230
  logger.info(f"gguf: feed forward length = {n_ff}")
231
231
 
232
- n_head = self.find_hparam(["num_attention_heads", "n_head"])
233
- self.gguf_writer.add_head_count(n_head)
234
- logger.info(f"gguf: head count = {n_head}")
232
+ if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
233
+ self.gguf_writer.add_head_count(n_head)
234
+ logger.info(f"gguf: head count = {n_head}")
235
235
 
236
236
  if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
237
237
  self.gguf_writer.add_head_count_kv(n_head_kv)
@@ -296,7 +296,9 @@ class Model:
296
296
  break
297
297
 
298
298
  for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
299
- data = data_torch.squeeze().numpy()
299
+ # TODO: why do we squeeze here?
300
+ # data = data_torch.squeeze().numpy()
301
+ data = data_torch.numpy()
300
302
 
301
303
  # if data ends up empty, it means data_torch was a scalar tensor -> restore
302
304
  if len(data.shape) == 0:
@@ -324,6 +326,9 @@ class Model:
324
326
  gguf.MODEL_TENSOR.TIME_MIX_W2,
325
327
  gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
326
328
  gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
329
+ gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
330
+ gguf.MODEL_TENSOR.POSNET_NORM1,
331
+ gguf.MODEL_TENSOR.POSNET_NORM2,
327
332
  )
328
333
  )
329
334
  or not new_name.endswith(".weight")
@@ -473,6 +478,11 @@ class Model:
473
478
  return modelcls
474
479
  return func
475
480
 
481
+ @classmethod
482
+ def print_registered_models(cls):
483
+ for name in sorted(cls._model_classes.keys()):
484
+ logger.error(f"- {name}")
485
+
476
486
  @classmethod
477
487
  def from_model_architecture(cls, arch: str) -> type[Model]:
478
488
  try:
@@ -525,9 +535,19 @@ class Model:
525
535
  else:
526
536
  token: str = reverse_vocab[i]
527
537
  if token in added_vocab:
538
+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539
+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
540
+ if not tokenizer.added_tokens_decoder[i].normalized:
541
+ previous_token = token
542
+ token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
543
+ if previous_token != token:
544
+ logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
545
+
528
546
  if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
529
547
  toktypes.append(gguf.TokenType.CONTROL)
530
548
  else:
549
+ # NOTE: this was added for Gemma.
550
+ # Encoding and decoding the tokens above isn't sufficient for this case.
531
551
  token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
532
552
  toktypes.append(gguf.TokenType.USER_DEFINED)
533
553
  else:
@@ -538,7 +558,7 @@ class Model:
538
558
 
539
559
  # NOTE: this function is generated by convert_hf_to_gguf_update.py
540
560
  # do not modify it manually!
541
- # ref: https://github.com/ggerganov/llama.cpp/pull/6920
561
+ # ref: https://github.com/ggml-org/llama.cpp/pull/6920
542
562
  # Marker: Start get_vocab_base_pre
543
563
  def get_vocab_base_pre(self, tokenizer) -> str:
544
564
  # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
@@ -571,6 +591,9 @@ class Model:
571
591
  if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
572
592
  # ref: https://huggingface.co/tiiuae/falcon-7b
573
593
  res = "falcon"
594
+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
595
+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
596
+ res = "falcon3"
574
597
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
575
598
  # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
576
599
  res = "bert-bge"
@@ -625,7 +648,7 @@ class Model:
625
648
  if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
626
649
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
627
650
  res = "jina-v2-code"
628
- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
651
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
629
652
  # ref: https://huggingface.co/THUDM/glm-4-9b-chat
630
653
  res = "chatglm-bpe"
631
654
  if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
@@ -664,6 +687,18 @@ class Model:
664
687
  if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
665
688
  # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
666
689
  res = "roberta-bpe"
690
+ if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
691
+ # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
692
+ res = "gigachat"
693
+ if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
694
+ # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
695
+ res = "megrez"
696
+ if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
697
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
698
+ res = "deepseek-v3"
699
+ if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
700
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
701
+ res = "deepseek-r1-qwen"
667
702
 
668
703
  if res is None:
669
704
  logger.warning("\n")
@@ -673,7 +708,7 @@ class Model:
673
708
  logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
674
709
  logger.warning("** - the pre-tokenization config has changed upstream")
675
710
  logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
676
- logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
711
+ logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
677
712
  logger.warning("**")
678
713
  logger.warning(f"** chkhsh: {chkhsh}")
679
714
  logger.warning("**************************************************************************************")
@@ -686,6 +721,9 @@ class Model:
686
721
  return res
687
722
  # Marker: End get_vocab_base_pre
688
723
 
724
+ def _set_vocab_none(self) -> None:
725
+ self.gguf_writer.add_tokenizer_model("none")
726
+
689
727
  def _set_vocab_gpt2(self) -> None:
690
728
  tokens, toktypes, tokpre = self.get_vocab_base()
691
729
  self.gguf_writer.add_tokenizer_model("gpt2")
@@ -1669,6 +1707,178 @@ class LlamaModel(Model):
1669
1707
  raise ValueError(f"Unprocessed experts: {experts}")
1670
1708
 
1671
1709
 
1710
+ @Model.register("DeciLMForCausalLM")
1711
+ class DeciModel(Model):
1712
+ model_arch = gguf.MODEL_ARCH.DECI
1713
+
1714
+ @staticmethod
1715
+ def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
1716
+ # DeciLM-specific code
1717
+ intermediate_size = int(2 * ffn_mult * n_embd / 3)
1718
+ return DeciModel._find_multiple(intermediate_size, 256)
1719
+
1720
+ @staticmethod
1721
+ def _find_multiple(n: int, k: int) -> int:
1722
+ # DeciLM-specific code
1723
+ if n % k == 0:
1724
+ return n
1725
+ return n + k - (n % k)
1726
+
1727
+ def __init__(self, *args, **kwargs):
1728
+ super().__init__(*args, **kwargs)
1729
+
1730
+ if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1731
+ _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
1732
+ assert self.block_count == len(_block_configs)
1733
+ self._num_kv_heads = list()
1734
+ self._num_heads = list()
1735
+ _ffn_multipliers = list()
1736
+ # ***linear attention layer***
1737
+ # if n_heads_in_group is None and replace_with_linear is True
1738
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
1739
+ # ***attention-free layer***
1740
+ # if n_heads_in_group is None and replace_with_linear is False
1741
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
1742
+ # ***normal attention-layer***
1743
+ # if n_heads_in_group is not None, then
1744
+ # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
1745
+ # _num_heads[il] is num_attention_head
1746
+ for il in range(len(_block_configs)):
1747
+ if _block_configs[il]["attention"]["n_heads_in_group"] is None:
1748
+ if _block_configs[il]["attention"]["replace_with_linear"] is True:
1749
+ self._num_kv_heads.append(0)
1750
+ self._num_heads.append(self.hparams["num_attention_heads"])
1751
+ else:
1752
+ self._num_kv_heads.append(0)
1753
+ self._num_heads.append(0)
1754
+ else:
1755
+ self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
1756
+ self._num_heads.append(self.hparams["num_attention_heads"])
1757
+ _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
1758
+ assert self.block_count == len(self._num_kv_heads)
1759
+ assert self.block_count == len(self._num_heads)
1760
+ assert self.block_count == len(_ffn_multipliers)
1761
+ assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
1762
+ assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
1763
+ assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
1764
+ self._ffn_dims: list[int] = [
1765
+ DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
1766
+ for multiplier in _ffn_multipliers
1767
+ ]
1768
+
1769
+ def set_vocab(self):
1770
+ # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
1771
+ # eos_token from '|eot_id|' to '|end_of_text|'
1772
+ if self.hparams.get("vocab_size", 128256) == 128256:
1773
+ tokens, toktypes, tokpre = self.get_vocab_base()
1774
+ self.gguf_writer.add_tokenizer_model("gpt2")
1775
+ self.gguf_writer.add_tokenizer_pre(tokpre)
1776
+ self.gguf_writer.add_token_list(tokens)
1777
+ self.gguf_writer.add_token_types(toktypes)
1778
+
1779
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1780
+ special_vocab.add_to_gguf(self.gguf_writer)
1781
+ else:
1782
+ # DeciLM-7B
1783
+ self._set_vocab_llama_hf()
1784
+
1785
+ def set_gguf_parameters(self):
1786
+ if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1787
+ assert self.block_count == len(self._num_kv_heads)
1788
+ assert self.block_count == len(self._num_heads)
1789
+ assert self.block_count == len(self._ffn_dims)
1790
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
1791
+ self.gguf_writer.add_rope_freq_base(rope_theta)
1792
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1793
+ self.gguf_writer.add_head_count(self._num_heads)
1794
+ self.gguf_writer.add_feed_forward_length(self._ffn_dims)
1795
+ self.gguf_writer.add_block_count(self.block_count)
1796
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1797
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1798
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1799
+ self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1800
+ self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1801
+ self.gguf_writer.add_file_type(self.ftype)
1802
+ else: # DeciLM-7B
1803
+ super().set_gguf_parameters()
1804
+ if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
1805
+ self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
1806
+ assert self.block_count == len(self._num_kv_heads)
1807
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1808
+ hparams = self.hparams
1809
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1810
+
1811
+ if "head_dim" in hparams:
1812
+ rope_dim = hparams["head_dim"]
1813
+ else:
1814
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1815
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
1816
+
1817
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1818
+ if self.hparams["rope_scaling"].get("type") == "linear":
1819
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1820
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1821
+
1822
+ @staticmethod
1823
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1824
+ if n_head_kv is not None and n_head != n_head_kv:
1825
+ n_head = n_head_kv
1826
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1827
+ .swapaxes(1, 2)
1828
+ .reshape(weights.shape))
1829
+
1830
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1831
+ n_head = self.hparams["num_attention_heads"]
1832
+ if bid is not None:
1833
+ if "num_key_value_heads_per_layer" in self.hparams:
1834
+ n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
1835
+ elif "block_configs" in self.hparams:
1836
+ n_kv_head = self._num_kv_heads[bid]
1837
+ n_head = self._num_heads[bid]
1838
+ else:
1839
+ n_kv_head = self.hparams.get("num_key_value_heads")
1840
+ else:
1841
+ n_kv_head = self.hparams.get("num_key_value_heads")
1842
+
1843
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
1844
+ data_torch = DeciModel.permute(data_torch, n_head, n_head)
1845
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
1846
+ data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
1847
+ return [(self.map_tensor_name(name), data_torch)]
1848
+
1849
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1850
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1851
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
1852
+ base = self.hparams.get("rope_theta", 10000.0)
1853
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1854
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1855
+
1856
+ factor = rope_scaling.get("factor", 8.0)
1857
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1858
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1859
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1860
+
1861
+ low_freq_wavelen = old_context_len / low_freq_factor
1862
+ high_freq_wavelen = old_context_len / high_freq_factor
1863
+ assert low_freq_wavelen != high_freq_wavelen
1864
+
1865
+ rope_factors = []
1866
+ for freq in freqs:
1867
+ wavelen = 2 * math.pi / freq
1868
+ if wavelen < high_freq_wavelen:
1869
+ rope_factors.append(1)
1870
+ elif wavelen > low_freq_wavelen:
1871
+ rope_factors.append(factor)
1872
+ else:
1873
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1874
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1875
+
1876
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1877
+
1878
+ def prepare_tensors(self):
1879
+ super().prepare_tensors()
1880
+
1881
+
1672
1882
  @Model.register("BitnetForCausalLM")
1673
1883
  class BitnetModel(Model):
1674
1884
  model_arch = gguf.MODEL_ARCH.BITNET
@@ -2024,6 +2234,44 @@ class Qwen2VLModel(Model):
2024
2234
  yield name, data
2025
2235
 
2026
2236
 
2237
+ @Model.register("WavTokenizerDec")
2238
+ class WavTokenizerDecModel(Model):
2239
+ model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
2240
+
2241
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2242
+ del bid # unused
2243
+
2244
+ if \
2245
+ name.endswith("codebook.cluster_size") or \
2246
+ name.endswith("codebook.embed_avg") or \
2247
+ name.endswith("codebook.inited"):
2248
+ logger.debug(f"Skipping {name!r}")
2249
+ return []
2250
+
2251
+ logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
2252
+
2253
+ return [(self.map_tensor_name(name), data_torch)]
2254
+
2255
+ def set_vocab(self):
2256
+ self._set_vocab_none()
2257
+
2258
+ def set_gguf_parameters(self):
2259
+ super().set_gguf_parameters()
2260
+ self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
2261
+ self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
2262
+ self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
2263
+ self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
2264
+ self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
2265
+
2266
+ self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
2267
+ self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
2268
+
2269
+ self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
2270
+ self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
2271
+
2272
+ self.gguf_writer.add_causal_attention(False)
2273
+
2274
+
2027
2275
  @Model.register("Qwen2MoeForCausalLM")
2028
2276
  class Qwen2MoeModel(Model):
2029
2277
  model_arch = gguf.MODEL_ARCH.QWEN2MOE
@@ -2152,6 +2400,15 @@ class Phi3MiniModel(Model):
2152
2400
  model_arch = gguf.MODEL_ARCH.PHI3
2153
2401
 
2154
2402
  def set_vocab(self):
2403
+ # Phi-4 model uses GPT2Tokenizer
2404
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2405
+ if tokenizer_config_file.is_file():
2406
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2407
+ tokenizer_config_json = json.load(f)
2408
+ tokenizer_class = tokenizer_config_json['tokenizer_class']
2409
+ if tokenizer_class == 'GPT2Tokenizer':
2410
+ return self._set_vocab_gpt2()
2411
+
2155
2412
  from sentencepiece import SentencePieceProcessor
2156
2413
 
2157
2414
  tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -2268,7 +2525,11 @@ class Phi3MiniModel(Model):
2268
2525
  self.gguf_writer.add_rope_dimension_count(rope_dims)
2269
2526
  self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
2270
2527
  self.gguf_writer.add_file_type(self.ftype)
2271
- self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
2528
+ sliding_window = self.hparams.get("sliding_window")
2529
+ # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
2530
+ if sliding_window is None:
2531
+ sliding_window = 0
2532
+ self.gguf_writer.add_sliding_window(sliding_window)
2272
2533
 
2273
2534
  def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2274
2535
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
@@ -2310,6 +2571,63 @@ class Phi3MiniModel(Model):
2310
2571
  yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2311
2572
 
2312
2573
 
2574
+ @Model.register("PhiMoEForCausalLM")
2575
+ class PhiMoeModel(Phi3MiniModel):
2576
+ model_arch = gguf.MODEL_ARCH.PHIMOE
2577
+
2578
+ _experts: list[dict[str, Tensor]] | None = None
2579
+
2580
+ def set_gguf_parameters(self):
2581
+ super().set_gguf_parameters()
2582
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
2583
+ self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
2584
+
2585
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2586
+ # process the experts separately
2587
+ if name.find("block_sparse_moe.experts") != -1:
2588
+ n_experts = self.hparams["num_local_experts"]
2589
+ assert bid is not None
2590
+
2591
+ if self._experts is None:
2592
+ self._experts = [{} for _ in range(self.block_count)]
2593
+
2594
+ self._experts[bid][name] = data_torch
2595
+
2596
+ if len(self._experts[bid]) >= n_experts * 3:
2597
+ tensors: list[tuple[str, Tensor]] = []
2598
+
2599
+ # merge the experts into a single 3d tensor
2600
+ for w_name in ["w1", "w2", "w3"]:
2601
+ datas: list[Tensor] = []
2602
+
2603
+ for xid in range(n_experts):
2604
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
2605
+ datas.append(self._experts[bid][ename])
2606
+ del self._experts[bid][ename]
2607
+
2608
+ data_torch = torch.stack(datas, dim=0)
2609
+
2610
+ merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
2611
+
2612
+ new_name = self.map_tensor_name(merged_name)
2613
+
2614
+ tensors.append((new_name, data_torch))
2615
+ return tensors
2616
+ else:
2617
+ return []
2618
+
2619
+ return [(self.map_tensor_name(name), data_torch)]
2620
+
2621
+ def prepare_tensors(self):
2622
+ super().prepare_tensors()
2623
+
2624
+ if self._experts is not None:
2625
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2626
+ experts = [k for d in self._experts for k in d.keys()]
2627
+ if len(experts) > 0:
2628
+ raise ValueError(f"Unprocessed experts: {experts}")
2629
+
2630
+
2313
2631
  @Model.register("PlamoForCausalLM")
2314
2632
  class PlamoModel(Model):
2315
2633
  model_arch = gguf.MODEL_ARCH.PLAMO
@@ -2517,7 +2835,7 @@ class InternLM2Model(Model):
2517
2835
  if chat_eos_token_id is not None:
2518
2836
  # For the chat model, we replace the eos with '<|im_end|>'.
2519
2837
  # TODO: this is a hack, should be fixed
2520
- # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2838
+ # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
2521
2839
  special_vocab.special_token_ids["eos"] = chat_eos_token_id
2522
2840
  logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
2523
2841
  " in chat mode so that the conversation can end normally.")
@@ -2567,7 +2885,67 @@ class InternLM2Model(Model):
2567
2885
  return [(self.map_tensor_name(name), data_torch)]
2568
2886
 
2569
2887
 
2570
- @Model.register("BertModel", "CamembertModel", "RobertaModel")
2888
+ @Model.register("InternLM3ForCausalLM")
2889
+ class InternLM3Model(Model):
2890
+ model_arch = gguf.MODEL_ARCH.LLAMA
2891
+
2892
+ def set_vocab(self):
2893
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
2894
+
2895
+ self.gguf_writer.add_tokenizer_model("llama")
2896
+ self.gguf_writer.add_tokenizer_pre("default")
2897
+ self.gguf_writer.add_token_list(tokens)
2898
+ self.gguf_writer.add_token_scores(scores)
2899
+ self.gguf_writer.add_token_types(toktypes)
2900
+
2901
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2902
+
2903
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2904
+ if tokenizer_config_file.is_file():
2905
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2906
+ tokenizer_config_json = json.load(f)
2907
+ if "add_prefix_space" in tokenizer_config_json:
2908
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
2909
+
2910
+ if "added_tokens_decoder" in tokenizer_config_json:
2911
+ for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
2912
+ if token_data.get("special"):
2913
+ token_id = int(token_id)
2914
+ token = token_data["content"]
2915
+ special_vocab._set_special_token(token, token_id)
2916
+ # update eos token
2917
+ if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
2918
+ special_vocab.special_token_ids["eos"] = token_id
2919
+
2920
+ special_vocab.add_to_gguf(self.gguf_writer)
2921
+
2922
+ def set_gguf_parameters(self):
2923
+ super().set_gguf_parameters()
2924
+ hparams = self.hparams
2925
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2926
+
2927
+ if "head_dim" in hparams:
2928
+ rope_dim = hparams["head_dim"]
2929
+ else:
2930
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2931
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
2932
+
2933
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2934
+ if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
2935
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2936
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2937
+
2938
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2939
+ n_head = self.hparams["num_attention_heads"]
2940
+ n_kv_head = self.hparams.get("num_key_value_heads")
2941
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
2942
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2943
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
2944
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
2945
+ return [(self.map_tensor_name(name), data_torch)]
2946
+
2947
+
2948
+ @Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
2571
2949
  class BertModel(Model):
2572
2950
  model_arch = gguf.MODEL_ARCH.BERT
2573
2951
 
@@ -2633,13 +3011,73 @@ class BertModel(Model):
2633
3011
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2634
3012
  del bid # unused
2635
3013
 
3014
+ if name.startswith("bert."):
3015
+ name = name[5:]
3016
+
3017
+ if name.endswith(".gamma"):
3018
+ name = name[:-6] + ".weight"
3019
+
3020
+ if name.endswith(".beta"):
3021
+ name = name[:-5] + ".bias"
3022
+
2636
3023
  # we are only using BERT for embeddings so we don't need the pooling layer
2637
3024
  if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
2638
3025
  return [] # we don't need these
2639
3026
 
3027
+ if name.startswith("cls.predictions"):
3028
+ return []
3029
+
3030
+ if name.startswith("cls.seq_relationship"):
3031
+ return []
3032
+
2640
3033
  return [(self.map_tensor_name(name), data_torch)]
2641
3034
 
2642
3035
 
3036
+ @Model.register("RobertaModel")
3037
+ class RobertaModel(BertModel):
3038
+ model_arch = gguf.MODEL_ARCH.BERT
3039
+
3040
+ def __init__(self, *args, **kwargs):
3041
+ super().__init__(*args, **kwargs)
3042
+
3043
+ # we need the pad_token_id to know how to chop down position_embd matrix
3044
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3045
+ self._position_offset = 1 + pad_token_id
3046
+ if "max_position_embeddings" in self.hparams:
3047
+ self.hparams["max_position_embeddings"] -= self._position_offset
3048
+ else:
3049
+ self._position_offset = None
3050
+
3051
+ def set_vocab(self):
3052
+ """Support BPE tokenizers for roberta models"""
3053
+ bpe_tok_path = self.dir_model / "tokenizer.json"
3054
+ if bpe_tok_path.exists():
3055
+ self._set_vocab_gpt2()
3056
+ self.gguf_writer.add_add_bos_token(True)
3057
+ self.gguf_writer.add_add_eos_token(True)
3058
+
3059
+ # we need this to validate the size of the token_type embeddings
3060
+ # though currently we are passing all zeros to the token_type embeddings
3061
+ # "Sequence A" or "Sequence B"
3062
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3063
+
3064
+ else:
3065
+ return super().set_vocab()
3066
+
3067
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3068
+ # if name starts with "roberta.", remove the prefix
3069
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
3070
+ if name.startswith("roberta."):
3071
+ name = name[8:]
3072
+
3073
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
3074
+ if name == "embeddings.position_embeddings.weight":
3075
+ if self._position_offset is not None:
3076
+ data_torch = data_torch[self._position_offset:,:]
3077
+
3078
+ return super().modify_tensors(data_torch, name, bid)
3079
+
3080
+
2643
3081
  @Model.register("NomicBertModel")
2644
3082
  class NomicBertModel(BertModel):
2645
3083
  model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@@ -2947,6 +3385,8 @@ class Rwkv6Model(Model):
2947
3385
  # required by llama.cpp, unused
2948
3386
  self.gguf_writer.add_head_count(0)
2949
3387
 
3388
+ lerp_weights: dict[int, dict[str, Tensor]] = {}
3389
+
2950
3390
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2951
3391
  new_name = self.map_tensor_name(name)
2952
3392
 
@@ -2959,14 +3399,87 @@ class Rwkv6Model(Model):
2959
3399
  if new_name.endswith("time_mix_w2.weight"):
2960
3400
  data_torch = data_torch.permute(0, 2, 1)
2961
3401
 
2962
- rescale_every_n_layers = self.hparams["rescale_every"]
2963
- if rescale_every_n_layers > 0:
2964
- if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
2965
- data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
3402
+ if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
3403
+ data_torch = data_torch.squeeze()
3404
+
3405
+ try:
3406
+ rescale_every_n_layers = self.hparams["rescale_every"]
3407
+ if rescale_every_n_layers > 0:
3408
+ if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
3409
+ data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
3410
+ except KeyError:
3411
+ pass
3412
+
3413
+ # concat time_mix_lerp weights to reduce some cpu overhead
3414
+ # also reduces the number of tensors in the model
3415
+ if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
3416
+ try:
3417
+ self.lerp_weights[bid][new_name] = data_torch
3418
+ except KeyError:
3419
+ self.lerp_weights[bid] = {new_name: data_torch}
3420
+ if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
3421
+ new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3422
+ data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
3423
+ yield (new_name, data)
3424
+ return
2966
3425
 
2967
3426
  yield (new_name, data_torch)
2968
3427
 
2969
3428
 
3429
+ @Model.register("RWKV6Qwen2ForCausalLM")
3430
+ class RWKV6Qwen2Model(Rwkv6Model):
3431
+ model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
3432
+
3433
+ def set_vocab(self):
3434
+ try:
3435
+ self._set_vocab_sentencepiece()
3436
+ except FileNotFoundError:
3437
+ self._set_vocab_gpt2()
3438
+
3439
+ def set_gguf_parameters(self):
3440
+ block_count = self.hparams["num_hidden_layers"]
3441
+ num_attention_heads = self.hparams["num_attention_heads"]
3442
+ num_key_value_heads = self.hparams["num_key_value_heads"]
3443
+ hidden_size = self.hparams["hidden_size"]
3444
+ head_size = hidden_size // num_attention_heads
3445
+ rms_norm_eps = self.hparams["rms_norm_eps"]
3446
+ intermediate_size = self.hparams["intermediate_size"]
3447
+ time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3448
+ time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
3449
+
3450
+ # RWKV isn't context limited
3451
+ self.gguf_writer.add_context_length(1048576)
3452
+ self.gguf_writer.add_embedding_length(hidden_size)
3453
+ self.gguf_writer.add_block_count(block_count)
3454
+ self.gguf_writer.add_wkv_head_size(head_size)
3455
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
3456
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
3457
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
3458
+ self.gguf_writer.add_file_type(self.ftype)
3459
+
3460
+ # special parameters for time_mixing in RWKV6QWEN2
3461
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3462
+ self.gguf_writer.add_token_shift_count(1)
3463
+ # RWKV6QWEN2 use grouped key/value like GQA
3464
+ self.gguf_writer.add_head_count_kv(num_key_value_heads)
3465
+
3466
+ # required by llama.cpp, unused
3467
+ self.gguf_writer.add_head_count(0)
3468
+
3469
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3470
+ for new_name, data in super().modify_tensors(data_torch, name, bid):
3471
+ if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
3472
+ data = data.view(5, -1, data.shape[-1])
3473
+ # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
3474
+ # permute them here to avoid code changes
3475
+ data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
3476
+ if "w2" in new_name:
3477
+ data = data.view(5, -1, data.shape[-1])
3478
+ yield (new_name, data)
3479
+ continue
3480
+ yield (new_name, data)
3481
+
3482
+
2970
3483
  @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
2971
3484
  class MambaModel(Model):
2972
3485
  model_arch = gguf.MODEL_ARCH.MAMBA
@@ -3061,6 +3574,24 @@ class CommandR2Model(Model):
3061
3574
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3062
3575
 
3063
3576
 
3577
+ @Model.register("Cohere2ForCausalLM")
3578
+ class Cohere2Model(Model):
3579
+ model_arch = gguf.MODEL_ARCH.COHERE2
3580
+
3581
+ def set_gguf_parameters(self):
3582
+ super().set_gguf_parameters()
3583
+
3584
+ self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
3585
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
3586
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3587
+
3588
+ rotary_pct = self.hparams["rotary_pct"]
3589
+ hidden_size = self.hparams["hidden_size"]
3590
+ num_attention_heads = self.hparams["num_attention_heads"]
3591
+ self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
3592
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3593
+
3594
+
3064
3595
  @Model.register("OlmoForCausalLM")
3065
3596
  @Model.register("OLMoForCausalLM")
3066
3597
  class OlmoModel(Model):
@@ -3427,7 +3958,99 @@ class ArcticModel(Model):
3427
3958
  raise ValueError(f"Unprocessed experts: {experts}")
3428
3959
 
3429
3960
 
3961
+ @Model.register("DeepseekForCausalLM")
3962
+ class DeepseekModel(Model):
3963
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK
3964
+
3965
+ def set_vocab(self):
3966
+ try:
3967
+ self._set_vocab_sentencepiece()
3968
+ except FileNotFoundError:
3969
+ self._set_vocab_gpt2()
3970
+
3971
+ def set_gguf_parameters(self):
3972
+ super().set_gguf_parameters()
3973
+ hparams = self.hparams
3974
+ if "head_dim" in hparams:
3975
+ rope_dim = hparams["head_dim"]
3976
+ else:
3977
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
3978
+
3979
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
3980
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3981
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
3982
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3983
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
3984
+ self.gguf_writer.add_expert_weights_scale(1.0)
3985
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3986
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3987
+
3988
+ _experts: list[dict[str, Tensor]] | None = None
3989
+
3990
+ @staticmethod
3991
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
3992
+ if n_head_kv is not None and n_head != n_head_kv:
3993
+ n_head = n_head_kv
3994
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
3995
+ .swapaxes(1, 2)
3996
+ .reshape(weights.shape))
3997
+
3998
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3999
+ n_head = self.hparams["num_attention_heads"]
4000
+ n_kv_head = self.hparams.get("num_key_value_heads")
4001
+
4002
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
4003
+ data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
4004
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
4005
+ data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
4006
+
4007
+ # process the experts separately
4008
+ if name.find("mlp.experts") != -1:
4009
+ n_experts = self.hparams["n_routed_experts"]
4010
+ assert bid is not None
4011
+
4012
+ if self._experts is None:
4013
+ self._experts = [{} for _ in range(self.block_count)]
4014
+
4015
+ self._experts[bid][name] = data_torch
4016
+
4017
+ if len(self._experts[bid]) >= n_experts * 3:
4018
+ tensors: list[tuple[str, Tensor]] = []
4019
+
4020
+ # merge the experts into a single 3d tensor
4021
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
4022
+ datas: list[Tensor] = []
4023
+
4024
+ for xid in range(n_experts):
4025
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4026
+ datas.append(self._experts[bid][ename])
4027
+ del self._experts[bid][ename]
4028
+
4029
+ data_torch = torch.stack(datas, dim=0)
4030
+
4031
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4032
+
4033
+ new_name = self.map_tensor_name(merged_name)
4034
+
4035
+ tensors.append((new_name, data_torch))
4036
+ return tensors
4037
+ else:
4038
+ return []
4039
+
4040
+ return [(self.map_tensor_name(name), data_torch)]
4041
+
4042
+ def prepare_tensors(self):
4043
+ super().prepare_tensors()
4044
+
4045
+ if self._experts is not None:
4046
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
4047
+ experts = [k for d in self._experts for k in d.keys()]
4048
+ if len(experts) > 0:
4049
+ raise ValueError(f"Unprocessed experts: {experts}")
4050
+
4051
+
3430
4052
  @Model.register("DeepseekV2ForCausalLM")
4053
+ @Model.register("DeepseekV3ForCausalLM")
3431
4054
  class DeepseekV2Model(Model):
3432
4055
  model_arch = gguf.MODEL_ARCH.DEEPSEEK2
3433
4056
 
@@ -3449,6 +4072,15 @@ class DeepseekV2Model(Model):
3449
4072
  self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3450
4073
  self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3451
4074
  self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
4075
+ self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
4076
+
4077
+ if hparams["scoring_func"] == "sigmoid":
4078
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
4079
+ elif hparams["scoring_func"] == "softmax":
4080
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
4081
+ else:
4082
+ raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
4083
+
3452
4084
  self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
3453
4085
 
3454
4086
  if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
@@ -3461,6 +4093,16 @@ class DeepseekV2Model(Model):
3461
4093
  _experts: list[dict[str, Tensor]] | None = None
3462
4094
 
3463
4095
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4096
+ # rename e_score_correction_bias tensors
4097
+ if name.endswith("e_score_correction_bias"):
4098
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
4099
+
4100
+ # skip Multi-Token Prediction (MTP) layers
4101
+ block_count = self.hparams["num_hidden_layers"]
4102
+ match = re.match(r"model.layers.(\d+)", name)
4103
+ if match and int(match.group(1)) >= block_count:
4104
+ return []
4105
+
3464
4106
  # process the experts separately
3465
4107
  if name.find("mlp.experts") != -1:
3466
4108
  n_experts = self.hparams["n_routed_experts"]
@@ -3871,7 +4513,7 @@ class JaisModel(Model):
3871
4513
  self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
3872
4514
 
3873
4515
 
3874
- @Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
4516
+ @Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
3875
4517
  class ChatGLMModel(Model):
3876
4518
  model_arch = gguf.MODEL_ARCH.CHATGLM
3877
4519
 
@@ -3977,47 +4619,15 @@ class ChatGLMModel(Model):
3977
4619
 
3978
4620
  from transformers import AutoTokenizer
3979
4621
  tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3980
- vocab_size = hparams["padded_vocab_size"]
4622
+ vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
3981
4623
  assert max(tokenizer.get_vocab().values()) < vocab_size
3982
4624
 
3983
- tokpre = self.get_vocab_base_pre(tokenizer)
3984
-
3985
- merges = []
3986
- vocab = {}
3987
- mergeable_ranks = tokenizer.mergeable_ranks
3988
- for token, rank in mergeable_ranks.items():
3989
- vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
3990
- if len(token) == 1:
3991
- continue
3992
- merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
3993
- assert len(merged) >= 2 and len(merged) <= 7
3994
- merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
3995
-
3996
- # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3997
- added_vocab = tokenizer.get_added_vocab()
3998
- reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
3999
-
4000
- for i in range(vocab_size):
4001
- if i not in reverse_vocab:
4002
- tokens.append(f"[PAD{i}]")
4003
- toktypes.append(gguf.TokenType.UNUSED)
4004
- elif reverse_vocab[i] in added_vocab:
4005
- tokens.append(reverse_vocab[i])
4006
- if tokenizer.added_tokens_decoder[i].special:
4007
- toktypes.append(gguf.TokenType.CONTROL)
4008
- else:
4009
- toktypes.append(gguf.TokenType.USER_DEFINED)
4010
- else:
4011
- tokens.append(reverse_vocab[i])
4012
- toktypes.append(gguf.TokenType.NORMAL)
4013
-
4625
+ tokens, toktypes, tokpre = self.get_vocab_base()
4014
4626
  self.gguf_writer.add_tokenizer_model("gpt2")
4015
4627
  self.gguf_writer.add_tokenizer_pre(tokpre)
4016
4628
  self.gguf_writer.add_token_list(tokens)
4017
4629
  self.gguf_writer.add_token_types(toktypes)
4018
-
4019
- special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
4020
- special_vocab.merges = merges
4630
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
4021
4631
  # only add special tokens when they were not already loaded from config.json
4022
4632
  special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
4023
4633
  special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
@@ -4028,16 +4638,20 @@ class ChatGLMModel(Model):
4028
4638
  def set_gguf_parameters(self):
4029
4639
  n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
4030
4640
  n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
4031
- n_head_kv = self.hparams.get("multi_query_group_num", n_head)
4641
+ n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
4032
4642
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
4033
4643
  self.gguf_writer.add_embedding_length(n_embed)
4034
- self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
4035
- self.gguf_writer.add_block_count(self.hparams["num_layers"])
4644
+ self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
4645
+ self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
4036
4646
  self.gguf_writer.add_head_count(n_head)
4037
4647
  self.gguf_writer.add_head_count_kv(n_head_kv)
4038
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
4648
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
4039
4649
  self.gguf_writer.add_file_type(self.ftype)
4040
- self.gguf_writer.add_rope_dimension_count(64)
4650
+ if "attention_dim" in self.hparams:
4651
+ rope_dim = self.hparams["attention_dim"]
4652
+ else:
4653
+ rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4654
+ self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
4041
4655
  self.gguf_writer.add_add_bos_token(False)
4042
4656
  rope_freq = 10000
4043
4657
  if "rope_ratio" in self.hparams:
@@ -4047,7 +4661,7 @@ class ChatGLMModel(Model):
4047
4661
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4048
4662
  del bid # unused
4049
4663
 
4050
- if name.endswith(".rotary_pos_emb.inv_freq"):
4664
+ if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
4051
4665
  return []
4052
4666
 
4053
4667
  name = name.removeprefix("transformer.")
@@ -4354,6 +4968,7 @@ def parse_args() -> argparse.Namespace:
4354
4968
  parser.add_argument(
4355
4969
  "model", type=Path,
4356
4970
  help="directory containing model file",
4971
+ nargs="?",
4357
4972
  )
4358
4973
  parser.add_argument(
4359
4974
  "--use-temp-file", action="store_true",
@@ -4391,8 +5006,15 @@ def parse_args() -> argparse.Namespace:
4391
5006
  "--metadata", type=Path,
4392
5007
  help="Specify the path for an authorship metadata override file"
4393
5008
  )
5009
+ parser.add_argument(
5010
+ "--print-supported-models", action="store_true",
5011
+ help="Print the supported models"
5012
+ )
4394
5013
 
4395
- return parser.parse_args()
5014
+ args = parser.parse_args()
5015
+ if not args.print_supported_models and args.model is None:
5016
+ parser.error("the following arguments are required: model")
5017
+ return args
4396
5018
 
4397
5019
 
4398
5020
  def split_str_to_n_bytes(split_str: str) -> int:
@@ -4416,6 +5038,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
4416
5038
  def main() -> None:
4417
5039
  args = parse_args()
4418
5040
 
5041
+ if args.print_supported_models:
5042
+ logger.error("Supported models:")
5043
+ Model.print_registered_models()
5044
+ sys.exit(0)
5045
+
4419
5046
  if args.verbose:
4420
5047
  logging.basicConfig(level=logging.DEBUG)
4421
5048
  else: