bigdl-core-cpp 2.7.0b20250413__py3-none-manylinux2010_x86_64.whl → 2.7.0b20250414__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +697 -60
  2. bigdl/cpp/convert_hf_to_gguf_update.py +46 -41
  3. bigdl/cpp/convert_lora_to_gguf.py +33 -5
  4. bigdl/cpp/gguf-py/gguf/constants.py +344 -123
  5. bigdl/cpp/gguf-py/gguf/gguf_writer.py +31 -3
  6. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +122 -25
  7. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  8. bigdl/cpp/gguf-py/gguf/vocab.py +1 -1
  9. bigdl/cpp/libs/libggml-base.so +0 -0
  10. bigdl/cpp/libs/libggml-cpu-alderlake.so +0 -0
  11. bigdl/cpp/libs/{libggml-cpu.so → libggml-cpu-haswell.so} +0 -0
  12. bigdl/cpp/libs/libggml-cpu-skylakex.so +0 -0
  13. bigdl/cpp/libs/libggml-sycl.so +0 -0
  14. bigdl/cpp/libs/libggml.so +0 -0
  15. bigdl/cpp/libs/libllama.so +0 -0
  16. bigdl/cpp/libs/libllava_shared.so +0 -0
  17. bigdl/cpp/libs/libsample.so +0 -0
  18. bigdl/cpp/libs/ollama-lib +0 -0
  19. {bigdl_core_cpp-2.7.0b20250413.dist-info → bigdl_core_cpp-2.7.0b20250414.dist-info}/METADATA +1 -1
  20. bigdl_core_cpp-2.7.0b20250414.dist-info/RECORD +37 -0
  21. bigdl/cpp/libs/libmllama.so +0 -0
  22. bigdl/cpp/libs/libollama-ggml-base.so +0 -0
  23. bigdl/cpp/libs/libollama-ggml-cpu.so +0 -0
  24. bigdl/cpp/libs/libollama-ggml-sycl.so +0 -0
  25. bigdl/cpp/libs/libollama_ggml.so +0 -0
  26. bigdl/cpp/libs/libollama_llama.so +0 -0
  27. bigdl/cpp/libs/libollama_llava_shared.so +0 -0
  28. bigdl/cpp/libs/llama-batched +0 -0
  29. bigdl/cpp/libs/llama-bench +0 -0
  30. bigdl/cpp/libs/llama-cli +0 -0
  31. bigdl/cpp/libs/llama-embedding +0 -0
  32. bigdl/cpp/libs/llama-gemma3-cli +0 -0
  33. bigdl/cpp/libs/llama-gguf +0 -0
  34. bigdl/cpp/libs/llama-llava-cli +0 -0
  35. bigdl/cpp/libs/llama-lookup +0 -0
  36. bigdl/cpp/libs/llama-ls-sycl-device +0 -0
  37. bigdl/cpp/libs/llama-minicpmv-cli +0 -0
  38. bigdl/cpp/libs/llama-perplexity +0 -0
  39. bigdl/cpp/libs/llama-quantize +0 -0
  40. bigdl/cpp/libs/llama-server +0 -0
  41. bigdl/cpp/libs/llama-simple +0 -0
  42. bigdl/cpp/libs/llama-speculative +0 -0
  43. bigdl/cpp/libs/llama-tokenize +0 -0
  44. bigdl_core_cpp-2.7.0b20250413.dist-info/RECORD +0 -58
  45. {bigdl_core_cpp-2.7.0b20250413.data → bigdl_core_cpp-2.7.0b20250414.data}/scripts/init-llama-cpp +0 -0
  46. {bigdl_core_cpp-2.7.0b20250413.data → bigdl_core_cpp-2.7.0b20250414.data}/scripts/init-ollama +0 -0
  47. {bigdl_core_cpp-2.7.0b20250413.dist-info → bigdl_core_cpp-2.7.0b20250414.dist-info}/WHEEL +0 -0
  48. {bigdl_core_cpp-2.7.0b20250413.dist-info → bigdl_core_cpp-2.7.0b20250414.dist-info}/top_level.txt +0 -0
@@ -221,17 +221,17 @@ class Model:
221
221
  self.gguf_writer.add_context_length(n_ctx)
222
222
  logger.info(f"gguf: context length = {n_ctx}")
223
223
 
224
- n_embd = self.find_hparam(["hidden_size", "n_embd"])
225
- self.gguf_writer.add_embedding_length(n_embd)
226
- logger.info(f"gguf: embedding length = {n_embd}")
224
+ if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
225
+ self.gguf_writer.add_embedding_length(n_embd)
226
+ logger.info(f"gguf: embedding length = {n_embd}")
227
227
 
228
228
  if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
229
229
  self.gguf_writer.add_feed_forward_length(n_ff)
230
230
  logger.info(f"gguf: feed forward length = {n_ff}")
231
231
 
232
- n_head = self.find_hparam(["num_attention_heads", "n_head"])
233
- self.gguf_writer.add_head_count(n_head)
234
- logger.info(f"gguf: head count = {n_head}")
232
+ if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
233
+ self.gguf_writer.add_head_count(n_head)
234
+ logger.info(f"gguf: head count = {n_head}")
235
235
 
236
236
  if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
237
237
  self.gguf_writer.add_head_count_kv(n_head_kv)
@@ -296,7 +296,9 @@ class Model:
296
296
  break
297
297
 
298
298
  for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
299
- data = data_torch.squeeze().numpy()
299
+ # TODO: why do we squeeze here?
300
+ # data = data_torch.squeeze().numpy()
301
+ data = data_torch.numpy()
300
302
 
301
303
  # if data ends up empty, it means data_torch was a scalar tensor -> restore
302
304
  if len(data.shape) == 0:
@@ -324,6 +326,9 @@ class Model:
324
326
  gguf.MODEL_TENSOR.TIME_MIX_W2,
325
327
  gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
326
328
  gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
329
+ gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
330
+ gguf.MODEL_TENSOR.POSNET_NORM1,
331
+ gguf.MODEL_TENSOR.POSNET_NORM2,
327
332
  )
328
333
  )
329
334
  or not new_name.endswith(".weight")
@@ -473,6 +478,11 @@ class Model:
473
478
  return modelcls
474
479
  return func
475
480
 
481
+ @classmethod
482
+ def print_registered_models(cls):
483
+ for name in sorted(cls._model_classes.keys()):
484
+ logger.error(f"- {name}")
485
+
476
486
  @classmethod
477
487
  def from_model_architecture(cls, arch: str) -> type[Model]:
478
488
  try:
@@ -525,9 +535,19 @@ class Model:
525
535
  else:
526
536
  token: str = reverse_vocab[i]
527
537
  if token in added_vocab:
538
+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539
+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
540
+ if not tokenizer.added_tokens_decoder[i].normalized:
541
+ previous_token = token
542
+ token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
543
+ if previous_token != token:
544
+ logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
545
+
528
546
  if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
529
547
  toktypes.append(gguf.TokenType.CONTROL)
530
548
  else:
549
+ # NOTE: this was added for Gemma.
550
+ # Encoding and decoding the tokens above isn't sufficient for this case.
531
551
  token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
532
552
  toktypes.append(gguf.TokenType.USER_DEFINED)
533
553
  else:
@@ -538,7 +558,7 @@ class Model:
538
558
 
539
559
  # NOTE: this function is generated by convert_hf_to_gguf_update.py
540
560
  # do not modify it manually!
541
- # ref: https://github.com/ggerganov/llama.cpp/pull/6920
561
+ # ref: https://github.com/ggml-org/llama.cpp/pull/6920
542
562
  # Marker: Start get_vocab_base_pre
543
563
  def get_vocab_base_pre(self, tokenizer) -> str:
544
564
  # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
@@ -571,6 +591,9 @@ class Model:
571
591
  if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
572
592
  # ref: https://huggingface.co/tiiuae/falcon-7b
573
593
  res = "falcon"
594
+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
595
+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
596
+ res = "falcon3"
574
597
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
575
598
  # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
576
599
  res = "bert-bge"
@@ -625,7 +648,7 @@ class Model:
625
648
  if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
626
649
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
627
650
  res = "jina-v2-code"
628
- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
651
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
629
652
  # ref: https://huggingface.co/THUDM/glm-4-9b-chat
630
653
  res = "chatglm-bpe"
631
654
  if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
@@ -664,6 +687,18 @@ class Model:
664
687
  if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
665
688
  # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
666
689
  res = "roberta-bpe"
690
+ if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
691
+ # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
692
+ res = "gigachat"
693
+ if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
694
+ # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
695
+ res = "megrez"
696
+ if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
697
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
698
+ res = "deepseek-v3"
699
+ if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
700
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
701
+ res = "deepseek-r1-qwen"
667
702
 
668
703
  if res is None:
669
704
  logger.warning("\n")
@@ -673,7 +708,7 @@ class Model:
673
708
  logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
674
709
  logger.warning("** - the pre-tokenization config has changed upstream")
675
710
  logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
676
- logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
711
+ logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
677
712
  logger.warning("**")
678
713
  logger.warning(f"** chkhsh: {chkhsh}")
679
714
  logger.warning("**************************************************************************************")
@@ -686,6 +721,9 @@ class Model:
686
721
  return res
687
722
  # Marker: End get_vocab_base_pre
688
723
 
724
+ def _set_vocab_none(self) -> None:
725
+ self.gguf_writer.add_tokenizer_model("none")
726
+
689
727
  def _set_vocab_gpt2(self) -> None:
690
728
  tokens, toktypes, tokpre = self.get_vocab_base()
691
729
  self.gguf_writer.add_tokenizer_model("gpt2")
@@ -1669,6 +1707,178 @@ class LlamaModel(Model):
1669
1707
  raise ValueError(f"Unprocessed experts: {experts}")
1670
1708
 
1671
1709
 
1710
+ @Model.register("DeciLMForCausalLM")
1711
+ class DeciModel(Model):
1712
+ model_arch = gguf.MODEL_ARCH.DECI
1713
+
1714
+ @staticmethod
1715
+ def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
1716
+ # DeciLM-specific code
1717
+ intermediate_size = int(2 * ffn_mult * n_embd / 3)
1718
+ return DeciModel._find_multiple(intermediate_size, 256)
1719
+
1720
+ @staticmethod
1721
+ def _find_multiple(n: int, k: int) -> int:
1722
+ # DeciLM-specific code
1723
+ if n % k == 0:
1724
+ return n
1725
+ return n + k - (n % k)
1726
+
1727
+ def __init__(self, *args, **kwargs):
1728
+ super().__init__(*args, **kwargs)
1729
+
1730
+ if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1731
+ _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
1732
+ assert self.block_count == len(_block_configs)
1733
+ self._num_kv_heads = list()
1734
+ self._num_heads = list()
1735
+ _ffn_multipliers = list()
1736
+ # ***linear attention layer***
1737
+ # if n_heads_in_group is None and replace_with_linear is True
1738
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
1739
+ # ***attention-free layer***
1740
+ # if n_heads_in_group is None and replace_with_linear is False
1741
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
1742
+ # ***normal attention-layer***
1743
+ # if n_heads_in_group is not None, then
1744
+ # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
1745
+ # _num_heads[il] is num_attention_head
1746
+ for il in range(len(_block_configs)):
1747
+ if _block_configs[il]["attention"]["n_heads_in_group"] is None:
1748
+ if _block_configs[il]["attention"]["replace_with_linear"] is True:
1749
+ self._num_kv_heads.append(0)
1750
+ self._num_heads.append(self.hparams["num_attention_heads"])
1751
+ else:
1752
+ self._num_kv_heads.append(0)
1753
+ self._num_heads.append(0)
1754
+ else:
1755
+ self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
1756
+ self._num_heads.append(self.hparams["num_attention_heads"])
1757
+ _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
1758
+ assert self.block_count == len(self._num_kv_heads)
1759
+ assert self.block_count == len(self._num_heads)
1760
+ assert self.block_count == len(_ffn_multipliers)
1761
+ assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
1762
+ assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
1763
+ assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
1764
+ self._ffn_dims: list[int] = [
1765
+ DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
1766
+ for multiplier in _ffn_multipliers
1767
+ ]
1768
+
1769
+ def set_vocab(self):
1770
+ # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
1771
+ # eos_token from '|eot_id|' to '|end_of_text|'
1772
+ if self.hparams.get("vocab_size", 128256) == 128256:
1773
+ tokens, toktypes, tokpre = self.get_vocab_base()
1774
+ self.gguf_writer.add_tokenizer_model("gpt2")
1775
+ self.gguf_writer.add_tokenizer_pre(tokpre)
1776
+ self.gguf_writer.add_token_list(tokens)
1777
+ self.gguf_writer.add_token_types(toktypes)
1778
+
1779
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1780
+ special_vocab.add_to_gguf(self.gguf_writer)
1781
+ else:
1782
+ # DeciLM-7B
1783
+ self._set_vocab_llama_hf()
1784
+
1785
+ def set_gguf_parameters(self):
1786
+ if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1787
+ assert self.block_count == len(self._num_kv_heads)
1788
+ assert self.block_count == len(self._num_heads)
1789
+ assert self.block_count == len(self._ffn_dims)
1790
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
1791
+ self.gguf_writer.add_rope_freq_base(rope_theta)
1792
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1793
+ self.gguf_writer.add_head_count(self._num_heads)
1794
+ self.gguf_writer.add_feed_forward_length(self._ffn_dims)
1795
+ self.gguf_writer.add_block_count(self.block_count)
1796
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1797
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1798
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1799
+ self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1800
+ self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1801
+ self.gguf_writer.add_file_type(self.ftype)
1802
+ else: # DeciLM-7B
1803
+ super().set_gguf_parameters()
1804
+ if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
1805
+ self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
1806
+ assert self.block_count == len(self._num_kv_heads)
1807
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1808
+ hparams = self.hparams
1809
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1810
+
1811
+ if "head_dim" in hparams:
1812
+ rope_dim = hparams["head_dim"]
1813
+ else:
1814
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1815
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
1816
+
1817
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1818
+ if self.hparams["rope_scaling"].get("type") == "linear":
1819
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1820
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1821
+
1822
+ @staticmethod
1823
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1824
+ if n_head_kv is not None and n_head != n_head_kv:
1825
+ n_head = n_head_kv
1826
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1827
+ .swapaxes(1, 2)
1828
+ .reshape(weights.shape))
1829
+
1830
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1831
+ n_head = self.hparams["num_attention_heads"]
1832
+ if bid is not None:
1833
+ if "num_key_value_heads_per_layer" in self.hparams:
1834
+ n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
1835
+ elif "block_configs" in self.hparams:
1836
+ n_kv_head = self._num_kv_heads[bid]
1837
+ n_head = self._num_heads[bid]
1838
+ else:
1839
+ n_kv_head = self.hparams.get("num_key_value_heads")
1840
+ else:
1841
+ n_kv_head = self.hparams.get("num_key_value_heads")
1842
+
1843
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
1844
+ data_torch = DeciModel.permute(data_torch, n_head, n_head)
1845
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
1846
+ data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
1847
+ return [(self.map_tensor_name(name), data_torch)]
1848
+
1849
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1850
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1851
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
1852
+ base = self.hparams.get("rope_theta", 10000.0)
1853
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1854
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1855
+
1856
+ factor = rope_scaling.get("factor", 8.0)
1857
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1858
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1859
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1860
+
1861
+ low_freq_wavelen = old_context_len / low_freq_factor
1862
+ high_freq_wavelen = old_context_len / high_freq_factor
1863
+ assert low_freq_wavelen != high_freq_wavelen
1864
+
1865
+ rope_factors = []
1866
+ for freq in freqs:
1867
+ wavelen = 2 * math.pi / freq
1868
+ if wavelen < high_freq_wavelen:
1869
+ rope_factors.append(1)
1870
+ elif wavelen > low_freq_wavelen:
1871
+ rope_factors.append(factor)
1872
+ else:
1873
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1874
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1875
+
1876
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1877
+
1878
+ def prepare_tensors(self):
1879
+ super().prepare_tensors()
1880
+
1881
+
1672
1882
  @Model.register("BitnetForCausalLM")
1673
1883
  class BitnetModel(Model):
1674
1884
  model_arch = gguf.MODEL_ARCH.BITNET
@@ -2024,6 +2234,44 @@ class Qwen2VLModel(Model):
2024
2234
  yield name, data
2025
2235
 
2026
2236
 
2237
+ @Model.register("WavTokenizerDec")
2238
+ class WavTokenizerDecModel(Model):
2239
+ model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
2240
+
2241
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2242
+ del bid # unused
2243
+
2244
+ if \
2245
+ name.endswith("codebook.cluster_size") or \
2246
+ name.endswith("codebook.embed_avg") or \
2247
+ name.endswith("codebook.inited"):
2248
+ logger.debug(f"Skipping {name!r}")
2249
+ return []
2250
+
2251
+ logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
2252
+
2253
+ return [(self.map_tensor_name(name), data_torch)]
2254
+
2255
+ def set_vocab(self):
2256
+ self._set_vocab_none()
2257
+
2258
+ def set_gguf_parameters(self):
2259
+ super().set_gguf_parameters()
2260
+ self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
2261
+ self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
2262
+ self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
2263
+ self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
2264
+ self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
2265
+
2266
+ self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
2267
+ self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
2268
+
2269
+ self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
2270
+ self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
2271
+
2272
+ self.gguf_writer.add_causal_attention(False)
2273
+
2274
+
2027
2275
  @Model.register("Qwen2MoeForCausalLM")
2028
2276
  class Qwen2MoeModel(Model):
2029
2277
  model_arch = gguf.MODEL_ARCH.QWEN2MOE
@@ -2087,6 +2335,16 @@ class Qwen2MoeModel(Model):
2087
2335
  raise ValueError(f"Unprocessed experts: {experts}")
2088
2336
 
2089
2337
 
2338
+ @Model.register("Qwen3ForCausalLM")
2339
+ class Qwen3Model(Qwen2Model):
2340
+ model_arch = gguf.MODEL_ARCH.QWEN3
2341
+
2342
+
2343
+ @Model.register("Qwen3MoeForCausalLM")
2344
+ class Qwen3MoeModel(Qwen2MoeModel):
2345
+ model_arch = gguf.MODEL_ARCH.QWEN3MOE
2346
+
2347
+
2090
2348
  @Model.register("GPT2LMHeadModel")
2091
2349
  class GPT2Model(Model):
2092
2350
  model_arch = gguf.MODEL_ARCH.GPT2
@@ -2152,6 +2410,15 @@ class Phi3MiniModel(Model):
2152
2410
  model_arch = gguf.MODEL_ARCH.PHI3
2153
2411
 
2154
2412
  def set_vocab(self):
2413
+ # Phi-4 model uses GPT2Tokenizer
2414
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2415
+ if tokenizer_config_file.is_file():
2416
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2417
+ tokenizer_config_json = json.load(f)
2418
+ tokenizer_class = tokenizer_config_json['tokenizer_class']
2419
+ if tokenizer_class == 'GPT2Tokenizer':
2420
+ return self._set_vocab_gpt2()
2421
+
2155
2422
  from sentencepiece import SentencePieceProcessor
2156
2423
 
2157
2424
  tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -2268,7 +2535,11 @@ class Phi3MiniModel(Model):
2268
2535
  self.gguf_writer.add_rope_dimension_count(rope_dims)
2269
2536
  self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
2270
2537
  self.gguf_writer.add_file_type(self.ftype)
2271
- self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
2538
+ sliding_window = self.hparams.get("sliding_window")
2539
+ # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
2540
+ if sliding_window is None:
2541
+ sliding_window = 0
2542
+ self.gguf_writer.add_sliding_window(sliding_window)
2272
2543
 
2273
2544
  def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2274
2545
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
@@ -2310,6 +2581,63 @@ class Phi3MiniModel(Model):
2310
2581
  yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2311
2582
 
2312
2583
 
2584
+ @Model.register("PhiMoEForCausalLM")
2585
+ class PhiMoeModel(Phi3MiniModel):
2586
+ model_arch = gguf.MODEL_ARCH.PHIMOE
2587
+
2588
+ _experts: list[dict[str, Tensor]] | None = None
2589
+
2590
+ def set_gguf_parameters(self):
2591
+ super().set_gguf_parameters()
2592
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
2593
+ self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
2594
+
2595
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2596
+ # process the experts separately
2597
+ if name.find("block_sparse_moe.experts") != -1:
2598
+ n_experts = self.hparams["num_local_experts"]
2599
+ assert bid is not None
2600
+
2601
+ if self._experts is None:
2602
+ self._experts = [{} for _ in range(self.block_count)]
2603
+
2604
+ self._experts[bid][name] = data_torch
2605
+
2606
+ if len(self._experts[bid]) >= n_experts * 3:
2607
+ tensors: list[tuple[str, Tensor]] = []
2608
+
2609
+ # merge the experts into a single 3d tensor
2610
+ for w_name in ["w1", "w2", "w3"]:
2611
+ datas: list[Tensor] = []
2612
+
2613
+ for xid in range(n_experts):
2614
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
2615
+ datas.append(self._experts[bid][ename])
2616
+ del self._experts[bid][ename]
2617
+
2618
+ data_torch = torch.stack(datas, dim=0)
2619
+
2620
+ merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
2621
+
2622
+ new_name = self.map_tensor_name(merged_name)
2623
+
2624
+ tensors.append((new_name, data_torch))
2625
+ return tensors
2626
+ else:
2627
+ return []
2628
+
2629
+ return [(self.map_tensor_name(name), data_torch)]
2630
+
2631
+ def prepare_tensors(self):
2632
+ super().prepare_tensors()
2633
+
2634
+ if self._experts is not None:
2635
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2636
+ experts = [k for d in self._experts for k in d.keys()]
2637
+ if len(experts) > 0:
2638
+ raise ValueError(f"Unprocessed experts: {experts}")
2639
+
2640
+
2313
2641
  @Model.register("PlamoForCausalLM")
2314
2642
  class PlamoModel(Model):
2315
2643
  model_arch = gguf.MODEL_ARCH.PLAMO
@@ -2517,7 +2845,7 @@ class InternLM2Model(Model):
2517
2845
  if chat_eos_token_id is not None:
2518
2846
  # For the chat model, we replace the eos with '<|im_end|>'.
2519
2847
  # TODO: this is a hack, should be fixed
2520
- # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2848
+ # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
2521
2849
  special_vocab.special_token_ids["eos"] = chat_eos_token_id
2522
2850
  logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
2523
2851
  " in chat mode so that the conversation can end normally.")
@@ -2567,7 +2895,67 @@ class InternLM2Model(Model):
2567
2895
  return [(self.map_tensor_name(name), data_torch)]
2568
2896
 
2569
2897
 
2570
- @Model.register("BertModel", "CamembertModel", "RobertaModel")
2898
+ @Model.register("InternLM3ForCausalLM")
2899
+ class InternLM3Model(Model):
2900
+ model_arch = gguf.MODEL_ARCH.LLAMA
2901
+
2902
+ def set_vocab(self):
2903
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
2904
+
2905
+ self.gguf_writer.add_tokenizer_model("llama")
2906
+ self.gguf_writer.add_tokenizer_pre("default")
2907
+ self.gguf_writer.add_token_list(tokens)
2908
+ self.gguf_writer.add_token_scores(scores)
2909
+ self.gguf_writer.add_token_types(toktypes)
2910
+
2911
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2912
+
2913
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2914
+ if tokenizer_config_file.is_file():
2915
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2916
+ tokenizer_config_json = json.load(f)
2917
+ if "add_prefix_space" in tokenizer_config_json:
2918
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
2919
+
2920
+ if "added_tokens_decoder" in tokenizer_config_json:
2921
+ for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
2922
+ if token_data.get("special"):
2923
+ token_id = int(token_id)
2924
+ token = token_data["content"]
2925
+ special_vocab._set_special_token(token, token_id)
2926
+ # update eos token
2927
+ if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
2928
+ special_vocab.special_token_ids["eos"] = token_id
2929
+
2930
+ special_vocab.add_to_gguf(self.gguf_writer)
2931
+
2932
+ def set_gguf_parameters(self):
2933
+ super().set_gguf_parameters()
2934
+ hparams = self.hparams
2935
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2936
+
2937
+ if "head_dim" in hparams:
2938
+ rope_dim = hparams["head_dim"]
2939
+ else:
2940
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2941
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
2942
+
2943
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2944
+ if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
2945
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2946
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2947
+
2948
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2949
+ n_head = self.hparams["num_attention_heads"]
2950
+ n_kv_head = self.hparams.get("num_key_value_heads")
2951
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
2952
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2953
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
2954
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
2955
+ return [(self.map_tensor_name(name), data_torch)]
2956
+
2957
+
2958
+ @Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
2571
2959
  class BertModel(Model):
2572
2960
  model_arch = gguf.MODEL_ARCH.BERT
2573
2961
 
@@ -2633,13 +3021,73 @@ class BertModel(Model):
2633
3021
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2634
3022
  del bid # unused
2635
3023
 
3024
+ if name.startswith("bert."):
3025
+ name = name[5:]
3026
+
3027
+ if name.endswith(".gamma"):
3028
+ name = name[:-6] + ".weight"
3029
+
3030
+ if name.endswith(".beta"):
3031
+ name = name[:-5] + ".bias"
3032
+
2636
3033
  # we are only using BERT for embeddings so we don't need the pooling layer
2637
3034
  if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
2638
3035
  return [] # we don't need these
2639
3036
 
3037
+ if name.startswith("cls.predictions"):
3038
+ return []
3039
+
3040
+ if name.startswith("cls.seq_relationship"):
3041
+ return []
3042
+
2640
3043
  return [(self.map_tensor_name(name), data_torch)]
2641
3044
 
2642
3045
 
3046
+ @Model.register("RobertaModel")
3047
+ class RobertaModel(BertModel):
3048
+ model_arch = gguf.MODEL_ARCH.BERT
3049
+
3050
+ def __init__(self, *args, **kwargs):
3051
+ super().__init__(*args, **kwargs)
3052
+
3053
+ # we need the pad_token_id to know how to chop down position_embd matrix
3054
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3055
+ self._position_offset = 1 + pad_token_id
3056
+ if "max_position_embeddings" in self.hparams:
3057
+ self.hparams["max_position_embeddings"] -= self._position_offset
3058
+ else:
3059
+ self._position_offset = None
3060
+
3061
+ def set_vocab(self):
3062
+ """Support BPE tokenizers for roberta models"""
3063
+ bpe_tok_path = self.dir_model / "tokenizer.json"
3064
+ if bpe_tok_path.exists():
3065
+ self._set_vocab_gpt2()
3066
+ self.gguf_writer.add_add_bos_token(True)
3067
+ self.gguf_writer.add_add_eos_token(True)
3068
+
3069
+ # we need this to validate the size of the token_type embeddings
3070
+ # though currently we are passing all zeros to the token_type embeddings
3071
+ # "Sequence A" or "Sequence B"
3072
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3073
+
3074
+ else:
3075
+ return super().set_vocab()
3076
+
3077
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3078
+ # if name starts with "roberta.", remove the prefix
3079
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
3080
+ if name.startswith("roberta."):
3081
+ name = name[8:]
3082
+
3083
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
3084
+ if name == "embeddings.position_embeddings.weight":
3085
+ if self._position_offset is not None:
3086
+ data_torch = data_torch[self._position_offset:,:]
3087
+
3088
+ return super().modify_tensors(data_torch, name, bid)
3089
+
3090
+
2643
3091
  @Model.register("NomicBertModel")
2644
3092
  class NomicBertModel(BertModel):
2645
3093
  model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@@ -2947,6 +3395,8 @@ class Rwkv6Model(Model):
2947
3395
  # required by llama.cpp, unused
2948
3396
  self.gguf_writer.add_head_count(0)
2949
3397
 
3398
+ lerp_weights: dict[int, dict[str, Tensor]] = {}
3399
+
2950
3400
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2951
3401
  new_name = self.map_tensor_name(name)
2952
3402
 
@@ -2959,14 +3409,87 @@ class Rwkv6Model(Model):
2959
3409
  if new_name.endswith("time_mix_w2.weight"):
2960
3410
  data_torch = data_torch.permute(0, 2, 1)
2961
3411
 
2962
- rescale_every_n_layers = self.hparams["rescale_every"]
2963
- if rescale_every_n_layers > 0:
2964
- if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
2965
- data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
3412
+ if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
3413
+ data_torch = data_torch.squeeze()
3414
+
3415
+ try:
3416
+ rescale_every_n_layers = self.hparams["rescale_every"]
3417
+ if rescale_every_n_layers > 0:
3418
+ if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
3419
+ data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
3420
+ except KeyError:
3421
+ pass
3422
+
3423
+ # concat time_mix_lerp weights to reduce some cpu overhead
3424
+ # also reduces the number of tensors in the model
3425
+ if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
3426
+ try:
3427
+ self.lerp_weights[bid][new_name] = data_torch
3428
+ except KeyError:
3429
+ self.lerp_weights[bid] = {new_name: data_torch}
3430
+ if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
3431
+ new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3432
+ data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
3433
+ yield (new_name, data)
3434
+ return
2966
3435
 
2967
3436
  yield (new_name, data_torch)
2968
3437
 
2969
3438
 
3439
+ @Model.register("RWKV6Qwen2ForCausalLM")
3440
+ class RWKV6Qwen2Model(Rwkv6Model):
3441
+ model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
3442
+
3443
+ def set_vocab(self):
3444
+ try:
3445
+ self._set_vocab_sentencepiece()
3446
+ except FileNotFoundError:
3447
+ self._set_vocab_gpt2()
3448
+
3449
+ def set_gguf_parameters(self):
3450
+ block_count = self.hparams["num_hidden_layers"]
3451
+ num_attention_heads = self.hparams["num_attention_heads"]
3452
+ num_key_value_heads = self.hparams["num_key_value_heads"]
3453
+ hidden_size = self.hparams["hidden_size"]
3454
+ head_size = hidden_size // num_attention_heads
3455
+ rms_norm_eps = self.hparams["rms_norm_eps"]
3456
+ intermediate_size = self.hparams["intermediate_size"]
3457
+ time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3458
+ time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
3459
+
3460
+ # RWKV isn't context limited
3461
+ self.gguf_writer.add_context_length(1048576)
3462
+ self.gguf_writer.add_embedding_length(hidden_size)
3463
+ self.gguf_writer.add_block_count(block_count)
3464
+ self.gguf_writer.add_wkv_head_size(head_size)
3465
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
3466
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
3467
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
3468
+ self.gguf_writer.add_file_type(self.ftype)
3469
+
3470
+ # special parameters for time_mixing in RWKV6QWEN2
3471
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3472
+ self.gguf_writer.add_token_shift_count(1)
3473
+ # RWKV6QWEN2 use grouped key/value like GQA
3474
+ self.gguf_writer.add_head_count_kv(num_key_value_heads)
3475
+
3476
+ # required by llama.cpp, unused
3477
+ self.gguf_writer.add_head_count(0)
3478
+
3479
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3480
+ for new_name, data in super().modify_tensors(data_torch, name, bid):
3481
+ if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
3482
+ data = data.view(5, -1, data.shape[-1])
3483
+ # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
3484
+ # permute them here to avoid code changes
3485
+ data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
3486
+ if "w2" in new_name:
3487
+ data = data.view(5, -1, data.shape[-1])
3488
+ yield (new_name, data)
3489
+ continue
3490
+ yield (new_name, data)
3491
+
3492
+
2970
3493
  @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
2971
3494
  class MambaModel(Model):
2972
3495
  model_arch = gguf.MODEL_ARCH.MAMBA
@@ -3061,6 +3584,24 @@ class CommandR2Model(Model):
3061
3584
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3062
3585
 
3063
3586
 
3587
+ @Model.register("Cohere2ForCausalLM")
3588
+ class Cohere2Model(Model):
3589
+ model_arch = gguf.MODEL_ARCH.COHERE2
3590
+
3591
+ def set_gguf_parameters(self):
3592
+ super().set_gguf_parameters()
3593
+
3594
+ self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
3595
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
3596
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3597
+
3598
+ rotary_pct = self.hparams["rotary_pct"]
3599
+ hidden_size = self.hparams["hidden_size"]
3600
+ num_attention_heads = self.hparams["num_attention_heads"]
3601
+ self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
3602
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3603
+
3604
+
3064
3605
  @Model.register("OlmoForCausalLM")
3065
3606
  @Model.register("OLMoForCausalLM")
3066
3607
  class OlmoModel(Model):
@@ -3427,7 +3968,99 @@ class ArcticModel(Model):
3427
3968
  raise ValueError(f"Unprocessed experts: {experts}")
3428
3969
 
3429
3970
 
3971
+ @Model.register("DeepseekForCausalLM")
3972
+ class DeepseekModel(Model):
3973
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK
3974
+
3975
+ def set_vocab(self):
3976
+ try:
3977
+ self._set_vocab_sentencepiece()
3978
+ except FileNotFoundError:
3979
+ self._set_vocab_gpt2()
3980
+
3981
+ def set_gguf_parameters(self):
3982
+ super().set_gguf_parameters()
3983
+ hparams = self.hparams
3984
+ if "head_dim" in hparams:
3985
+ rope_dim = hparams["head_dim"]
3986
+ else:
3987
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
3988
+
3989
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
3990
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3991
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
3992
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3993
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
3994
+ self.gguf_writer.add_expert_weights_scale(1.0)
3995
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3996
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3997
+
3998
+ _experts: list[dict[str, Tensor]] | None = None
3999
+
4000
+ @staticmethod
4001
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
4002
+ if n_head_kv is not None and n_head != n_head_kv:
4003
+ n_head = n_head_kv
4004
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
4005
+ .swapaxes(1, 2)
4006
+ .reshape(weights.shape))
4007
+
4008
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4009
+ n_head = self.hparams["num_attention_heads"]
4010
+ n_kv_head = self.hparams.get("num_key_value_heads")
4011
+
4012
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
4013
+ data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
4014
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
4015
+ data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
4016
+
4017
+ # process the experts separately
4018
+ if name.find("mlp.experts") != -1:
4019
+ n_experts = self.hparams["n_routed_experts"]
4020
+ assert bid is not None
4021
+
4022
+ if self._experts is None:
4023
+ self._experts = [{} for _ in range(self.block_count)]
4024
+
4025
+ self._experts[bid][name] = data_torch
4026
+
4027
+ if len(self._experts[bid]) >= n_experts * 3:
4028
+ tensors: list[tuple[str, Tensor]] = []
4029
+
4030
+ # merge the experts into a single 3d tensor
4031
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
4032
+ datas: list[Tensor] = []
4033
+
4034
+ for xid in range(n_experts):
4035
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4036
+ datas.append(self._experts[bid][ename])
4037
+ del self._experts[bid][ename]
4038
+
4039
+ data_torch = torch.stack(datas, dim=0)
4040
+
4041
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4042
+
4043
+ new_name = self.map_tensor_name(merged_name)
4044
+
4045
+ tensors.append((new_name, data_torch))
4046
+ return tensors
4047
+ else:
4048
+ return []
4049
+
4050
+ return [(self.map_tensor_name(name), data_torch)]
4051
+
4052
+ def prepare_tensors(self):
4053
+ super().prepare_tensors()
4054
+
4055
+ if self._experts is not None:
4056
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
4057
+ experts = [k for d in self._experts for k in d.keys()]
4058
+ if len(experts) > 0:
4059
+ raise ValueError(f"Unprocessed experts: {experts}")
4060
+
4061
+
3430
4062
  @Model.register("DeepseekV2ForCausalLM")
4063
+ @Model.register("DeepseekV3ForCausalLM")
3431
4064
  class DeepseekV2Model(Model):
3432
4065
  model_arch = gguf.MODEL_ARCH.DEEPSEEK2
3433
4066
 
@@ -3449,6 +4082,15 @@ class DeepseekV2Model(Model):
3449
4082
  self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3450
4083
  self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3451
4084
  self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
4085
+ self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
4086
+
4087
+ if hparams["scoring_func"] == "sigmoid":
4088
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
4089
+ elif hparams["scoring_func"] == "softmax":
4090
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
4091
+ else:
4092
+ raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
4093
+
3452
4094
  self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
3453
4095
 
3454
4096
  if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
@@ -3461,6 +4103,16 @@ class DeepseekV2Model(Model):
3461
4103
  _experts: list[dict[str, Tensor]] | None = None
3462
4104
 
3463
4105
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4106
+ # rename e_score_correction_bias tensors
4107
+ if name.endswith("e_score_correction_bias"):
4108
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
4109
+
4110
+ # skip Multi-Token Prediction (MTP) layers
4111
+ block_count = self.hparams["num_hidden_layers"]
4112
+ match = re.match(r"model.layers.(\d+)", name)
4113
+ if match and int(match.group(1)) >= block_count:
4114
+ return []
4115
+
3464
4116
  # process the experts separately
3465
4117
  if name.find("mlp.experts") != -1:
3466
4118
  n_experts = self.hparams["n_routed_experts"]
@@ -3871,7 +4523,7 @@ class JaisModel(Model):
3871
4523
  self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
3872
4524
 
3873
4525
 
3874
- @Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
4526
+ @Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
3875
4527
  class ChatGLMModel(Model):
3876
4528
  model_arch = gguf.MODEL_ARCH.CHATGLM
3877
4529
 
@@ -3977,47 +4629,15 @@ class ChatGLMModel(Model):
3977
4629
 
3978
4630
  from transformers import AutoTokenizer
3979
4631
  tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3980
- vocab_size = hparams["padded_vocab_size"]
4632
+ vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
3981
4633
  assert max(tokenizer.get_vocab().values()) < vocab_size
3982
4634
 
3983
- tokpre = self.get_vocab_base_pre(tokenizer)
3984
-
3985
- merges = []
3986
- vocab = {}
3987
- mergeable_ranks = tokenizer.mergeable_ranks
3988
- for token, rank in mergeable_ranks.items():
3989
- vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
3990
- if len(token) == 1:
3991
- continue
3992
- merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
3993
- assert len(merged) >= 2 and len(merged) <= 7
3994
- merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
3995
-
3996
- # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3997
- added_vocab = tokenizer.get_added_vocab()
3998
- reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
3999
-
4000
- for i in range(vocab_size):
4001
- if i not in reverse_vocab:
4002
- tokens.append(f"[PAD{i}]")
4003
- toktypes.append(gguf.TokenType.UNUSED)
4004
- elif reverse_vocab[i] in added_vocab:
4005
- tokens.append(reverse_vocab[i])
4006
- if tokenizer.added_tokens_decoder[i].special:
4007
- toktypes.append(gguf.TokenType.CONTROL)
4008
- else:
4009
- toktypes.append(gguf.TokenType.USER_DEFINED)
4010
- else:
4011
- tokens.append(reverse_vocab[i])
4012
- toktypes.append(gguf.TokenType.NORMAL)
4013
-
4635
+ tokens, toktypes, tokpre = self.get_vocab_base()
4014
4636
  self.gguf_writer.add_tokenizer_model("gpt2")
4015
4637
  self.gguf_writer.add_tokenizer_pre(tokpre)
4016
4638
  self.gguf_writer.add_token_list(tokens)
4017
4639
  self.gguf_writer.add_token_types(toktypes)
4018
-
4019
- special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
4020
- special_vocab.merges = merges
4640
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
4021
4641
  # only add special tokens when they were not already loaded from config.json
4022
4642
  special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
4023
4643
  special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
@@ -4028,16 +4648,20 @@ class ChatGLMModel(Model):
4028
4648
  def set_gguf_parameters(self):
4029
4649
  n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
4030
4650
  n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
4031
- n_head_kv = self.hparams.get("multi_query_group_num", n_head)
4651
+ n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
4032
4652
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
4033
4653
  self.gguf_writer.add_embedding_length(n_embed)
4034
- self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
4035
- self.gguf_writer.add_block_count(self.hparams["num_layers"])
4654
+ self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
4655
+ self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
4036
4656
  self.gguf_writer.add_head_count(n_head)
4037
4657
  self.gguf_writer.add_head_count_kv(n_head_kv)
4038
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
4658
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
4039
4659
  self.gguf_writer.add_file_type(self.ftype)
4040
- self.gguf_writer.add_rope_dimension_count(64)
4660
+ if "attention_dim" in self.hparams:
4661
+ rope_dim = self.hparams["attention_dim"]
4662
+ else:
4663
+ rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4664
+ self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
4041
4665
  self.gguf_writer.add_add_bos_token(False)
4042
4666
  rope_freq = 10000
4043
4667
  if "rope_ratio" in self.hparams:
@@ -4047,7 +4671,7 @@ class ChatGLMModel(Model):
4047
4671
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4048
4672
  del bid # unused
4049
4673
 
4050
- if name.endswith(".rotary_pos_emb.inv_freq"):
4674
+ if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
4051
4675
  return []
4052
4676
 
4053
4677
  name = name.removeprefix("transformer.")
@@ -4354,6 +4978,7 @@ def parse_args() -> argparse.Namespace:
4354
4978
  parser.add_argument(
4355
4979
  "model", type=Path,
4356
4980
  help="directory containing model file",
4981
+ nargs="?",
4357
4982
  )
4358
4983
  parser.add_argument(
4359
4984
  "--use-temp-file", action="store_true",
@@ -4391,8 +5016,15 @@ def parse_args() -> argparse.Namespace:
4391
5016
  "--metadata", type=Path,
4392
5017
  help="Specify the path for an authorship metadata override file"
4393
5018
  )
5019
+ parser.add_argument(
5020
+ "--print-supported-models", action="store_true",
5021
+ help="Print the supported models"
5022
+ )
4394
5023
 
4395
- return parser.parse_args()
5024
+ args = parser.parse_args()
5025
+ if not args.print_supported_models and args.model is None:
5026
+ parser.error("the following arguments are required: model")
5027
+ return args
4396
5028
 
4397
5029
 
4398
5030
  def split_str_to_n_bytes(split_str: str) -> int:
@@ -4416,6 +5048,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
4416
5048
  def main() -> None:
4417
5049
  args = parse_args()
4418
5050
 
5051
+ if args.print_supported_models:
5052
+ logger.error("Supported models:")
5053
+ Model.print_registered_models()
5054
+ sys.exit(0)
5055
+
4419
5056
  if args.verbose:
4420
5057
  logging.basicConfig(level=logging.DEBUG)
4421
5058
  else: