bigdl-core-cpp 2.6.0b20250319__py3-none-win_amd64.whl → 2.6.0b20250321__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert_hf_to_gguf.py +687 -60
- bigdl/cpp/convert_hf_to_gguf_update.py +46 -41
- bigdl/cpp/convert_lora_to_gguf.py +33 -5
- bigdl/cpp/gguf-py/gguf/constants.py +306 -123
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +31 -3
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +122 -25
- bigdl/cpp/gguf-py/gguf/utility.py +1 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +1 -1
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/ggml-base.dll +0 -0
- bigdl/cpp/libs/ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ggml.dll +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gemma3-cli.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ollama-lib.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/ollama_ggml.dll +0 -0
- bigdl/cpp/libs/ollama_llama.dll +0 -0
- bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
- {bigdl_core_cpp-2.6.0b20250319.dist-info → bigdl_core_cpp-2.6.0b20250321.dist-info}/METADATA +2 -2
- bigdl_core_cpp-2.6.0b20250321.dist-info/RECORD +57 -0
- {bigdl_core_cpp-2.6.0b20250319.dist-info → bigdl_core_cpp-2.6.0b20250321.dist-info}/WHEEL +1 -1
- bigdl_core_cpp-2.6.0b20250319.dist-info/RECORD +0 -57
- {bigdl_core_cpp-2.6.0b20250319.data → bigdl_core_cpp-2.6.0b20250321.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.6.0b20250319.data → bigdl_core_cpp-2.6.0b20250321.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.6.0b20250319.data → bigdl_core_cpp-2.6.0b20250321.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.6.0b20250319.dist-info → bigdl_core_cpp-2.6.0b20250321.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert_hf_to_gguf.py
CHANGED
@@ -221,17 +221,17 @@ class Model:
|
|
221
221
|
self.gguf_writer.add_context_length(n_ctx)
|
222
222
|
logger.info(f"gguf: context length = {n_ctx}")
|
223
223
|
|
224
|
-
n_embd
|
225
|
-
|
226
|
-
|
224
|
+
if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
|
225
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
226
|
+
logger.info(f"gguf: embedding length = {n_embd}")
|
227
227
|
|
228
228
|
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
229
229
|
self.gguf_writer.add_feed_forward_length(n_ff)
|
230
230
|
logger.info(f"gguf: feed forward length = {n_ff}")
|
231
231
|
|
232
|
-
n_head
|
233
|
-
|
234
|
-
|
232
|
+
if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
|
233
|
+
self.gguf_writer.add_head_count(n_head)
|
234
|
+
logger.info(f"gguf: head count = {n_head}")
|
235
235
|
|
236
236
|
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
237
237
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
@@ -296,7 +296,9 @@ class Model:
|
|
296
296
|
break
|
297
297
|
|
298
298
|
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
|
299
|
-
|
299
|
+
# TODO: why do we squeeze here?
|
300
|
+
# data = data_torch.squeeze().numpy()
|
301
|
+
data = data_torch.numpy()
|
300
302
|
|
301
303
|
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
302
304
|
if len(data.shape) == 0:
|
@@ -324,6 +326,9 @@ class Model:
|
|
324
326
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
325
327
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
326
328
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
329
|
+
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
330
|
+
gguf.MODEL_TENSOR.POSNET_NORM1,
|
331
|
+
gguf.MODEL_TENSOR.POSNET_NORM2,
|
327
332
|
)
|
328
333
|
)
|
329
334
|
or not new_name.endswith(".weight")
|
@@ -473,6 +478,11 @@ class Model:
|
|
473
478
|
return modelcls
|
474
479
|
return func
|
475
480
|
|
481
|
+
@classmethod
|
482
|
+
def print_registered_models(cls):
|
483
|
+
for name in sorted(cls._model_classes.keys()):
|
484
|
+
logger.error(f"- {name}")
|
485
|
+
|
476
486
|
@classmethod
|
477
487
|
def from_model_architecture(cls, arch: str) -> type[Model]:
|
478
488
|
try:
|
@@ -525,9 +535,19 @@ class Model:
|
|
525
535
|
else:
|
526
536
|
token: str = reverse_vocab[i]
|
527
537
|
if token in added_vocab:
|
538
|
+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
539
|
+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
540
|
+
if not tokenizer.added_tokens_decoder[i].normalized:
|
541
|
+
previous_token = token
|
542
|
+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
543
|
+
if previous_token != token:
|
544
|
+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
545
|
+
|
528
546
|
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
529
547
|
toktypes.append(gguf.TokenType.CONTROL)
|
530
548
|
else:
|
549
|
+
# NOTE: this was added for Gemma.
|
550
|
+
# Encoding and decoding the tokens above isn't sufficient for this case.
|
531
551
|
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
532
552
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
533
553
|
else:
|
@@ -538,7 +558,7 @@ class Model:
|
|
538
558
|
|
539
559
|
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
540
560
|
# do not modify it manually!
|
541
|
-
# ref: https://github.com/
|
561
|
+
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
542
562
|
# Marker: Start get_vocab_base_pre
|
543
563
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
544
564
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
@@ -571,6 +591,9 @@ class Model:
|
|
571
591
|
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
572
592
|
# ref: https://huggingface.co/tiiuae/falcon-7b
|
573
593
|
res = "falcon"
|
594
|
+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
595
|
+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
596
|
+
res = "falcon3"
|
574
597
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
575
598
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
576
599
|
res = "bert-bge"
|
@@ -625,7 +648,7 @@ class Model:
|
|
625
648
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
626
649
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
627
650
|
res = "jina-v2-code"
|
628
|
-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
651
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
629
652
|
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
630
653
|
res = "chatglm-bpe"
|
631
654
|
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
@@ -664,6 +687,18 @@ class Model:
|
|
664
687
|
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
|
665
688
|
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
|
666
689
|
res = "roberta-bpe"
|
690
|
+
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
691
|
+
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
692
|
+
res = "gigachat"
|
693
|
+
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
694
|
+
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
695
|
+
res = "megrez"
|
696
|
+
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
|
697
|
+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
698
|
+
res = "deepseek-v3"
|
699
|
+
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
|
700
|
+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
701
|
+
res = "deepseek-r1-qwen"
|
667
702
|
|
668
703
|
if res is None:
|
669
704
|
logger.warning("\n")
|
@@ -673,7 +708,7 @@ class Model:
|
|
673
708
|
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
674
709
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
675
710
|
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
676
|
-
logger.warning("** ref: https://github.com/
|
711
|
+
logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
|
677
712
|
logger.warning("**")
|
678
713
|
logger.warning(f"** chkhsh: {chkhsh}")
|
679
714
|
logger.warning("**************************************************************************************")
|
@@ -686,6 +721,9 @@ class Model:
|
|
686
721
|
return res
|
687
722
|
# Marker: End get_vocab_base_pre
|
688
723
|
|
724
|
+
def _set_vocab_none(self) -> None:
|
725
|
+
self.gguf_writer.add_tokenizer_model("none")
|
726
|
+
|
689
727
|
def _set_vocab_gpt2(self) -> None:
|
690
728
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
691
729
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
@@ -1669,6 +1707,178 @@ class LlamaModel(Model):
|
|
1669
1707
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1670
1708
|
|
1671
1709
|
|
1710
|
+
@Model.register("DeciLMForCausalLM")
|
1711
|
+
class DeciModel(Model):
|
1712
|
+
model_arch = gguf.MODEL_ARCH.DECI
|
1713
|
+
|
1714
|
+
@staticmethod
|
1715
|
+
def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
|
1716
|
+
# DeciLM-specific code
|
1717
|
+
intermediate_size = int(2 * ffn_mult * n_embd / 3)
|
1718
|
+
return DeciModel._find_multiple(intermediate_size, 256)
|
1719
|
+
|
1720
|
+
@staticmethod
|
1721
|
+
def _find_multiple(n: int, k: int) -> int:
|
1722
|
+
# DeciLM-specific code
|
1723
|
+
if n % k == 0:
|
1724
|
+
return n
|
1725
|
+
return n + k - (n % k)
|
1726
|
+
|
1727
|
+
def __init__(self, *args, **kwargs):
|
1728
|
+
super().__init__(*args, **kwargs)
|
1729
|
+
|
1730
|
+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
1731
|
+
_block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
|
1732
|
+
assert self.block_count == len(_block_configs)
|
1733
|
+
self._num_kv_heads = list()
|
1734
|
+
self._num_heads = list()
|
1735
|
+
_ffn_multipliers = list()
|
1736
|
+
# ***linear attention layer***
|
1737
|
+
# if n_heads_in_group is None and replace_with_linear is True
|
1738
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
|
1739
|
+
# ***attention-free layer***
|
1740
|
+
# if n_heads_in_group is None and replace_with_linear is False
|
1741
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0
|
1742
|
+
# ***normal attention-layer***
|
1743
|
+
# if n_heads_in_group is not None, then
|
1744
|
+
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
|
1745
|
+
# _num_heads[il] is num_attention_head
|
1746
|
+
for il in range(len(_block_configs)):
|
1747
|
+
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
|
1748
|
+
if _block_configs[il]["attention"]["replace_with_linear"] is True:
|
1749
|
+
self._num_kv_heads.append(0)
|
1750
|
+
self._num_heads.append(self.hparams["num_attention_heads"])
|
1751
|
+
else:
|
1752
|
+
self._num_kv_heads.append(0)
|
1753
|
+
self._num_heads.append(0)
|
1754
|
+
else:
|
1755
|
+
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
|
1756
|
+
self._num_heads.append(self.hparams["num_attention_heads"])
|
1757
|
+
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
1758
|
+
assert self.block_count == len(self._num_kv_heads)
|
1759
|
+
assert self.block_count == len(self._num_heads)
|
1760
|
+
assert self.block_count == len(_ffn_multipliers)
|
1761
|
+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
1762
|
+
assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
|
1763
|
+
assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
|
1764
|
+
self._ffn_dims: list[int] = [
|
1765
|
+
DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
|
1766
|
+
for multiplier in _ffn_multipliers
|
1767
|
+
]
|
1768
|
+
|
1769
|
+
def set_vocab(self):
|
1770
|
+
# Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
|
1771
|
+
# eos_token from '|eot_id|' to '|end_of_text|'
|
1772
|
+
if self.hparams.get("vocab_size", 128256) == 128256:
|
1773
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
1774
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
1775
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
1776
|
+
self.gguf_writer.add_token_list(tokens)
|
1777
|
+
self.gguf_writer.add_token_types(toktypes)
|
1778
|
+
|
1779
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
1780
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
1781
|
+
else:
|
1782
|
+
# DeciLM-7B
|
1783
|
+
self._set_vocab_llama_hf()
|
1784
|
+
|
1785
|
+
def set_gguf_parameters(self):
|
1786
|
+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
1787
|
+
assert self.block_count == len(self._num_kv_heads)
|
1788
|
+
assert self.block_count == len(self._num_heads)
|
1789
|
+
assert self.block_count == len(self._ffn_dims)
|
1790
|
+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
1791
|
+
self.gguf_writer.add_rope_freq_base(rope_theta)
|
1792
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
1793
|
+
self.gguf_writer.add_head_count(self._num_heads)
|
1794
|
+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
1795
|
+
self.gguf_writer.add_block_count(self.block_count)
|
1796
|
+
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1797
|
+
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
1798
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
1799
|
+
self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1800
|
+
self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1801
|
+
self.gguf_writer.add_file_type(self.ftype)
|
1802
|
+
else: # DeciLM-7B
|
1803
|
+
super().set_gguf_parameters()
|
1804
|
+
if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
|
1805
|
+
self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
|
1806
|
+
assert self.block_count == len(self._num_kv_heads)
|
1807
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
1808
|
+
hparams = self.hparams
|
1809
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1810
|
+
|
1811
|
+
if "head_dim" in hparams:
|
1812
|
+
rope_dim = hparams["head_dim"]
|
1813
|
+
else:
|
1814
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1815
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1816
|
+
|
1817
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1818
|
+
if self.hparams["rope_scaling"].get("type") == "linear":
|
1819
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1820
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1821
|
+
|
1822
|
+
@staticmethod
|
1823
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1824
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
1825
|
+
n_head = n_head_kv
|
1826
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1827
|
+
.swapaxes(1, 2)
|
1828
|
+
.reshape(weights.shape))
|
1829
|
+
|
1830
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1831
|
+
n_head = self.hparams["num_attention_heads"]
|
1832
|
+
if bid is not None:
|
1833
|
+
if "num_key_value_heads_per_layer" in self.hparams:
|
1834
|
+
n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
|
1835
|
+
elif "block_configs" in self.hparams:
|
1836
|
+
n_kv_head = self._num_kv_heads[bid]
|
1837
|
+
n_head = self._num_heads[bid]
|
1838
|
+
else:
|
1839
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1840
|
+
else:
|
1841
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1842
|
+
|
1843
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1844
|
+
data_torch = DeciModel.permute(data_torch, n_head, n_head)
|
1845
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1846
|
+
data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
|
1847
|
+
return [(self.map_tensor_name(name), data_torch)]
|
1848
|
+
|
1849
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1850
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1851
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1852
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
1853
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1854
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1855
|
+
|
1856
|
+
factor = rope_scaling.get("factor", 8.0)
|
1857
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1858
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
1859
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
1860
|
+
|
1861
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
1862
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
1863
|
+
assert low_freq_wavelen != high_freq_wavelen
|
1864
|
+
|
1865
|
+
rope_factors = []
|
1866
|
+
for freq in freqs:
|
1867
|
+
wavelen = 2 * math.pi / freq
|
1868
|
+
if wavelen < high_freq_wavelen:
|
1869
|
+
rope_factors.append(1)
|
1870
|
+
elif wavelen > low_freq_wavelen:
|
1871
|
+
rope_factors.append(factor)
|
1872
|
+
else:
|
1873
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1874
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1875
|
+
|
1876
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
1877
|
+
|
1878
|
+
def prepare_tensors(self):
|
1879
|
+
super().prepare_tensors()
|
1880
|
+
|
1881
|
+
|
1672
1882
|
@Model.register("BitnetForCausalLM")
|
1673
1883
|
class BitnetModel(Model):
|
1674
1884
|
model_arch = gguf.MODEL_ARCH.BITNET
|
@@ -2024,6 +2234,44 @@ class Qwen2VLModel(Model):
|
|
2024
2234
|
yield name, data
|
2025
2235
|
|
2026
2236
|
|
2237
|
+
@Model.register("WavTokenizerDec")
|
2238
|
+
class WavTokenizerDecModel(Model):
|
2239
|
+
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
2240
|
+
|
2241
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2242
|
+
del bid # unused
|
2243
|
+
|
2244
|
+
if \
|
2245
|
+
name.endswith("codebook.cluster_size") or \
|
2246
|
+
name.endswith("codebook.embed_avg") or \
|
2247
|
+
name.endswith("codebook.inited"):
|
2248
|
+
logger.debug(f"Skipping {name!r}")
|
2249
|
+
return []
|
2250
|
+
|
2251
|
+
logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
|
2252
|
+
|
2253
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2254
|
+
|
2255
|
+
def set_vocab(self):
|
2256
|
+
self._set_vocab_none()
|
2257
|
+
|
2258
|
+
def set_gguf_parameters(self):
|
2259
|
+
super().set_gguf_parameters()
|
2260
|
+
self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
|
2261
|
+
self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
|
2262
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
|
2263
|
+
self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
|
2264
|
+
self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
|
2265
|
+
|
2266
|
+
self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
|
2267
|
+
self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
|
2268
|
+
|
2269
|
+
self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
|
2270
|
+
self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
|
2271
|
+
|
2272
|
+
self.gguf_writer.add_causal_attention(False)
|
2273
|
+
|
2274
|
+
|
2027
2275
|
@Model.register("Qwen2MoeForCausalLM")
|
2028
2276
|
class Qwen2MoeModel(Model):
|
2029
2277
|
model_arch = gguf.MODEL_ARCH.QWEN2MOE
|
@@ -2152,6 +2400,15 @@ class Phi3MiniModel(Model):
|
|
2152
2400
|
model_arch = gguf.MODEL_ARCH.PHI3
|
2153
2401
|
|
2154
2402
|
def set_vocab(self):
|
2403
|
+
# Phi-4 model uses GPT2Tokenizer
|
2404
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2405
|
+
if tokenizer_config_file.is_file():
|
2406
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2407
|
+
tokenizer_config_json = json.load(f)
|
2408
|
+
tokenizer_class = tokenizer_config_json['tokenizer_class']
|
2409
|
+
if tokenizer_class == 'GPT2Tokenizer':
|
2410
|
+
return self._set_vocab_gpt2()
|
2411
|
+
|
2155
2412
|
from sentencepiece import SentencePieceProcessor
|
2156
2413
|
|
2157
2414
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
@@ -2268,7 +2525,11 @@ class Phi3MiniModel(Model):
|
|
2268
2525
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
2269
2526
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
2270
2527
|
self.gguf_writer.add_file_type(self.ftype)
|
2271
|
-
self.
|
2528
|
+
sliding_window = self.hparams.get("sliding_window")
|
2529
|
+
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
2530
|
+
if sliding_window is None:
|
2531
|
+
sliding_window = 0
|
2532
|
+
self.gguf_writer.add_sliding_window(sliding_window)
|
2272
2533
|
|
2273
2534
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2274
2535
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
@@ -2310,6 +2571,63 @@ class Phi3MiniModel(Model):
|
|
2310
2571
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2311
2572
|
|
2312
2573
|
|
2574
|
+
@Model.register("PhiMoEForCausalLM")
|
2575
|
+
class PhiMoeModel(Phi3MiniModel):
|
2576
|
+
model_arch = gguf.MODEL_ARCH.PHIMOE
|
2577
|
+
|
2578
|
+
_experts: list[dict[str, Tensor]] | None = None
|
2579
|
+
|
2580
|
+
def set_gguf_parameters(self):
|
2581
|
+
super().set_gguf_parameters()
|
2582
|
+
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
2583
|
+
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
|
2584
|
+
|
2585
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2586
|
+
# process the experts separately
|
2587
|
+
if name.find("block_sparse_moe.experts") != -1:
|
2588
|
+
n_experts = self.hparams["num_local_experts"]
|
2589
|
+
assert bid is not None
|
2590
|
+
|
2591
|
+
if self._experts is None:
|
2592
|
+
self._experts = [{} for _ in range(self.block_count)]
|
2593
|
+
|
2594
|
+
self._experts[bid][name] = data_torch
|
2595
|
+
|
2596
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
2597
|
+
tensors: list[tuple[str, Tensor]] = []
|
2598
|
+
|
2599
|
+
# merge the experts into a single 3d tensor
|
2600
|
+
for w_name in ["w1", "w2", "w3"]:
|
2601
|
+
datas: list[Tensor] = []
|
2602
|
+
|
2603
|
+
for xid in range(n_experts):
|
2604
|
+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
|
2605
|
+
datas.append(self._experts[bid][ename])
|
2606
|
+
del self._experts[bid][ename]
|
2607
|
+
|
2608
|
+
data_torch = torch.stack(datas, dim=0)
|
2609
|
+
|
2610
|
+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
|
2611
|
+
|
2612
|
+
new_name = self.map_tensor_name(merged_name)
|
2613
|
+
|
2614
|
+
tensors.append((new_name, data_torch))
|
2615
|
+
return tensors
|
2616
|
+
else:
|
2617
|
+
return []
|
2618
|
+
|
2619
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2620
|
+
|
2621
|
+
def prepare_tensors(self):
|
2622
|
+
super().prepare_tensors()
|
2623
|
+
|
2624
|
+
if self._experts is not None:
|
2625
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
2626
|
+
experts = [k for d in self._experts for k in d.keys()]
|
2627
|
+
if len(experts) > 0:
|
2628
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
2629
|
+
|
2630
|
+
|
2313
2631
|
@Model.register("PlamoForCausalLM")
|
2314
2632
|
class PlamoModel(Model):
|
2315
2633
|
model_arch = gguf.MODEL_ARCH.PLAMO
|
@@ -2517,7 +2835,7 @@ class InternLM2Model(Model):
|
|
2517
2835
|
if chat_eos_token_id is not None:
|
2518
2836
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
2519
2837
|
# TODO: this is a hack, should be fixed
|
2520
|
-
# https://github.com/
|
2838
|
+
# https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
|
2521
2839
|
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
2522
2840
|
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
2523
2841
|
" in chat mode so that the conversation can end normally.")
|
@@ -2567,7 +2885,67 @@ class InternLM2Model(Model):
|
|
2567
2885
|
return [(self.map_tensor_name(name), data_torch)]
|
2568
2886
|
|
2569
2887
|
|
2570
|
-
@Model.register("
|
2888
|
+
@Model.register("InternLM3ForCausalLM")
|
2889
|
+
class InternLM3Model(Model):
|
2890
|
+
model_arch = gguf.MODEL_ARCH.LLAMA
|
2891
|
+
|
2892
|
+
def set_vocab(self):
|
2893
|
+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
2894
|
+
|
2895
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
2896
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
2897
|
+
self.gguf_writer.add_token_list(tokens)
|
2898
|
+
self.gguf_writer.add_token_scores(scores)
|
2899
|
+
self.gguf_writer.add_token_types(toktypes)
|
2900
|
+
|
2901
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2902
|
+
|
2903
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2904
|
+
if tokenizer_config_file.is_file():
|
2905
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2906
|
+
tokenizer_config_json = json.load(f)
|
2907
|
+
if "add_prefix_space" in tokenizer_config_json:
|
2908
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
2909
|
+
|
2910
|
+
if "added_tokens_decoder" in tokenizer_config_json:
|
2911
|
+
for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
|
2912
|
+
if token_data.get("special"):
|
2913
|
+
token_id = int(token_id)
|
2914
|
+
token = token_data["content"]
|
2915
|
+
special_vocab._set_special_token(token, token_id)
|
2916
|
+
# update eos token
|
2917
|
+
if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
|
2918
|
+
special_vocab.special_token_ids["eos"] = token_id
|
2919
|
+
|
2920
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2921
|
+
|
2922
|
+
def set_gguf_parameters(self):
|
2923
|
+
super().set_gguf_parameters()
|
2924
|
+
hparams = self.hparams
|
2925
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
2926
|
+
|
2927
|
+
if "head_dim" in hparams:
|
2928
|
+
rope_dim = hparams["head_dim"]
|
2929
|
+
else:
|
2930
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
2931
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
2932
|
+
|
2933
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2934
|
+
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
|
2935
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2936
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2937
|
+
|
2938
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2939
|
+
n_head = self.hparams["num_attention_heads"]
|
2940
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
2941
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
2942
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
2943
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
2944
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
2945
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2946
|
+
|
2947
|
+
|
2948
|
+
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
|
2571
2949
|
class BertModel(Model):
|
2572
2950
|
model_arch = gguf.MODEL_ARCH.BERT
|
2573
2951
|
|
@@ -2633,13 +3011,73 @@ class BertModel(Model):
|
|
2633
3011
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2634
3012
|
del bid # unused
|
2635
3013
|
|
3014
|
+
if name.startswith("bert."):
|
3015
|
+
name = name[5:]
|
3016
|
+
|
3017
|
+
if name.endswith(".gamma"):
|
3018
|
+
name = name[:-6] + ".weight"
|
3019
|
+
|
3020
|
+
if name.endswith(".beta"):
|
3021
|
+
name = name[:-5] + ".bias"
|
3022
|
+
|
2636
3023
|
# we are only using BERT for embeddings so we don't need the pooling layer
|
2637
3024
|
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
2638
3025
|
return [] # we don't need these
|
2639
3026
|
|
3027
|
+
if name.startswith("cls.predictions"):
|
3028
|
+
return []
|
3029
|
+
|
3030
|
+
if name.startswith("cls.seq_relationship"):
|
3031
|
+
return []
|
3032
|
+
|
2640
3033
|
return [(self.map_tensor_name(name), data_torch)]
|
2641
3034
|
|
2642
3035
|
|
3036
|
+
@Model.register("RobertaModel")
|
3037
|
+
class RobertaModel(BertModel):
|
3038
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
3039
|
+
|
3040
|
+
def __init__(self, *args, **kwargs):
|
3041
|
+
super().__init__(*args, **kwargs)
|
3042
|
+
|
3043
|
+
# we need the pad_token_id to know how to chop down position_embd matrix
|
3044
|
+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
3045
|
+
self._position_offset = 1 + pad_token_id
|
3046
|
+
if "max_position_embeddings" in self.hparams:
|
3047
|
+
self.hparams["max_position_embeddings"] -= self._position_offset
|
3048
|
+
else:
|
3049
|
+
self._position_offset = None
|
3050
|
+
|
3051
|
+
def set_vocab(self):
|
3052
|
+
"""Support BPE tokenizers for roberta models"""
|
3053
|
+
bpe_tok_path = self.dir_model / "tokenizer.json"
|
3054
|
+
if bpe_tok_path.exists():
|
3055
|
+
self._set_vocab_gpt2()
|
3056
|
+
self.gguf_writer.add_add_bos_token(True)
|
3057
|
+
self.gguf_writer.add_add_eos_token(True)
|
3058
|
+
|
3059
|
+
# we need this to validate the size of the token_type embeddings
|
3060
|
+
# though currently we are passing all zeros to the token_type embeddings
|
3061
|
+
# "Sequence A" or "Sequence B"
|
3062
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
3063
|
+
|
3064
|
+
else:
|
3065
|
+
return super().set_vocab()
|
3066
|
+
|
3067
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3068
|
+
# if name starts with "roberta.", remove the prefix
|
3069
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
3070
|
+
if name.startswith("roberta."):
|
3071
|
+
name = name[8:]
|
3072
|
+
|
3073
|
+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
3074
|
+
if name == "embeddings.position_embeddings.weight":
|
3075
|
+
if self._position_offset is not None:
|
3076
|
+
data_torch = data_torch[self._position_offset:,:]
|
3077
|
+
|
3078
|
+
return super().modify_tensors(data_torch, name, bid)
|
3079
|
+
|
3080
|
+
|
2643
3081
|
@Model.register("NomicBertModel")
|
2644
3082
|
class NomicBertModel(BertModel):
|
2645
3083
|
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
|
@@ -2947,6 +3385,8 @@ class Rwkv6Model(Model):
|
|
2947
3385
|
# required by llama.cpp, unused
|
2948
3386
|
self.gguf_writer.add_head_count(0)
|
2949
3387
|
|
3388
|
+
lerp_weights: dict[int, dict[str, Tensor]] = {}
|
3389
|
+
|
2950
3390
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2951
3391
|
new_name = self.map_tensor_name(name)
|
2952
3392
|
|
@@ -2959,14 +3399,87 @@ class Rwkv6Model(Model):
|
|
2959
3399
|
if new_name.endswith("time_mix_w2.weight"):
|
2960
3400
|
data_torch = data_torch.permute(0, 2, 1)
|
2961
3401
|
|
2962
|
-
|
2963
|
-
|
2964
|
-
|
2965
|
-
|
3402
|
+
if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
|
3403
|
+
data_torch = data_torch.squeeze()
|
3404
|
+
|
3405
|
+
try:
|
3406
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
3407
|
+
if rescale_every_n_layers > 0:
|
3408
|
+
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
3409
|
+
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
3410
|
+
except KeyError:
|
3411
|
+
pass
|
3412
|
+
|
3413
|
+
# concat time_mix_lerp weights to reduce some cpu overhead
|
3414
|
+
# also reduces the number of tensors in the model
|
3415
|
+
if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
|
3416
|
+
try:
|
3417
|
+
self.lerp_weights[bid][new_name] = data_torch
|
3418
|
+
except KeyError:
|
3419
|
+
self.lerp_weights[bid] = {new_name: data_torch}
|
3420
|
+
if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
|
3421
|
+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
|
3422
|
+
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
|
3423
|
+
yield (new_name, data)
|
3424
|
+
return
|
2966
3425
|
|
2967
3426
|
yield (new_name, data_torch)
|
2968
3427
|
|
2969
3428
|
|
3429
|
+
@Model.register("RWKV6Qwen2ForCausalLM")
|
3430
|
+
class RWKV6Qwen2Model(Rwkv6Model):
|
3431
|
+
model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
|
3432
|
+
|
3433
|
+
def set_vocab(self):
|
3434
|
+
try:
|
3435
|
+
self._set_vocab_sentencepiece()
|
3436
|
+
except FileNotFoundError:
|
3437
|
+
self._set_vocab_gpt2()
|
3438
|
+
|
3439
|
+
def set_gguf_parameters(self):
|
3440
|
+
block_count = self.hparams["num_hidden_layers"]
|
3441
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
3442
|
+
num_key_value_heads = self.hparams["num_key_value_heads"]
|
3443
|
+
hidden_size = self.hparams["hidden_size"]
|
3444
|
+
head_size = hidden_size // num_attention_heads
|
3445
|
+
rms_norm_eps = self.hparams["rms_norm_eps"]
|
3446
|
+
intermediate_size = self.hparams["intermediate_size"]
|
3447
|
+
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
|
3448
|
+
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
|
3449
|
+
|
3450
|
+
# RWKV isn't context limited
|
3451
|
+
self.gguf_writer.add_context_length(1048576)
|
3452
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
3453
|
+
self.gguf_writer.add_block_count(block_count)
|
3454
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
3455
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
3456
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
3457
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
3458
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3459
|
+
|
3460
|
+
# special parameters for time_mixing in RWKV6QWEN2
|
3461
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3462
|
+
self.gguf_writer.add_token_shift_count(1)
|
3463
|
+
# RWKV6QWEN2 use grouped key/value like GQA
|
3464
|
+
self.gguf_writer.add_head_count_kv(num_key_value_heads)
|
3465
|
+
|
3466
|
+
# required by llama.cpp, unused
|
3467
|
+
self.gguf_writer.add_head_count(0)
|
3468
|
+
|
3469
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3470
|
+
for new_name, data in super().modify_tensors(data_torch, name, bid):
|
3471
|
+
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
|
3472
|
+
data = data.view(5, -1, data.shape[-1])
|
3473
|
+
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
|
3474
|
+
# permute them here to avoid code changes
|
3475
|
+
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
|
3476
|
+
if "w2" in new_name:
|
3477
|
+
data = data.view(5, -1, data.shape[-1])
|
3478
|
+
yield (new_name, data)
|
3479
|
+
continue
|
3480
|
+
yield (new_name, data)
|
3481
|
+
|
3482
|
+
|
2970
3483
|
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
2971
3484
|
class MambaModel(Model):
|
2972
3485
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
@@ -3061,6 +3574,24 @@ class CommandR2Model(Model):
|
|
3061
3574
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3062
3575
|
|
3063
3576
|
|
3577
|
+
@Model.register("Cohere2ForCausalLM")
|
3578
|
+
class Cohere2Model(Model):
|
3579
|
+
model_arch = gguf.MODEL_ARCH.COHERE2
|
3580
|
+
|
3581
|
+
def set_gguf_parameters(self):
|
3582
|
+
super().set_gguf_parameters()
|
3583
|
+
|
3584
|
+
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
3585
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
3586
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
3587
|
+
|
3588
|
+
rotary_pct = self.hparams["rotary_pct"]
|
3589
|
+
hidden_size = self.hparams["hidden_size"]
|
3590
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
3591
|
+
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
|
3592
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3593
|
+
|
3594
|
+
|
3064
3595
|
@Model.register("OlmoForCausalLM")
|
3065
3596
|
@Model.register("OLMoForCausalLM")
|
3066
3597
|
class OlmoModel(Model):
|
@@ -3427,7 +3958,99 @@ class ArcticModel(Model):
|
|
3427
3958
|
raise ValueError(f"Unprocessed experts: {experts}")
|
3428
3959
|
|
3429
3960
|
|
3961
|
+
@Model.register("DeepseekForCausalLM")
|
3962
|
+
class DeepseekModel(Model):
|
3963
|
+
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
3964
|
+
|
3965
|
+
def set_vocab(self):
|
3966
|
+
try:
|
3967
|
+
self._set_vocab_sentencepiece()
|
3968
|
+
except FileNotFoundError:
|
3969
|
+
self._set_vocab_gpt2()
|
3970
|
+
|
3971
|
+
def set_gguf_parameters(self):
|
3972
|
+
super().set_gguf_parameters()
|
3973
|
+
hparams = self.hparams
|
3974
|
+
if "head_dim" in hparams:
|
3975
|
+
rope_dim = hparams["head_dim"]
|
3976
|
+
else:
|
3977
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
3978
|
+
|
3979
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
3980
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3981
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
3982
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
3983
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
3984
|
+
self.gguf_writer.add_expert_weights_scale(1.0)
|
3985
|
+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3986
|
+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3987
|
+
|
3988
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3989
|
+
|
3990
|
+
@staticmethod
|
3991
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
3992
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
3993
|
+
n_head = n_head_kv
|
3994
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
3995
|
+
.swapaxes(1, 2)
|
3996
|
+
.reshape(weights.shape))
|
3997
|
+
|
3998
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3999
|
+
n_head = self.hparams["num_attention_heads"]
|
4000
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
4001
|
+
|
4002
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
4003
|
+
data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
|
4004
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
4005
|
+
data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
|
4006
|
+
|
4007
|
+
# process the experts separately
|
4008
|
+
if name.find("mlp.experts") != -1:
|
4009
|
+
n_experts = self.hparams["n_routed_experts"]
|
4010
|
+
assert bid is not None
|
4011
|
+
|
4012
|
+
if self._experts is None:
|
4013
|
+
self._experts = [{} for _ in range(self.block_count)]
|
4014
|
+
|
4015
|
+
self._experts[bid][name] = data_torch
|
4016
|
+
|
4017
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
4018
|
+
tensors: list[tuple[str, Tensor]] = []
|
4019
|
+
|
4020
|
+
# merge the experts into a single 3d tensor
|
4021
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
4022
|
+
datas: list[Tensor] = []
|
4023
|
+
|
4024
|
+
for xid in range(n_experts):
|
4025
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
4026
|
+
datas.append(self._experts[bid][ename])
|
4027
|
+
del self._experts[bid][ename]
|
4028
|
+
|
4029
|
+
data_torch = torch.stack(datas, dim=0)
|
4030
|
+
|
4031
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
4032
|
+
|
4033
|
+
new_name = self.map_tensor_name(merged_name)
|
4034
|
+
|
4035
|
+
tensors.append((new_name, data_torch))
|
4036
|
+
return tensors
|
4037
|
+
else:
|
4038
|
+
return []
|
4039
|
+
|
4040
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4041
|
+
|
4042
|
+
def prepare_tensors(self):
|
4043
|
+
super().prepare_tensors()
|
4044
|
+
|
4045
|
+
if self._experts is not None:
|
4046
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
4047
|
+
experts = [k for d in self._experts for k in d.keys()]
|
4048
|
+
if len(experts) > 0:
|
4049
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
4050
|
+
|
4051
|
+
|
3430
4052
|
@Model.register("DeepseekV2ForCausalLM")
|
4053
|
+
@Model.register("DeepseekV3ForCausalLM")
|
3431
4054
|
class DeepseekV2Model(Model):
|
3432
4055
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
3433
4056
|
|
@@ -3449,6 +4072,15 @@ class DeepseekV2Model(Model):
|
|
3449
4072
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3450
4073
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3451
4074
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
4075
|
+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
4076
|
+
|
4077
|
+
if hparams["scoring_func"] == "sigmoid":
|
4078
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
4079
|
+
elif hparams["scoring_func"] == "softmax":
|
4080
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
4081
|
+
else:
|
4082
|
+
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
|
4083
|
+
|
3452
4084
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
3453
4085
|
|
3454
4086
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
@@ -3461,6 +4093,16 @@ class DeepseekV2Model(Model):
|
|
3461
4093
|
_experts: list[dict[str, Tensor]] | None = None
|
3462
4094
|
|
3463
4095
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4096
|
+
# rename e_score_correction_bias tensors
|
4097
|
+
if name.endswith("e_score_correction_bias"):
|
4098
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
4099
|
+
|
4100
|
+
# skip Multi-Token Prediction (MTP) layers
|
4101
|
+
block_count = self.hparams["num_hidden_layers"]
|
4102
|
+
match = re.match(r"model.layers.(\d+)", name)
|
4103
|
+
if match and int(match.group(1)) >= block_count:
|
4104
|
+
return []
|
4105
|
+
|
3464
4106
|
# process the experts separately
|
3465
4107
|
if name.find("mlp.experts") != -1:
|
3466
4108
|
n_experts = self.hparams["n_routed_experts"]
|
@@ -3871,7 +4513,7 @@ class JaisModel(Model):
|
|
3871
4513
|
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
3872
4514
|
|
3873
4515
|
|
3874
|
-
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
4516
|
+
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
3875
4517
|
class ChatGLMModel(Model):
|
3876
4518
|
model_arch = gguf.MODEL_ARCH.CHATGLM
|
3877
4519
|
|
@@ -3977,47 +4619,15 @@ class ChatGLMModel(Model):
|
|
3977
4619
|
|
3978
4620
|
from transformers import AutoTokenizer
|
3979
4621
|
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3980
|
-
vocab_size = hparams
|
4622
|
+
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
|
3981
4623
|
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3982
4624
|
|
3983
|
-
tokpre = self.
|
3984
|
-
|
3985
|
-
merges = []
|
3986
|
-
vocab = {}
|
3987
|
-
mergeable_ranks = tokenizer.mergeable_ranks
|
3988
|
-
for token, rank in mergeable_ranks.items():
|
3989
|
-
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
3990
|
-
if len(token) == 1:
|
3991
|
-
continue
|
3992
|
-
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
3993
|
-
assert len(merged) >= 2 and len(merged) <= 7
|
3994
|
-
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
3995
|
-
|
3996
|
-
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
3997
|
-
added_vocab = tokenizer.get_added_vocab()
|
3998
|
-
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
3999
|
-
|
4000
|
-
for i in range(vocab_size):
|
4001
|
-
if i not in reverse_vocab:
|
4002
|
-
tokens.append(f"[PAD{i}]")
|
4003
|
-
toktypes.append(gguf.TokenType.UNUSED)
|
4004
|
-
elif reverse_vocab[i] in added_vocab:
|
4005
|
-
tokens.append(reverse_vocab[i])
|
4006
|
-
if tokenizer.added_tokens_decoder[i].special:
|
4007
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
4008
|
-
else:
|
4009
|
-
toktypes.append(gguf.TokenType.USER_DEFINED)
|
4010
|
-
else:
|
4011
|
-
tokens.append(reverse_vocab[i])
|
4012
|
-
toktypes.append(gguf.TokenType.NORMAL)
|
4013
|
-
|
4625
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
4014
4626
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
4015
4627
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
4016
4628
|
self.gguf_writer.add_token_list(tokens)
|
4017
4629
|
self.gguf_writer.add_token_types(toktypes)
|
4018
|
-
|
4019
|
-
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
4020
|
-
special_vocab.merges = merges
|
4630
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
4021
4631
|
# only add special tokens when they were not already loaded from config.json
|
4022
4632
|
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
4023
4633
|
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
@@ -4028,16 +4638,20 @@ class ChatGLMModel(Model):
|
|
4028
4638
|
def set_gguf_parameters(self):
|
4029
4639
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
4030
4640
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
4031
|
-
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
4641
|
+
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
|
4032
4642
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
4033
4643
|
self.gguf_writer.add_embedding_length(n_embed)
|
4034
|
-
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
4035
|
-
self.gguf_writer.add_block_count(self.hparams
|
4644
|
+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
|
4645
|
+
self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
|
4036
4646
|
self.gguf_writer.add_head_count(n_head)
|
4037
4647
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
4038
|
-
self.gguf_writer.add_layer_norm_rms_eps(self.hparams
|
4648
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
|
4039
4649
|
self.gguf_writer.add_file_type(self.ftype)
|
4040
|
-
self.
|
4650
|
+
if "attention_dim" in self.hparams:
|
4651
|
+
rope_dim = self.hparams["attention_dim"]
|
4652
|
+
else:
|
4653
|
+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
4654
|
+
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
4041
4655
|
self.gguf_writer.add_add_bos_token(False)
|
4042
4656
|
rope_freq = 10000
|
4043
4657
|
if "rope_ratio" in self.hparams:
|
@@ -4047,7 +4661,7 @@ class ChatGLMModel(Model):
|
|
4047
4661
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4048
4662
|
del bid # unused
|
4049
4663
|
|
4050
|
-
if name.endswith(".rotary_pos_emb.inv_freq"):
|
4664
|
+
if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
|
4051
4665
|
return []
|
4052
4666
|
|
4053
4667
|
name = name.removeprefix("transformer.")
|
@@ -4354,6 +4968,7 @@ def parse_args() -> argparse.Namespace:
|
|
4354
4968
|
parser.add_argument(
|
4355
4969
|
"model", type=Path,
|
4356
4970
|
help="directory containing model file",
|
4971
|
+
nargs="?",
|
4357
4972
|
)
|
4358
4973
|
parser.add_argument(
|
4359
4974
|
"--use-temp-file", action="store_true",
|
@@ -4391,8 +5006,15 @@ def parse_args() -> argparse.Namespace:
|
|
4391
5006
|
"--metadata", type=Path,
|
4392
5007
|
help="Specify the path for an authorship metadata override file"
|
4393
5008
|
)
|
5009
|
+
parser.add_argument(
|
5010
|
+
"--print-supported-models", action="store_true",
|
5011
|
+
help="Print the supported models"
|
5012
|
+
)
|
4394
5013
|
|
4395
|
-
|
5014
|
+
args = parser.parse_args()
|
5015
|
+
if not args.print_supported_models and args.model is None:
|
5016
|
+
parser.error("the following arguments are required: model")
|
5017
|
+
return args
|
4396
5018
|
|
4397
5019
|
|
4398
5020
|
def split_str_to_n_bytes(split_str: str) -> int:
|
@@ -4416,6 +5038,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
|
4416
5038
|
def main() -> None:
|
4417
5039
|
args = parse_args()
|
4418
5040
|
|
5041
|
+
if args.print_supported_models:
|
5042
|
+
logger.error("Supported models:")
|
5043
|
+
Model.print_registered_models()
|
5044
|
+
sys.exit(0)
|
5045
|
+
|
4419
5046
|
if args.verbose:
|
4420
5047
|
logging.basicConfig(level=logging.DEBUG)
|
4421
5048
|
else:
|