bigdl-core-cpp 2.7.0b20250413__py3-none-manylinux2010_x86_64.whl → 2.7.0b20250414__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert_hf_to_gguf.py +697 -60
- bigdl/cpp/convert_hf_to_gguf_update.py +46 -41
- bigdl/cpp/convert_lora_to_gguf.py +33 -5
- bigdl/cpp/gguf-py/gguf/constants.py +344 -123
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +31 -3
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +122 -25
- bigdl/cpp/gguf-py/gguf/utility.py +1 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +1 -1
- bigdl/cpp/libs/libggml-base.so +0 -0
- bigdl/cpp/libs/libggml-cpu-alderlake.so +0 -0
- bigdl/cpp/libs/{libggml-cpu.so → libggml-cpu-haswell.so} +0 -0
- bigdl/cpp/libs/libggml-cpu-skylakex.so +0 -0
- bigdl/cpp/libs/libggml-sycl.so +0 -0
- bigdl/cpp/libs/libggml.so +0 -0
- bigdl/cpp/libs/libllama.so +0 -0
- bigdl/cpp/libs/libllava_shared.so +0 -0
- bigdl/cpp/libs/libsample.so +0 -0
- bigdl/cpp/libs/ollama-lib +0 -0
- {bigdl_core_cpp-2.7.0b20250413.dist-info → bigdl_core_cpp-2.7.0b20250414.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.7.0b20250414.dist-info/RECORD +37 -0
- bigdl/cpp/libs/libmllama.so +0 -0
- bigdl/cpp/libs/libollama-ggml-base.so +0 -0
- bigdl/cpp/libs/libollama-ggml-cpu.so +0 -0
- bigdl/cpp/libs/libollama-ggml-sycl.so +0 -0
- bigdl/cpp/libs/libollama_ggml.so +0 -0
- bigdl/cpp/libs/libollama_llama.so +0 -0
- bigdl/cpp/libs/libollama_llava_shared.so +0 -0
- bigdl/cpp/libs/llama-batched +0 -0
- bigdl/cpp/libs/llama-bench +0 -0
- bigdl/cpp/libs/llama-cli +0 -0
- bigdl/cpp/libs/llama-embedding +0 -0
- bigdl/cpp/libs/llama-gemma3-cli +0 -0
- bigdl/cpp/libs/llama-gguf +0 -0
- bigdl/cpp/libs/llama-llava-cli +0 -0
- bigdl/cpp/libs/llama-lookup +0 -0
- bigdl/cpp/libs/llama-ls-sycl-device +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli +0 -0
- bigdl/cpp/libs/llama-perplexity +0 -0
- bigdl/cpp/libs/llama-quantize +0 -0
- bigdl/cpp/libs/llama-server +0 -0
- bigdl/cpp/libs/llama-simple +0 -0
- bigdl/cpp/libs/llama-speculative +0 -0
- bigdl/cpp/libs/llama-tokenize +0 -0
- bigdl_core_cpp-2.7.0b20250413.dist-info/RECORD +0 -58
- {bigdl_core_cpp-2.7.0b20250413.data → bigdl_core_cpp-2.7.0b20250414.data}/scripts/init-llama-cpp +0 -0
- {bigdl_core_cpp-2.7.0b20250413.data → bigdl_core_cpp-2.7.0b20250414.data}/scripts/init-ollama +0 -0
- {bigdl_core_cpp-2.7.0b20250413.dist-info → bigdl_core_cpp-2.7.0b20250414.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.7.0b20250413.dist-info → bigdl_core_cpp-2.7.0b20250414.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert_hf_to_gguf.py
CHANGED
@@ -221,17 +221,17 @@ class Model:
|
|
221
221
|
self.gguf_writer.add_context_length(n_ctx)
|
222
222
|
logger.info(f"gguf: context length = {n_ctx}")
|
223
223
|
|
224
|
-
n_embd
|
225
|
-
|
226
|
-
|
224
|
+
if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
|
225
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
226
|
+
logger.info(f"gguf: embedding length = {n_embd}")
|
227
227
|
|
228
228
|
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
229
229
|
self.gguf_writer.add_feed_forward_length(n_ff)
|
230
230
|
logger.info(f"gguf: feed forward length = {n_ff}")
|
231
231
|
|
232
|
-
n_head
|
233
|
-
|
234
|
-
|
232
|
+
if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
|
233
|
+
self.gguf_writer.add_head_count(n_head)
|
234
|
+
logger.info(f"gguf: head count = {n_head}")
|
235
235
|
|
236
236
|
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
237
237
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
@@ -296,7 +296,9 @@ class Model:
|
|
296
296
|
break
|
297
297
|
|
298
298
|
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
|
299
|
-
|
299
|
+
# TODO: why do we squeeze here?
|
300
|
+
# data = data_torch.squeeze().numpy()
|
301
|
+
data = data_torch.numpy()
|
300
302
|
|
301
303
|
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
302
304
|
if len(data.shape) == 0:
|
@@ -324,6 +326,9 @@ class Model:
|
|
324
326
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
325
327
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
326
328
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
329
|
+
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
330
|
+
gguf.MODEL_TENSOR.POSNET_NORM1,
|
331
|
+
gguf.MODEL_TENSOR.POSNET_NORM2,
|
327
332
|
)
|
328
333
|
)
|
329
334
|
or not new_name.endswith(".weight")
|
@@ -473,6 +478,11 @@ class Model:
|
|
473
478
|
return modelcls
|
474
479
|
return func
|
475
480
|
|
481
|
+
@classmethod
|
482
|
+
def print_registered_models(cls):
|
483
|
+
for name in sorted(cls._model_classes.keys()):
|
484
|
+
logger.error(f"- {name}")
|
485
|
+
|
476
486
|
@classmethod
|
477
487
|
def from_model_architecture(cls, arch: str) -> type[Model]:
|
478
488
|
try:
|
@@ -525,9 +535,19 @@ class Model:
|
|
525
535
|
else:
|
526
536
|
token: str = reverse_vocab[i]
|
527
537
|
if token in added_vocab:
|
538
|
+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
539
|
+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
540
|
+
if not tokenizer.added_tokens_decoder[i].normalized:
|
541
|
+
previous_token = token
|
542
|
+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
543
|
+
if previous_token != token:
|
544
|
+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
545
|
+
|
528
546
|
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
529
547
|
toktypes.append(gguf.TokenType.CONTROL)
|
530
548
|
else:
|
549
|
+
# NOTE: this was added for Gemma.
|
550
|
+
# Encoding and decoding the tokens above isn't sufficient for this case.
|
531
551
|
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
532
552
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
533
553
|
else:
|
@@ -538,7 +558,7 @@ class Model:
|
|
538
558
|
|
539
559
|
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
540
560
|
# do not modify it manually!
|
541
|
-
# ref: https://github.com/
|
561
|
+
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
542
562
|
# Marker: Start get_vocab_base_pre
|
543
563
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
544
564
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
@@ -571,6 +591,9 @@ class Model:
|
|
571
591
|
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
572
592
|
# ref: https://huggingface.co/tiiuae/falcon-7b
|
573
593
|
res = "falcon"
|
594
|
+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
595
|
+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
596
|
+
res = "falcon3"
|
574
597
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
575
598
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
576
599
|
res = "bert-bge"
|
@@ -625,7 +648,7 @@ class Model:
|
|
625
648
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
626
649
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
627
650
|
res = "jina-v2-code"
|
628
|
-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
651
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
629
652
|
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
630
653
|
res = "chatglm-bpe"
|
631
654
|
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
@@ -664,6 +687,18 @@ class Model:
|
|
664
687
|
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
|
665
688
|
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
|
666
689
|
res = "roberta-bpe"
|
690
|
+
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
691
|
+
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
692
|
+
res = "gigachat"
|
693
|
+
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
694
|
+
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
695
|
+
res = "megrez"
|
696
|
+
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
|
697
|
+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
698
|
+
res = "deepseek-v3"
|
699
|
+
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
|
700
|
+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
701
|
+
res = "deepseek-r1-qwen"
|
667
702
|
|
668
703
|
if res is None:
|
669
704
|
logger.warning("\n")
|
@@ -673,7 +708,7 @@ class Model:
|
|
673
708
|
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
674
709
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
675
710
|
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
676
|
-
logger.warning("** ref: https://github.com/
|
711
|
+
logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
|
677
712
|
logger.warning("**")
|
678
713
|
logger.warning(f"** chkhsh: {chkhsh}")
|
679
714
|
logger.warning("**************************************************************************************")
|
@@ -686,6 +721,9 @@ class Model:
|
|
686
721
|
return res
|
687
722
|
# Marker: End get_vocab_base_pre
|
688
723
|
|
724
|
+
def _set_vocab_none(self) -> None:
|
725
|
+
self.gguf_writer.add_tokenizer_model("none")
|
726
|
+
|
689
727
|
def _set_vocab_gpt2(self) -> None:
|
690
728
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
691
729
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
@@ -1669,6 +1707,178 @@ class LlamaModel(Model):
|
|
1669
1707
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1670
1708
|
|
1671
1709
|
|
1710
|
+
@Model.register("DeciLMForCausalLM")
|
1711
|
+
class DeciModel(Model):
|
1712
|
+
model_arch = gguf.MODEL_ARCH.DECI
|
1713
|
+
|
1714
|
+
@staticmethod
|
1715
|
+
def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
|
1716
|
+
# DeciLM-specific code
|
1717
|
+
intermediate_size = int(2 * ffn_mult * n_embd / 3)
|
1718
|
+
return DeciModel._find_multiple(intermediate_size, 256)
|
1719
|
+
|
1720
|
+
@staticmethod
|
1721
|
+
def _find_multiple(n: int, k: int) -> int:
|
1722
|
+
# DeciLM-specific code
|
1723
|
+
if n % k == 0:
|
1724
|
+
return n
|
1725
|
+
return n + k - (n % k)
|
1726
|
+
|
1727
|
+
def __init__(self, *args, **kwargs):
|
1728
|
+
super().__init__(*args, **kwargs)
|
1729
|
+
|
1730
|
+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
1731
|
+
_block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
|
1732
|
+
assert self.block_count == len(_block_configs)
|
1733
|
+
self._num_kv_heads = list()
|
1734
|
+
self._num_heads = list()
|
1735
|
+
_ffn_multipliers = list()
|
1736
|
+
# ***linear attention layer***
|
1737
|
+
# if n_heads_in_group is None and replace_with_linear is True
|
1738
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
|
1739
|
+
# ***attention-free layer***
|
1740
|
+
# if n_heads_in_group is None and replace_with_linear is False
|
1741
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0
|
1742
|
+
# ***normal attention-layer***
|
1743
|
+
# if n_heads_in_group is not None, then
|
1744
|
+
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
|
1745
|
+
# _num_heads[il] is num_attention_head
|
1746
|
+
for il in range(len(_block_configs)):
|
1747
|
+
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
|
1748
|
+
if _block_configs[il]["attention"]["replace_with_linear"] is True:
|
1749
|
+
self._num_kv_heads.append(0)
|
1750
|
+
self._num_heads.append(self.hparams["num_attention_heads"])
|
1751
|
+
else:
|
1752
|
+
self._num_kv_heads.append(0)
|
1753
|
+
self._num_heads.append(0)
|
1754
|
+
else:
|
1755
|
+
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
|
1756
|
+
self._num_heads.append(self.hparams["num_attention_heads"])
|
1757
|
+
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
1758
|
+
assert self.block_count == len(self._num_kv_heads)
|
1759
|
+
assert self.block_count == len(self._num_heads)
|
1760
|
+
assert self.block_count == len(_ffn_multipliers)
|
1761
|
+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
1762
|
+
assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
|
1763
|
+
assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
|
1764
|
+
self._ffn_dims: list[int] = [
|
1765
|
+
DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
|
1766
|
+
for multiplier in _ffn_multipliers
|
1767
|
+
]
|
1768
|
+
|
1769
|
+
def set_vocab(self):
|
1770
|
+
# Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
|
1771
|
+
# eos_token from '|eot_id|' to '|end_of_text|'
|
1772
|
+
if self.hparams.get("vocab_size", 128256) == 128256:
|
1773
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
1774
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
1775
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
1776
|
+
self.gguf_writer.add_token_list(tokens)
|
1777
|
+
self.gguf_writer.add_token_types(toktypes)
|
1778
|
+
|
1779
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
1780
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
1781
|
+
else:
|
1782
|
+
# DeciLM-7B
|
1783
|
+
self._set_vocab_llama_hf()
|
1784
|
+
|
1785
|
+
def set_gguf_parameters(self):
|
1786
|
+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
1787
|
+
assert self.block_count == len(self._num_kv_heads)
|
1788
|
+
assert self.block_count == len(self._num_heads)
|
1789
|
+
assert self.block_count == len(self._ffn_dims)
|
1790
|
+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
1791
|
+
self.gguf_writer.add_rope_freq_base(rope_theta)
|
1792
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
1793
|
+
self.gguf_writer.add_head_count(self._num_heads)
|
1794
|
+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
1795
|
+
self.gguf_writer.add_block_count(self.block_count)
|
1796
|
+
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1797
|
+
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
1798
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
1799
|
+
self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1800
|
+
self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1801
|
+
self.gguf_writer.add_file_type(self.ftype)
|
1802
|
+
else: # DeciLM-7B
|
1803
|
+
super().set_gguf_parameters()
|
1804
|
+
if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
|
1805
|
+
self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
|
1806
|
+
assert self.block_count == len(self._num_kv_heads)
|
1807
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
1808
|
+
hparams = self.hparams
|
1809
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1810
|
+
|
1811
|
+
if "head_dim" in hparams:
|
1812
|
+
rope_dim = hparams["head_dim"]
|
1813
|
+
else:
|
1814
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1815
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1816
|
+
|
1817
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1818
|
+
if self.hparams["rope_scaling"].get("type") == "linear":
|
1819
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1820
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1821
|
+
|
1822
|
+
@staticmethod
|
1823
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1824
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
1825
|
+
n_head = n_head_kv
|
1826
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1827
|
+
.swapaxes(1, 2)
|
1828
|
+
.reshape(weights.shape))
|
1829
|
+
|
1830
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1831
|
+
n_head = self.hparams["num_attention_heads"]
|
1832
|
+
if bid is not None:
|
1833
|
+
if "num_key_value_heads_per_layer" in self.hparams:
|
1834
|
+
n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
|
1835
|
+
elif "block_configs" in self.hparams:
|
1836
|
+
n_kv_head = self._num_kv_heads[bid]
|
1837
|
+
n_head = self._num_heads[bid]
|
1838
|
+
else:
|
1839
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1840
|
+
else:
|
1841
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1842
|
+
|
1843
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1844
|
+
data_torch = DeciModel.permute(data_torch, n_head, n_head)
|
1845
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1846
|
+
data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
|
1847
|
+
return [(self.map_tensor_name(name), data_torch)]
|
1848
|
+
|
1849
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1850
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1851
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1852
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
1853
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1854
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1855
|
+
|
1856
|
+
factor = rope_scaling.get("factor", 8.0)
|
1857
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1858
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
1859
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
1860
|
+
|
1861
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
1862
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
1863
|
+
assert low_freq_wavelen != high_freq_wavelen
|
1864
|
+
|
1865
|
+
rope_factors = []
|
1866
|
+
for freq in freqs:
|
1867
|
+
wavelen = 2 * math.pi / freq
|
1868
|
+
if wavelen < high_freq_wavelen:
|
1869
|
+
rope_factors.append(1)
|
1870
|
+
elif wavelen > low_freq_wavelen:
|
1871
|
+
rope_factors.append(factor)
|
1872
|
+
else:
|
1873
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1874
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1875
|
+
|
1876
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
1877
|
+
|
1878
|
+
def prepare_tensors(self):
|
1879
|
+
super().prepare_tensors()
|
1880
|
+
|
1881
|
+
|
1672
1882
|
@Model.register("BitnetForCausalLM")
|
1673
1883
|
class BitnetModel(Model):
|
1674
1884
|
model_arch = gguf.MODEL_ARCH.BITNET
|
@@ -2024,6 +2234,44 @@ class Qwen2VLModel(Model):
|
|
2024
2234
|
yield name, data
|
2025
2235
|
|
2026
2236
|
|
2237
|
+
@Model.register("WavTokenizerDec")
|
2238
|
+
class WavTokenizerDecModel(Model):
|
2239
|
+
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
2240
|
+
|
2241
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2242
|
+
del bid # unused
|
2243
|
+
|
2244
|
+
if \
|
2245
|
+
name.endswith("codebook.cluster_size") or \
|
2246
|
+
name.endswith("codebook.embed_avg") or \
|
2247
|
+
name.endswith("codebook.inited"):
|
2248
|
+
logger.debug(f"Skipping {name!r}")
|
2249
|
+
return []
|
2250
|
+
|
2251
|
+
logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
|
2252
|
+
|
2253
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2254
|
+
|
2255
|
+
def set_vocab(self):
|
2256
|
+
self._set_vocab_none()
|
2257
|
+
|
2258
|
+
def set_gguf_parameters(self):
|
2259
|
+
super().set_gguf_parameters()
|
2260
|
+
self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
|
2261
|
+
self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
|
2262
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
|
2263
|
+
self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
|
2264
|
+
self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
|
2265
|
+
|
2266
|
+
self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
|
2267
|
+
self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
|
2268
|
+
|
2269
|
+
self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
|
2270
|
+
self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
|
2271
|
+
|
2272
|
+
self.gguf_writer.add_causal_attention(False)
|
2273
|
+
|
2274
|
+
|
2027
2275
|
@Model.register("Qwen2MoeForCausalLM")
|
2028
2276
|
class Qwen2MoeModel(Model):
|
2029
2277
|
model_arch = gguf.MODEL_ARCH.QWEN2MOE
|
@@ -2087,6 +2335,16 @@ class Qwen2MoeModel(Model):
|
|
2087
2335
|
raise ValueError(f"Unprocessed experts: {experts}")
|
2088
2336
|
|
2089
2337
|
|
2338
|
+
@Model.register("Qwen3ForCausalLM")
|
2339
|
+
class Qwen3Model(Qwen2Model):
|
2340
|
+
model_arch = gguf.MODEL_ARCH.QWEN3
|
2341
|
+
|
2342
|
+
|
2343
|
+
@Model.register("Qwen3MoeForCausalLM")
|
2344
|
+
class Qwen3MoeModel(Qwen2MoeModel):
|
2345
|
+
model_arch = gguf.MODEL_ARCH.QWEN3MOE
|
2346
|
+
|
2347
|
+
|
2090
2348
|
@Model.register("GPT2LMHeadModel")
|
2091
2349
|
class GPT2Model(Model):
|
2092
2350
|
model_arch = gguf.MODEL_ARCH.GPT2
|
@@ -2152,6 +2410,15 @@ class Phi3MiniModel(Model):
|
|
2152
2410
|
model_arch = gguf.MODEL_ARCH.PHI3
|
2153
2411
|
|
2154
2412
|
def set_vocab(self):
|
2413
|
+
# Phi-4 model uses GPT2Tokenizer
|
2414
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2415
|
+
if tokenizer_config_file.is_file():
|
2416
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2417
|
+
tokenizer_config_json = json.load(f)
|
2418
|
+
tokenizer_class = tokenizer_config_json['tokenizer_class']
|
2419
|
+
if tokenizer_class == 'GPT2Tokenizer':
|
2420
|
+
return self._set_vocab_gpt2()
|
2421
|
+
|
2155
2422
|
from sentencepiece import SentencePieceProcessor
|
2156
2423
|
|
2157
2424
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
@@ -2268,7 +2535,11 @@ class Phi3MiniModel(Model):
|
|
2268
2535
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
2269
2536
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
2270
2537
|
self.gguf_writer.add_file_type(self.ftype)
|
2271
|
-
self.
|
2538
|
+
sliding_window = self.hparams.get("sliding_window")
|
2539
|
+
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
2540
|
+
if sliding_window is None:
|
2541
|
+
sliding_window = 0
|
2542
|
+
self.gguf_writer.add_sliding_window(sliding_window)
|
2272
2543
|
|
2273
2544
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2274
2545
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
@@ -2310,6 +2581,63 @@ class Phi3MiniModel(Model):
|
|
2310
2581
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2311
2582
|
|
2312
2583
|
|
2584
|
+
@Model.register("PhiMoEForCausalLM")
|
2585
|
+
class PhiMoeModel(Phi3MiniModel):
|
2586
|
+
model_arch = gguf.MODEL_ARCH.PHIMOE
|
2587
|
+
|
2588
|
+
_experts: list[dict[str, Tensor]] | None = None
|
2589
|
+
|
2590
|
+
def set_gguf_parameters(self):
|
2591
|
+
super().set_gguf_parameters()
|
2592
|
+
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
2593
|
+
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
|
2594
|
+
|
2595
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2596
|
+
# process the experts separately
|
2597
|
+
if name.find("block_sparse_moe.experts") != -1:
|
2598
|
+
n_experts = self.hparams["num_local_experts"]
|
2599
|
+
assert bid is not None
|
2600
|
+
|
2601
|
+
if self._experts is None:
|
2602
|
+
self._experts = [{} for _ in range(self.block_count)]
|
2603
|
+
|
2604
|
+
self._experts[bid][name] = data_torch
|
2605
|
+
|
2606
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
2607
|
+
tensors: list[tuple[str, Tensor]] = []
|
2608
|
+
|
2609
|
+
# merge the experts into a single 3d tensor
|
2610
|
+
for w_name in ["w1", "w2", "w3"]:
|
2611
|
+
datas: list[Tensor] = []
|
2612
|
+
|
2613
|
+
for xid in range(n_experts):
|
2614
|
+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
|
2615
|
+
datas.append(self._experts[bid][ename])
|
2616
|
+
del self._experts[bid][ename]
|
2617
|
+
|
2618
|
+
data_torch = torch.stack(datas, dim=0)
|
2619
|
+
|
2620
|
+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
|
2621
|
+
|
2622
|
+
new_name = self.map_tensor_name(merged_name)
|
2623
|
+
|
2624
|
+
tensors.append((new_name, data_torch))
|
2625
|
+
return tensors
|
2626
|
+
else:
|
2627
|
+
return []
|
2628
|
+
|
2629
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2630
|
+
|
2631
|
+
def prepare_tensors(self):
|
2632
|
+
super().prepare_tensors()
|
2633
|
+
|
2634
|
+
if self._experts is not None:
|
2635
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
2636
|
+
experts = [k for d in self._experts for k in d.keys()]
|
2637
|
+
if len(experts) > 0:
|
2638
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
2639
|
+
|
2640
|
+
|
2313
2641
|
@Model.register("PlamoForCausalLM")
|
2314
2642
|
class PlamoModel(Model):
|
2315
2643
|
model_arch = gguf.MODEL_ARCH.PLAMO
|
@@ -2517,7 +2845,7 @@ class InternLM2Model(Model):
|
|
2517
2845
|
if chat_eos_token_id is not None:
|
2518
2846
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
2519
2847
|
# TODO: this is a hack, should be fixed
|
2520
|
-
# https://github.com/
|
2848
|
+
# https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
|
2521
2849
|
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
2522
2850
|
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
2523
2851
|
" in chat mode so that the conversation can end normally.")
|
@@ -2567,7 +2895,67 @@ class InternLM2Model(Model):
|
|
2567
2895
|
return [(self.map_tensor_name(name), data_torch)]
|
2568
2896
|
|
2569
2897
|
|
2570
|
-
@Model.register("
|
2898
|
+
@Model.register("InternLM3ForCausalLM")
|
2899
|
+
class InternLM3Model(Model):
|
2900
|
+
model_arch = gguf.MODEL_ARCH.LLAMA
|
2901
|
+
|
2902
|
+
def set_vocab(self):
|
2903
|
+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
2904
|
+
|
2905
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
2906
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
2907
|
+
self.gguf_writer.add_token_list(tokens)
|
2908
|
+
self.gguf_writer.add_token_scores(scores)
|
2909
|
+
self.gguf_writer.add_token_types(toktypes)
|
2910
|
+
|
2911
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2912
|
+
|
2913
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2914
|
+
if tokenizer_config_file.is_file():
|
2915
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2916
|
+
tokenizer_config_json = json.load(f)
|
2917
|
+
if "add_prefix_space" in tokenizer_config_json:
|
2918
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
2919
|
+
|
2920
|
+
if "added_tokens_decoder" in tokenizer_config_json:
|
2921
|
+
for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
|
2922
|
+
if token_data.get("special"):
|
2923
|
+
token_id = int(token_id)
|
2924
|
+
token = token_data["content"]
|
2925
|
+
special_vocab._set_special_token(token, token_id)
|
2926
|
+
# update eos token
|
2927
|
+
if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
|
2928
|
+
special_vocab.special_token_ids["eos"] = token_id
|
2929
|
+
|
2930
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2931
|
+
|
2932
|
+
def set_gguf_parameters(self):
|
2933
|
+
super().set_gguf_parameters()
|
2934
|
+
hparams = self.hparams
|
2935
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
2936
|
+
|
2937
|
+
if "head_dim" in hparams:
|
2938
|
+
rope_dim = hparams["head_dim"]
|
2939
|
+
else:
|
2940
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
2941
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
2942
|
+
|
2943
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2944
|
+
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
|
2945
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2946
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2947
|
+
|
2948
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2949
|
+
n_head = self.hparams["num_attention_heads"]
|
2950
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
2951
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
2952
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
2953
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
2954
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
2955
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2956
|
+
|
2957
|
+
|
2958
|
+
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
|
2571
2959
|
class BertModel(Model):
|
2572
2960
|
model_arch = gguf.MODEL_ARCH.BERT
|
2573
2961
|
|
@@ -2633,13 +3021,73 @@ class BertModel(Model):
|
|
2633
3021
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2634
3022
|
del bid # unused
|
2635
3023
|
|
3024
|
+
if name.startswith("bert."):
|
3025
|
+
name = name[5:]
|
3026
|
+
|
3027
|
+
if name.endswith(".gamma"):
|
3028
|
+
name = name[:-6] + ".weight"
|
3029
|
+
|
3030
|
+
if name.endswith(".beta"):
|
3031
|
+
name = name[:-5] + ".bias"
|
3032
|
+
|
2636
3033
|
# we are only using BERT for embeddings so we don't need the pooling layer
|
2637
3034
|
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
2638
3035
|
return [] # we don't need these
|
2639
3036
|
|
3037
|
+
if name.startswith("cls.predictions"):
|
3038
|
+
return []
|
3039
|
+
|
3040
|
+
if name.startswith("cls.seq_relationship"):
|
3041
|
+
return []
|
3042
|
+
|
2640
3043
|
return [(self.map_tensor_name(name), data_torch)]
|
2641
3044
|
|
2642
3045
|
|
3046
|
+
@Model.register("RobertaModel")
|
3047
|
+
class RobertaModel(BertModel):
|
3048
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
3049
|
+
|
3050
|
+
def __init__(self, *args, **kwargs):
|
3051
|
+
super().__init__(*args, **kwargs)
|
3052
|
+
|
3053
|
+
# we need the pad_token_id to know how to chop down position_embd matrix
|
3054
|
+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
3055
|
+
self._position_offset = 1 + pad_token_id
|
3056
|
+
if "max_position_embeddings" in self.hparams:
|
3057
|
+
self.hparams["max_position_embeddings"] -= self._position_offset
|
3058
|
+
else:
|
3059
|
+
self._position_offset = None
|
3060
|
+
|
3061
|
+
def set_vocab(self):
|
3062
|
+
"""Support BPE tokenizers for roberta models"""
|
3063
|
+
bpe_tok_path = self.dir_model / "tokenizer.json"
|
3064
|
+
if bpe_tok_path.exists():
|
3065
|
+
self._set_vocab_gpt2()
|
3066
|
+
self.gguf_writer.add_add_bos_token(True)
|
3067
|
+
self.gguf_writer.add_add_eos_token(True)
|
3068
|
+
|
3069
|
+
# we need this to validate the size of the token_type embeddings
|
3070
|
+
# though currently we are passing all zeros to the token_type embeddings
|
3071
|
+
# "Sequence A" or "Sequence B"
|
3072
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
3073
|
+
|
3074
|
+
else:
|
3075
|
+
return super().set_vocab()
|
3076
|
+
|
3077
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3078
|
+
# if name starts with "roberta.", remove the prefix
|
3079
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
3080
|
+
if name.startswith("roberta."):
|
3081
|
+
name = name[8:]
|
3082
|
+
|
3083
|
+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
3084
|
+
if name == "embeddings.position_embeddings.weight":
|
3085
|
+
if self._position_offset is not None:
|
3086
|
+
data_torch = data_torch[self._position_offset:,:]
|
3087
|
+
|
3088
|
+
return super().modify_tensors(data_torch, name, bid)
|
3089
|
+
|
3090
|
+
|
2643
3091
|
@Model.register("NomicBertModel")
|
2644
3092
|
class NomicBertModel(BertModel):
|
2645
3093
|
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
|
@@ -2947,6 +3395,8 @@ class Rwkv6Model(Model):
|
|
2947
3395
|
# required by llama.cpp, unused
|
2948
3396
|
self.gguf_writer.add_head_count(0)
|
2949
3397
|
|
3398
|
+
lerp_weights: dict[int, dict[str, Tensor]] = {}
|
3399
|
+
|
2950
3400
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2951
3401
|
new_name = self.map_tensor_name(name)
|
2952
3402
|
|
@@ -2959,14 +3409,87 @@ class Rwkv6Model(Model):
|
|
2959
3409
|
if new_name.endswith("time_mix_w2.weight"):
|
2960
3410
|
data_torch = data_torch.permute(0, 2, 1)
|
2961
3411
|
|
2962
|
-
|
2963
|
-
|
2964
|
-
|
2965
|
-
|
3412
|
+
if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
|
3413
|
+
data_torch = data_torch.squeeze()
|
3414
|
+
|
3415
|
+
try:
|
3416
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
3417
|
+
if rescale_every_n_layers > 0:
|
3418
|
+
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
3419
|
+
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
3420
|
+
except KeyError:
|
3421
|
+
pass
|
3422
|
+
|
3423
|
+
# concat time_mix_lerp weights to reduce some cpu overhead
|
3424
|
+
# also reduces the number of tensors in the model
|
3425
|
+
if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
|
3426
|
+
try:
|
3427
|
+
self.lerp_weights[bid][new_name] = data_torch
|
3428
|
+
except KeyError:
|
3429
|
+
self.lerp_weights[bid] = {new_name: data_torch}
|
3430
|
+
if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
|
3431
|
+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
|
3432
|
+
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
|
3433
|
+
yield (new_name, data)
|
3434
|
+
return
|
2966
3435
|
|
2967
3436
|
yield (new_name, data_torch)
|
2968
3437
|
|
2969
3438
|
|
3439
|
+
@Model.register("RWKV6Qwen2ForCausalLM")
|
3440
|
+
class RWKV6Qwen2Model(Rwkv6Model):
|
3441
|
+
model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
|
3442
|
+
|
3443
|
+
def set_vocab(self):
|
3444
|
+
try:
|
3445
|
+
self._set_vocab_sentencepiece()
|
3446
|
+
except FileNotFoundError:
|
3447
|
+
self._set_vocab_gpt2()
|
3448
|
+
|
3449
|
+
def set_gguf_parameters(self):
|
3450
|
+
block_count = self.hparams["num_hidden_layers"]
|
3451
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
3452
|
+
num_key_value_heads = self.hparams["num_key_value_heads"]
|
3453
|
+
hidden_size = self.hparams["hidden_size"]
|
3454
|
+
head_size = hidden_size // num_attention_heads
|
3455
|
+
rms_norm_eps = self.hparams["rms_norm_eps"]
|
3456
|
+
intermediate_size = self.hparams["intermediate_size"]
|
3457
|
+
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
|
3458
|
+
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
|
3459
|
+
|
3460
|
+
# RWKV isn't context limited
|
3461
|
+
self.gguf_writer.add_context_length(1048576)
|
3462
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
3463
|
+
self.gguf_writer.add_block_count(block_count)
|
3464
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
3465
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
3466
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
3467
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
3468
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3469
|
+
|
3470
|
+
# special parameters for time_mixing in RWKV6QWEN2
|
3471
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3472
|
+
self.gguf_writer.add_token_shift_count(1)
|
3473
|
+
# RWKV6QWEN2 use grouped key/value like GQA
|
3474
|
+
self.gguf_writer.add_head_count_kv(num_key_value_heads)
|
3475
|
+
|
3476
|
+
# required by llama.cpp, unused
|
3477
|
+
self.gguf_writer.add_head_count(0)
|
3478
|
+
|
3479
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3480
|
+
for new_name, data in super().modify_tensors(data_torch, name, bid):
|
3481
|
+
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
|
3482
|
+
data = data.view(5, -1, data.shape[-1])
|
3483
|
+
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
|
3484
|
+
# permute them here to avoid code changes
|
3485
|
+
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
|
3486
|
+
if "w2" in new_name:
|
3487
|
+
data = data.view(5, -1, data.shape[-1])
|
3488
|
+
yield (new_name, data)
|
3489
|
+
continue
|
3490
|
+
yield (new_name, data)
|
3491
|
+
|
3492
|
+
|
2970
3493
|
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
2971
3494
|
class MambaModel(Model):
|
2972
3495
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
@@ -3061,6 +3584,24 @@ class CommandR2Model(Model):
|
|
3061
3584
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3062
3585
|
|
3063
3586
|
|
3587
|
+
@Model.register("Cohere2ForCausalLM")
|
3588
|
+
class Cohere2Model(Model):
|
3589
|
+
model_arch = gguf.MODEL_ARCH.COHERE2
|
3590
|
+
|
3591
|
+
def set_gguf_parameters(self):
|
3592
|
+
super().set_gguf_parameters()
|
3593
|
+
|
3594
|
+
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
3595
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
3596
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
3597
|
+
|
3598
|
+
rotary_pct = self.hparams["rotary_pct"]
|
3599
|
+
hidden_size = self.hparams["hidden_size"]
|
3600
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
3601
|
+
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
|
3602
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3603
|
+
|
3604
|
+
|
3064
3605
|
@Model.register("OlmoForCausalLM")
|
3065
3606
|
@Model.register("OLMoForCausalLM")
|
3066
3607
|
class OlmoModel(Model):
|
@@ -3427,7 +3968,99 @@ class ArcticModel(Model):
|
|
3427
3968
|
raise ValueError(f"Unprocessed experts: {experts}")
|
3428
3969
|
|
3429
3970
|
|
3971
|
+
@Model.register("DeepseekForCausalLM")
|
3972
|
+
class DeepseekModel(Model):
|
3973
|
+
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
3974
|
+
|
3975
|
+
def set_vocab(self):
|
3976
|
+
try:
|
3977
|
+
self._set_vocab_sentencepiece()
|
3978
|
+
except FileNotFoundError:
|
3979
|
+
self._set_vocab_gpt2()
|
3980
|
+
|
3981
|
+
def set_gguf_parameters(self):
|
3982
|
+
super().set_gguf_parameters()
|
3983
|
+
hparams = self.hparams
|
3984
|
+
if "head_dim" in hparams:
|
3985
|
+
rope_dim = hparams["head_dim"]
|
3986
|
+
else:
|
3987
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
3988
|
+
|
3989
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
3990
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3991
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
3992
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
3993
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
3994
|
+
self.gguf_writer.add_expert_weights_scale(1.0)
|
3995
|
+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3996
|
+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3997
|
+
|
3998
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3999
|
+
|
4000
|
+
@staticmethod
|
4001
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
4002
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
4003
|
+
n_head = n_head_kv
|
4004
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
4005
|
+
.swapaxes(1, 2)
|
4006
|
+
.reshape(weights.shape))
|
4007
|
+
|
4008
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4009
|
+
n_head = self.hparams["num_attention_heads"]
|
4010
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
4011
|
+
|
4012
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
4013
|
+
data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
|
4014
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
4015
|
+
data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
|
4016
|
+
|
4017
|
+
# process the experts separately
|
4018
|
+
if name.find("mlp.experts") != -1:
|
4019
|
+
n_experts = self.hparams["n_routed_experts"]
|
4020
|
+
assert bid is not None
|
4021
|
+
|
4022
|
+
if self._experts is None:
|
4023
|
+
self._experts = [{} for _ in range(self.block_count)]
|
4024
|
+
|
4025
|
+
self._experts[bid][name] = data_torch
|
4026
|
+
|
4027
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
4028
|
+
tensors: list[tuple[str, Tensor]] = []
|
4029
|
+
|
4030
|
+
# merge the experts into a single 3d tensor
|
4031
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
4032
|
+
datas: list[Tensor] = []
|
4033
|
+
|
4034
|
+
for xid in range(n_experts):
|
4035
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
4036
|
+
datas.append(self._experts[bid][ename])
|
4037
|
+
del self._experts[bid][ename]
|
4038
|
+
|
4039
|
+
data_torch = torch.stack(datas, dim=0)
|
4040
|
+
|
4041
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
4042
|
+
|
4043
|
+
new_name = self.map_tensor_name(merged_name)
|
4044
|
+
|
4045
|
+
tensors.append((new_name, data_torch))
|
4046
|
+
return tensors
|
4047
|
+
else:
|
4048
|
+
return []
|
4049
|
+
|
4050
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4051
|
+
|
4052
|
+
def prepare_tensors(self):
|
4053
|
+
super().prepare_tensors()
|
4054
|
+
|
4055
|
+
if self._experts is not None:
|
4056
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
4057
|
+
experts = [k for d in self._experts for k in d.keys()]
|
4058
|
+
if len(experts) > 0:
|
4059
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
4060
|
+
|
4061
|
+
|
3430
4062
|
@Model.register("DeepseekV2ForCausalLM")
|
4063
|
+
@Model.register("DeepseekV3ForCausalLM")
|
3431
4064
|
class DeepseekV2Model(Model):
|
3432
4065
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
3433
4066
|
|
@@ -3449,6 +4082,15 @@ class DeepseekV2Model(Model):
|
|
3449
4082
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3450
4083
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3451
4084
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
4085
|
+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
4086
|
+
|
4087
|
+
if hparams["scoring_func"] == "sigmoid":
|
4088
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
4089
|
+
elif hparams["scoring_func"] == "softmax":
|
4090
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
4091
|
+
else:
|
4092
|
+
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
|
4093
|
+
|
3452
4094
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
3453
4095
|
|
3454
4096
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
@@ -3461,6 +4103,16 @@ class DeepseekV2Model(Model):
|
|
3461
4103
|
_experts: list[dict[str, Tensor]] | None = None
|
3462
4104
|
|
3463
4105
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4106
|
+
# rename e_score_correction_bias tensors
|
4107
|
+
if name.endswith("e_score_correction_bias"):
|
4108
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
4109
|
+
|
4110
|
+
# skip Multi-Token Prediction (MTP) layers
|
4111
|
+
block_count = self.hparams["num_hidden_layers"]
|
4112
|
+
match = re.match(r"model.layers.(\d+)", name)
|
4113
|
+
if match and int(match.group(1)) >= block_count:
|
4114
|
+
return []
|
4115
|
+
|
3464
4116
|
# process the experts separately
|
3465
4117
|
if name.find("mlp.experts") != -1:
|
3466
4118
|
n_experts = self.hparams["n_routed_experts"]
|
@@ -3871,7 +4523,7 @@ class JaisModel(Model):
|
|
3871
4523
|
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
3872
4524
|
|
3873
4525
|
|
3874
|
-
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
4526
|
+
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
3875
4527
|
class ChatGLMModel(Model):
|
3876
4528
|
model_arch = gguf.MODEL_ARCH.CHATGLM
|
3877
4529
|
|
@@ -3977,47 +4629,15 @@ class ChatGLMModel(Model):
|
|
3977
4629
|
|
3978
4630
|
from transformers import AutoTokenizer
|
3979
4631
|
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3980
|
-
vocab_size = hparams
|
4632
|
+
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
|
3981
4633
|
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3982
4634
|
|
3983
|
-
tokpre = self.
|
3984
|
-
|
3985
|
-
merges = []
|
3986
|
-
vocab = {}
|
3987
|
-
mergeable_ranks = tokenizer.mergeable_ranks
|
3988
|
-
for token, rank in mergeable_ranks.items():
|
3989
|
-
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
3990
|
-
if len(token) == 1:
|
3991
|
-
continue
|
3992
|
-
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
3993
|
-
assert len(merged) >= 2 and len(merged) <= 7
|
3994
|
-
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
3995
|
-
|
3996
|
-
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
3997
|
-
added_vocab = tokenizer.get_added_vocab()
|
3998
|
-
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
3999
|
-
|
4000
|
-
for i in range(vocab_size):
|
4001
|
-
if i not in reverse_vocab:
|
4002
|
-
tokens.append(f"[PAD{i}]")
|
4003
|
-
toktypes.append(gguf.TokenType.UNUSED)
|
4004
|
-
elif reverse_vocab[i] in added_vocab:
|
4005
|
-
tokens.append(reverse_vocab[i])
|
4006
|
-
if tokenizer.added_tokens_decoder[i].special:
|
4007
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
4008
|
-
else:
|
4009
|
-
toktypes.append(gguf.TokenType.USER_DEFINED)
|
4010
|
-
else:
|
4011
|
-
tokens.append(reverse_vocab[i])
|
4012
|
-
toktypes.append(gguf.TokenType.NORMAL)
|
4013
|
-
|
4635
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
4014
4636
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
4015
4637
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
4016
4638
|
self.gguf_writer.add_token_list(tokens)
|
4017
4639
|
self.gguf_writer.add_token_types(toktypes)
|
4018
|
-
|
4019
|
-
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
4020
|
-
special_vocab.merges = merges
|
4640
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
4021
4641
|
# only add special tokens when they were not already loaded from config.json
|
4022
4642
|
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
4023
4643
|
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
@@ -4028,16 +4648,20 @@ class ChatGLMModel(Model):
|
|
4028
4648
|
def set_gguf_parameters(self):
|
4029
4649
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
4030
4650
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
4031
|
-
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
4651
|
+
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
|
4032
4652
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
4033
4653
|
self.gguf_writer.add_embedding_length(n_embed)
|
4034
|
-
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
4035
|
-
self.gguf_writer.add_block_count(self.hparams
|
4654
|
+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
|
4655
|
+
self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
|
4036
4656
|
self.gguf_writer.add_head_count(n_head)
|
4037
4657
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
4038
|
-
self.gguf_writer.add_layer_norm_rms_eps(self.hparams
|
4658
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
|
4039
4659
|
self.gguf_writer.add_file_type(self.ftype)
|
4040
|
-
self.
|
4660
|
+
if "attention_dim" in self.hparams:
|
4661
|
+
rope_dim = self.hparams["attention_dim"]
|
4662
|
+
else:
|
4663
|
+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
4664
|
+
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
4041
4665
|
self.gguf_writer.add_add_bos_token(False)
|
4042
4666
|
rope_freq = 10000
|
4043
4667
|
if "rope_ratio" in self.hparams:
|
@@ -4047,7 +4671,7 @@ class ChatGLMModel(Model):
|
|
4047
4671
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4048
4672
|
del bid # unused
|
4049
4673
|
|
4050
|
-
if name.endswith(".rotary_pos_emb.inv_freq"):
|
4674
|
+
if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
|
4051
4675
|
return []
|
4052
4676
|
|
4053
4677
|
name = name.removeprefix("transformer.")
|
@@ -4354,6 +4978,7 @@ def parse_args() -> argparse.Namespace:
|
|
4354
4978
|
parser.add_argument(
|
4355
4979
|
"model", type=Path,
|
4356
4980
|
help="directory containing model file",
|
4981
|
+
nargs="?",
|
4357
4982
|
)
|
4358
4983
|
parser.add_argument(
|
4359
4984
|
"--use-temp-file", action="store_true",
|
@@ -4391,8 +5016,15 @@ def parse_args() -> argparse.Namespace:
|
|
4391
5016
|
"--metadata", type=Path,
|
4392
5017
|
help="Specify the path for an authorship metadata override file"
|
4393
5018
|
)
|
5019
|
+
parser.add_argument(
|
5020
|
+
"--print-supported-models", action="store_true",
|
5021
|
+
help="Print the supported models"
|
5022
|
+
)
|
4394
5023
|
|
4395
|
-
|
5024
|
+
args = parser.parse_args()
|
5025
|
+
if not args.print_supported_models and args.model is None:
|
5026
|
+
parser.error("the following arguments are required: model")
|
5027
|
+
return args
|
4396
5028
|
|
4397
5029
|
|
4398
5030
|
def split_str_to_n_bytes(split_str: str) -> int:
|
@@ -4416,6 +5048,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
|
4416
5048
|
def main() -> None:
|
4417
5049
|
args = parse_args()
|
4418
5050
|
|
5051
|
+
if args.print_supported_models:
|
5052
|
+
logger.error("Supported models:")
|
5053
|
+
Model.print_registered_models()
|
5054
|
+
sys.exit(0)
|
5055
|
+
|
4419
5056
|
if args.verbose:
|
4420
5057
|
logging.basicConfig(level=logging.DEBUG)
|
4421
5058
|
else:
|