bigdl-core-cpp 2.6.0b20250204__py3-none-win_amd64.whl → 2.6.0b20250206__py3-none-win_amd64.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +99 -44
  2. bigdl/cpp/convert_hf_to_gguf_update.py +4 -1
  3. bigdl/cpp/convert_lora_to_gguf.py +41 -11
  4. bigdl/cpp/gguf-py/gguf/constants.py +79 -18
  5. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
  7. bigdl/cpp/gguf-py/gguf/gguf_writer.py +36 -12
  8. bigdl/cpp/gguf-py/gguf/metadata.py +131 -19
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +17 -15
  10. bigdl/cpp/gguf-py/gguf/vocab.py +24 -2
  11. bigdl/cpp/libs/common.lib +0 -0
  12. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  13. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  14. bigdl/cpp/libs/ggml.dll +0 -0
  15. bigdl/cpp/libs/llama-batched.exe +0 -0
  16. bigdl/cpp/libs/llama-bench.exe +0 -0
  17. bigdl/cpp/libs/llama-cli.exe +0 -0
  18. bigdl/cpp/libs/llama-embedding.exe +0 -0
  19. bigdl/cpp/libs/llama-gguf.exe +0 -0
  20. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  21. bigdl/cpp/libs/llama-lookup.exe +0 -0
  22. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  23. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  24. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  25. bigdl/cpp/libs/llama-quantize.exe +0 -0
  26. bigdl/cpp/libs/llama-server.exe +0 -0
  27. bigdl/cpp/libs/llama-simple.exe +0 -0
  28. bigdl/cpp/libs/llama-speculative.exe +0 -0
  29. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  30. bigdl/cpp/libs/llama.dll +0 -0
  31. bigdl/cpp/libs/llava_shared.dll +0 -0
  32. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  33. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  34. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  35. bigdl/cpp/libs/ollama.exe +0 -0
  36. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  37. bigdl/cpp/libs/ollama_llama.dll +0 -0
  38. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  39. {bigdl_core_cpp-2.6.0b20250204.data → bigdl_core_cpp-2.6.0b20250206.data}/scripts/init-ollama.bat +1 -1
  40. {bigdl_core_cpp-2.6.0b20250204.dist-info → bigdl_core_cpp-2.6.0b20250206.dist-info}/METADATA +1 -1
  41. bigdl_core_cpp-2.6.0b20250206.dist-info/RECORD +54 -0
  42. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/ipex_llm/ollama_llama_server.exe +0 -0
  43. bigdl_core_cpp-2.6.0b20250204.dist-info/RECORD +0 -50
  44. {bigdl_core_cpp-2.6.0b20250204.data → bigdl_core_cpp-2.6.0b20250206.data}/scripts/init-llama-cpp.bat +0 -0
  45. {bigdl_core_cpp-2.6.0b20250204.data → bigdl_core_cpp-2.6.0b20250206.data}/scripts/init-llama-cpp.ps1 +0 -0
  46. {bigdl_core_cpp-2.6.0b20250204.dist-info → bigdl_core_cpp-2.6.0b20250206.dist-info}/WHEEL +0 -0
  47. {bigdl_core_cpp-2.6.0b20250204.dist-info → bigdl_core_cpp-2.6.0b20250206.dist-info}/top_level.txt +0 -0
@@ -72,7 +72,8 @@ class Model:
72
72
  def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
73
73
  use_temp_file: bool = False, eager: bool = False,
74
74
  metadata_override: Path | None = None, model_name: str | None = None,
75
- split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
75
+ split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
76
+ small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
76
77
  if type(self) is Model:
77
78
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
78
79
 
@@ -87,7 +88,7 @@ class Model:
87
88
  self.is_safetensors = len(self.part_names) > 0
88
89
  if not self.is_safetensors:
89
90
  self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
90
- self.hparams = Model.load_hparams(self.dir_model)
91
+ self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
91
92
  self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
92
93
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
93
94
  self.tensor_names = None
@@ -573,6 +574,9 @@ class Model:
573
574
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
574
575
  # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
575
576
  res = "bert-bge"
577
+ if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
578
+ # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
579
+ res = "bert-bge-large"
576
580
  if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
577
581
  # ref: https://huggingface.co/mosaicml/mpt-7b
578
582
  res = "mpt"
@@ -654,6 +658,12 @@ class Model:
654
658
  if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
655
659
  # ref: https://huggingface.co/facebook/chameleon-7b
656
660
  res = "chameleon"
661
+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
662
+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
663
+ res = "minerva-7b"
664
+ if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
665
+ # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
666
+ res = "roberta-bpe"
657
667
 
658
668
  if res is None:
659
669
  logger.warning("\n")
@@ -1538,6 +1548,17 @@ class LlamaModel(Model):
1538
1548
  special_vocab._set_special_token("eot", 32010)
1539
1549
  special_vocab.add_to_gguf(self.gguf_writer)
1540
1550
 
1551
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1552
+ if tokenizer_config_file.is_file():
1553
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1554
+ tokenizer_config_json = json.load(f)
1555
+ if "add_prefix_space" in tokenizer_config_json:
1556
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1557
+
1558
+ # Apply to granite small models only
1559
+ if self.hparams.get("vocab_size", 32000) == 49152:
1560
+ self.gguf_writer.add_add_bos_token(False)
1561
+
1541
1562
  def set_gguf_parameters(self):
1542
1563
  super().set_gguf_parameters()
1543
1564
  hparams = self.hparams
@@ -1554,17 +1575,6 @@ class LlamaModel(Model):
1554
1575
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1555
1576
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1556
1577
 
1557
- tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1558
- if tokenizer_config_file.is_file():
1559
- with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1560
- tokenizer_config_json = json.load(f)
1561
- if "add_prefix_space" in tokenizer_config_json:
1562
- self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1563
-
1564
- # Apply to granite small models only
1565
- if self.hparams.get("vocab_size", 32000) == 49152:
1566
- self.gguf_writer.add_add_bos_token(False)
1567
-
1568
1578
  @staticmethod
1569
1579
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1570
1580
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1827,29 +1837,40 @@ class MiniCPMModel(Model):
1827
1837
  model_arch = gguf.MODEL_ARCH.MINICPM
1828
1838
 
1829
1839
  def set_gguf_parameters(self):
1830
- block_count = self.hparams["num_hidden_layers"]
1831
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1832
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1833
- self.gguf_writer.add_block_count(block_count)
1834
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1835
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1836
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1837
- self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
1838
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1839
- self.gguf_writer.add_file_type(self.ftype)
1840
+ super().set_gguf_parameters()
1841
+ embedding_scale = float(self.hparams["scale_emb"])
1842
+ self.gguf_writer.add_embedding_scale(embedding_scale)
1843
+ logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
1844
+ residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
1845
+ self.gguf_writer.add_residual_scale(residual_scale)
1846
+ logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
1847
+ logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
1848
+ self.gguf_writer.add_logit_scale(logit_scale)
1849
+ logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
1850
+ if self.hparams.get("rope_scaling") is not None:
1851
+ if self.hparams["rope_scaling"].get("type") == "longrope":
1852
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
1853
+ logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
1840
1854
 
1841
- def set_vocab(self):
1842
- self._set_vocab_llama_hf()
1855
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1856
+ rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1843
1857
 
1844
- def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1845
- if n_kv_head is not None and n_head != n_kv_head:
1846
- n_head //= n_kv_head
1858
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
1859
+ if rope_scaling is not None:
1860
+ long_factors = rope_scaling.get('long_factor', None)
1861
+ short_factors = rope_scaling.get('short_factor', None)
1847
1862
 
1848
- return (
1849
- weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1850
- .swapaxes(1, 2)
1851
- .reshape(weights.shape)
1852
- )
1863
+ if long_factors is None or short_factors is None:
1864
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
1865
+
1866
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
1867
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
1868
+
1869
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
1870
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
1871
+
1872
+ def set_vocab(self):
1873
+ self._set_vocab_sentencepiece()
1853
1874
 
1854
1875
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1855
1876
  del bid # unused
@@ -1859,9 +1880,9 @@ class MiniCPMModel(Model):
1859
1880
 
1860
1881
  # HF models permute some of the tensors, so we need to undo that
1861
1882
  if name.endswith(("q_proj.weight")):
1862
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
1883
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1863
1884
  if name.endswith(("k_proj.weight")):
1864
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
1885
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1865
1886
 
1866
1887
  return [(self.map_tensor_name(name), data_torch)]
1867
1888
 
@@ -1971,6 +1992,37 @@ class Qwen2Model(Model):
1971
1992
  except FileNotFoundError:
1972
1993
  self._set_vocab_gpt2()
1973
1994
 
1995
+ def set_gguf_parameters(self):
1996
+ super().set_gguf_parameters()
1997
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1998
+ if self.hparams["rope_scaling"].get("type") == "yarn":
1999
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2000
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2001
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
2002
+
2003
+
2004
+ @Model.register("Qwen2VLForConditionalGeneration")
2005
+ class Qwen2VLModel(Model):
2006
+ model_arch = gguf.MODEL_ARCH.QWEN2VL
2007
+
2008
+ def set_gguf_parameters(self):
2009
+ super().set_gguf_parameters()
2010
+ mrope_section = self.hparams["rope_scaling"]["mrope_section"]
2011
+ mrope_section += [0] * max(0, 4 - len(mrope_section))
2012
+ self.gguf_writer.add_rope_dimension_sections(mrope_section)
2013
+
2014
+ def set_vocab(self):
2015
+ try:
2016
+ self._set_vocab_sentencepiece()
2017
+ except FileNotFoundError:
2018
+ self._set_vocab_gpt2()
2019
+
2020
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
2021
+ for name, data in super().get_tensors():
2022
+ if name.startswith("visual."):
2023
+ continue
2024
+ yield name, data
2025
+
1974
2026
 
1975
2027
  @Model.register("Qwen2MoeForCausalLM")
1976
2028
  class Qwen2MoeModel(Model):
@@ -2515,7 +2567,7 @@ class InternLM2Model(Model):
2515
2567
  return [(self.map_tensor_name(name), data_torch)]
2516
2568
 
2517
2569
 
2518
- @Model.register("BertModel", "CamembertModel")
2570
+ @Model.register("BertModel", "CamembertModel", "RobertaModel")
2519
2571
  class BertModel(Model):
2520
2572
  model_arch = gguf.MODEL_ARCH.BERT
2521
2573
 
@@ -2556,7 +2608,8 @@ class BertModel(Model):
2556
2608
 
2557
2609
  # we need this to validate the size of the token_type embeddings
2558
2610
  # though currently we are passing all zeros to the token_type embeddings
2559
- self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
2611
+ # "Sequence A" or "Sequence B"
2612
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2560
2613
 
2561
2614
  # convert to phantom space vocab
2562
2615
  def phantom(tok):
@@ -2703,7 +2756,7 @@ class XLMRobertaModel(BertModel):
2703
2756
  self.gguf_writer.add_token_scores(scores)
2704
2757
  self.gguf_writer.add_token_types(toktypes)
2705
2758
  self.gguf_writer.add_add_space_prefix(add_prefix)
2706
- self.gguf_writer.add_token_type_count(1)
2759
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2707
2760
  self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
2708
2761
  if precompiled_charsmap:
2709
2762
  self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
@@ -2864,6 +2917,9 @@ class Rwkv6Model(Model):
2864
2917
  self.gguf_writer.add_token_list(tokens)
2865
2918
  self.gguf_writer.add_token_types(toktypes)
2866
2919
  special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
2920
+ special_vocab.chat_template = "rwkv-world"
2921
+ # hack: Add '\n\n' as the EOT token to make it chat normally
2922
+ special_vocab._set_special_token("eot", 261)
2867
2923
  special_vocab.add_to_gguf(self.gguf_writer)
2868
2924
 
2869
2925
  def set_gguf_parameters(self):
@@ -3033,6 +3089,11 @@ class OlmoModel(Model):
3033
3089
  return [(self.map_tensor_name(name), data_torch)]
3034
3090
 
3035
3091
 
3092
+ @Model.register("Olmo2ForCausalLM")
3093
+ class Olmo2Model(Model):
3094
+ model_arch = gguf.MODEL_ARCH.OLMO2
3095
+
3096
+
3036
3097
  @Model.register("OlmoeForCausalLM")
3037
3098
  class OlmoeModel(Model):
3038
3099
  model_arch = gguf.MODEL_ARCH.OLMOE
@@ -3741,10 +3802,7 @@ class JaisModel(Model):
3741
3802
 
3742
3803
  # Embeddings scale
3743
3804
  self.embeddings_scale = 1.0
3744
- # note: For some JAIS flavors, output is tied to (same as) wte in original model
3745
- self.output_is_wte = False
3746
3805
  if 'mup_embeddings_scale' in self.hparams:
3747
- self.output_is_wte = True # Hack (?)
3748
3806
  self.embeddings_scale = self.hparams['mup_embeddings_scale']
3749
3807
  elif 'embeddings_scale' in self.hparams:
3750
3808
  self.embeddings_scale = self.hparams['embeddings_scale']
@@ -3801,10 +3859,7 @@ class JaisModel(Model):
3801
3859
 
3802
3860
  if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3803
3861
  tensors.append((new_name, data_torch * self.embeddings_scale))
3804
- if self.output_is_wte:
3805
- tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
3806
3862
  elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3807
- assert not self.output_is_wte
3808
3863
  tensors.append((new_name, data_torch * self.width_scale))
3809
3864
  else:
3810
3865
  tensors.append((new_name, data_torch))
@@ -17,7 +17,7 @@
17
17
  #
18
18
  # python3 convert_hf_to_gguf_update.py <huggingface_token>
19
19
  #
20
- # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
20
+ # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
21
21
  # - Update llama.cpp with the new pre-tokenizer if necessary
22
22
  #
23
23
  # TODO: generate tokenizer tests for llama.cpp
@@ -72,6 +72,7 @@ models = [
72
72
  {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
73
73
  {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
74
74
  {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
75
+ {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
75
76
  {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
76
77
  {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
77
78
  {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
@@ -101,6 +102,8 @@ models = [
101
102
  {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
102
103
  {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
103
104
  {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
105
+ {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
106
+ {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
104
107
  ]
105
108
 
106
109
 
@@ -12,6 +12,7 @@ import json
12
12
  from math import prod
13
13
  from pathlib import Path
14
14
  from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15
+ from transformers import AutoConfig
15
16
 
16
17
  import torch
17
18
 
@@ -230,7 +231,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
230
231
 
231
232
  def parse_args() -> argparse.Namespace:
232
233
  parser = argparse.ArgumentParser(
233
- description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
234
+ description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
234
235
  parser.add_argument(
235
236
  "--outfile", type=Path,
236
237
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -256,17 +257,23 @@ def parse_args() -> argparse.Namespace:
256
257
  help="only print out what will be done, without writing any new files",
257
258
  )
258
259
  parser.add_argument(
259
- "--base", type=Path, required=True,
260
- help="directory containing base model file",
260
+ "--base", type=Path,
261
+ help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
261
262
  )
262
263
  parser.add_argument(
263
264
  "lora_path", type=Path,
264
- help="directory containing LoRA adapter file",
265
+ help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
265
266
  )
266
267
 
267
268
  return parser.parse_args()
268
269
 
269
270
 
271
+ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
272
+ # normally, adapter does not come with base model config, we need to load it from AutoConfig
273
+ config = AutoConfig.from_pretrained(hf_model_id)
274
+ return config.to_dict()
275
+
276
+
270
277
  if __name__ == '__main__':
271
278
  args = parse_args()
272
279
  logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@@ -281,7 +288,7 @@ if __name__ == '__main__':
281
288
 
282
289
  ftype = ftype_map[args.outtype]
283
290
 
284
- dir_base_model: Path = args.base
291
+ dir_base_model: Path | None = args.base
285
292
  dir_lora: Path = args.lora_path
286
293
  lora_config = dir_lora / "adapter_config.json"
287
294
  input_model = dir_lora / "adapter_model.safetensors"
@@ -301,9 +308,29 @@ if __name__ == '__main__':
301
308
  input_model = os.path.join(dir_lora, "adapter_model.bin")
302
309
  lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
303
310
 
311
+ # load LoRA config
312
+ with open(lora_config, "r") as f:
313
+ lparams: dict[str, Any] = json.load(f)
314
+
304
315
  # load base model
305
- logger.info(f"Loading base model: {dir_base_model.name}")
306
- hparams = Model.load_hparams(dir_base_model)
316
+ if dir_base_model is None:
317
+ if "base_model_name_or_path" in lparams:
318
+ model_id = lparams["base_model_name_or_path"]
319
+ logger.info(f"Loading base model from Hugging Face: {model_id}")
320
+ try:
321
+ hparams = load_hparams_from_hf(model_id)
322
+ except OSError as e:
323
+ logger.error(f"Failed to load base model config: {e}")
324
+ logger.error("Please try downloading the base model and add its path to --base")
325
+ sys.exit(1)
326
+ else:
327
+ logger.error("'base_model_name_or_path' is not found in adapter_config.json")
328
+ logger.error("Base model config is required. Please download the base model and add its path to --base")
329
+ sys.exit(1)
330
+ else:
331
+ logger.info(f"Loading base model: {dir_base_model.name}")
332
+ hparams = Model.load_hparams(dir_base_model)
333
+
307
334
  with torch.inference_mode():
308
335
  try:
309
336
  model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -323,13 +350,15 @@ if __name__ == '__main__':
323
350
  self.dir_model_card = dir_lora_model
324
351
  self.lora_alpha = float(lora_alpha)
325
352
 
353
+ def set_vocab(self):
354
+ pass
355
+
326
356
  def set_type(self):
327
357
  self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
328
358
  self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
329
359
 
330
360
  def set_gguf_parameters(self):
331
361
  self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
332
- super().set_gguf_parameters()
333
362
 
334
363
  def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
335
364
  # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
@@ -348,6 +377,9 @@ if __name__ == '__main__':
348
377
  if ".base_layer.weight" in name:
349
378
  continue
350
379
  logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
380
+ if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
381
+ logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
382
+ logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
351
383
  sys.exit(1)
352
384
 
353
385
  if base_name in tensor_map:
@@ -381,9 +413,6 @@ if __name__ == '__main__':
381
413
  yield (dest_name + ".lora_a", lora_a)
382
414
  yield (dest_name + ".lora_b", lora_b)
383
415
 
384
- with open(lora_config, "r") as f:
385
- lparams: dict[str, Any] = json.load(f)
386
-
387
416
  alpha: float = lparams["lora_alpha"]
388
417
 
389
418
  model_instance = LoraModel(
@@ -396,6 +425,7 @@ if __name__ == '__main__':
396
425
  dry_run=args.dry_run,
397
426
  dir_lora_model=dir_lora,
398
427
  lora_alpha=alpha,
428
+ hparams=hparams,
399
429
  )
400
430
 
401
431
  logger.info("Exporting model...")
@@ -64,15 +64,27 @@ class Keys:
64
64
  BASE_MODEL_AUTHOR = "general.base_model.{id}.author"
65
65
  BASE_MODEL_VERSION = "general.base_model.{id}.version"
66
66
  BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization"
67
+ BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description"
67
68
  BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper
68
69
  BASE_MODEL_DOI = "general.base_model.{id}.doi"
69
70
  BASE_MODEL_UUID = "general.base_model.{id}.uuid"
70
71
  BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
71
72
 
73
+ # Dataset Source
74
+ DATASET_COUNT = "general.dataset.count"
75
+ DATASET_NAME = "general.dataset.{id}.name"
76
+ DATASET_AUTHOR = "general.dataset.{id}.author"
77
+ DATASET_VERSION = "general.dataset.{id}.version"
78
+ DATASET_ORGANIZATION = "general.dataset.{id}.organization"
79
+ DATASET_DESCRIPTION = "general.dataset.{id}.description"
80
+ DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper
81
+ DATASET_DOI = "general.dataset.{id}.doi"
82
+ DATASET_UUID = "general.dataset.{id}.uuid"
83
+ DATASET_REPO_URL = "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...)
84
+
72
85
  # Array based KV stores
73
86
  TAGS = "general.tags"
74
87
  LANGUAGES = "general.languages"
75
- DATASETS = "general.datasets"
76
88
 
77
89
  class LLM:
78
90
  VOCAB_SIZE = "{arch}.vocab_size"
@@ -119,6 +131,7 @@ class Keys:
119
131
 
120
132
  class Rope:
121
133
  DIMENSION_COUNT = "{arch}.rope.dimension_count"
134
+ DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
122
135
  FREQ_BASE = "{arch}.rope.freq_base"
123
136
  SCALING_TYPE = "{arch}.rope.scaling.type"
124
137
  SCALING_FACTOR = "{arch}.rope.scaling.factor"
@@ -152,6 +165,8 @@ class Keys:
152
165
  MERGES = "tokenizer.ggml.merges"
153
166
  BOS_ID = "tokenizer.ggml.bos_token_id"
154
167
  EOS_ID = "tokenizer.ggml.eos_token_id"
168
+ EOT_ID = "tokenizer.ggml.eot_token_id"
169
+ EOM_ID = "tokenizer.ggml.eom_token_id"
155
170
  UNK_ID = "tokenizer.ggml.unknown_token_id"
156
171
  SEP_ID = "tokenizer.ggml.seperator_token_id"
157
172
  PAD_ID = "tokenizer.ggml.padding_token_id"
@@ -168,11 +183,16 @@ class Keys:
168
183
  CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
169
184
  CHAT_TEMPLATES = "tokenizer.chat_templates"
170
185
  # FIM/Infill special tokens constants
186
+ FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
187
+ FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
188
+ FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
189
+ FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
190
+ FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
191
+ FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
192
+ # deprecated:
171
193
  PREFIX_ID = "tokenizer.ggml.prefix_token_id"
172
194
  SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
173
195
  MIDDLE_ID = "tokenizer.ggml.middle_token_id"
174
- EOT_ID = "tokenizer.ggml.eot_token_id"
175
- EOM_ID = "tokenizer.ggml.eom_token_id"
176
196
 
177
197
  class Adapter:
178
198
  TYPE = "adapter.type"
@@ -207,6 +227,7 @@ class MODEL_ARCH(IntEnum):
207
227
  QWEN = auto()
208
228
  QWEN2 = auto()
209
229
  QWEN2MOE = auto()
230
+ QWEN2VL = auto()
210
231
  PHI2 = auto()
211
232
  PHI3 = auto()
212
233
  PLAMO = auto()
@@ -224,6 +245,7 @@ class MODEL_ARCH(IntEnum):
224
245
  COMMAND_R = auto()
225
246
  DBRX = auto()
226
247
  OLMO = auto()
248
+ OLMO2 = auto()
227
249
  OLMOE = auto()
228
250
  OPENELM = auto()
229
251
  ARCTIC = auto()
@@ -368,6 +390,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
368
390
  MODEL_ARCH.QWEN: "qwen",
369
391
  MODEL_ARCH.QWEN2: "qwen2",
370
392
  MODEL_ARCH.QWEN2MOE: "qwen2moe",
393
+ MODEL_ARCH.QWEN2VL: "qwen2vl",
371
394
  MODEL_ARCH.PHI2: "phi2",
372
395
  MODEL_ARCH.PHI3: "phi3",
373
396
  MODEL_ARCH.PLAMO: "plamo",
@@ -385,6 +408,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
385
408
  MODEL_ARCH.COMMAND_R: "command-r",
386
409
  MODEL_ARCH.DBRX: "dbrx",
387
410
  MODEL_ARCH.OLMO: "olmo",
411
+ MODEL_ARCH.OLMO2: "olmo2",
388
412
  MODEL_ARCH.OLMOE: "olmoe",
389
413
  MODEL_ARCH.OPENELM: "openelm",
390
414
  MODEL_ARCH.ARCTIC: "arctic",
@@ -737,6 +761,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
737
761
  MODEL_TENSOR.FFN_UP,
738
762
  ],
739
763
  MODEL_ARCH.QWEN2: [
764
+ MODEL_TENSOR.TOKEN_EMBD,
765
+ MODEL_TENSOR.OUTPUT_NORM,
766
+ MODEL_TENSOR.OUTPUT,
767
+ MODEL_TENSOR.ROPE_FREQS,
768
+ MODEL_TENSOR.ATTN_NORM,
769
+ MODEL_TENSOR.ATTN_Q,
770
+ MODEL_TENSOR.ATTN_K,
771
+ MODEL_TENSOR.ATTN_V,
772
+ MODEL_TENSOR.ATTN_OUT,
773
+ MODEL_TENSOR.FFN_NORM,
774
+ MODEL_TENSOR.FFN_GATE,
775
+ MODEL_TENSOR.FFN_DOWN,
776
+ MODEL_TENSOR.FFN_UP,
777
+ ],
778
+ MODEL_ARCH.QWEN2VL: [
740
779
  MODEL_TENSOR.TOKEN_EMBD,
741
780
  MODEL_TENSOR.OUTPUT_NORM,
742
781
  MODEL_TENSOR.OUTPUT,
@@ -875,6 +914,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
875
914
  MODEL_TENSOR.OUTPUT,
876
915
  MODEL_TENSOR.OUTPUT_NORM,
877
916
  MODEL_TENSOR.ROPE_FREQS,
917
+ MODEL_TENSOR.ROPE_FACTORS_LONG,
918
+ MODEL_TENSOR.ROPE_FACTORS_SHORT,
878
919
  MODEL_TENSOR.ATTN_NORM,
879
920
  MODEL_TENSOR.ATTN_Q,
880
921
  MODEL_TENSOR.ATTN_K,
@@ -1050,6 +1091,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1050
1091
  MODEL_TENSOR.FFN_DOWN,
1051
1092
  MODEL_TENSOR.FFN_UP,
1052
1093
  ],
1094
+ MODEL_ARCH.OLMO2: [
1095
+ MODEL_TENSOR.TOKEN_EMBD,
1096
+ MODEL_TENSOR.OUTPUT_NORM,
1097
+ MODEL_TENSOR.OUTPUT,
1098
+ MODEL_TENSOR.ATTN_Q,
1099
+ MODEL_TENSOR.ATTN_K,
1100
+ MODEL_TENSOR.ATTN_V,
1101
+ MODEL_TENSOR.ATTN_OUT,
1102
+ MODEL_TENSOR.ATTN_POST_NORM,
1103
+ MODEL_TENSOR.ATTN_Q_NORM,
1104
+ MODEL_TENSOR.ATTN_K_NORM,
1105
+ MODEL_TENSOR.FFN_POST_NORM,
1106
+ MODEL_TENSOR.FFN_GATE,
1107
+ MODEL_TENSOR.FFN_DOWN,
1108
+ MODEL_TENSOR.FFN_UP,
1109
+ ],
1053
1110
  MODEL_ARCH.OLMOE: [
1054
1111
  MODEL_TENSOR.TOKEN_EMBD,
1055
1112
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1351,9 +1408,10 @@ class TokenType(IntEnum):
1351
1408
 
1352
1409
 
1353
1410
  class RopeScalingType(Enum):
1354
- NONE = 'none'
1355
- LINEAR = 'linear'
1356
- YARN = 'yarn'
1411
+ NONE = 'none'
1412
+ LINEAR = 'linear'
1413
+ YARN = 'yarn'
1414
+ LONGROPE = 'longrope'
1357
1415
 
1358
1416
 
1359
1417
  class PoolingType(IntEnum):
@@ -1392,9 +1450,6 @@ class GGMLQuantizationType(IntEnum):
1392
1450
  F64 = 28
1393
1451
  IQ1_M = 29
1394
1452
  BF16 = 30
1395
- Q4_0_4_4 = 31
1396
- Q4_0_4_8 = 32
1397
- Q4_0_8_8 = 33
1398
1453
  TQ1_0 = 34
1399
1454
  TQ2_0 = 35
1400
1455
 
@@ -1438,9 +1493,9 @@ class LlamaFileType(IntEnum):
1438
1493
  MOSTLY_IQ4_XS = 30 # except 1d tensors
1439
1494
  MOSTLY_IQ1_M = 31 # except 1d tensors
1440
1495
  MOSTLY_BF16 = 32 # except 1d tensors
1441
- MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
1442
- MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
1443
- MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
1496
+ # MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack
1497
+ # MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack
1498
+ # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack
1444
1499
  MOSTLY_TQ1_0 = 36 # except 1d tensors
1445
1500
  MOSTLY_TQ2_0 = 37 # except 1d tensors
1446
1501
 
@@ -1516,9 +1571,6 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
1516
1571
  GGMLQuantizationType.F64: (1, 8),
1517
1572
  GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
1518
1573
  GGMLQuantizationType.BF16: (1, 2),
1519
- GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
1520
- GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
1521
- GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
1522
1574
  GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
1523
1575
  GGMLQuantizationType.TQ2_0: (256, 2 + 64),
1524
1576
  }
@@ -1579,6 +1631,8 @@ KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
1579
1631
  KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
1580
1632
  KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
1581
1633
  KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
1634
+ KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
1635
+ KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
1582
1636
  KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
1583
1637
  KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
1584
1638
  KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
@@ -1586,8 +1640,15 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
1586
1640
  KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
1587
1641
  KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
1588
1642
  KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
1589
- KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
1643
+
1644
+ KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
1645
+ KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
1646
+ KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
1647
+ KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
1648
+ KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
1649
+ KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
1650
+
1651
+ # deprecated
1652
+ KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
1590
1653
  KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
1591
1654
  KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
1592
- KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
1593
- KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
@@ -12,4 +12,4 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
12
12
  importlib.invalidate_caches()
13
13
  import gguf # noqa: E402
14
14
 
15
- importlib.reload(gguf)
15
+ importlib.reload(gguf)
@@ -145,11 +145,10 @@ class GGUFReader:
145
145
  count = int(count)
146
146
  itemsize = int(np.empty([], dtype = dtype).itemsize)
147
147
  end_offs = offset + itemsize * count
148
- return (
149
- self.data[offset:end_offs]
150
- .view(dtype = dtype)[:count]
151
- .newbyteorder(override_order or self.byte_order)
152
- )
148
+ arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
149
+ if override_order is None:
150
+ return arr
151
+ return arr.view(arr.dtype.newbyteorder(override_order))
153
152
 
154
153
  def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
155
154
  if field.name in self.fields:
@@ -314,4 +313,4 @@ class GGUFReader:
314
313
  data = self._get(data_offs, item_type, item_count).reshape(np_dims),
315
314
  field = field,
316
315
  ))
317
- self.tensors = tensors
316
+ self.tensors = tensors