bigdl-core-cpp 2.6.0b20250204__py3-none-win_amd64.whl → 2.6.0b20250206__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +99 -44
  2. bigdl/cpp/convert_hf_to_gguf_update.py +4 -1
  3. bigdl/cpp/convert_lora_to_gguf.py +41 -11
  4. bigdl/cpp/gguf-py/gguf/constants.py +79 -18
  5. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
  7. bigdl/cpp/gguf-py/gguf/gguf_writer.py +36 -12
  8. bigdl/cpp/gguf-py/gguf/metadata.py +131 -19
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +17 -15
  10. bigdl/cpp/gguf-py/gguf/vocab.py +24 -2
  11. bigdl/cpp/libs/common.lib +0 -0
  12. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  13. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  14. bigdl/cpp/libs/ggml.dll +0 -0
  15. bigdl/cpp/libs/llama-batched.exe +0 -0
  16. bigdl/cpp/libs/llama-bench.exe +0 -0
  17. bigdl/cpp/libs/llama-cli.exe +0 -0
  18. bigdl/cpp/libs/llama-embedding.exe +0 -0
  19. bigdl/cpp/libs/llama-gguf.exe +0 -0
  20. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  21. bigdl/cpp/libs/llama-lookup.exe +0 -0
  22. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  23. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  24. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  25. bigdl/cpp/libs/llama-quantize.exe +0 -0
  26. bigdl/cpp/libs/llama-server.exe +0 -0
  27. bigdl/cpp/libs/llama-simple.exe +0 -0
  28. bigdl/cpp/libs/llama-speculative.exe +0 -0
  29. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  30. bigdl/cpp/libs/llama.dll +0 -0
  31. bigdl/cpp/libs/llava_shared.dll +0 -0
  32. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  33. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  34. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  35. bigdl/cpp/libs/ollama.exe +0 -0
  36. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  37. bigdl/cpp/libs/ollama_llama.dll +0 -0
  38. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  39. {bigdl_core_cpp-2.6.0b20250204.data → bigdl_core_cpp-2.6.0b20250206.data}/scripts/init-ollama.bat +1 -1
  40. {bigdl_core_cpp-2.6.0b20250204.dist-info → bigdl_core_cpp-2.6.0b20250206.dist-info}/METADATA +1 -1
  41. bigdl_core_cpp-2.6.0b20250206.dist-info/RECORD +54 -0
  42. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/ipex_llm/ollama_llama_server.exe +0 -0
  43. bigdl_core_cpp-2.6.0b20250204.dist-info/RECORD +0 -50
  44. {bigdl_core_cpp-2.6.0b20250204.data → bigdl_core_cpp-2.6.0b20250206.data}/scripts/init-llama-cpp.bat +0 -0
  45. {bigdl_core_cpp-2.6.0b20250204.data → bigdl_core_cpp-2.6.0b20250206.data}/scripts/init-llama-cpp.ps1 +0 -0
  46. {bigdl_core_cpp-2.6.0b20250204.dist-info → bigdl_core_cpp-2.6.0b20250206.dist-info}/WHEEL +0 -0
  47. {bigdl_core_cpp-2.6.0b20250204.dist-info → bigdl_core_cpp-2.6.0b20250206.dist-info}/top_level.txt +0 -0
@@ -72,7 +72,8 @@ class Model:
72
72
  def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
73
73
  use_temp_file: bool = False, eager: bool = False,
74
74
  metadata_override: Path | None = None, model_name: str | None = None,
75
- split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
75
+ split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
76
+ small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
76
77
  if type(self) is Model:
77
78
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
78
79
 
@@ -87,7 +88,7 @@ class Model:
87
88
  self.is_safetensors = len(self.part_names) > 0
88
89
  if not self.is_safetensors:
89
90
  self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
90
- self.hparams = Model.load_hparams(self.dir_model)
91
+ self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
91
92
  self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
92
93
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
93
94
  self.tensor_names = None
@@ -573,6 +574,9 @@ class Model:
573
574
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
574
575
  # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
575
576
  res = "bert-bge"
577
+ if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
578
+ # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
579
+ res = "bert-bge-large"
576
580
  if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
577
581
  # ref: https://huggingface.co/mosaicml/mpt-7b
578
582
  res = "mpt"
@@ -654,6 +658,12 @@ class Model:
654
658
  if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
655
659
  # ref: https://huggingface.co/facebook/chameleon-7b
656
660
  res = "chameleon"
661
+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
662
+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
663
+ res = "minerva-7b"
664
+ if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
665
+ # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
666
+ res = "roberta-bpe"
657
667
 
658
668
  if res is None:
659
669
  logger.warning("\n")
@@ -1538,6 +1548,17 @@ class LlamaModel(Model):
1538
1548
  special_vocab._set_special_token("eot", 32010)
1539
1549
  special_vocab.add_to_gguf(self.gguf_writer)
1540
1550
 
1551
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1552
+ if tokenizer_config_file.is_file():
1553
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1554
+ tokenizer_config_json = json.load(f)
1555
+ if "add_prefix_space" in tokenizer_config_json:
1556
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1557
+
1558
+ # Apply to granite small models only
1559
+ if self.hparams.get("vocab_size", 32000) == 49152:
1560
+ self.gguf_writer.add_add_bos_token(False)
1561
+
1541
1562
  def set_gguf_parameters(self):
1542
1563
  super().set_gguf_parameters()
1543
1564
  hparams = self.hparams
@@ -1554,17 +1575,6 @@ class LlamaModel(Model):
1554
1575
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1555
1576
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1556
1577
 
1557
- tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1558
- if tokenizer_config_file.is_file():
1559
- with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1560
- tokenizer_config_json = json.load(f)
1561
- if "add_prefix_space" in tokenizer_config_json:
1562
- self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1563
-
1564
- # Apply to granite small models only
1565
- if self.hparams.get("vocab_size", 32000) == 49152:
1566
- self.gguf_writer.add_add_bos_token(False)
1567
-
1568
1578
  @staticmethod
1569
1579
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1570
1580
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1827,29 +1837,40 @@ class MiniCPMModel(Model):
1827
1837
  model_arch = gguf.MODEL_ARCH.MINICPM
1828
1838
 
1829
1839
  def set_gguf_parameters(self):
1830
- block_count = self.hparams["num_hidden_layers"]
1831
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1832
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1833
- self.gguf_writer.add_block_count(block_count)
1834
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1835
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1836
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1837
- self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
1838
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1839
- self.gguf_writer.add_file_type(self.ftype)
1840
+ super().set_gguf_parameters()
1841
+ embedding_scale = float(self.hparams["scale_emb"])
1842
+ self.gguf_writer.add_embedding_scale(embedding_scale)
1843
+ logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
1844
+ residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
1845
+ self.gguf_writer.add_residual_scale(residual_scale)
1846
+ logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
1847
+ logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
1848
+ self.gguf_writer.add_logit_scale(logit_scale)
1849
+ logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
1850
+ if self.hparams.get("rope_scaling") is not None:
1851
+ if self.hparams["rope_scaling"].get("type") == "longrope":
1852
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
1853
+ logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
1840
1854
 
1841
- def set_vocab(self):
1842
- self._set_vocab_llama_hf()
1855
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1856
+ rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1843
1857
 
1844
- def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1845
- if n_kv_head is not None and n_head != n_kv_head:
1846
- n_head //= n_kv_head
1858
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
1859
+ if rope_scaling is not None:
1860
+ long_factors = rope_scaling.get('long_factor', None)
1861
+ short_factors = rope_scaling.get('short_factor', None)
1847
1862
 
1848
- return (
1849
- weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1850
- .swapaxes(1, 2)
1851
- .reshape(weights.shape)
1852
- )
1863
+ if long_factors is None or short_factors is None:
1864
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
1865
+
1866
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
1867
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
1868
+
1869
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
1870
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
1871
+
1872
+ def set_vocab(self):
1873
+ self._set_vocab_sentencepiece()
1853
1874
 
1854
1875
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1855
1876
  del bid # unused
@@ -1859,9 +1880,9 @@ class MiniCPMModel(Model):
1859
1880
 
1860
1881
  # HF models permute some of the tensors, so we need to undo that
1861
1882
  if name.endswith(("q_proj.weight")):
1862
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
1883
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1863
1884
  if name.endswith(("k_proj.weight")):
1864
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
1885
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1865
1886
 
1866
1887
  return [(self.map_tensor_name(name), data_torch)]
1867
1888
 
@@ -1971,6 +1992,37 @@ class Qwen2Model(Model):
1971
1992
  except FileNotFoundError:
1972
1993
  self._set_vocab_gpt2()
1973
1994
 
1995
+ def set_gguf_parameters(self):
1996
+ super().set_gguf_parameters()
1997
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1998
+ if self.hparams["rope_scaling"].get("type") == "yarn":
1999
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2000
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2001
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
2002
+
2003
+
2004
+ @Model.register("Qwen2VLForConditionalGeneration")
2005
+ class Qwen2VLModel(Model):
2006
+ model_arch = gguf.MODEL_ARCH.QWEN2VL
2007
+
2008
+ def set_gguf_parameters(self):
2009
+ super().set_gguf_parameters()
2010
+ mrope_section = self.hparams["rope_scaling"]["mrope_section"]
2011
+ mrope_section += [0] * max(0, 4 - len(mrope_section))
2012
+ self.gguf_writer.add_rope_dimension_sections(mrope_section)
2013
+
2014
+ def set_vocab(self):
2015
+ try:
2016
+ self._set_vocab_sentencepiece()
2017
+ except FileNotFoundError:
2018
+ self._set_vocab_gpt2()
2019
+
2020
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
2021
+ for name, data in super().get_tensors():
2022
+ if name.startswith("visual."):
2023
+ continue
2024
+ yield name, data
2025
+
1974
2026
 
1975
2027
  @Model.register("Qwen2MoeForCausalLM")
1976
2028
  class Qwen2MoeModel(Model):
@@ -2515,7 +2567,7 @@ class InternLM2Model(Model):
2515
2567
  return [(self.map_tensor_name(name), data_torch)]
2516
2568
 
2517
2569
 
2518
- @Model.register("BertModel", "CamembertModel")
2570
+ @Model.register("BertModel", "CamembertModel", "RobertaModel")
2519
2571
  class BertModel(Model):
2520
2572
  model_arch = gguf.MODEL_ARCH.BERT
2521
2573
 
@@ -2556,7 +2608,8 @@ class BertModel(Model):
2556
2608
 
2557
2609
  # we need this to validate the size of the token_type embeddings
2558
2610
  # though currently we are passing all zeros to the token_type embeddings
2559
- self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
2611
+ # "Sequence A" or "Sequence B"
2612
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2560
2613
 
2561
2614
  # convert to phantom space vocab
2562
2615
  def phantom(tok):
@@ -2703,7 +2756,7 @@ class XLMRobertaModel(BertModel):
2703
2756
  self.gguf_writer.add_token_scores(scores)
2704
2757
  self.gguf_writer.add_token_types(toktypes)
2705
2758
  self.gguf_writer.add_add_space_prefix(add_prefix)
2706
- self.gguf_writer.add_token_type_count(1)
2759
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2707
2760
  self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
2708
2761
  if precompiled_charsmap:
2709
2762
  self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
@@ -2864,6 +2917,9 @@ class Rwkv6Model(Model):
2864
2917
  self.gguf_writer.add_token_list(tokens)
2865
2918
  self.gguf_writer.add_token_types(toktypes)
2866
2919
  special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
2920
+ special_vocab.chat_template = "rwkv-world"
2921
+ # hack: Add '\n\n' as the EOT token to make it chat normally
2922
+ special_vocab._set_special_token("eot", 261)
2867
2923
  special_vocab.add_to_gguf(self.gguf_writer)
2868
2924
 
2869
2925
  def set_gguf_parameters(self):
@@ -3033,6 +3089,11 @@ class OlmoModel(Model):
3033
3089
  return [(self.map_tensor_name(name), data_torch)]
3034
3090
 
3035
3091
 
3092
+ @Model.register("Olmo2ForCausalLM")
3093
+ class Olmo2Model(Model):
3094
+ model_arch = gguf.MODEL_ARCH.OLMO2
3095
+
3096
+
3036
3097
  @Model.register("OlmoeForCausalLM")
3037
3098
  class OlmoeModel(Model):
3038
3099
  model_arch = gguf.MODEL_ARCH.OLMOE
@@ -3741,10 +3802,7 @@ class JaisModel(Model):
3741
3802
 
3742
3803
  # Embeddings scale
3743
3804
  self.embeddings_scale = 1.0
3744
- # note: For some JAIS flavors, output is tied to (same as) wte in original model
3745
- self.output_is_wte = False
3746
3805
  if 'mup_embeddings_scale' in self.hparams:
3747
- self.output_is_wte = True # Hack (?)
3748
3806
  self.embeddings_scale = self.hparams['mup_embeddings_scale']
3749
3807
  elif 'embeddings_scale' in self.hparams:
3750
3808
  self.embeddings_scale = self.hparams['embeddings_scale']
@@ -3801,10 +3859,7 @@ class JaisModel(Model):
3801
3859
 
3802
3860
  if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3803
3861
  tensors.append((new_name, data_torch * self.embeddings_scale))
3804
- if self.output_is_wte:
3805
- tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
3806
3862
  elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3807
- assert not self.output_is_wte
3808
3863
  tensors.append((new_name, data_torch * self.width_scale))
3809
3864
  else:
3810
3865
  tensors.append((new_name, data_torch))
@@ -17,7 +17,7 @@
17
17
  #
18
18
  # python3 convert_hf_to_gguf_update.py <huggingface_token>
19
19
  #
20
- # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
20
+ # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
21
21
  # - Update llama.cpp with the new pre-tokenizer if necessary
22
22
  #
23
23
  # TODO: generate tokenizer tests for llama.cpp
@@ -72,6 +72,7 @@ models = [
72
72
  {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
73
73
  {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
74
74
  {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
75
+ {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
75
76
  {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
76
77
  {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
77
78
  {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
@@ -101,6 +102,8 @@ models = [
101
102
  {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
102
103
  {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
103
104
  {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
105
+ {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
106
+ {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
104
107
  ]
105
108
 
106
109
 
@@ -12,6 +12,7 @@ import json
12
12
  from math import prod
13
13
  from pathlib import Path
14
14
  from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15
+ from transformers import AutoConfig
15
16
 
16
17
  import torch
17
18
 
@@ -230,7 +231,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
230
231
 
231
232
  def parse_args() -> argparse.Namespace:
232
233
  parser = argparse.ArgumentParser(
233
- description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
234
+ description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
234
235
  parser.add_argument(
235
236
  "--outfile", type=Path,
236
237
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -256,17 +257,23 @@ def parse_args() -> argparse.Namespace:
256
257
  help="only print out what will be done, without writing any new files",
257
258
  )
258
259
  parser.add_argument(
259
- "--base", type=Path, required=True,
260
- help="directory containing base model file",
260
+ "--base", type=Path,
261
+ help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
261
262
  )
262
263
  parser.add_argument(
263
264
  "lora_path", type=Path,
264
- help="directory containing LoRA adapter file",
265
+ help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
265
266
  )
266
267
 
267
268
  return parser.parse_args()
268
269
 
269
270
 
271
+ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
272
+ # normally, adapter does not come with base model config, we need to load it from AutoConfig
273
+ config = AutoConfig.from_pretrained(hf_model_id)
274
+ return config.to_dict()
275
+
276
+
270
277
  if __name__ == '__main__':
271
278
  args = parse_args()
272
279
  logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@@ -281,7 +288,7 @@ if __name__ == '__main__':
281
288
 
282
289
  ftype = ftype_map[args.outtype]
283
290
 
284
- dir_base_model: Path = args.base
291
+ dir_base_model: Path | None = args.base
285
292
  dir_lora: Path = args.lora_path
286
293
  lora_config = dir_lora / "adapter_config.json"
287
294
  input_model = dir_lora / "adapter_model.safetensors"
@@ -301,9 +308,29 @@ if __name__ == '__main__':
301
308
  input_model = os.path.join(dir_lora, "adapter_model.bin")
302
309
  lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
303
310
 
311
+ # load LoRA config
312
+ with open(lora_config, "r") as f:
313
+ lparams: dict[str, Any] = json.load(f)
314
+
304
315
  # load base model
305
- logger.info(f"Loading base model: {dir_base_model.name}")
306
- hparams = Model.load_hparams(dir_base_model)
316
+ if dir_base_model is None:
317
+ if "base_model_name_or_path" in lparams:
318
+ model_id = lparams["base_model_name_or_path"]
319
+ logger.info(f"Loading base model from Hugging Face: {model_id}")
320
+ try:
321
+ hparams = load_hparams_from_hf(model_id)
322
+ except OSError as e:
323
+ logger.error(f"Failed to load base model config: {e}")
324
+ logger.error("Please try downloading the base model and add its path to --base")
325
+ sys.exit(1)
326
+ else:
327
+ logger.error("'base_model_name_or_path' is not found in adapter_config.json")
328
+ logger.error("Base model config is required. Please download the base model and add its path to --base")
329
+ sys.exit(1)
330
+ else:
331
+ logger.info(f"Loading base model: {dir_base_model.name}")
332
+ hparams = Model.load_hparams(dir_base_model)
333
+
307
334
  with torch.inference_mode():
308
335
  try:
309
336
  model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -323,13 +350,15 @@ if __name__ == '__main__':
323
350
  self.dir_model_card = dir_lora_model
324
351
  self.lora_alpha = float(lora_alpha)
325
352
 
353
+ def set_vocab(self):
354
+ pass
355
+
326
356
  def set_type(self):
327
357
  self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
328
358
  self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
329
359
 
330
360
  def set_gguf_parameters(self):
331
361
  self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
332
- super().set_gguf_parameters()
333
362
 
334
363
  def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
335
364
  # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
@@ -348,6 +377,9 @@ if __name__ == '__main__':
348
377
  if ".base_layer.weight" in name:
349
378
  continue
350
379
  logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
380
+ if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
381
+ logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
382
+ logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
351
383
  sys.exit(1)
352
384
 
353
385
  if base_name in tensor_map:
@@ -381,9 +413,6 @@ if __name__ == '__main__':
381
413
  yield (dest_name + ".lora_a", lora_a)
382
414
  yield (dest_name + ".lora_b", lora_b)
383
415
 
384
- with open(lora_config, "r") as f:
385
- lparams: dict[str, Any] = json.load(f)
386
-
387
416
  alpha: float = lparams["lora_alpha"]
388
417
 
389
418
  model_instance = LoraModel(
@@ -396,6 +425,7 @@ if __name__ == '__main__':
396
425
  dry_run=args.dry_run,
397
426
  dir_lora_model=dir_lora,
398
427
  lora_alpha=alpha,
428
+ hparams=hparams,
399
429
  )
400
430
 
401
431
  logger.info("Exporting model...")
@@ -64,15 +64,27 @@ class Keys:
64
64
  BASE_MODEL_AUTHOR = "general.base_model.{id}.author"
65
65
  BASE_MODEL_VERSION = "general.base_model.{id}.version"
66
66
  BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization"
67
+ BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description"
67
68
  BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper
68
69
  BASE_MODEL_DOI = "general.base_model.{id}.doi"
69
70
  BASE_MODEL_UUID = "general.base_model.{id}.uuid"
70
71
  BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
71
72
 
73
+ # Dataset Source
74
+ DATASET_COUNT = "general.dataset.count"
75
+ DATASET_NAME = "general.dataset.{id}.name"
76
+ DATASET_AUTHOR = "general.dataset.{id}.author"
77
+ DATASET_VERSION = "general.dataset.{id}.version"
78
+ DATASET_ORGANIZATION = "general.dataset.{id}.organization"
79
+ DATASET_DESCRIPTION = "general.dataset.{id}.description"
80
+ DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper
81
+ DATASET_DOI = "general.dataset.{id}.doi"
82
+ DATASET_UUID = "general.dataset.{id}.uuid"
83
+ DATASET_REPO_URL = "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...)
84
+
72
85
  # Array based KV stores
73
86
  TAGS = "general.tags"
74
87
  LANGUAGES = "general.languages"
75
- DATASETS = "general.datasets"
76
88
 
77
89
  class LLM:
78
90
  VOCAB_SIZE = "{arch}.vocab_size"
@@ -119,6 +131,7 @@ class Keys:
119
131
 
120
132
  class Rope:
121
133
  DIMENSION_COUNT = "{arch}.rope.dimension_count"
134
+ DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
122
135
  FREQ_BASE = "{arch}.rope.freq_base"
123
136
  SCALING_TYPE = "{arch}.rope.scaling.type"
124
137
  SCALING_FACTOR = "{arch}.rope.scaling.factor"
@@ -152,6 +165,8 @@ class Keys:
152
165
  MERGES = "tokenizer.ggml.merges"
153
166
  BOS_ID = "tokenizer.ggml.bos_token_id"
154
167
  EOS_ID = "tokenizer.ggml.eos_token_id"
168
+ EOT_ID = "tokenizer.ggml.eot_token_id"
169
+ EOM_ID = "tokenizer.ggml.eom_token_id"
155
170
  UNK_ID = "tokenizer.ggml.unknown_token_id"
156
171
  SEP_ID = "tokenizer.ggml.seperator_token_id"
157
172
  PAD_ID = "tokenizer.ggml.padding_token_id"
@@ -168,11 +183,16 @@ class Keys:
168
183
  CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
169
184
  CHAT_TEMPLATES = "tokenizer.chat_templates"
170
185
  # FIM/Infill special tokens constants
186
+ FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
187
+ FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
188
+ FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
189
+ FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
190
+ FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
191
+ FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
192
+ # deprecated:
171
193
  PREFIX_ID = "tokenizer.ggml.prefix_token_id"
172
194
  SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
173
195
  MIDDLE_ID = "tokenizer.ggml.middle_token_id"
174
- EOT_ID = "tokenizer.ggml.eot_token_id"
175
- EOM_ID = "tokenizer.ggml.eom_token_id"
176
196
 
177
197
  class Adapter:
178
198
  TYPE = "adapter.type"
@@ -207,6 +227,7 @@ class MODEL_ARCH(IntEnum):
207
227
  QWEN = auto()
208
228
  QWEN2 = auto()
209
229
  QWEN2MOE = auto()
230
+ QWEN2VL = auto()
210
231
  PHI2 = auto()
211
232
  PHI3 = auto()
212
233
  PLAMO = auto()
@@ -224,6 +245,7 @@ class MODEL_ARCH(IntEnum):
224
245
  COMMAND_R = auto()
225
246
  DBRX = auto()
226
247
  OLMO = auto()
248
+ OLMO2 = auto()
227
249
  OLMOE = auto()
228
250
  OPENELM = auto()
229
251
  ARCTIC = auto()
@@ -368,6 +390,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
368
390
  MODEL_ARCH.QWEN: "qwen",
369
391
  MODEL_ARCH.QWEN2: "qwen2",
370
392
  MODEL_ARCH.QWEN2MOE: "qwen2moe",
393
+ MODEL_ARCH.QWEN2VL: "qwen2vl",
371
394
  MODEL_ARCH.PHI2: "phi2",
372
395
  MODEL_ARCH.PHI3: "phi3",
373
396
  MODEL_ARCH.PLAMO: "plamo",
@@ -385,6 +408,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
385
408
  MODEL_ARCH.COMMAND_R: "command-r",
386
409
  MODEL_ARCH.DBRX: "dbrx",
387
410
  MODEL_ARCH.OLMO: "olmo",
411
+ MODEL_ARCH.OLMO2: "olmo2",
388
412
  MODEL_ARCH.OLMOE: "olmoe",
389
413
  MODEL_ARCH.OPENELM: "openelm",
390
414
  MODEL_ARCH.ARCTIC: "arctic",
@@ -737,6 +761,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
737
761
  MODEL_TENSOR.FFN_UP,
738
762
  ],
739
763
  MODEL_ARCH.QWEN2: [
764
+ MODEL_TENSOR.TOKEN_EMBD,
765
+ MODEL_TENSOR.OUTPUT_NORM,
766
+ MODEL_TENSOR.OUTPUT,
767
+ MODEL_TENSOR.ROPE_FREQS,
768
+ MODEL_TENSOR.ATTN_NORM,
769
+ MODEL_TENSOR.ATTN_Q,
770
+ MODEL_TENSOR.ATTN_K,
771
+ MODEL_TENSOR.ATTN_V,
772
+ MODEL_TENSOR.ATTN_OUT,
773
+ MODEL_TENSOR.FFN_NORM,
774
+ MODEL_TENSOR.FFN_GATE,
775
+ MODEL_TENSOR.FFN_DOWN,
776
+ MODEL_TENSOR.FFN_UP,
777
+ ],
778
+ MODEL_ARCH.QWEN2VL: [
740
779
  MODEL_TENSOR.TOKEN_EMBD,
741
780
  MODEL_TENSOR.OUTPUT_NORM,
742
781
  MODEL_TENSOR.OUTPUT,
@@ -875,6 +914,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
875
914
  MODEL_TENSOR.OUTPUT,
876
915
  MODEL_TENSOR.OUTPUT_NORM,
877
916
  MODEL_TENSOR.ROPE_FREQS,
917
+ MODEL_TENSOR.ROPE_FACTORS_LONG,
918
+ MODEL_TENSOR.ROPE_FACTORS_SHORT,
878
919
  MODEL_TENSOR.ATTN_NORM,
879
920
  MODEL_TENSOR.ATTN_Q,
880
921
  MODEL_TENSOR.ATTN_K,
@@ -1050,6 +1091,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1050
1091
  MODEL_TENSOR.FFN_DOWN,
1051
1092
  MODEL_TENSOR.FFN_UP,
1052
1093
  ],
1094
+ MODEL_ARCH.OLMO2: [
1095
+ MODEL_TENSOR.TOKEN_EMBD,
1096
+ MODEL_TENSOR.OUTPUT_NORM,
1097
+ MODEL_TENSOR.OUTPUT,
1098
+ MODEL_TENSOR.ATTN_Q,
1099
+ MODEL_TENSOR.ATTN_K,
1100
+ MODEL_TENSOR.ATTN_V,
1101
+ MODEL_TENSOR.ATTN_OUT,
1102
+ MODEL_TENSOR.ATTN_POST_NORM,
1103
+ MODEL_TENSOR.ATTN_Q_NORM,
1104
+ MODEL_TENSOR.ATTN_K_NORM,
1105
+ MODEL_TENSOR.FFN_POST_NORM,
1106
+ MODEL_TENSOR.FFN_GATE,
1107
+ MODEL_TENSOR.FFN_DOWN,
1108
+ MODEL_TENSOR.FFN_UP,
1109
+ ],
1053
1110
  MODEL_ARCH.OLMOE: [
1054
1111
  MODEL_TENSOR.TOKEN_EMBD,
1055
1112
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1351,9 +1408,10 @@ class TokenType(IntEnum):
1351
1408
 
1352
1409
 
1353
1410
  class RopeScalingType(Enum):
1354
- NONE = 'none'
1355
- LINEAR = 'linear'
1356
- YARN = 'yarn'
1411
+ NONE = 'none'
1412
+ LINEAR = 'linear'
1413
+ YARN = 'yarn'
1414
+ LONGROPE = 'longrope'
1357
1415
 
1358
1416
 
1359
1417
  class PoolingType(IntEnum):
@@ -1392,9 +1450,6 @@ class GGMLQuantizationType(IntEnum):
1392
1450
  F64 = 28
1393
1451
  IQ1_M = 29
1394
1452
  BF16 = 30
1395
- Q4_0_4_4 = 31
1396
- Q4_0_4_8 = 32
1397
- Q4_0_8_8 = 33
1398
1453
  TQ1_0 = 34
1399
1454
  TQ2_0 = 35
1400
1455
 
@@ -1438,9 +1493,9 @@ class LlamaFileType(IntEnum):
1438
1493
  MOSTLY_IQ4_XS = 30 # except 1d tensors
1439
1494
  MOSTLY_IQ1_M = 31 # except 1d tensors
1440
1495
  MOSTLY_BF16 = 32 # except 1d tensors
1441
- MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
1442
- MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
1443
- MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
1496
+ # MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack
1497
+ # MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack
1498
+ # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack
1444
1499
  MOSTLY_TQ1_0 = 36 # except 1d tensors
1445
1500
  MOSTLY_TQ2_0 = 37 # except 1d tensors
1446
1501
 
@@ -1516,9 +1571,6 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
1516
1571
  GGMLQuantizationType.F64: (1, 8),
1517
1572
  GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
1518
1573
  GGMLQuantizationType.BF16: (1, 2),
1519
- GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
1520
- GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
1521
- GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
1522
1574
  GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
1523
1575
  GGMLQuantizationType.TQ2_0: (256, 2 + 64),
1524
1576
  }
@@ -1579,6 +1631,8 @@ KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
1579
1631
  KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
1580
1632
  KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
1581
1633
  KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
1634
+ KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
1635
+ KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
1582
1636
  KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
1583
1637
  KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
1584
1638
  KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
@@ -1586,8 +1640,15 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
1586
1640
  KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
1587
1641
  KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
1588
1642
  KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
1589
- KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
1643
+
1644
+ KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
1645
+ KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
1646
+ KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
1647
+ KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
1648
+ KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
1649
+ KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
1650
+
1651
+ # deprecated
1652
+ KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
1590
1653
  KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
1591
1654
  KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
1592
- KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
1593
- KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
@@ -12,4 +12,4 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
12
12
  importlib.invalidate_caches()
13
13
  import gguf # noqa: E402
14
14
 
15
- importlib.reload(gguf)
15
+ importlib.reload(gguf)
@@ -145,11 +145,10 @@ class GGUFReader:
145
145
  count = int(count)
146
146
  itemsize = int(np.empty([], dtype = dtype).itemsize)
147
147
  end_offs = offset + itemsize * count
148
- return (
149
- self.data[offset:end_offs]
150
- .view(dtype = dtype)[:count]
151
- .newbyteorder(override_order or self.byte_order)
152
- )
148
+ arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
149
+ if override_order is None:
150
+ return arr
151
+ return arr.view(arr.dtype.newbyteorder(override_order))
153
152
 
154
153
  def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
155
154
  if field.name in self.fields:
@@ -314,4 +313,4 @@ class GGUFReader:
314
313
  data = self._get(data_offs, item_type, item_count).reshape(np_dims),
315
314
  field = field,
316
315
  ))
317
- self.tensors = tensors
316
+ self.tensors = tensors