bigdl-core-cpp 2.6.0b20241203__py3-none-manylinux2010_x86_64.whl → 2.6.0b20241211__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +404 -37
  2. bigdl/cpp/convert_hf_to_gguf_update.py +25 -6
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +0 -4
  4. bigdl/cpp/convert_lora_to_gguf.py +11 -1
  5. bigdl/cpp/gguf-py/gguf/constants.py +276 -81
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +25 -1
  7. bigdl/cpp/gguf-py/gguf/lazy.py +0 -1
  8. bigdl/cpp/gguf-py/gguf/quants.py +81 -0
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +135 -23
  10. bigdl/cpp/libs/libggml.so +0 -0
  11. bigdl/cpp/libs/libllama.so +0 -0
  12. bigdl/cpp/libs/llama-batched +0 -0
  13. bigdl/cpp/libs/llama-bench +0 -0
  14. bigdl/cpp/libs/llama-cli +0 -0
  15. bigdl/cpp/libs/llama-embedding +0 -0
  16. bigdl/cpp/libs/llama-gguf +0 -0
  17. bigdl/cpp/libs/llama-llava-cli +0 -0
  18. bigdl/cpp/libs/llama-lookup +0 -0
  19. bigdl/cpp/libs/llama-ls-sycl-device +0 -0
  20. bigdl/cpp/libs/llama-minicpmv-cli +0 -0
  21. bigdl/cpp/libs/llama-perplexity +0 -0
  22. bigdl/cpp/libs/llama-quantize +0 -0
  23. bigdl/cpp/libs/llama-server +0 -0
  24. bigdl/cpp/libs/llama-simple +0 -0
  25. bigdl/cpp/libs/llama-speculative +0 -0
  26. bigdl/cpp/libs/llama-tokenize +0 -0
  27. bigdl/cpp/libs/ollama +0 -0
  28. {bigdl_core_cpp-2.6.0b20241203.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/METADATA +1 -1
  29. bigdl_core_cpp-2.6.0b20241211.dist-info/RECORD +44 -0
  30. bigdl_core_cpp-2.6.0b20241203.dist-info/RECORD +0 -44
  31. {bigdl_core_cpp-2.6.0b20241203.data → bigdl_core_cpp-2.6.0b20241211.data}/scripts/init-llama-cpp +0 -0
  32. {bigdl_core_cpp-2.6.0b20241203.data → bigdl_core_cpp-2.6.0b20241211.data}/scripts/init-ollama +0 -0
  33. {bigdl_core_cpp-2.6.0b20241203.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/WHEEL +0 -0
  34. {bigdl_core_cpp-2.6.0b20241203.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import ast
6
7
  import logging
7
8
  import argparse
8
9
  import contextlib
@@ -14,6 +15,7 @@ from enum import IntEnum
14
15
  from pathlib import Path
15
16
  from hashlib import sha256
16
17
  from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
18
+ from itertools import chain
17
19
 
18
20
  import math
19
21
  import numpy as np
@@ -129,12 +131,14 @@ class Model:
129
131
  def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
130
132
  tensor_names_from_parts: set[str] = set()
131
133
 
132
- if len(self.part_names) > 1:
134
+ index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
135
+ index_name += ".index.json"
136
+ index_file = self.dir_model / index_name
137
+
138
+ if index_file.is_file():
133
139
  self.tensor_names = set()
134
- index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
135
- index_name += ".index.json"
136
140
  logger.info(f"gguf: loading model weight map from '{index_name}'")
137
- with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
141
+ with open(index_file, "r", encoding="utf-8") as f:
138
142
  index: dict[str, Any] = json.load(f)
139
143
  weight_map = index.get("weight_map")
140
144
  if weight_map is None or not isinstance(weight_map, dict):
@@ -142,6 +146,7 @@ class Model:
142
146
  self.tensor_names.update(weight_map.keys())
143
147
  else:
144
148
  self.tensor_names = tensor_names_from_parts
149
+ weight_map = {}
145
150
 
146
151
  for part_name in self.part_names:
147
152
  logger.info(f"gguf: loading model part '{part_name}'")
@@ -168,9 +173,17 @@ class Model:
168
173
  data = LazyTorchTensor.from_eager(data)
169
174
  yield name, data
170
175
 
171
- # only verify tensor name presence; it doesn't matter if they are not in the right files
172
- if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
173
- raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
176
+ # verify tensor name presence and identify potentially missing files
177
+ if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
178
+ missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
179
+ extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
180
+ missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
181
+ if len(extra) == 0 and len(missing_files) > 0:
182
+ raise ValueError(f"Missing or incomplete model files: {missing_files}")
183
+ else:
184
+ raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
185
+ f"Missing tensors: {missing}\n"
186
+ f"Extra tensors: {extra}")
174
187
 
175
188
  def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
176
189
  if key not in gguf.MODEL_TENSORS[self.model_arch]:
@@ -256,10 +269,14 @@ class Model:
256
269
 
257
270
  return False
258
271
 
272
+ # some models need extra generated tensors (like rope_freqs)
273
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
274
+ return ()
275
+
259
276
  def prepare_tensors(self):
260
277
  max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
261
278
 
262
- for name, data_torch in self.get_tensors():
279
+ for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
263
280
  # we don't need these
264
281
  if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
265
282
  continue
@@ -277,8 +294,13 @@ class Model:
277
294
  bid = int(part)
278
295
  break
279
296
 
280
- for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
281
- data: np.ndarray # type hint
297
+ for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
298
+ data = data_torch.squeeze().numpy()
299
+
300
+ # if data ends up empty, it means data_torch was a scalar tensor -> restore
301
+ if len(data.shape) == 0:
302
+ data = data_torch.numpy()
303
+
282
304
  n_dims = len(data.shape)
283
305
  data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
284
306
 
@@ -296,12 +318,31 @@ class Model:
296
318
  gguf.MODEL_TENSOR.POS_EMBD,
297
319
  gguf.MODEL_TENSOR.TOKEN_TYPES,
298
320
  gguf.MODEL_TENSOR.SSM_CONV1D,
321
+ gguf.MODEL_TENSOR.TIME_MIX_FIRST,
322
+ gguf.MODEL_TENSOR.TIME_MIX_W1,
323
+ gguf.MODEL_TENSOR.TIME_MIX_W2,
324
+ gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
325
+ gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
299
326
  )
300
327
  )
301
- or not name.endswith(".weight")
328
+ or not new_name.endswith(".weight")
302
329
  ):
303
330
  data_qtype = gguf.GGMLQuantizationType.F32
304
331
 
332
+ if data_qtype is False and any(
333
+ self.match_model_tensor_name(new_name, key, bid)
334
+ for key in (
335
+ gguf.MODEL_TENSOR.TOKEN_EMBD,
336
+ gguf.MODEL_TENSOR.OUTPUT,
337
+ )
338
+ ):
339
+ if self.ftype in (
340
+ gguf.LlamaFileType.MOSTLY_TQ1_0,
341
+ gguf.LlamaFileType.MOSTLY_TQ2_0,
342
+ ):
343
+ # TODO: use Q4_K and Q6_K
344
+ data_qtype = gguf.GGMLQuantizationType.F16
345
+
305
346
  # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
306
347
  if isinstance(data_qtype, bool):
307
348
  if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -312,6 +353,10 @@ class Model:
312
353
  data_qtype = gguf.GGMLQuantizationType.BF16
313
354
  elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
314
355
  data_qtype = gguf.GGMLQuantizationType.Q8_0
356
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
357
+ data_qtype = gguf.GGMLQuantizationType.TQ1_0
358
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
359
+ data_qtype = gguf.GGMLQuantizationType.TQ2_0
315
360
  else:
316
361
  raise ValueError(f"Unknown file type: {self.ftype.name}")
317
362
 
@@ -555,6 +600,9 @@ class Model:
555
600
  if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
556
601
  # ref: https://huggingface.co/databricks/dbrx-base
557
602
  res = "dbrx"
603
+ if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
604
+ # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
605
+ res = "jina-v1-en"
558
606
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
559
607
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
560
608
  res = "jina-v2-en"
@@ -600,6 +648,12 @@ class Model:
600
648
  if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
601
649
  # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
602
650
  res = "exaone"
651
+ if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
652
+ # ref: https://huggingface.co/microsoft/phi-2
653
+ res = "phi-2"
654
+ if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
655
+ # ref: https://huggingface.co/facebook/chameleon-7b
656
+ res = "chameleon"
603
657
 
604
658
  if res is None:
605
659
  logger.warning("\n")
@@ -1458,7 +1512,7 @@ class StableLMModel(Model):
1458
1512
  raise ValueError(f"Unprocessed norms: {norms}")
1459
1513
 
1460
1514
 
1461
- @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1515
+ @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1462
1516
  class LlamaModel(Model):
1463
1517
  model_arch = gguf.MODEL_ARCH.LLAMA
1464
1518
 
@@ -1566,11 +1620,11 @@ class LlamaModel(Model):
1566
1620
 
1567
1621
  return [(self.map_tensor_name(name), data_torch)]
1568
1622
 
1569
- def prepare_tensors(self):
1623
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1570
1624
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1571
1625
  if rope_scaling.get("rope_type", '').lower() == "llama3":
1572
1626
  base = self.hparams.get("rope_theta", 10000.0)
1573
- dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1627
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1574
1628
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1575
1629
 
1576
1630
  factor = rope_scaling.get("factor", 8.0)
@@ -1593,8 +1647,9 @@ class LlamaModel(Model):
1593
1647
  smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1594
1648
  rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1595
1649
 
1596
- self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1650
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1597
1651
 
1652
+ def prepare_tensors(self):
1598
1653
  super().prepare_tensors()
1599
1654
 
1600
1655
  if self._experts is not None:
@@ -1616,15 +1671,16 @@ class BitnetModel(Model):
1616
1671
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1617
1672
  self.gguf_writer.add_rope_scaling_factor(1.0)
1618
1673
 
1619
- def weight_quant(self, weight):
1674
+ def weight_quant(self, weight: Tensor) -> Tensor:
1620
1675
  dtype = weight.dtype
1621
1676
  weight = weight.float()
1622
- s = 1 / weight.abs().mean().clamp(min=1e-5)
1623
- weight = (weight * s).round().clamp(-1, 1) / s
1624
- scale = weight.abs().max().unsqueeze(0)
1625
- weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
1626
- weight = torch.sign(weight).type(dtype)
1627
- return weight.type(dtype), scale.type(torch.float32)
1677
+ scale = weight.abs().mean().clamp(min=1e-5)
1678
+ iscale = 1 / scale
1679
+ # TODO: multiply by the scale directly instead of inverting it twice
1680
+ # (this is also unnecessarily doubly inverted upstream)
1681
+ # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
1682
+ result = (weight * iscale).round().clamp(-1, 1) / iscale
1683
+ return result.type(dtype)
1628
1684
 
1629
1685
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1630
1686
  new_name = self.map_tensor_name(name)
@@ -1639,11 +1695,9 @@ class BitnetModel(Model):
1639
1695
  gguf.MODEL_TENSOR.FFN_GATE,
1640
1696
  ]):
1641
1697
  # transform weight into 1/0/-1 (in fp32)
1642
- weight_torch, scale_torch = self.weight_quant(data_torch)
1643
- yield (new_name, weight_torch)
1644
- yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
1645
- else:
1646
- yield (new_name, data_torch)
1698
+ data_torch = self.weight_quant(data_torch)
1699
+
1700
+ yield (new_name, data_torch)
1647
1701
 
1648
1702
 
1649
1703
  @Model.register("GrokForCausalLM")
@@ -1789,7 +1843,7 @@ class MiniCPMModel(Model):
1789
1843
 
1790
1844
  def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1791
1845
  if n_kv_head is not None and n_head != n_kv_head:
1792
- n_head = n_kv_head
1846
+ n_head //= n_kv_head
1793
1847
 
1794
1848
  return (
1795
1849
  weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@@ -1812,6 +1866,59 @@ class MiniCPMModel(Model):
1812
1866
  return [(self.map_tensor_name(name), data_torch)]
1813
1867
 
1814
1868
 
1869
+ @Model.register("MiniCPM3ForCausalLM")
1870
+ class MiniCPM3Model(Model):
1871
+ model_arch = gguf.MODEL_ARCH.MINICPM3
1872
+
1873
+ def set_gguf_parameters(self):
1874
+ hparams = self.hparams
1875
+
1876
+ self.gguf_writer.add_file_type(self.ftype)
1877
+ self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
1878
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1879
+ self.gguf_writer.add_block_count(self.block_count)
1880
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
1881
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
1882
+ self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
1883
+ self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
1884
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1885
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
1886
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
1887
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
1888
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
1889
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
1890
+
1891
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1892
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
1893
+ if rope_scaling is not None:
1894
+ rope_dims = self.hparams["qk_rope_head_dim"]
1895
+
1896
+ long_factors = rope_scaling.get('long_factor', None)
1897
+ short_factors = rope_scaling.get('short_factor', None)
1898
+
1899
+ if long_factors is None or short_factors is None:
1900
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
1901
+
1902
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
1903
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
1904
+
1905
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
1906
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
1907
+
1908
+ def set_vocab(self):
1909
+ self._set_vocab_sentencepiece()
1910
+
1911
+ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1912
+ if n_kv_head is not None and n_head != n_kv_head:
1913
+ n_head //= n_kv_head
1914
+
1915
+ return (
1916
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1917
+ .swapaxes(1, 2)
1918
+ .reshape(weights.shape)
1919
+ )
1920
+
1921
+
1815
1922
  @Model.register("QWenLMHeadModel")
1816
1923
  class QwenModel(Model):
1817
1924
  model_arch = gguf.MODEL_ARCH.QWEN
@@ -2111,6 +2218,13 @@ class Phi3MiniModel(Model):
2111
2218
  self.gguf_writer.add_file_type(self.ftype)
2112
2219
  self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
2113
2220
 
2221
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2222
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
2223
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
2224
+ max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
2225
+ orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2226
+ rope_dims = n_embd // n_head
2227
+
2114
2228
  # write rope scaling for long context (128k) model
2115
2229
  rope_scaling = self.find_hparam(['rope_scaling'], True)
2116
2230
  if rope_scaling is None:
@@ -2140,8 +2254,8 @@ class Phi3MiniModel(Model):
2140
2254
  if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2141
2255
  raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2142
2256
 
2143
- self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
2144
- self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
2257
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2258
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2145
2259
 
2146
2260
 
2147
2261
  @Model.register("PlamoForCausalLM")
@@ -2503,7 +2617,7 @@ class NomicBertModel(BertModel):
2503
2617
  self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
2504
2618
 
2505
2619
 
2506
- @Model.register("XLMRobertaModel")
2620
+ @Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
2507
2621
  class XLMRobertaModel(BertModel):
2508
2622
  model_arch = gguf.MODEL_ARCH.BERT
2509
2623
 
@@ -2601,6 +2715,11 @@ class XLMRobertaModel(BertModel):
2601
2715
  self.gguf_writer.add_add_eos_token(True)
2602
2716
 
2603
2717
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2718
+ # if name starts with "roberta.", remove the prefix
2719
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
2720
+ if name.startswith("roberta."):
2721
+ name = name[8:]
2722
+
2604
2723
  # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2605
2724
  if name == "embeddings.position_embeddings.weight":
2606
2725
  if self._position_offset is not None:
@@ -2712,6 +2831,86 @@ class StarCoder2Model(Model):
2712
2831
  model_arch = gguf.MODEL_ARCH.STARCODER2
2713
2832
 
2714
2833
 
2834
+ @Model.register("Rwkv6ForCausalLM")
2835
+ class Rwkv6Model(Model):
2836
+ model_arch = gguf.MODEL_ARCH.RWKV6
2837
+
2838
+ def set_vocab(self):
2839
+ assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
2840
+ vocab_size = self.hparams.get("vocab_size", 65536)
2841
+
2842
+ tokens: list[bytes] = ['<s>'.encode("utf-8")]
2843
+ toktypes: list[int] = [gguf.TokenType.CONTROL]
2844
+
2845
+ with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
2846
+ lines = f.readlines()
2847
+ for line in lines:
2848
+ parts = line.split(' ')
2849
+ assert len(parts) >= 3
2850
+ token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
2851
+ token = token.encode("utf-8") if isinstance(token, str) else token
2852
+ assert isinstance(token, bytes)
2853
+ assert len(token) == token_len
2854
+ token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
2855
+ tokens.append(token_text.encode("utf-8"))
2856
+ toktypes.append(gguf.TokenType.NORMAL)
2857
+ remainder = vocab_size - len(tokens)
2858
+ assert remainder >= 0
2859
+ for i in range(len(tokens), vocab_size):
2860
+ tokens.append(f"[PAD{i}]".encode("utf-8"))
2861
+ toktypes.append(gguf.TokenType.UNUSED)
2862
+
2863
+ self.gguf_writer.add_tokenizer_model("rwkv")
2864
+ self.gguf_writer.add_token_list(tokens)
2865
+ self.gguf_writer.add_token_types(toktypes)
2866
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
2867
+ special_vocab.add_to_gguf(self.gguf_writer)
2868
+
2869
+ def set_gguf_parameters(self):
2870
+ block_count = self.hparams["num_hidden_layers"]
2871
+ head_size = self.hparams["head_size"]
2872
+ hidden_size = self.hparams["hidden_size"]
2873
+ layer_norm_eps = self.hparams["layer_norm_epsilon"]
2874
+ rescale_every_n_layers = self.hparams["rescale_every"]
2875
+ intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
2876
+ time_mix_extra_dim = 64 if hidden_size == 4096 else 32
2877
+ time_decay_extra_dim = 128 if hidden_size == 4096 else 64
2878
+
2879
+ # RWKV isn't context limited
2880
+ self.gguf_writer.add_context_length(1048576)
2881
+ self.gguf_writer.add_embedding_length(hidden_size)
2882
+ self.gguf_writer.add_block_count(block_count)
2883
+ self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
2884
+ self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
2885
+ self.gguf_writer.add_wkv_head_size(head_size)
2886
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
2887
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
2888
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
2889
+ self.gguf_writer.add_file_type(self.ftype)
2890
+
2891
+ # required by llama.cpp, unused
2892
+ self.gguf_writer.add_head_count(0)
2893
+
2894
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2895
+ new_name = self.map_tensor_name(name)
2896
+
2897
+ if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
2898
+ new_name += ".weight"
2899
+
2900
+ if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
2901
+ data_torch = data_torch.transpose(0, 1)
2902
+
2903
+ if new_name.endswith("time_mix_w2.weight"):
2904
+ data_torch = data_torch.permute(0, 2, 1)
2905
+
2906
+ rescale_every_n_layers = self.hparams["rescale_every"]
2907
+ if rescale_every_n_layers > 0:
2908
+ if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
2909
+ data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
2910
+
2911
+ yield (new_name, data_torch)
2912
+
2913
+
2715
2914
  @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
2716
2915
  class MambaModel(Model):
2717
2916
  model_arch = gguf.MODEL_ARCH.MAMBA
@@ -2834,6 +3033,66 @@ class OlmoModel(Model):
2834
3033
  return [(self.map_tensor_name(name), data_torch)]
2835
3034
 
2836
3035
 
3036
+ @Model.register("OlmoeForCausalLM")
3037
+ class OlmoeModel(Model):
3038
+ model_arch = gguf.MODEL_ARCH.OLMOE
3039
+
3040
+ def set_gguf_parameters(self):
3041
+ super().set_gguf_parameters()
3042
+ self.gguf_writer.add_layer_norm_rms_eps(1e-5)
3043
+ if (n_experts := self.hparams.get("num_experts")) is not None:
3044
+ self.gguf_writer.add_expert_count(n_experts)
3045
+
3046
+ _experts: list[dict[str, Tensor]] | None = None
3047
+
3048
+ # Copied from: Qwen2MoeModel
3049
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3050
+ # process the experts separately
3051
+ if name.find("experts") != -1:
3052
+ n_experts = self.hparams["num_experts"]
3053
+ assert bid is not None
3054
+
3055
+ if self._experts is None:
3056
+ self._experts = [{} for _ in range(self.block_count)]
3057
+
3058
+ self._experts[bid][name] = data_torch
3059
+
3060
+ if len(self._experts[bid]) >= n_experts * 3:
3061
+ tensors: list[tuple[str, Tensor]] = []
3062
+
3063
+ # merge the experts into a single 3d tensor
3064
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
3065
+ datas: list[Tensor] = []
3066
+
3067
+ for xid in range(n_experts):
3068
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3069
+ datas.append(self._experts[bid][ename])
3070
+ del self._experts[bid][ename]
3071
+
3072
+ data_torch = torch.stack(datas, dim=0)
3073
+
3074
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3075
+
3076
+ new_name = self.map_tensor_name(merged_name)
3077
+
3078
+ tensors.append((new_name, data_torch))
3079
+ return tensors
3080
+ else:
3081
+ return []
3082
+
3083
+ return [(self.map_tensor_name(name), data_torch)]
3084
+
3085
+ # Copied from: Qwen2MoeModel
3086
+ def prepare_tensors(self):
3087
+ super().prepare_tensors()
3088
+
3089
+ if self._experts is not None:
3090
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3091
+ experts = [k for d in self._experts for k in d.keys()]
3092
+ if len(experts) > 0:
3093
+ raise ValueError(f"Unprocessed experts: {experts}")
3094
+
3095
+
2837
3096
  @Model.register("JinaBertModel", "JinaBertForMaskedLM")
2838
3097
  class JinaBertV2Model(BertModel):
2839
3098
  model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
@@ -2872,6 +3131,14 @@ class JinaBertV2Model(BertModel):
2872
3131
  self.gguf_writer.add_add_bos_token(True)
2873
3132
  self.gguf_writer.add_add_eos_token(True)
2874
3133
 
3134
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3135
+ # if name starts with "bert.", remove the prefix
3136
+ # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
3137
+ if name.startswith("bert."):
3138
+ name = name[5:]
3139
+
3140
+ return super().modify_tensors(data_torch, name, bid)
3141
+
2875
3142
 
2876
3143
  @Model.register("OpenELMForCausalLM")
2877
3144
  class OpenELMModel(Model):
@@ -3812,11 +4079,11 @@ class ExaoneModel(Model):
3812
4079
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3813
4080
  self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
3814
4081
 
3815
- def prepare_tensors(self):
4082
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
3816
4083
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
3817
4084
  if rope_scaling.get("rope_type", '').lower() == "llama3":
3818
4085
  base = self.hparams.get("rope_theta", 10000.0)
3819
- dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4086
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
3820
4087
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
3821
4088
 
3822
4089
  factor = rope_scaling.get("factor", 8.0)
@@ -3839,9 +4106,107 @@ class ExaoneModel(Model):
3839
4106
  smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
3840
4107
  rope_factors.append(1 / ((1 - smooth) / factor + smooth))
3841
4108
 
3842
- self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
4109
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
3843
4110
 
3844
- super().prepare_tensors()
4111
+
4112
+ @Model.register("GraniteForCausalLM")
4113
+ class GraniteModel(LlamaModel):
4114
+ """Conversion for IBM's GraniteForCausalLM"""
4115
+ model_arch = gguf.MODEL_ARCH.GRANITE
4116
+
4117
+ def set_gguf_parameters(self):
4118
+ """Granite uses standard llama parameters with the following differences:
4119
+
4120
+ - No head_dim support
4121
+ - New multiplier params:
4122
+ - attention_scale
4123
+ - embedding_scale
4124
+ - residual_scale
4125
+ - logits_scaling
4126
+ """
4127
+ if head_dim := self.hparams.pop("head_dim", None):
4128
+ logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
4129
+ super().set_gguf_parameters()
4130
+ # NOTE: Convert _multiplier params to _scale params for naming
4131
+ # consistency
4132
+ if attention_scale := self.hparams.get("attention_multiplier"):
4133
+ self.gguf_writer.add_attention_scale(attention_scale)
4134
+ logger.info("gguf: (granite) attention_scale = %s", attention_scale)
4135
+ if embedding_scale := self.hparams.get("embedding_multiplier"):
4136
+ self.gguf_writer.add_embedding_scale(embedding_scale)
4137
+ logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
4138
+ if residual_scale := self.hparams.get("residual_multiplier"):
4139
+ self.gguf_writer.add_residual_scale(residual_scale)
4140
+ logger.info("gguf: (granite) residual_scale = %s", residual_scale)
4141
+ if logits_scale := self.hparams.get("logits_scaling"):
4142
+ self.gguf_writer.add_logit_scale(logits_scale)
4143
+ logger.info("gguf: (granite) logits_scale = %s", logits_scale)
4144
+
4145
+
4146
+ @Model.register("GraniteMoeForCausalLM")
4147
+ class GraniteMoeModel(GraniteModel):
4148
+ """Conversion for IBM's GraniteMoeForCausalLM"""
4149
+ model_arch = gguf.MODEL_ARCH.GRANITE_MOE
4150
+
4151
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4152
+ """In modeling_granitemoe, the JetMoe implementation of parallel experts
4153
+ is used. This essentially merges w1 and w3 into a single tensor with 2x
4154
+ the hidden size that is then split during forward. To keep compatibility
4155
+ with existing mixtral support, we pull them apart here.
4156
+ """
4157
+
4158
+ if name.endswith("block_sparse_moe.input_linear.weight"):
4159
+ ffn_dim = self.hparams["intermediate_size"]
4160
+ assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
4161
+ gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
4162
+ return [
4163
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
4164
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
4165
+ ]
4166
+
4167
+ return super().modify_tensors(data_torch, name, bid)
4168
+
4169
+
4170
+ @Model.register("ChameleonForConditionalGeneration")
4171
+ @Model.register("ChameleonForCausalLM") # obsolete
4172
+ class ChameleonModel(Model):
4173
+ model_arch = gguf.MODEL_ARCH.CHAMELEON
4174
+
4175
+ def set_gguf_parameters(self):
4176
+ super().set_gguf_parameters()
4177
+ self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
4178
+
4179
+ def set_vocab(self):
4180
+ self._set_vocab_gpt2()
4181
+
4182
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4183
+ # ignore image tokenizer for now
4184
+ # TODO: remove this once image support is implemented for Chameleon
4185
+ if name.startswith("model.vqmodel"):
4186
+ return []
4187
+
4188
+ n_head = self.hparams["num_attention_heads"]
4189
+ n_kv_head = self.hparams.get("num_key_value_heads")
4190
+ hidden_dim = self.hparams.get("hidden_size")
4191
+
4192
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
4193
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
4194
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
4195
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
4196
+ if name.endswith(("q_norm.weight", "q_norm.bias")):
4197
+ data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
4198
+ if name.endswith(("k_norm.weight", "k_norm.bias")):
4199
+ data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
4200
+
4201
+ return [(self.map_tensor_name(name), data_torch)]
4202
+
4203
+ # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
4204
+ @staticmethod
4205
+ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
4206
+ head_dim = hidden_dim // n_heads
4207
+ data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
4208
+ data_torch = data_torch.repeat_interleave(n_heads, 0)
4209
+ return data_torch
3845
4210
 
3846
4211
 
3847
4212
  ###### CONVERSION LOGIC ######
@@ -3924,8 +4289,8 @@ def parse_args() -> argparse.Namespace:
3924
4289
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
3925
4290
  )
3926
4291
  parser.add_argument(
3927
- "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
3928
- help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
4292
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
4293
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
3929
4294
  )
3930
4295
  parser.add_argument(
3931
4296
  "--bigendian", action="store_true",
@@ -4012,6 +4377,8 @@ def main() -> None:
4012
4377
  "f16": gguf.LlamaFileType.MOSTLY_F16,
4013
4378
  "bf16": gguf.LlamaFileType.MOSTLY_BF16,
4014
4379
  "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
4380
+ "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
4381
+ "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
4015
4382
  "auto": gguf.LlamaFileType.GUESSED,
4016
4383
  }
4017
4384