bigdl-core-cpp 2.6.0b20241204__py3-none-win_amd64.whl → 2.6.0b20241212__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert_hf_to_gguf.py +404 -37
- bigdl/cpp/convert_hf_to_gguf_update.py +25 -6
- bigdl/cpp/convert_llama_ggml_to_gguf.py +0 -4
- bigdl/cpp/convert_lora_to_gguf.py +11 -1
- bigdl/cpp/gguf-py/gguf/constants.py +276 -81
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +25 -1
- bigdl/cpp/gguf-py/gguf/lazy.py +0 -1
- bigdl/cpp/gguf-py/gguf/quants.py +81 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +135 -23
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/ggml.dll +0 -0
- bigdl/cpp/libs/libc++.dll +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl_core_cpp-2.6.0b20241212.data/scripts/init-ollama.bat +16 -0
- {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241212.dist-info}/METADATA +2 -1
- bigdl_core_cpp-2.6.0b20241212.dist-info/RECORD +49 -0
- {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241212.dist-info}/WHEEL +1 -1
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
- bigdl_core_cpp-2.6.0b20241204.data/scripts/init-ollama.bat +0 -19
- bigdl_core_cpp-2.6.0b20241204.dist-info/RECORD +0 -54
- {bigdl_core_cpp-2.6.0b20241204.data → bigdl_core_cpp-2.6.0b20241212.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.6.0b20241204.data → bigdl_core_cpp-2.6.0b20241212.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241212.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert_hf_to_gguf.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
from __future__ import annotations
|
5
5
|
|
6
|
+
import ast
|
6
7
|
import logging
|
7
8
|
import argparse
|
8
9
|
import contextlib
|
@@ -14,6 +15,7 @@ from enum import IntEnum
|
|
14
15
|
from pathlib import Path
|
15
16
|
from hashlib import sha256
|
16
17
|
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
18
|
+
from itertools import chain
|
17
19
|
|
18
20
|
import math
|
19
21
|
import numpy as np
|
@@ -129,12 +131,14 @@ class Model:
|
|
129
131
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
130
132
|
tensor_names_from_parts: set[str] = set()
|
131
133
|
|
132
|
-
if
|
134
|
+
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
135
|
+
index_name += ".index.json"
|
136
|
+
index_file = self.dir_model / index_name
|
137
|
+
|
138
|
+
if index_file.is_file():
|
133
139
|
self.tensor_names = set()
|
134
|
-
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
135
|
-
index_name += ".index.json"
|
136
140
|
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
137
|
-
with open(
|
141
|
+
with open(index_file, "r", encoding="utf-8") as f:
|
138
142
|
index: dict[str, Any] = json.load(f)
|
139
143
|
weight_map = index.get("weight_map")
|
140
144
|
if weight_map is None or not isinstance(weight_map, dict):
|
@@ -142,6 +146,7 @@ class Model:
|
|
142
146
|
self.tensor_names.update(weight_map.keys())
|
143
147
|
else:
|
144
148
|
self.tensor_names = tensor_names_from_parts
|
149
|
+
weight_map = {}
|
145
150
|
|
146
151
|
for part_name in self.part_names:
|
147
152
|
logger.info(f"gguf: loading model part '{part_name}'")
|
@@ -168,9 +173,17 @@ class Model:
|
|
168
173
|
data = LazyTorchTensor.from_eager(data)
|
169
174
|
yield name, data
|
170
175
|
|
171
|
-
#
|
172
|
-
if len(
|
173
|
-
|
176
|
+
# verify tensor name presence and identify potentially missing files
|
177
|
+
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
178
|
+
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
|
179
|
+
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
180
|
+
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
181
|
+
if len(extra) == 0 and len(missing_files) > 0:
|
182
|
+
raise ValueError(f"Missing or incomplete model files: {missing_files}")
|
183
|
+
else:
|
184
|
+
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
185
|
+
f"Missing tensors: {missing}\n"
|
186
|
+
f"Extra tensors: {extra}")
|
174
187
|
|
175
188
|
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
176
189
|
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
@@ -256,10 +269,14 @@ class Model:
|
|
256
269
|
|
257
270
|
return False
|
258
271
|
|
272
|
+
# some models need extra generated tensors (like rope_freqs)
|
273
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
274
|
+
return ()
|
275
|
+
|
259
276
|
def prepare_tensors(self):
|
260
277
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
261
278
|
|
262
|
-
for name, data_torch in self.get_tensors():
|
279
|
+
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
|
263
280
|
# we don't need these
|
264
281
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
265
282
|
continue
|
@@ -277,8 +294,13 @@ class Model:
|
|
277
294
|
bid = int(part)
|
278
295
|
break
|
279
296
|
|
280
|
-
for new_name,
|
281
|
-
data
|
297
|
+
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
|
298
|
+
data = data_torch.squeeze().numpy()
|
299
|
+
|
300
|
+
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
301
|
+
if len(data.shape) == 0:
|
302
|
+
data = data_torch.numpy()
|
303
|
+
|
282
304
|
n_dims = len(data.shape)
|
283
305
|
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
284
306
|
|
@@ -296,12 +318,31 @@ class Model:
|
|
296
318
|
gguf.MODEL_TENSOR.POS_EMBD,
|
297
319
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
298
320
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
321
|
+
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
322
|
+
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
323
|
+
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
324
|
+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
325
|
+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
299
326
|
)
|
300
327
|
)
|
301
|
-
or not
|
328
|
+
or not new_name.endswith(".weight")
|
302
329
|
):
|
303
330
|
data_qtype = gguf.GGMLQuantizationType.F32
|
304
331
|
|
332
|
+
if data_qtype is False and any(
|
333
|
+
self.match_model_tensor_name(new_name, key, bid)
|
334
|
+
for key in (
|
335
|
+
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
336
|
+
gguf.MODEL_TENSOR.OUTPUT,
|
337
|
+
)
|
338
|
+
):
|
339
|
+
if self.ftype in (
|
340
|
+
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
341
|
+
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
342
|
+
):
|
343
|
+
# TODO: use Q4_K and Q6_K
|
344
|
+
data_qtype = gguf.GGMLQuantizationType.F16
|
345
|
+
|
305
346
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
306
347
|
if isinstance(data_qtype, bool):
|
307
348
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
@@ -312,6 +353,10 @@ class Model:
|
|
312
353
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
313
354
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
314
355
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
356
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
357
|
+
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
358
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
359
|
+
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
315
360
|
else:
|
316
361
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
317
362
|
|
@@ -555,6 +600,9 @@ class Model:
|
|
555
600
|
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
556
601
|
# ref: https://huggingface.co/databricks/dbrx-base
|
557
602
|
res = "dbrx"
|
603
|
+
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
|
604
|
+
# ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
605
|
+
res = "jina-v1-en"
|
558
606
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
559
607
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
560
608
|
res = "jina-v2-en"
|
@@ -600,6 +648,12 @@ class Model:
|
|
600
648
|
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
601
649
|
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
602
650
|
res = "exaone"
|
651
|
+
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
652
|
+
# ref: https://huggingface.co/microsoft/phi-2
|
653
|
+
res = "phi-2"
|
654
|
+
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
|
655
|
+
# ref: https://huggingface.co/facebook/chameleon-7b
|
656
|
+
res = "chameleon"
|
603
657
|
|
604
658
|
if res is None:
|
605
659
|
logger.warning("\n")
|
@@ -1458,7 +1512,7 @@ class StableLMModel(Model):
|
|
1458
1512
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1459
1513
|
|
1460
1514
|
|
1461
|
-
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1515
|
+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1462
1516
|
class LlamaModel(Model):
|
1463
1517
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
1464
1518
|
|
@@ -1566,11 +1620,11 @@ class LlamaModel(Model):
|
|
1566
1620
|
|
1567
1621
|
return [(self.map_tensor_name(name), data_torch)]
|
1568
1622
|
|
1569
|
-
def
|
1623
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1570
1624
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1571
1625
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1572
1626
|
base = self.hparams.get("rope_theta", 10000.0)
|
1573
|
-
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1627
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1574
1628
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1575
1629
|
|
1576
1630
|
factor = rope_scaling.get("factor", 8.0)
|
@@ -1593,8 +1647,9 @@ class LlamaModel(Model):
|
|
1593
1647
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1594
1648
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1595
1649
|
|
1596
|
-
|
1650
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
1597
1651
|
|
1652
|
+
def prepare_tensors(self):
|
1598
1653
|
super().prepare_tensors()
|
1599
1654
|
|
1600
1655
|
if self._experts is not None:
|
@@ -1616,15 +1671,16 @@ class BitnetModel(Model):
|
|
1616
1671
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1617
1672
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
1618
1673
|
|
1619
|
-
def weight_quant(self, weight):
|
1674
|
+
def weight_quant(self, weight: Tensor) -> Tensor:
|
1620
1675
|
dtype = weight.dtype
|
1621
1676
|
weight = weight.float()
|
1622
|
-
|
1623
|
-
|
1624
|
-
scale
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1677
|
+
scale = weight.abs().mean().clamp(min=1e-5)
|
1678
|
+
iscale = 1 / scale
|
1679
|
+
# TODO: multiply by the scale directly instead of inverting it twice
|
1680
|
+
# (this is also unnecessarily doubly inverted upstream)
|
1681
|
+
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
1682
|
+
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
1683
|
+
return result.type(dtype)
|
1628
1684
|
|
1629
1685
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1630
1686
|
new_name = self.map_tensor_name(name)
|
@@ -1639,11 +1695,9 @@ class BitnetModel(Model):
|
|
1639
1695
|
gguf.MODEL_TENSOR.FFN_GATE,
|
1640
1696
|
]):
|
1641
1697
|
# transform weight into 1/0/-1 (in fp32)
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
else:
|
1646
|
-
yield (new_name, data_torch)
|
1698
|
+
data_torch = self.weight_quant(data_torch)
|
1699
|
+
|
1700
|
+
yield (new_name, data_torch)
|
1647
1701
|
|
1648
1702
|
|
1649
1703
|
@Model.register("GrokForCausalLM")
|
@@ -1789,7 +1843,7 @@ class MiniCPMModel(Model):
|
|
1789
1843
|
|
1790
1844
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
1791
1845
|
if n_kv_head is not None and n_head != n_kv_head:
|
1792
|
-
|
1846
|
+
n_head //= n_kv_head
|
1793
1847
|
|
1794
1848
|
return (
|
1795
1849
|
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
@@ -1812,6 +1866,59 @@ class MiniCPMModel(Model):
|
|
1812
1866
|
return [(self.map_tensor_name(name), data_torch)]
|
1813
1867
|
|
1814
1868
|
|
1869
|
+
@Model.register("MiniCPM3ForCausalLM")
|
1870
|
+
class MiniCPM3Model(Model):
|
1871
|
+
model_arch = gguf.MODEL_ARCH.MINICPM3
|
1872
|
+
|
1873
|
+
def set_gguf_parameters(self):
|
1874
|
+
hparams = self.hparams
|
1875
|
+
|
1876
|
+
self.gguf_writer.add_file_type(self.ftype)
|
1877
|
+
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
1878
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1879
|
+
self.gguf_writer.add_block_count(self.block_count)
|
1880
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
1881
|
+
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
1882
|
+
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
1883
|
+
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
1884
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1885
|
+
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
1886
|
+
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
1887
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
1888
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
1889
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
1890
|
+
|
1891
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1892
|
+
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
1893
|
+
if rope_scaling is not None:
|
1894
|
+
rope_dims = self.hparams["qk_rope_head_dim"]
|
1895
|
+
|
1896
|
+
long_factors = rope_scaling.get('long_factor', None)
|
1897
|
+
short_factors = rope_scaling.get('short_factor', None)
|
1898
|
+
|
1899
|
+
if long_factors is None or short_factors is None:
|
1900
|
+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
1901
|
+
|
1902
|
+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
1903
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
1904
|
+
|
1905
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
1906
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
1907
|
+
|
1908
|
+
def set_vocab(self):
|
1909
|
+
self._set_vocab_sentencepiece()
|
1910
|
+
|
1911
|
+
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
1912
|
+
if n_kv_head is not None and n_head != n_kv_head:
|
1913
|
+
n_head //= n_kv_head
|
1914
|
+
|
1915
|
+
return (
|
1916
|
+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1917
|
+
.swapaxes(1, 2)
|
1918
|
+
.reshape(weights.shape)
|
1919
|
+
)
|
1920
|
+
|
1921
|
+
|
1815
1922
|
@Model.register("QWenLMHeadModel")
|
1816
1923
|
class QwenModel(Model):
|
1817
1924
|
model_arch = gguf.MODEL_ARCH.QWEN
|
@@ -2111,6 +2218,13 @@ class Phi3MiniModel(Model):
|
|
2111
2218
|
self.gguf_writer.add_file_type(self.ftype)
|
2112
2219
|
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
|
2113
2220
|
|
2221
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2222
|
+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
2223
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
2224
|
+
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
2225
|
+
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
2226
|
+
rope_dims = n_embd // n_head
|
2227
|
+
|
2114
2228
|
# write rope scaling for long context (128k) model
|
2115
2229
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
2116
2230
|
if rope_scaling is None:
|
@@ -2140,8 +2254,8 @@ class Phi3MiniModel(Model):
|
|
2140
2254
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2141
2255
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2142
2256
|
|
2143
|
-
self.
|
2144
|
-
self.
|
2257
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2258
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2145
2259
|
|
2146
2260
|
|
2147
2261
|
@Model.register("PlamoForCausalLM")
|
@@ -2503,7 +2617,7 @@ class NomicBertModel(BertModel):
|
|
2503
2617
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
2504
2618
|
|
2505
2619
|
|
2506
|
-
@Model.register("XLMRobertaModel")
|
2620
|
+
@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
2507
2621
|
class XLMRobertaModel(BertModel):
|
2508
2622
|
model_arch = gguf.MODEL_ARCH.BERT
|
2509
2623
|
|
@@ -2601,6 +2715,11 @@ class XLMRobertaModel(BertModel):
|
|
2601
2715
|
self.gguf_writer.add_add_eos_token(True)
|
2602
2716
|
|
2603
2717
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2718
|
+
# if name starts with "roberta.", remove the prefix
|
2719
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
2720
|
+
if name.startswith("roberta."):
|
2721
|
+
name = name[8:]
|
2722
|
+
|
2604
2723
|
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
2605
2724
|
if name == "embeddings.position_embeddings.weight":
|
2606
2725
|
if self._position_offset is not None:
|
@@ -2712,6 +2831,86 @@ class StarCoder2Model(Model):
|
|
2712
2831
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
2713
2832
|
|
2714
2833
|
|
2834
|
+
@Model.register("Rwkv6ForCausalLM")
|
2835
|
+
class Rwkv6Model(Model):
|
2836
|
+
model_arch = gguf.MODEL_ARCH.RWKV6
|
2837
|
+
|
2838
|
+
def set_vocab(self):
|
2839
|
+
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
2840
|
+
vocab_size = self.hparams.get("vocab_size", 65536)
|
2841
|
+
|
2842
|
+
tokens: list[bytes] = ['<s>'.encode("utf-8")]
|
2843
|
+
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
2844
|
+
|
2845
|
+
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
|
2846
|
+
lines = f.readlines()
|
2847
|
+
for line in lines:
|
2848
|
+
parts = line.split(' ')
|
2849
|
+
assert len(parts) >= 3
|
2850
|
+
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
2851
|
+
token = token.encode("utf-8") if isinstance(token, str) else token
|
2852
|
+
assert isinstance(token, bytes)
|
2853
|
+
assert len(token) == token_len
|
2854
|
+
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
2855
|
+
tokens.append(token_text.encode("utf-8"))
|
2856
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
2857
|
+
remainder = vocab_size - len(tokens)
|
2858
|
+
assert remainder >= 0
|
2859
|
+
for i in range(len(tokens), vocab_size):
|
2860
|
+
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
2861
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
2862
|
+
|
2863
|
+
self.gguf_writer.add_tokenizer_model("rwkv")
|
2864
|
+
self.gguf_writer.add_token_list(tokens)
|
2865
|
+
self.gguf_writer.add_token_types(toktypes)
|
2866
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
2867
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2868
|
+
|
2869
|
+
def set_gguf_parameters(self):
|
2870
|
+
block_count = self.hparams["num_hidden_layers"]
|
2871
|
+
head_size = self.hparams["head_size"]
|
2872
|
+
hidden_size = self.hparams["hidden_size"]
|
2873
|
+
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
2874
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
2875
|
+
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
2876
|
+
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
2877
|
+
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
2878
|
+
|
2879
|
+
# RWKV isn't context limited
|
2880
|
+
self.gguf_writer.add_context_length(1048576)
|
2881
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
2882
|
+
self.gguf_writer.add_block_count(block_count)
|
2883
|
+
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
2884
|
+
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
2885
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
2886
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
2887
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
2888
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
2889
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2890
|
+
|
2891
|
+
# required by llama.cpp, unused
|
2892
|
+
self.gguf_writer.add_head_count(0)
|
2893
|
+
|
2894
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2895
|
+
new_name = self.map_tensor_name(name)
|
2896
|
+
|
2897
|
+
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
2898
|
+
new_name += ".weight"
|
2899
|
+
|
2900
|
+
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
|
2901
|
+
data_torch = data_torch.transpose(0, 1)
|
2902
|
+
|
2903
|
+
if new_name.endswith("time_mix_w2.weight"):
|
2904
|
+
data_torch = data_torch.permute(0, 2, 1)
|
2905
|
+
|
2906
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
2907
|
+
if rescale_every_n_layers > 0:
|
2908
|
+
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
2909
|
+
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
2910
|
+
|
2911
|
+
yield (new_name, data_torch)
|
2912
|
+
|
2913
|
+
|
2715
2914
|
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
2716
2915
|
class MambaModel(Model):
|
2717
2916
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
@@ -2834,6 +3033,66 @@ class OlmoModel(Model):
|
|
2834
3033
|
return [(self.map_tensor_name(name), data_torch)]
|
2835
3034
|
|
2836
3035
|
|
3036
|
+
@Model.register("OlmoeForCausalLM")
|
3037
|
+
class OlmoeModel(Model):
|
3038
|
+
model_arch = gguf.MODEL_ARCH.OLMOE
|
3039
|
+
|
3040
|
+
def set_gguf_parameters(self):
|
3041
|
+
super().set_gguf_parameters()
|
3042
|
+
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
|
3043
|
+
if (n_experts := self.hparams.get("num_experts")) is not None:
|
3044
|
+
self.gguf_writer.add_expert_count(n_experts)
|
3045
|
+
|
3046
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3047
|
+
|
3048
|
+
# Copied from: Qwen2MoeModel
|
3049
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3050
|
+
# process the experts separately
|
3051
|
+
if name.find("experts") != -1:
|
3052
|
+
n_experts = self.hparams["num_experts"]
|
3053
|
+
assert bid is not None
|
3054
|
+
|
3055
|
+
if self._experts is None:
|
3056
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3057
|
+
|
3058
|
+
self._experts[bid][name] = data_torch
|
3059
|
+
|
3060
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3061
|
+
tensors: list[tuple[str, Tensor]] = []
|
3062
|
+
|
3063
|
+
# merge the experts into a single 3d tensor
|
3064
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
3065
|
+
datas: list[Tensor] = []
|
3066
|
+
|
3067
|
+
for xid in range(n_experts):
|
3068
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
3069
|
+
datas.append(self._experts[bid][ename])
|
3070
|
+
del self._experts[bid][ename]
|
3071
|
+
|
3072
|
+
data_torch = torch.stack(datas, dim=0)
|
3073
|
+
|
3074
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
3075
|
+
|
3076
|
+
new_name = self.map_tensor_name(merged_name)
|
3077
|
+
|
3078
|
+
tensors.append((new_name, data_torch))
|
3079
|
+
return tensors
|
3080
|
+
else:
|
3081
|
+
return []
|
3082
|
+
|
3083
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3084
|
+
|
3085
|
+
# Copied from: Qwen2MoeModel
|
3086
|
+
def prepare_tensors(self):
|
3087
|
+
super().prepare_tensors()
|
3088
|
+
|
3089
|
+
if self._experts is not None:
|
3090
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3091
|
+
experts = [k for d in self._experts for k in d.keys()]
|
3092
|
+
if len(experts) > 0:
|
3093
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
3094
|
+
|
3095
|
+
|
2837
3096
|
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
2838
3097
|
class JinaBertV2Model(BertModel):
|
2839
3098
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
@@ -2872,6 +3131,14 @@ class JinaBertV2Model(BertModel):
|
|
2872
3131
|
self.gguf_writer.add_add_bos_token(True)
|
2873
3132
|
self.gguf_writer.add_add_eos_token(True)
|
2874
3133
|
|
3134
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3135
|
+
# if name starts with "bert.", remove the prefix
|
3136
|
+
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
3137
|
+
if name.startswith("bert."):
|
3138
|
+
name = name[5:]
|
3139
|
+
|
3140
|
+
return super().modify_tensors(data_torch, name, bid)
|
3141
|
+
|
2875
3142
|
|
2876
3143
|
@Model.register("OpenELMForCausalLM")
|
2877
3144
|
class OpenELMModel(Model):
|
@@ -3812,11 +4079,11 @@ class ExaoneModel(Model):
|
|
3812
4079
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
3813
4080
|
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
3814
4081
|
|
3815
|
-
def
|
4082
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
3816
4083
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
3817
4084
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
3818
4085
|
base = self.hparams.get("rope_theta", 10000.0)
|
3819
|
-
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
4086
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
3820
4087
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
3821
4088
|
|
3822
4089
|
factor = rope_scaling.get("factor", 8.0)
|
@@ -3839,9 +4106,107 @@ class ExaoneModel(Model):
|
|
3839
4106
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
3840
4107
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
3841
4108
|
|
3842
|
-
|
4109
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
3843
4110
|
|
3844
|
-
|
4111
|
+
|
4112
|
+
@Model.register("GraniteForCausalLM")
|
4113
|
+
class GraniteModel(LlamaModel):
|
4114
|
+
"""Conversion for IBM's GraniteForCausalLM"""
|
4115
|
+
model_arch = gguf.MODEL_ARCH.GRANITE
|
4116
|
+
|
4117
|
+
def set_gguf_parameters(self):
|
4118
|
+
"""Granite uses standard llama parameters with the following differences:
|
4119
|
+
|
4120
|
+
- No head_dim support
|
4121
|
+
- New multiplier params:
|
4122
|
+
- attention_scale
|
4123
|
+
- embedding_scale
|
4124
|
+
- residual_scale
|
4125
|
+
- logits_scaling
|
4126
|
+
"""
|
4127
|
+
if head_dim := self.hparams.pop("head_dim", None):
|
4128
|
+
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
|
4129
|
+
super().set_gguf_parameters()
|
4130
|
+
# NOTE: Convert _multiplier params to _scale params for naming
|
4131
|
+
# consistency
|
4132
|
+
if attention_scale := self.hparams.get("attention_multiplier"):
|
4133
|
+
self.gguf_writer.add_attention_scale(attention_scale)
|
4134
|
+
logger.info("gguf: (granite) attention_scale = %s", attention_scale)
|
4135
|
+
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
4136
|
+
self.gguf_writer.add_embedding_scale(embedding_scale)
|
4137
|
+
logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
|
4138
|
+
if residual_scale := self.hparams.get("residual_multiplier"):
|
4139
|
+
self.gguf_writer.add_residual_scale(residual_scale)
|
4140
|
+
logger.info("gguf: (granite) residual_scale = %s", residual_scale)
|
4141
|
+
if logits_scale := self.hparams.get("logits_scaling"):
|
4142
|
+
self.gguf_writer.add_logit_scale(logits_scale)
|
4143
|
+
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
4144
|
+
|
4145
|
+
|
4146
|
+
@Model.register("GraniteMoeForCausalLM")
|
4147
|
+
class GraniteMoeModel(GraniteModel):
|
4148
|
+
"""Conversion for IBM's GraniteMoeForCausalLM"""
|
4149
|
+
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
|
4150
|
+
|
4151
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4152
|
+
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
|
4153
|
+
is used. This essentially merges w1 and w3 into a single tensor with 2x
|
4154
|
+
the hidden size that is then split during forward. To keep compatibility
|
4155
|
+
with existing mixtral support, we pull them apart here.
|
4156
|
+
"""
|
4157
|
+
|
4158
|
+
if name.endswith("block_sparse_moe.input_linear.weight"):
|
4159
|
+
ffn_dim = self.hparams["intermediate_size"]
|
4160
|
+
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
|
4161
|
+
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
|
4162
|
+
return [
|
4163
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
|
4164
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
4165
|
+
]
|
4166
|
+
|
4167
|
+
return super().modify_tensors(data_torch, name, bid)
|
4168
|
+
|
4169
|
+
|
4170
|
+
@Model.register("ChameleonForConditionalGeneration")
|
4171
|
+
@Model.register("ChameleonForCausalLM") # obsolete
|
4172
|
+
class ChameleonModel(Model):
|
4173
|
+
model_arch = gguf.MODEL_ARCH.CHAMELEON
|
4174
|
+
|
4175
|
+
def set_gguf_parameters(self):
|
4176
|
+
super().set_gguf_parameters()
|
4177
|
+
self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
|
4178
|
+
|
4179
|
+
def set_vocab(self):
|
4180
|
+
self._set_vocab_gpt2()
|
4181
|
+
|
4182
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4183
|
+
# ignore image tokenizer for now
|
4184
|
+
# TODO: remove this once image support is implemented for Chameleon
|
4185
|
+
if name.startswith("model.vqmodel"):
|
4186
|
+
return []
|
4187
|
+
|
4188
|
+
n_head = self.hparams["num_attention_heads"]
|
4189
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
4190
|
+
hidden_dim = self.hparams.get("hidden_size")
|
4191
|
+
|
4192
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
4193
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
4194
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
4195
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
4196
|
+
if name.endswith(("q_norm.weight", "q_norm.bias")):
|
4197
|
+
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
|
4198
|
+
if name.endswith(("k_norm.weight", "k_norm.bias")):
|
4199
|
+
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
|
4200
|
+
|
4201
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4202
|
+
|
4203
|
+
# see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
|
4204
|
+
@staticmethod
|
4205
|
+
def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
|
4206
|
+
head_dim = hidden_dim // n_heads
|
4207
|
+
data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
|
4208
|
+
data_torch = data_torch.repeat_interleave(n_heads, 0)
|
4209
|
+
return data_torch
|
3845
4210
|
|
3846
4211
|
|
3847
4212
|
###### CONVERSION LOGIC ######
|
@@ -3924,8 +4289,8 @@ def parse_args() -> argparse.Namespace:
|
|
3924
4289
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
3925
4290
|
)
|
3926
4291
|
parser.add_argument(
|
3927
|
-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
3928
|
-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
4292
|
+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
4293
|
+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
3929
4294
|
)
|
3930
4295
|
parser.add_argument(
|
3931
4296
|
"--bigendian", action="store_true",
|
@@ -4012,6 +4377,8 @@ def main() -> None:
|
|
4012
4377
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
4013
4378
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
4014
4379
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
4380
|
+
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
4381
|
+
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
4015
4382
|
"auto": gguf.LlamaFileType.GUESSED,
|
4016
4383
|
}
|
4017
4384
|
|