bigdl-core-cpp 2.1.0b20240820.post1__py3-none-win_amd64.whl → 2.2.0b20250217.post0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +908 -140
- bigdl/cpp/convert_hf_to_gguf_update.py +376 -0
- bigdl/cpp/convert_llama_ggml_to_gguf.py +450 -0
- bigdl/cpp/convert_lora_to_gguf.py +433 -0
- bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
- bigdl/cpp/gguf-py/gguf/constants.py +414 -89
- bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +77 -14
- bigdl/cpp/gguf-py/gguf/lazy.py +3 -1
- bigdl/cpp/gguf-py/gguf/metadata.py +195 -76
- bigdl/cpp/gguf-py/gguf/quants.py +1210 -64
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +156 -34
- bigdl/cpp/gguf-py/gguf/utility.py +1 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +325 -3
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/ggml-base.dll +0 -0
- bigdl/cpp/libs/ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ggml.dll +0 -0
- bigdl/cpp/libs/libc++.dll +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ollama-lib.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/ollama_ggml.dll +0 -0
- bigdl/cpp/libs/ollama_llama.dll +0 -0
- bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
- {bigdl_core_cpp-2.1.0b20240820.post1.data → bigdl_core_cpp-2.2.0b20250217.post0.data}/scripts/init-llama-cpp.bat +7 -2
- bigdl_core_cpp-2.2.0b20250217.post0.data/scripts/init-ollama.bat +16 -0
- {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/METADATA +9 -5
- bigdl_core_cpp-2.2.0b20250217.post0.dist-info/RECORD +56 -0
- {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/WHEEL +1 -1
- bigdl/cpp/convert.py +0 -1714
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/ggml_shared.dll +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- bigdl_core_cpp-2.1.0b20240820.post1.data/scripts/init-ollama.bat +0 -13
- bigdl_core_cpp-2.1.0b20240820.post1.dist-info/RECORD +0 -63
- {bigdl_core_cpp-2.1.0b20240820.post1.data → bigdl_core_cpp-2.2.0b20250217.post0.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
from __future__ import annotations
|
5
5
|
|
6
|
+
import ast
|
6
7
|
import logging
|
7
8
|
import argparse
|
8
9
|
import contextlib
|
@@ -14,6 +15,7 @@ from enum import IntEnum
|
|
14
15
|
from pathlib import Path
|
15
16
|
from hashlib import sha256
|
16
17
|
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
18
|
+
from itertools import chain
|
17
19
|
|
18
20
|
import math
|
19
21
|
import numpy as np
|
@@ -70,7 +72,8 @@ class Model:
|
|
70
72
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
71
73
|
use_temp_file: bool = False, eager: bool = False,
|
72
74
|
metadata_override: Path | None = None, model_name: str | None = None,
|
73
|
-
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
75
|
+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
76
|
+
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
|
74
77
|
if type(self) is Model:
|
75
78
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
76
79
|
|
@@ -85,7 +88,7 @@ class Model:
|
|
85
88
|
self.is_safetensors = len(self.part_names) > 0
|
86
89
|
if not self.is_safetensors:
|
87
90
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
88
|
-
self.hparams = Model.load_hparams(self.dir_model)
|
91
|
+
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
|
89
92
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
90
93
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
91
94
|
self.tensor_names = None
|
@@ -129,12 +132,14 @@ class Model:
|
|
129
132
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
130
133
|
tensor_names_from_parts: set[str] = set()
|
131
134
|
|
132
|
-
if
|
135
|
+
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
136
|
+
index_name += ".index.json"
|
137
|
+
index_file = self.dir_model / index_name
|
138
|
+
|
139
|
+
if index_file.is_file():
|
133
140
|
self.tensor_names = set()
|
134
|
-
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
135
|
-
index_name += ".index.json"
|
136
141
|
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
137
|
-
with open(
|
142
|
+
with open(index_file, "r", encoding="utf-8") as f:
|
138
143
|
index: dict[str, Any] = json.load(f)
|
139
144
|
weight_map = index.get("weight_map")
|
140
145
|
if weight_map is None or not isinstance(weight_map, dict):
|
@@ -142,6 +147,7 @@ class Model:
|
|
142
147
|
self.tensor_names.update(weight_map.keys())
|
143
148
|
else:
|
144
149
|
self.tensor_names = tensor_names_from_parts
|
150
|
+
weight_map = {}
|
145
151
|
|
146
152
|
for part_name in self.part_names:
|
147
153
|
logger.info(f"gguf: loading model part '{part_name}'")
|
@@ -168,9 +174,17 @@ class Model:
|
|
168
174
|
data = LazyTorchTensor.from_eager(data)
|
169
175
|
yield name, data
|
170
176
|
|
171
|
-
#
|
172
|
-
if len(
|
173
|
-
|
177
|
+
# verify tensor name presence and identify potentially missing files
|
178
|
+
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
179
|
+
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
|
180
|
+
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
181
|
+
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
182
|
+
if len(extra) == 0 and len(missing_files) > 0:
|
183
|
+
raise ValueError(f"Missing or incomplete model files: {missing_files}")
|
184
|
+
else:
|
185
|
+
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
186
|
+
f"Missing tensors: {missing}\n"
|
187
|
+
f"Extra tensors: {extra}")
|
174
188
|
|
175
189
|
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
176
190
|
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
@@ -251,20 +265,19 @@ class Model:
|
|
251
265
|
|
252
266
|
return [(self.map_tensor_name(name), data_torch)]
|
253
267
|
|
254
|
-
def
|
268
|
+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
255
269
|
del name, new_name, bid, n_dims # unused
|
256
270
|
|
257
271
|
return False
|
258
272
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
return False
|
273
|
+
# some models need extra generated tensors (like rope_freqs)
|
274
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
275
|
+
return ()
|
263
276
|
|
264
277
|
def prepare_tensors(self):
|
265
278
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
266
279
|
|
267
|
-
for name, data_torch in self.get_tensors():
|
280
|
+
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
|
268
281
|
# we don't need these
|
269
282
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
270
283
|
continue
|
@@ -282,57 +295,78 @@ class Model:
|
|
282
295
|
bid = int(part)
|
283
296
|
break
|
284
297
|
|
285
|
-
for new_name,
|
286
|
-
data
|
287
|
-
n_dims = len(data.shape)
|
288
|
-
data_dtype = data.dtype
|
289
|
-
data_qtype: gguf.GGMLQuantizationType | None = None
|
298
|
+
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
|
299
|
+
data = data_torch.squeeze().numpy()
|
290
300
|
|
291
|
-
#
|
292
|
-
|
293
|
-
|
301
|
+
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
302
|
+
if len(data.shape) == 0:
|
303
|
+
data = data_torch.numpy()
|
304
|
+
|
305
|
+
n_dims = len(data.shape)
|
306
|
+
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
294
307
|
|
295
308
|
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
296
|
-
|
297
|
-
|
298
|
-
extra_f32,
|
299
|
-
n_dims == 1,
|
300
|
-
new_name.endswith("_norm.weight"),
|
301
|
-
))
|
309
|
+
if n_dims <= 1 or new_name.endswith("_norm.weight"):
|
310
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
302
311
|
|
312
|
+
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
303
313
|
# Some tensor types are always in float32
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
314
|
+
if data_qtype is False and (
|
315
|
+
any(
|
316
|
+
self.match_model_tensor_name(new_name, key, bid)
|
317
|
+
for key in (
|
318
|
+
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
319
|
+
gguf.MODEL_TENSOR.POS_EMBD,
|
320
|
+
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
321
|
+
gguf.MODEL_TENSOR.SSM_CONV1D,
|
322
|
+
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
323
|
+
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
324
|
+
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
325
|
+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
326
|
+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
327
|
+
)
|
328
|
+
)
|
329
|
+
or not new_name.endswith(".weight")
|
330
|
+
):
|
331
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
321
332
|
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
333
|
+
if data_qtype is False and any(
|
334
|
+
self.match_model_tensor_name(new_name, key, bid)
|
335
|
+
for key in (
|
336
|
+
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
337
|
+
gguf.MODEL_TENSOR.OUTPUT,
|
338
|
+
)
|
339
|
+
):
|
340
|
+
if self.ftype in (
|
341
|
+
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
342
|
+
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
343
|
+
):
|
344
|
+
# TODO: use Q4_K and Q6_K
|
345
|
+
data_qtype = gguf.GGMLQuantizationType.F16
|
326
346
|
|
327
|
-
|
328
|
-
|
329
|
-
|
347
|
+
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
348
|
+
if isinstance(data_qtype, bool):
|
349
|
+
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
350
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
351
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
330
352
|
data_qtype = gguf.GGMLQuantizationType.F16
|
353
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
354
|
+
data_qtype = gguf.GGMLQuantizationType.BF16
|
355
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
356
|
+
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
357
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
358
|
+
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
359
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
360
|
+
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
361
|
+
else:
|
362
|
+
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
331
363
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
364
|
+
try:
|
365
|
+
data = gguf.quants.quantize(data, data_qtype)
|
366
|
+
except gguf.QuantError as e:
|
367
|
+
logger.warning("%s, %s", e, "falling back to F16")
|
368
|
+
data_qtype = gguf.GGMLQuantizationType.F16
|
369
|
+
data = gguf.quants.quantize(data, data_qtype)
|
336
370
|
|
337
371
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
338
372
|
|
@@ -540,6 +574,9 @@ class Model:
|
|
540
574
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
541
575
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
542
576
|
res = "bert-bge"
|
577
|
+
if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
|
578
|
+
# ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
|
579
|
+
res = "bert-bge-large"
|
543
580
|
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
544
581
|
# ref: https://huggingface.co/mosaicml/mpt-7b
|
545
582
|
res = "mpt"
|
@@ -567,6 +604,9 @@ class Model:
|
|
567
604
|
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
568
605
|
# ref: https://huggingface.co/databricks/dbrx-base
|
569
606
|
res = "dbrx"
|
607
|
+
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
|
608
|
+
# ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
609
|
+
res = "jina-v1-en"
|
570
610
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
571
611
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
572
612
|
res = "jina-v2-en"
|
@@ -603,6 +643,27 @@ class Model:
|
|
603
643
|
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
604
644
|
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
605
645
|
res = "smollm"
|
646
|
+
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
|
647
|
+
# ref: https://huggingface.co/bigscience/bloom
|
648
|
+
res = "bloom"
|
649
|
+
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
|
650
|
+
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
|
651
|
+
res = "gpt3-finnish"
|
652
|
+
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
653
|
+
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
654
|
+
res = "exaone"
|
655
|
+
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
656
|
+
# ref: https://huggingface.co/microsoft/phi-2
|
657
|
+
res = "phi-2"
|
658
|
+
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
|
659
|
+
# ref: https://huggingface.co/facebook/chameleon-7b
|
660
|
+
res = "chameleon"
|
661
|
+
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
662
|
+
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
663
|
+
res = "minerva-7b"
|
664
|
+
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
|
665
|
+
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
|
666
|
+
res = "roberta-bpe"
|
606
667
|
|
607
668
|
if res is None:
|
608
669
|
logger.warning("\n")
|
@@ -906,7 +967,7 @@ class GPTNeoXModel(Model):
|
|
906
967
|
return tensors
|
907
968
|
|
908
969
|
|
909
|
-
@Model.register("BloomForCausalLM")
|
970
|
+
@Model.register("BloomForCausalLM", "BloomModel")
|
910
971
|
class BloomModel(Model):
|
911
972
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
912
973
|
|
@@ -1461,7 +1522,7 @@ class StableLMModel(Model):
|
|
1461
1522
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1462
1523
|
|
1463
1524
|
|
1464
|
-
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1525
|
+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1465
1526
|
class LlamaModel(Model):
|
1466
1527
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
1467
1528
|
|
@@ -1487,6 +1548,17 @@ class LlamaModel(Model):
|
|
1487
1548
|
special_vocab._set_special_token("eot", 32010)
|
1488
1549
|
special_vocab.add_to_gguf(self.gguf_writer)
|
1489
1550
|
|
1551
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1552
|
+
if tokenizer_config_file.is_file():
|
1553
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1554
|
+
tokenizer_config_json = json.load(f)
|
1555
|
+
if "add_prefix_space" in tokenizer_config_json:
|
1556
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1557
|
+
|
1558
|
+
# Apply to granite small models only
|
1559
|
+
if self.hparams.get("vocab_size", 32000) == 49152:
|
1560
|
+
self.gguf_writer.add_add_bos_token(False)
|
1561
|
+
|
1490
1562
|
def set_gguf_parameters(self):
|
1491
1563
|
super().set_gguf_parameters()
|
1492
1564
|
hparams = self.hparams
|
@@ -1503,17 +1575,6 @@ class LlamaModel(Model):
|
|
1503
1575
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1504
1576
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1505
1577
|
|
1506
|
-
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1507
|
-
if tokenizer_config_file.is_file():
|
1508
|
-
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1509
|
-
tokenizer_config_json = json.load(f)
|
1510
|
-
if "add_prefix_space" in tokenizer_config_json:
|
1511
|
-
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1512
|
-
|
1513
|
-
# Apply to granite small models only
|
1514
|
-
if self.hparams.get("vocab_size", 32000) == 49152:
|
1515
|
-
self.gguf_writer.add_add_bos_token(False)
|
1516
|
-
|
1517
1578
|
@staticmethod
|
1518
1579
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1519
1580
|
if n_head_kv is not None and n_head != n_head_kv:
|
@@ -1569,12 +1630,13 @@ class LlamaModel(Model):
|
|
1569
1630
|
|
1570
1631
|
return [(self.map_tensor_name(name), data_torch)]
|
1571
1632
|
|
1572
|
-
def
|
1633
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1573
1634
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1574
1635
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1575
1636
|
base = self.hparams.get("rope_theta", 10000.0)
|
1576
|
-
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1637
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1577
1638
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1639
|
+
|
1578
1640
|
factor = rope_scaling.get("factor", 8.0)
|
1579
1641
|
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1580
1642
|
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
@@ -1595,8 +1657,9 @@ class LlamaModel(Model):
|
|
1595
1657
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1596
1658
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1597
1659
|
|
1598
|
-
|
1660
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
1599
1661
|
|
1662
|
+
def prepare_tensors(self):
|
1600
1663
|
super().prepare_tensors()
|
1601
1664
|
|
1602
1665
|
if self._experts is not None:
|
@@ -1618,15 +1681,16 @@ class BitnetModel(Model):
|
|
1618
1681
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1619
1682
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
1620
1683
|
|
1621
|
-
def weight_quant(self, weight):
|
1684
|
+
def weight_quant(self, weight: Tensor) -> Tensor:
|
1622
1685
|
dtype = weight.dtype
|
1623
1686
|
weight = weight.float()
|
1624
|
-
|
1625
|
-
|
1626
|
-
scale
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1687
|
+
scale = weight.abs().mean().clamp(min=1e-5)
|
1688
|
+
iscale = 1 / scale
|
1689
|
+
# TODO: multiply by the scale directly instead of inverting it twice
|
1690
|
+
# (this is also unnecessarily doubly inverted upstream)
|
1691
|
+
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
1692
|
+
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
1693
|
+
return result.type(dtype)
|
1630
1694
|
|
1631
1695
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1632
1696
|
new_name = self.map_tensor_name(name)
|
@@ -1641,11 +1705,9 @@ class BitnetModel(Model):
|
|
1641
1705
|
gguf.MODEL_TENSOR.FFN_GATE,
|
1642
1706
|
]):
|
1643
1707
|
# transform weight into 1/0/-1 (in fp32)
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
else:
|
1648
|
-
yield (new_name, data_torch)
|
1708
|
+
data_torch = self.weight_quant(data_torch)
|
1709
|
+
|
1710
|
+
yield (new_name, data_torch)
|
1649
1711
|
|
1650
1712
|
|
1651
1713
|
@Model.register("GrokForCausalLM")
|
@@ -1764,7 +1826,7 @@ class DbrxModel(Model):
|
|
1764
1826
|
|
1765
1827
|
return [(new_name, data_torch)]
|
1766
1828
|
|
1767
|
-
def
|
1829
|
+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
1768
1830
|
del name, new_name, bid # unused
|
1769
1831
|
|
1770
1832
|
return n_dims > 1
|
@@ -1775,29 +1837,40 @@ class MiniCPMModel(Model):
|
|
1775
1837
|
model_arch = gguf.MODEL_ARCH.MINICPM
|
1776
1838
|
|
1777
1839
|
def set_gguf_parameters(self):
|
1778
|
-
|
1779
|
-
|
1780
|
-
self.gguf_writer.
|
1781
|
-
|
1782
|
-
self.
|
1783
|
-
self.gguf_writer.
|
1784
|
-
|
1785
|
-
self.
|
1786
|
-
self.gguf_writer.
|
1787
|
-
|
1840
|
+
super().set_gguf_parameters()
|
1841
|
+
embedding_scale = float(self.hparams["scale_emb"])
|
1842
|
+
self.gguf_writer.add_embedding_scale(embedding_scale)
|
1843
|
+
logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
|
1844
|
+
residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
|
1845
|
+
self.gguf_writer.add_residual_scale(residual_scale)
|
1846
|
+
logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
|
1847
|
+
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
1848
|
+
self.gguf_writer.add_logit_scale(logit_scale)
|
1849
|
+
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
1850
|
+
if self.hparams.get("rope_scaling") is not None:
|
1851
|
+
if self.hparams["rope_scaling"].get("type") == "longrope":
|
1852
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
1853
|
+
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
1854
|
+
|
1855
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1856
|
+
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1788
1857
|
|
1789
|
-
|
1790
|
-
|
1858
|
+
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
1859
|
+
if rope_scaling is not None:
|
1860
|
+
long_factors = rope_scaling.get('long_factor', None)
|
1861
|
+
short_factors = rope_scaling.get('short_factor', None)
|
1791
1862
|
|
1792
|
-
|
1793
|
-
|
1794
|
-
n_head = n_kv_head
|
1863
|
+
if long_factors is None or short_factors is None:
|
1864
|
+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
1795
1865
|
|
1796
|
-
|
1797
|
-
|
1798
|
-
|
1799
|
-
.
|
1800
|
-
|
1866
|
+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
1867
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
1868
|
+
|
1869
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
1870
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
1871
|
+
|
1872
|
+
def set_vocab(self):
|
1873
|
+
self._set_vocab_sentencepiece()
|
1801
1874
|
|
1802
1875
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1803
1876
|
del bid # unused
|
@@ -1807,13 +1880,66 @@ class MiniCPMModel(Model):
|
|
1807
1880
|
|
1808
1881
|
# HF models permute some of the tensors, so we need to undo that
|
1809
1882
|
if name.endswith(("q_proj.weight")):
|
1810
|
-
data_torch =
|
1883
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1811
1884
|
if name.endswith(("k_proj.weight")):
|
1812
|
-
data_torch =
|
1885
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1813
1886
|
|
1814
1887
|
return [(self.map_tensor_name(name), data_torch)]
|
1815
1888
|
|
1816
1889
|
|
1890
|
+
@Model.register("MiniCPM3ForCausalLM")
|
1891
|
+
class MiniCPM3Model(Model):
|
1892
|
+
model_arch = gguf.MODEL_ARCH.MINICPM3
|
1893
|
+
|
1894
|
+
def set_gguf_parameters(self):
|
1895
|
+
hparams = self.hparams
|
1896
|
+
|
1897
|
+
self.gguf_writer.add_file_type(self.ftype)
|
1898
|
+
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
1899
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1900
|
+
self.gguf_writer.add_block_count(self.block_count)
|
1901
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
1902
|
+
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
1903
|
+
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
1904
|
+
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
1905
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1906
|
+
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
1907
|
+
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
1908
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
1909
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
1910
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
1911
|
+
|
1912
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1913
|
+
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
1914
|
+
if rope_scaling is not None:
|
1915
|
+
rope_dims = self.hparams["qk_rope_head_dim"]
|
1916
|
+
|
1917
|
+
long_factors = rope_scaling.get('long_factor', None)
|
1918
|
+
short_factors = rope_scaling.get('short_factor', None)
|
1919
|
+
|
1920
|
+
if long_factors is None or short_factors is None:
|
1921
|
+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
1922
|
+
|
1923
|
+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
1924
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
1925
|
+
|
1926
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
1927
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
1928
|
+
|
1929
|
+
def set_vocab(self):
|
1930
|
+
self._set_vocab_sentencepiece()
|
1931
|
+
|
1932
|
+
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
1933
|
+
if n_kv_head is not None and n_head != n_kv_head:
|
1934
|
+
n_head //= n_kv_head
|
1935
|
+
|
1936
|
+
return (
|
1937
|
+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1938
|
+
.swapaxes(1, 2)
|
1939
|
+
.reshape(weights.shape)
|
1940
|
+
)
|
1941
|
+
|
1942
|
+
|
1817
1943
|
@Model.register("QWenLMHeadModel")
|
1818
1944
|
class QwenModel(Model):
|
1819
1945
|
model_arch = gguf.MODEL_ARCH.QWEN
|
@@ -1866,6 +1992,37 @@ class Qwen2Model(Model):
|
|
1866
1992
|
except FileNotFoundError:
|
1867
1993
|
self._set_vocab_gpt2()
|
1868
1994
|
|
1995
|
+
def set_gguf_parameters(self):
|
1996
|
+
super().set_gguf_parameters()
|
1997
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1998
|
+
if self.hparams["rope_scaling"].get("type") == "yarn":
|
1999
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
2000
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2001
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
2002
|
+
|
2003
|
+
|
2004
|
+
@Model.register("Qwen2VLForConditionalGeneration")
|
2005
|
+
class Qwen2VLModel(Model):
|
2006
|
+
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
2007
|
+
|
2008
|
+
def set_gguf_parameters(self):
|
2009
|
+
super().set_gguf_parameters()
|
2010
|
+
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
|
2011
|
+
mrope_section += [0] * max(0, 4 - len(mrope_section))
|
2012
|
+
self.gguf_writer.add_rope_dimension_sections(mrope_section)
|
2013
|
+
|
2014
|
+
def set_vocab(self):
|
2015
|
+
try:
|
2016
|
+
self._set_vocab_sentencepiece()
|
2017
|
+
except FileNotFoundError:
|
2018
|
+
self._set_vocab_gpt2()
|
2019
|
+
|
2020
|
+
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
2021
|
+
for name, data in super().get_tensors():
|
2022
|
+
if name.startswith("visual."):
|
2023
|
+
continue
|
2024
|
+
yield name, data
|
2025
|
+
|
1869
2026
|
|
1870
2027
|
@Model.register("Qwen2MoeForCausalLM")
|
1871
2028
|
class Qwen2MoeModel(Model):
|
@@ -2113,6 +2270,13 @@ class Phi3MiniModel(Model):
|
|
2113
2270
|
self.gguf_writer.add_file_type(self.ftype)
|
2114
2271
|
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
|
2115
2272
|
|
2273
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2274
|
+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
2275
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
2276
|
+
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
2277
|
+
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
2278
|
+
rope_dims = n_embd // n_head
|
2279
|
+
|
2116
2280
|
# write rope scaling for long context (128k) model
|
2117
2281
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
2118
2282
|
if rope_scaling is None:
|
@@ -2142,8 +2306,8 @@ class Phi3MiniModel(Model):
|
|
2142
2306
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2143
2307
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2144
2308
|
|
2145
|
-
self.
|
2146
|
-
self.
|
2309
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2310
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2147
2311
|
|
2148
2312
|
|
2149
2313
|
@Model.register("PlamoForCausalLM")
|
@@ -2403,7 +2567,7 @@ class InternLM2Model(Model):
|
|
2403
2567
|
return [(self.map_tensor_name(name), data_torch)]
|
2404
2568
|
|
2405
2569
|
|
2406
|
-
@Model.register("BertModel", "CamembertModel")
|
2570
|
+
@Model.register("BertModel", "CamembertModel", "RobertaModel")
|
2407
2571
|
class BertModel(Model):
|
2408
2572
|
model_arch = gguf.MODEL_ARCH.BERT
|
2409
2573
|
|
@@ -2444,7 +2608,8 @@ class BertModel(Model):
|
|
2444
2608
|
|
2445
2609
|
# we need this to validate the size of the token_type embeddings
|
2446
2610
|
# though currently we are passing all zeros to the token_type embeddings
|
2447
|
-
|
2611
|
+
# "Sequence A" or "Sequence B"
|
2612
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
2448
2613
|
|
2449
2614
|
# convert to phantom space vocab
|
2450
2615
|
def phantom(tok):
|
@@ -2505,6 +2670,117 @@ class NomicBertModel(BertModel):
|
|
2505
2670
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
2506
2671
|
|
2507
2672
|
|
2673
|
+
@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
2674
|
+
class XLMRobertaModel(BertModel):
|
2675
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
2676
|
+
|
2677
|
+
def __init__(self, *args, **kwargs):
|
2678
|
+
super().__init__(*args, **kwargs)
|
2679
|
+
|
2680
|
+
# we need the pad_token_id to know how to chop down position_embd matrix
|
2681
|
+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
2682
|
+
self._position_offset = 1 + pad_token_id
|
2683
|
+
if "max_position_embeddings" in self.hparams:
|
2684
|
+
self.hparams["max_position_embeddings"] -= self._position_offset
|
2685
|
+
else:
|
2686
|
+
self._position_offset = None
|
2687
|
+
|
2688
|
+
def set_vocab(self):
|
2689
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
2690
|
+
# exception when importing sentencepiece_model_pb2
|
2691
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
2692
|
+
from sentencepiece import SentencePieceProcessor
|
2693
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
2694
|
+
|
2695
|
+
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
|
2696
|
+
if not tokenizer_path.is_file():
|
2697
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
2698
|
+
|
2699
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
2700
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
2701
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
2702
|
+
|
2703
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
2704
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
2705
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
2706
|
+
|
2707
|
+
tokenizer = SentencePieceProcessor()
|
2708
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
2709
|
+
|
2710
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
2711
|
+
|
2712
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
2713
|
+
scores: list[float] = [-10000.0] * vocab_size
|
2714
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
2715
|
+
|
2716
|
+
for token_id in range(tokenizer.vocab_size()):
|
2717
|
+
piece = tokenizer.IdToPiece(token_id)
|
2718
|
+
text = piece.encode("utf-8")
|
2719
|
+
score = tokenizer.GetScore(token_id)
|
2720
|
+
|
2721
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
2722
|
+
if tokenizer.IsUnknown(token_id):
|
2723
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
2724
|
+
elif tokenizer.IsControl(token_id):
|
2725
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
2726
|
+
elif tokenizer.IsUnused(token_id):
|
2727
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
2728
|
+
elif tokenizer.IsByte(token_id):
|
2729
|
+
toktype = SentencePieceTokenTypes.BYTE
|
2730
|
+
|
2731
|
+
tokens[token_id] = text
|
2732
|
+
scores[token_id] = score
|
2733
|
+
toktypes[token_id] = toktype
|
2734
|
+
|
2735
|
+
if vocab_size > len(tokens):
|
2736
|
+
pad_count = vocab_size - len(tokens)
|
2737
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
2738
|
+
for i in range(1, pad_count + 1):
|
2739
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
2740
|
+
scores.append(-1000.0)
|
2741
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
2742
|
+
|
2743
|
+
# realign tokens (see HF tokenizer code)
|
2744
|
+
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
|
2745
|
+
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
|
2746
|
+
toktypes = [
|
2747
|
+
SentencePieceTokenTypes.CONTROL,
|
2748
|
+
SentencePieceTokenTypes.CONTROL,
|
2749
|
+
SentencePieceTokenTypes.CONTROL,
|
2750
|
+
SentencePieceTokenTypes.UNKNOWN,
|
2751
|
+
] + toktypes[3:-1]
|
2752
|
+
|
2753
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
2754
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
2755
|
+
self.gguf_writer.add_token_list(tokens)
|
2756
|
+
self.gguf_writer.add_token_scores(scores)
|
2757
|
+
self.gguf_writer.add_token_types(toktypes)
|
2758
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
2759
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
2760
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
2761
|
+
if precompiled_charsmap:
|
2762
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
2763
|
+
|
2764
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2765
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2766
|
+
|
2767
|
+
self.gguf_writer.add_add_bos_token(True)
|
2768
|
+
self.gguf_writer.add_add_eos_token(True)
|
2769
|
+
|
2770
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2771
|
+
# if name starts with "roberta.", remove the prefix
|
2772
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
2773
|
+
if name.startswith("roberta."):
|
2774
|
+
name = name[8:]
|
2775
|
+
|
2776
|
+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
2777
|
+
if name == "embeddings.position_embeddings.weight":
|
2778
|
+
if self._position_offset is not None:
|
2779
|
+
data_torch = data_torch[self._position_offset:,:]
|
2780
|
+
|
2781
|
+
return super().modify_tensors(data_torch, name, bid)
|
2782
|
+
|
2783
|
+
|
2508
2784
|
@Model.register("GemmaForCausalLM")
|
2509
2785
|
class GemmaModel(Model):
|
2510
2786
|
model_arch = gguf.MODEL_ARCH.GEMMA
|
@@ -2608,7 +2884,90 @@ class StarCoder2Model(Model):
|
|
2608
2884
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
2609
2885
|
|
2610
2886
|
|
2611
|
-
@Model.register("
|
2887
|
+
@Model.register("Rwkv6ForCausalLM")
|
2888
|
+
class Rwkv6Model(Model):
|
2889
|
+
model_arch = gguf.MODEL_ARCH.RWKV6
|
2890
|
+
|
2891
|
+
def set_vocab(self):
|
2892
|
+
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
2893
|
+
vocab_size = self.hparams.get("vocab_size", 65536)
|
2894
|
+
|
2895
|
+
tokens: list[bytes] = ['<s>'.encode("utf-8")]
|
2896
|
+
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
2897
|
+
|
2898
|
+
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
|
2899
|
+
lines = f.readlines()
|
2900
|
+
for line in lines:
|
2901
|
+
parts = line.split(' ')
|
2902
|
+
assert len(parts) >= 3
|
2903
|
+
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
2904
|
+
token = token.encode("utf-8") if isinstance(token, str) else token
|
2905
|
+
assert isinstance(token, bytes)
|
2906
|
+
assert len(token) == token_len
|
2907
|
+
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
2908
|
+
tokens.append(token_text.encode("utf-8"))
|
2909
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
2910
|
+
remainder = vocab_size - len(tokens)
|
2911
|
+
assert remainder >= 0
|
2912
|
+
for i in range(len(tokens), vocab_size):
|
2913
|
+
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
2914
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
2915
|
+
|
2916
|
+
self.gguf_writer.add_tokenizer_model("rwkv")
|
2917
|
+
self.gguf_writer.add_token_list(tokens)
|
2918
|
+
self.gguf_writer.add_token_types(toktypes)
|
2919
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
2920
|
+
special_vocab.chat_template = "rwkv-world"
|
2921
|
+
# hack: Add '\n\n' as the EOT token to make it chat normally
|
2922
|
+
special_vocab._set_special_token("eot", 261)
|
2923
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2924
|
+
|
2925
|
+
def set_gguf_parameters(self):
|
2926
|
+
block_count = self.hparams["num_hidden_layers"]
|
2927
|
+
head_size = self.hparams["head_size"]
|
2928
|
+
hidden_size = self.hparams["hidden_size"]
|
2929
|
+
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
2930
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
2931
|
+
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
2932
|
+
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
2933
|
+
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
2934
|
+
|
2935
|
+
# RWKV isn't context limited
|
2936
|
+
self.gguf_writer.add_context_length(1048576)
|
2937
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
2938
|
+
self.gguf_writer.add_block_count(block_count)
|
2939
|
+
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
2940
|
+
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
2941
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
2942
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
2943
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
2944
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
2945
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2946
|
+
|
2947
|
+
# required by llama.cpp, unused
|
2948
|
+
self.gguf_writer.add_head_count(0)
|
2949
|
+
|
2950
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2951
|
+
new_name = self.map_tensor_name(name)
|
2952
|
+
|
2953
|
+
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
2954
|
+
new_name += ".weight"
|
2955
|
+
|
2956
|
+
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
|
2957
|
+
data_torch = data_torch.transpose(0, 1)
|
2958
|
+
|
2959
|
+
if new_name.endswith("time_mix_w2.weight"):
|
2960
|
+
data_torch = data_torch.permute(0, 2, 1)
|
2961
|
+
|
2962
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
2963
|
+
if rescale_every_n_layers > 0:
|
2964
|
+
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
2965
|
+
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
2966
|
+
|
2967
|
+
yield (new_name, data_torch)
|
2968
|
+
|
2969
|
+
|
2970
|
+
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
2612
2971
|
class MambaModel(Model):
|
2613
2972
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
2614
2973
|
|
@@ -2639,7 +2998,10 @@ class MambaModel(Model):
|
|
2639
2998
|
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
2640
2999
|
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
|
2641
3000
|
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
2642
|
-
|
3001
|
+
use_dt_b_c_norm = False
|
3002
|
+
# For falconmamba we do apply RMS norm on B / DT and C layers
|
3003
|
+
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
|
3004
|
+
use_dt_b_c_norm = True
|
2643
3005
|
# Fail early for models which don't have a block expansion factor of 2
|
2644
3006
|
assert d_inner == 2 * d_model
|
2645
3007
|
|
@@ -2647,12 +3009,13 @@ class MambaModel(Model):
|
|
2647
3009
|
self.gguf_writer.add_embedding_length(d_model)
|
2648
3010
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
2649
3011
|
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
2650
|
-
self.gguf_writer.add_block_count(self.
|
3012
|
+
self.gguf_writer.add_block_count(self.block_count)
|
2651
3013
|
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
2652
3014
|
self.gguf_writer.add_ssm_inner_size(d_inner)
|
2653
3015
|
self.gguf_writer.add_ssm_state_size(d_state)
|
2654
3016
|
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
2655
3017
|
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3018
|
+
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
|
2656
3019
|
self.gguf_writer.add_file_type(self.ftype)
|
2657
3020
|
|
2658
3021
|
_tok_embd = None
|
@@ -2679,19 +3042,6 @@ class MambaModel(Model):
|
|
2679
3042
|
|
2680
3043
|
return [(new_name, data_torch)]
|
2681
3044
|
|
2682
|
-
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
2683
|
-
del n_dims # unused
|
2684
|
-
|
2685
|
-
return bid is not None and new_name in (
|
2686
|
-
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
2687
|
-
gguf.MODEL_TENSOR.SSM_CONV1D,
|
2688
|
-
gguf.MODEL_TENSOR.SSM_X,
|
2689
|
-
gguf.MODEL_TENSOR.SSM_DT,
|
2690
|
-
gguf.MODEL_TENSOR.SSM_A,
|
2691
|
-
gguf.MODEL_TENSOR.SSM_D,
|
2692
|
-
]
|
2693
|
-
)
|
2694
|
-
|
2695
3045
|
|
2696
3046
|
@Model.register("CohereForCausalLM")
|
2697
3047
|
class CommandR2Model(Model):
|
@@ -2739,9 +3089,74 @@ class OlmoModel(Model):
|
|
2739
3089
|
return [(self.map_tensor_name(name), data_torch)]
|
2740
3090
|
|
2741
3091
|
|
2742
|
-
@Model.register("
|
2743
|
-
class
|
2744
|
-
model_arch = gguf.MODEL_ARCH.
|
3092
|
+
@Model.register("Olmo2ForCausalLM")
|
3093
|
+
class Olmo2Model(Model):
|
3094
|
+
model_arch = gguf.MODEL_ARCH.OLMO2
|
3095
|
+
|
3096
|
+
|
3097
|
+
@Model.register("OlmoeForCausalLM")
|
3098
|
+
class OlmoeModel(Model):
|
3099
|
+
model_arch = gguf.MODEL_ARCH.OLMOE
|
3100
|
+
|
3101
|
+
def set_gguf_parameters(self):
|
3102
|
+
super().set_gguf_parameters()
|
3103
|
+
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
|
3104
|
+
if (n_experts := self.hparams.get("num_experts")) is not None:
|
3105
|
+
self.gguf_writer.add_expert_count(n_experts)
|
3106
|
+
|
3107
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3108
|
+
|
3109
|
+
# Copied from: Qwen2MoeModel
|
3110
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3111
|
+
# process the experts separately
|
3112
|
+
if name.find("experts") != -1:
|
3113
|
+
n_experts = self.hparams["num_experts"]
|
3114
|
+
assert bid is not None
|
3115
|
+
|
3116
|
+
if self._experts is None:
|
3117
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3118
|
+
|
3119
|
+
self._experts[bid][name] = data_torch
|
3120
|
+
|
3121
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3122
|
+
tensors: list[tuple[str, Tensor]] = []
|
3123
|
+
|
3124
|
+
# merge the experts into a single 3d tensor
|
3125
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
3126
|
+
datas: list[Tensor] = []
|
3127
|
+
|
3128
|
+
for xid in range(n_experts):
|
3129
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
3130
|
+
datas.append(self._experts[bid][ename])
|
3131
|
+
del self._experts[bid][ename]
|
3132
|
+
|
3133
|
+
data_torch = torch.stack(datas, dim=0)
|
3134
|
+
|
3135
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
3136
|
+
|
3137
|
+
new_name = self.map_tensor_name(merged_name)
|
3138
|
+
|
3139
|
+
tensors.append((new_name, data_torch))
|
3140
|
+
return tensors
|
3141
|
+
else:
|
3142
|
+
return []
|
3143
|
+
|
3144
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3145
|
+
|
3146
|
+
# Copied from: Qwen2MoeModel
|
3147
|
+
def prepare_tensors(self):
|
3148
|
+
super().prepare_tensors()
|
3149
|
+
|
3150
|
+
if self._experts is not None:
|
3151
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3152
|
+
experts = [k for d in self._experts for k in d.keys()]
|
3153
|
+
if len(experts) > 0:
|
3154
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
3155
|
+
|
3156
|
+
|
3157
|
+
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
3158
|
+
class JinaBertV2Model(BertModel):
|
3159
|
+
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
2745
3160
|
|
2746
3161
|
def __init__(self, *args, **kwargs):
|
2747
3162
|
super().__init__(*args, **kwargs)
|
@@ -2777,6 +3192,14 @@ class JinaBertV2Model(BertModel):
|
|
2777
3192
|
self.gguf_writer.add_add_bos_token(True)
|
2778
3193
|
self.gguf_writer.add_add_eos_token(True)
|
2779
3194
|
|
3195
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3196
|
+
# if name starts with "bert.", remove the prefix
|
3197
|
+
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
3198
|
+
if name.startswith("bert."):
|
3199
|
+
name = name[5:]
|
3200
|
+
|
3201
|
+
return super().modify_tensors(data_torch, name, bid)
|
3202
|
+
|
2780
3203
|
|
2781
3204
|
@Model.register("OpenELMForCausalLM")
|
2782
3205
|
class OpenELMModel(Model):
|
@@ -3226,6 +3649,145 @@ class T5Model(Model):
|
|
3226
3649
|
return [(self.map_tensor_name(name), data_torch)]
|
3227
3650
|
|
3228
3651
|
|
3652
|
+
@Model.register("T5EncoderModel")
|
3653
|
+
class T5EncoderModel(Model):
|
3654
|
+
model_arch = gguf.MODEL_ARCH.T5ENCODER
|
3655
|
+
|
3656
|
+
def __init__(self, *args, **kwargs):
|
3657
|
+
super().__init__(*args, **kwargs)
|
3658
|
+
self.shared_token_embeddings_found = False
|
3659
|
+
|
3660
|
+
def set_vocab(self):
|
3661
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
3662
|
+
# exception when importing sentencepiece_model_pb2
|
3663
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
3664
|
+
from sentencepiece import SentencePieceProcessor
|
3665
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
3666
|
+
|
3667
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
3668
|
+
|
3669
|
+
# many older models use spiece.model tokenizer model filename
|
3670
|
+
if not tokenizer_path.is_file():
|
3671
|
+
tokenizer_path = self.dir_model / 'spiece.model'
|
3672
|
+
|
3673
|
+
if not tokenizer_path.is_file():
|
3674
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
3675
|
+
|
3676
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
3677
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
3678
|
+
|
3679
|
+
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
3680
|
+
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
3681
|
+
# assure the tokenizer model file name is correct
|
3682
|
+
assert tokenizer_path.name == 'tokenizer.model'
|
3683
|
+
return self._set_vocab_sentencepiece()
|
3684
|
+
else:
|
3685
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3686
|
+
|
3687
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
3688
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
3689
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
3690
|
+
|
3691
|
+
tokenizer = SentencePieceProcessor()
|
3692
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3693
|
+
|
3694
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
3695
|
+
|
3696
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3697
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3698
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3699
|
+
|
3700
|
+
for token_id in range(tokenizer.vocab_size()):
|
3701
|
+
piece = tokenizer.IdToPiece(token_id)
|
3702
|
+
text = piece.encode("utf-8")
|
3703
|
+
score = tokenizer.GetScore(token_id)
|
3704
|
+
|
3705
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3706
|
+
if tokenizer.IsUnknown(token_id):
|
3707
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3708
|
+
elif tokenizer.IsControl(token_id):
|
3709
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3710
|
+
elif tokenizer.IsUnused(token_id):
|
3711
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3712
|
+
elif tokenizer.IsByte(token_id):
|
3713
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3714
|
+
|
3715
|
+
tokens[token_id] = text
|
3716
|
+
scores[token_id] = score
|
3717
|
+
toktypes[token_id] = toktype
|
3718
|
+
|
3719
|
+
added_tokens_file = self.dir_model / 'added_tokens.json'
|
3720
|
+
if added_tokens_file.is_file():
|
3721
|
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
3722
|
+
added_tokens_json = json.load(f)
|
3723
|
+
for key in added_tokens_json:
|
3724
|
+
token_id = added_tokens_json[key]
|
3725
|
+
if token_id >= vocab_size:
|
3726
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
3727
|
+
continue
|
3728
|
+
|
3729
|
+
tokens[token_id] = key.encode("utf-8")
|
3730
|
+
scores[token_id] = -1000.0
|
3731
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
3732
|
+
|
3733
|
+
if vocab_size > len(tokens):
|
3734
|
+
pad_count = vocab_size - len(tokens)
|
3735
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
3736
|
+
for i in range(1, pad_count + 1):
|
3737
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
3738
|
+
scores.append(-1000.0)
|
3739
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
3740
|
+
|
3741
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
3742
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3743
|
+
self.gguf_writer.add_token_list(tokens)
|
3744
|
+
self.gguf_writer.add_token_scores(scores)
|
3745
|
+
self.gguf_writer.add_token_types(toktypes)
|
3746
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
3747
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3748
|
+
if precompiled_charsmap:
|
3749
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3750
|
+
|
3751
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3752
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3753
|
+
|
3754
|
+
self.gguf_writer.add_add_bos_token(False)
|
3755
|
+
self.gguf_writer.add_add_eos_token(True)
|
3756
|
+
|
3757
|
+
def set_gguf_parameters(self):
|
3758
|
+
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
3759
|
+
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
3760
|
+
n_ctx = 512
|
3761
|
+
self.gguf_writer.add_context_length(n_ctx)
|
3762
|
+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
3763
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
3764
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3765
|
+
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
3766
|
+
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
3767
|
+
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
3768
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3769
|
+
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
3770
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
3771
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3772
|
+
|
3773
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3774
|
+
del bid # unused
|
3775
|
+
|
3776
|
+
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
|
3777
|
+
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
|
3778
|
+
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
|
3779
|
+
# and decoder and ignore the remaining ones.
|
3780
|
+
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
|
3781
|
+
if not self.shared_token_embeddings_found:
|
3782
|
+
name = "shared.weight"
|
3783
|
+
self.shared_token_embeddings_found = True
|
3784
|
+
else:
|
3785
|
+
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
|
3786
|
+
return []
|
3787
|
+
|
3788
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3789
|
+
|
3790
|
+
|
3229
3791
|
@Model.register("JAISLMHeadModel")
|
3230
3792
|
class JaisModel(Model):
|
3231
3793
|
model_arch = gguf.MODEL_ARCH.JAIS
|
@@ -3240,10 +3802,7 @@ class JaisModel(Model):
|
|
3240
3802
|
|
3241
3803
|
# Embeddings scale
|
3242
3804
|
self.embeddings_scale = 1.0
|
3243
|
-
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
3244
|
-
self.output_is_wte = False
|
3245
3805
|
if 'mup_embeddings_scale' in self.hparams:
|
3246
|
-
self.output_is_wte = True # Hack (?)
|
3247
3806
|
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
3248
3807
|
elif 'embeddings_scale' in self.hparams:
|
3249
3808
|
self.embeddings_scale = self.hparams['embeddings_scale']
|
@@ -3300,10 +3859,7 @@ class JaisModel(Model):
|
|
3300
3859
|
|
3301
3860
|
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3302
3861
|
tensors.append((new_name, data_torch * self.embeddings_scale))
|
3303
|
-
if self.output_is_wte:
|
3304
|
-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
3305
3862
|
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
3306
|
-
assert not self.output_is_wte
|
3307
3863
|
tensors.append((new_name, data_torch * self.width_scale))
|
3308
3864
|
else:
|
3309
3865
|
tensors.append((new_name, data_torch))
|
@@ -3497,8 +4053,218 @@ class ChatGLMModel(Model):
|
|
3497
4053
|
name = name.removeprefix("transformer.")
|
3498
4054
|
return [(self.map_tensor_name(name), data_torch)]
|
3499
4055
|
|
3500
|
-
###### CONVERSION LOGIC ######
|
3501
4056
|
|
4057
|
+
@Model.register("NemotronForCausalLM")
|
4058
|
+
class NemotronModel(Model):
|
4059
|
+
model_arch = gguf.MODEL_ARCH.NEMOTRON
|
4060
|
+
|
4061
|
+
def set_vocab(self):
|
4062
|
+
self._set_vocab_sentencepiece()
|
4063
|
+
self.gguf_writer.add_pad_token_id(0)
|
4064
|
+
self.gguf_writer.add_unk_token_id(1)
|
4065
|
+
|
4066
|
+
def set_gguf_parameters(self):
|
4067
|
+
super().set_gguf_parameters()
|
4068
|
+
hparams = self.hparams
|
4069
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
4070
|
+
|
4071
|
+
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
|
4072
|
+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
4073
|
+
|
4074
|
+
# * Partial RoPE
|
4075
|
+
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
|
4076
|
+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
4077
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
4078
|
+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
4079
|
+
|
4080
|
+
# * RopeScaling for Nemotron
|
4081
|
+
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
|
4082
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
4083
|
+
else:
|
4084
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
4085
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
|
4086
|
+
|
4087
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4088
|
+
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
|
4089
|
+
# model.layers.{l}.input_layernorm.weight
|
4090
|
+
# model.layers.{l}.post_attention_layernorm.weight
|
4091
|
+
# model.norm.weight
|
4092
|
+
if name.endswith("norm.weight"):
|
4093
|
+
data_torch = data_torch + 1
|
4094
|
+
|
4095
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4096
|
+
|
4097
|
+
|
4098
|
+
@Model.register("ExaoneForCausalLM")
|
4099
|
+
class ExaoneModel(Model):
|
4100
|
+
model_arch = gguf.MODEL_ARCH.EXAONE
|
4101
|
+
|
4102
|
+
def set_gguf_parameters(self):
|
4103
|
+
hparams = self.hparams
|
4104
|
+
|
4105
|
+
assert (hparams["activation_function"] == "silu")
|
4106
|
+
|
4107
|
+
max_position_embeddings = hparams["max_position_embeddings"]
|
4108
|
+
embed_dim = hparams["hidden_size"]
|
4109
|
+
num_heads = hparams["num_attention_heads"]
|
4110
|
+
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
|
4111
|
+
layer_norm_eps = hparams["layer_norm_epsilon"]
|
4112
|
+
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
|
4113
|
+
num_layers = hparams["num_layers"]
|
4114
|
+
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
|
4115
|
+
# attention_dropout_rate = hparams["attention_dropout"]
|
4116
|
+
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
|
4117
|
+
# embed_dropout_rate = hparams["embed_dropout"]
|
4118
|
+
self.gguf_writer.add_embedding_length(embed_dim)
|
4119
|
+
self.gguf_writer.add_head_count(num_heads)
|
4120
|
+
self.gguf_writer.add_head_count_kv(num_kv_heads)
|
4121
|
+
self.gguf_writer.add_context_length(max_position_embeddings)
|
4122
|
+
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
|
4123
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
4124
|
+
self.gguf_writer.add_block_count(num_layers)
|
4125
|
+
self.gguf_writer.add_file_type(self.ftype)
|
4126
|
+
|
4127
|
+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
4128
|
+
self.gguf_writer.add_rope_freq_base(rope_theta)
|
4129
|
+
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
4130
|
+
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
4131
|
+
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
4132
|
+
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
|
4133
|
+
if hparams["rope_scaling"].get("type") == "linear":
|
4134
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
4135
|
+
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
4136
|
+
|
4137
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
4138
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
4139
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
4140
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
4141
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
4142
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
4143
|
+
|
4144
|
+
factor = rope_scaling.get("factor", 8.0)
|
4145
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
4146
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
4147
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
4148
|
+
|
4149
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
4150
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
4151
|
+
assert low_freq_wavelen != high_freq_wavelen
|
4152
|
+
|
4153
|
+
rope_factors = []
|
4154
|
+
for freq in freqs:
|
4155
|
+
wavelen = 2 * math.pi / freq
|
4156
|
+
if wavelen < high_freq_wavelen:
|
4157
|
+
rope_factors.append(1)
|
4158
|
+
elif wavelen > low_freq_wavelen:
|
4159
|
+
rope_factors.append(factor)
|
4160
|
+
else:
|
4161
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
4162
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
4163
|
+
|
4164
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
4165
|
+
|
4166
|
+
|
4167
|
+
@Model.register("GraniteForCausalLM")
|
4168
|
+
class GraniteModel(LlamaModel):
|
4169
|
+
"""Conversion for IBM's GraniteForCausalLM"""
|
4170
|
+
model_arch = gguf.MODEL_ARCH.GRANITE
|
4171
|
+
|
4172
|
+
def set_gguf_parameters(self):
|
4173
|
+
"""Granite uses standard llama parameters with the following differences:
|
4174
|
+
|
4175
|
+
- No head_dim support
|
4176
|
+
- New multiplier params:
|
4177
|
+
- attention_scale
|
4178
|
+
- embedding_scale
|
4179
|
+
- residual_scale
|
4180
|
+
- logits_scaling
|
4181
|
+
"""
|
4182
|
+
if head_dim := self.hparams.pop("head_dim", None):
|
4183
|
+
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
|
4184
|
+
super().set_gguf_parameters()
|
4185
|
+
# NOTE: Convert _multiplier params to _scale params for naming
|
4186
|
+
# consistency
|
4187
|
+
if attention_scale := self.hparams.get("attention_multiplier"):
|
4188
|
+
self.gguf_writer.add_attention_scale(attention_scale)
|
4189
|
+
logger.info("gguf: (granite) attention_scale = %s", attention_scale)
|
4190
|
+
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
4191
|
+
self.gguf_writer.add_embedding_scale(embedding_scale)
|
4192
|
+
logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
|
4193
|
+
if residual_scale := self.hparams.get("residual_multiplier"):
|
4194
|
+
self.gguf_writer.add_residual_scale(residual_scale)
|
4195
|
+
logger.info("gguf: (granite) residual_scale = %s", residual_scale)
|
4196
|
+
if logits_scale := self.hparams.get("logits_scaling"):
|
4197
|
+
self.gguf_writer.add_logit_scale(logits_scale)
|
4198
|
+
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
4199
|
+
|
4200
|
+
|
4201
|
+
@Model.register("GraniteMoeForCausalLM")
|
4202
|
+
class GraniteMoeModel(GraniteModel):
|
4203
|
+
"""Conversion for IBM's GraniteMoeForCausalLM"""
|
4204
|
+
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
|
4205
|
+
|
4206
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4207
|
+
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
|
4208
|
+
is used. This essentially merges w1 and w3 into a single tensor with 2x
|
4209
|
+
the hidden size that is then split during forward. To keep compatibility
|
4210
|
+
with existing mixtral support, we pull them apart here.
|
4211
|
+
"""
|
4212
|
+
|
4213
|
+
if name.endswith("block_sparse_moe.input_linear.weight"):
|
4214
|
+
ffn_dim = self.hparams["intermediate_size"]
|
4215
|
+
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
|
4216
|
+
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
|
4217
|
+
return [
|
4218
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
|
4219
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
4220
|
+
]
|
4221
|
+
|
4222
|
+
return super().modify_tensors(data_torch, name, bid)
|
4223
|
+
|
4224
|
+
|
4225
|
+
@Model.register("ChameleonForConditionalGeneration")
|
4226
|
+
@Model.register("ChameleonForCausalLM") # obsolete
|
4227
|
+
class ChameleonModel(Model):
|
4228
|
+
model_arch = gguf.MODEL_ARCH.CHAMELEON
|
4229
|
+
|
4230
|
+
def set_gguf_parameters(self):
|
4231
|
+
super().set_gguf_parameters()
|
4232
|
+
self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
|
4233
|
+
|
4234
|
+
def set_vocab(self):
|
4235
|
+
self._set_vocab_gpt2()
|
4236
|
+
|
4237
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4238
|
+
# ignore image tokenizer for now
|
4239
|
+
# TODO: remove this once image support is implemented for Chameleon
|
4240
|
+
if name.startswith("model.vqmodel"):
|
4241
|
+
return []
|
4242
|
+
|
4243
|
+
n_head = self.hparams["num_attention_heads"]
|
4244
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
4245
|
+
hidden_dim = self.hparams.get("hidden_size")
|
4246
|
+
|
4247
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
4248
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
4249
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
4250
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
4251
|
+
if name.endswith(("q_norm.weight", "q_norm.bias")):
|
4252
|
+
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
|
4253
|
+
if name.endswith(("k_norm.weight", "k_norm.bias")):
|
4254
|
+
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
|
4255
|
+
|
4256
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4257
|
+
|
4258
|
+
# see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
|
4259
|
+
@staticmethod
|
4260
|
+
def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
|
4261
|
+
head_dim = hidden_dim // n_heads
|
4262
|
+
data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
|
4263
|
+
data_torch = data_torch.repeat_interleave(n_heads, 0)
|
4264
|
+
return data_torch
|
4265
|
+
|
4266
|
+
|
4267
|
+
###### CONVERSION LOGIC ######
|
3502
4268
|
|
3503
4269
|
# tree of lazy tensors
|
3504
4270
|
class LazyTorchTensor(gguf.LazyBase):
|
@@ -3578,8 +4344,8 @@ def parse_args() -> argparse.Namespace:
|
|
3578
4344
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
3579
4345
|
)
|
3580
4346
|
parser.add_argument(
|
3581
|
-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
3582
|
-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
4347
|
+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
4348
|
+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
3583
4349
|
)
|
3584
4350
|
parser.add_argument(
|
3585
4351
|
"--bigendian", action="store_true",
|
@@ -3666,6 +4432,8 @@ def main() -> None:
|
|
3666
4432
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
3667
4433
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
3668
4434
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
4435
|
+
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
4436
|
+
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
3669
4437
|
"auto": gguf.LlamaFileType.GUESSED,
|
3670
4438
|
}
|
3671
4439
|
|