bigdl-core-cpp 2.7.0b20250630__py3-none-manylinux2010_x86_64.whl → 2.7.0b20250703__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert_hf_to_gguf.py +1987 -558
- bigdl/cpp/convert_hf_to_gguf_update.py +131 -67
- bigdl/cpp/convert_lora_to_gguf.py +3 -3
- bigdl/cpp/gguf-py/gguf/constants.py +546 -16
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +57 -6
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +119 -7
- bigdl/cpp/gguf-py/gguf/lazy.py +10 -0
- bigdl/cpp/gguf-py/gguf/metadata.py +28 -8
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +461 -48
- bigdl/cpp/gguf-py/gguf/utility.py +195 -0
- bigdl/cpp/gguf-py/gguf/vocab.py +6 -1
- bigdl/cpp/libs/llama_cpp/libggml-base.so +0 -0
- bigdl/cpp/libs/llama_cpp/libggml-cpu.so +0 -0
- bigdl/cpp/libs/llama_cpp/libggml-sycl.so +0 -0
- bigdl/cpp/libs/llama_cpp/libggml.so +0 -0
- bigdl/cpp/libs/llama_cpp/libllama.so +0 -0
- bigdl/cpp/libs/llama_cpp/llama-batched +0 -0
- bigdl/cpp/libs/llama_cpp/llama-bench +0 -0
- bigdl/cpp/libs/llama_cpp/llama-cli +0 -0
- bigdl/cpp/libs/llama_cpp/llama-embedding +0 -0
- bigdl/cpp/libs/llama_cpp/llama-gemma3-cli +0 -0
- bigdl/cpp/libs/llama_cpp/llama-gguf +0 -0
- bigdl/cpp/libs/llama_cpp/llama-llava-cli +0 -0
- bigdl/cpp/libs/llama_cpp/llama-lookup +0 -0
- bigdl/cpp/libs/llama_cpp/llama-ls-sycl-device +0 -0
- bigdl/cpp/libs/llama_cpp/llama-minicpmv-cli +0 -0
- bigdl/cpp/libs/llama_cpp/llama-perplexity +0 -0
- bigdl/cpp/libs/llama_cpp/llama-quantize +0 -0
- bigdl/cpp/libs/llama_cpp/llama-server +0 -0
- bigdl/cpp/libs/llama_cpp/llama-simple +0 -0
- bigdl/cpp/libs/llama_cpp/llama-speculative +0 -0
- bigdl/cpp/libs/llama_cpp/llama-tokenize +0 -0
- bigdl/cpp/libs/ollama/libllama.so +0 -0
- bigdl/cpp/libs/ollama/ollama-lib +0 -0
- {bigdl_core_cpp-2.7.0b20250630.dist-info → bigdl_core_cpp-2.7.0b20250703.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.7.0b20250703.dist-info/RECORD +61 -0
- bigdl/cpp/libs/llama_cpp/libllava_shared.so +0 -0
- bigdl_core_cpp-2.7.0b20250630.dist-info/RECORD +0 -62
- {bigdl_core_cpp-2.7.0b20250630.data → bigdl_core_cpp-2.7.0b20250703.data}/scripts/init-llama-cpp +0 -0
- {bigdl_core_cpp-2.7.0b20250630.data → bigdl_core_cpp-2.7.0b20250703.data}/scripts/init-ollama +0 -0
- {bigdl_core_cpp-2.7.0b20250630.dist-info → bigdl_core_cpp-2.7.0b20250703.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.7.0b20250630.dist-info → bigdl_core_cpp-2.7.0b20250703.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert_hf_to_gguf.py
CHANGED
@@ -16,6 +16,7 @@ from pathlib import Path
|
|
16
16
|
from hashlib import sha256
|
17
17
|
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
18
18
|
from itertools import chain
|
19
|
+
from transformers import AutoConfig
|
19
20
|
|
20
21
|
import math
|
21
22
|
import numpy as np
|
@@ -42,11 +43,19 @@ class SentencePieceTokenTypes(IntEnum):
|
|
42
43
|
BYTE = 6
|
43
44
|
|
44
45
|
|
45
|
-
|
46
|
+
class ModelType(IntEnum):
|
47
|
+
TEXT = 1
|
48
|
+
MMPROJ = 2
|
46
49
|
|
47
50
|
|
48
|
-
|
49
|
-
|
51
|
+
AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
|
52
|
+
|
53
|
+
|
54
|
+
class ModelBase:
|
55
|
+
_model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
|
56
|
+
ModelType.TEXT: {},
|
57
|
+
ModelType.MMPROJ: {},
|
58
|
+
}
|
50
59
|
|
51
60
|
dir_model: Path
|
52
61
|
ftype: gguf.LlamaFileType
|
@@ -58,23 +67,28 @@ class Model:
|
|
58
67
|
part_names: list[str]
|
59
68
|
is_safetensors: bool
|
60
69
|
hparams: dict[str, Any]
|
61
|
-
block_count: int
|
62
|
-
tensor_map: gguf.TensorNameMap
|
63
70
|
tensor_names: set[str] | None
|
64
71
|
gguf_writer: gguf.GGUFWriter
|
65
72
|
model_name: str | None
|
66
73
|
metadata_override: Path | None
|
67
74
|
dir_model_card: Path
|
75
|
+
remote_hf_model_id: str | None
|
68
76
|
|
69
77
|
# subclasses should define this!
|
70
78
|
model_arch: gguf.MODEL_ARCH
|
71
79
|
|
72
|
-
|
80
|
+
# subclasses should initialize this!
|
81
|
+
block_count: int
|
82
|
+
tensor_map: gguf.TensorNameMap
|
83
|
+
|
84
|
+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
|
73
85
|
use_temp_file: bool = False, eager: bool = False,
|
74
86
|
metadata_override: Path | None = None, model_name: str | None = None,
|
75
87
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
76
|
-
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
|
77
|
-
if type(self) is
|
88
|
+
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
|
89
|
+
if type(self) is ModelBase or \
|
90
|
+
type(self) is TextModel or \
|
91
|
+
type(self) is MmprojModel:
|
78
92
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
79
93
|
|
80
94
|
self.dir_model = dir_model
|
@@ -83,14 +97,25 @@ class Model:
|
|
83
97
|
self.is_big_endian = is_big_endian
|
84
98
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
85
99
|
self.use_temp_file = use_temp_file
|
86
|
-
self.lazy = not eager
|
87
|
-
self.
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
100
|
+
self.lazy = not eager or (remote_hf_model_id is not None)
|
101
|
+
self.remote_hf_model_id = remote_hf_model_id
|
102
|
+
if remote_hf_model_id is not None:
|
103
|
+
self.is_safetensors = True
|
104
|
+
|
105
|
+
def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
|
106
|
+
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
|
107
|
+
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
|
108
|
+
self.tensor_names = set(name for name in remote_tensors.keys())
|
109
|
+
for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items():
|
110
|
+
yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
|
111
|
+
|
112
|
+
self.get_tensors = get_remote_tensors
|
113
|
+
else:
|
114
|
+
self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors")
|
115
|
+
self.is_safetensors = len(self.part_names) > 0
|
116
|
+
if not self.is_safetensors:
|
117
|
+
self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
118
|
+
self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
|
94
119
|
self.tensor_names = None
|
95
120
|
self.metadata_override = metadata_override
|
96
121
|
self.model_name = model_name
|
@@ -112,11 +137,10 @@ class Model:
|
|
112
137
|
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
113
138
|
|
114
139
|
@classmethod
|
115
|
-
def
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
|
140
|
+
def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path:
|
141
|
+
stem, suffix = path.stem, path.suffix
|
142
|
+
new_name = f"{prefix}{stem}{suffix}"
|
143
|
+
return path.with_name(new_name)
|
120
144
|
|
121
145
|
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
122
146
|
key = next((k for k in keys if k in self.hparams), None)
|
@@ -126,9 +150,6 @@ class Model:
|
|
126
150
|
return None
|
127
151
|
raise KeyError(f"could not find any of: {keys}")
|
128
152
|
|
129
|
-
def set_vocab(self):
|
130
|
-
self._set_vocab_gpt2()
|
131
|
-
|
132
153
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
133
154
|
tensor_names_from_parts: set[str] = set()
|
134
155
|
|
@@ -180,7 +201,8 @@ class Model:
|
|
180
201
|
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
181
202
|
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
182
203
|
if len(extra) == 0 and len(missing_files) > 0:
|
183
|
-
raise ValueError(f"Missing or incomplete model files: {missing_files}"
|
204
|
+
raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
|
205
|
+
f"Missing tensors: {missing}")
|
184
206
|
else:
|
185
207
|
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
186
208
|
f"Missing tensors: {missing}\n"
|
@@ -215,50 +237,7 @@ class Model:
|
|
215
237
|
return new_name
|
216
238
|
|
217
239
|
def set_gguf_parameters(self):
|
218
|
-
|
219
|
-
|
220
|
-
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
221
|
-
self.gguf_writer.add_context_length(n_ctx)
|
222
|
-
logger.info(f"gguf: context length = {n_ctx}")
|
223
|
-
|
224
|
-
if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
|
225
|
-
self.gguf_writer.add_embedding_length(n_embd)
|
226
|
-
logger.info(f"gguf: embedding length = {n_embd}")
|
227
|
-
|
228
|
-
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
229
|
-
self.gguf_writer.add_feed_forward_length(n_ff)
|
230
|
-
logger.info(f"gguf: feed forward length = {n_ff}")
|
231
|
-
|
232
|
-
if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
|
233
|
-
self.gguf_writer.add_head_count(n_head)
|
234
|
-
logger.info(f"gguf: head count = {n_head}")
|
235
|
-
|
236
|
-
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
237
|
-
self.gguf_writer.add_head_count_kv(n_head_kv)
|
238
|
-
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
239
|
-
|
240
|
-
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
241
|
-
self.gguf_writer.add_rope_freq_base(rope_theta)
|
242
|
-
logger.info(f"gguf: rope theta = {rope_theta}")
|
243
|
-
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
244
|
-
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
245
|
-
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
246
|
-
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
|
247
|
-
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
248
|
-
logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
|
249
|
-
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
250
|
-
self.gguf_writer.add_expert_count(n_experts)
|
251
|
-
logger.info(f"gguf: expert count = {n_experts}")
|
252
|
-
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
253
|
-
self.gguf_writer.add_expert_used_count(n_experts_used)
|
254
|
-
logger.info(f"gguf: experts used count = {n_experts_used}")
|
255
|
-
|
256
|
-
if (head_dim := self.hparams.get("head_dim")) is not None:
|
257
|
-
self.gguf_writer.add_key_length(head_dim)
|
258
|
-
self.gguf_writer.add_value_length(head_dim)
|
259
|
-
|
260
|
-
self.gguf_writer.add_file_type(self.ftype)
|
261
|
-
logger.info(f"gguf: file type = {self.ftype}")
|
240
|
+
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
|
262
241
|
|
263
242
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
264
243
|
del bid # unused
|
@@ -329,6 +308,8 @@ class Model:
|
|
329
308
|
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
330
309
|
gguf.MODEL_TENSOR.POSNET_NORM1,
|
331
310
|
gguf.MODEL_TENSOR.POSNET_NORM2,
|
311
|
+
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
|
312
|
+
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
|
332
313
|
)
|
333
314
|
)
|
334
315
|
or not new_name.endswith(".weight")
|
@@ -392,6 +373,10 @@ class Model:
|
|
392
373
|
|
393
374
|
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
|
394
375
|
|
376
|
+
# If we are using HF model id, set the metadata name to the model id
|
377
|
+
if self.remote_hf_model_id:
|
378
|
+
self.metadata.name = self.remote_hf_model_id
|
379
|
+
|
395
380
|
# Fallback to model directory name if metadata name is still missing
|
396
381
|
if self.metadata.name is None:
|
397
382
|
self.metadata.name = self.dir_model.name
|
@@ -400,27 +385,6 @@ class Model:
|
|
400
385
|
if self.metadata.size_label is None and total_params > 0:
|
401
386
|
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
402
387
|
|
403
|
-
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
404
|
-
output_type: str = self.ftype.name.partition("_")[2]
|
405
|
-
|
406
|
-
# Filename Output
|
407
|
-
if self.fname_out.is_dir():
|
408
|
-
# Generate default filename based on model specification and available metadata
|
409
|
-
if not vocab_only:
|
410
|
-
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
411
|
-
else:
|
412
|
-
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
413
|
-
|
414
|
-
# Use the default filename
|
415
|
-
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
416
|
-
else:
|
417
|
-
# Output path is a custom defined templated filename
|
418
|
-
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
419
|
-
# file template strings as it doesn't actually exist as a file
|
420
|
-
|
421
|
-
# Process templated file name with the output ftype, useful with the "auto" ftype
|
422
|
-
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
423
|
-
|
424
388
|
self.set_type()
|
425
389
|
|
426
390
|
logger.info("Set meta model")
|
@@ -429,12 +393,12 @@ class Model:
|
|
429
393
|
logger.info("Set model parameters")
|
430
394
|
self.set_gguf_parameters()
|
431
395
|
|
432
|
-
logger.info("Set model tokenizer")
|
433
|
-
self.set_vocab()
|
434
|
-
|
435
396
|
logger.info("Set model quantization version")
|
436
397
|
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
437
398
|
|
399
|
+
def write_vocab(self):
|
400
|
+
raise NotImplementedError("write_vocab() must be implemented in subclasses")
|
401
|
+
|
438
402
|
def write(self):
|
439
403
|
self.prepare_tensors()
|
440
404
|
self.prepare_metadata(vocab_only=False)
|
@@ -443,15 +407,6 @@ class Model:
|
|
443
407
|
self.gguf_writer.write_tensors_to_file(progress=True)
|
444
408
|
self.gguf_writer.close()
|
445
409
|
|
446
|
-
def write_vocab(self):
|
447
|
-
if len(self.gguf_writer.tensors) != 1:
|
448
|
-
raise ValueError('Splitting the vocabulary is not supported')
|
449
|
-
|
450
|
-
self.prepare_metadata(vocab_only=True)
|
451
|
-
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
452
|
-
self.gguf_writer.write_kv_data_to_file()
|
453
|
-
self.gguf_writer.close()
|
454
|
-
|
455
410
|
@staticmethod
|
456
411
|
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
457
412
|
part_names: list[str] = []
|
@@ -465,31 +420,160 @@ class Model:
|
|
465
420
|
|
466
421
|
@staticmethod
|
467
422
|
def load_hparams(dir_model: Path):
|
468
|
-
|
469
|
-
|
423
|
+
try:
|
424
|
+
# for security reason, we don't allow loading remote code by default
|
425
|
+
# if a model need remote code, we will fallback to config.json
|
426
|
+
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
|
427
|
+
except Exception as e:
|
428
|
+
logger.warning(f"Failed to load model config from {dir_model}: {e}")
|
429
|
+
logger.warning("Trying to load config.json instead")
|
430
|
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
431
|
+
config = json.load(f)
|
432
|
+
if "llm_config" in config:
|
433
|
+
# rename for InternVL
|
434
|
+
config["text_config"] = config["llm_config"]
|
435
|
+
if "thinker_config" in config:
|
436
|
+
# rename for Qwen2.5-Omni
|
437
|
+
config["text_config"] = config["thinker_config"]["text_config"]
|
438
|
+
return config
|
470
439
|
|
471
440
|
@classmethod
|
472
441
|
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
473
442
|
assert names
|
474
443
|
|
475
444
|
def func(modelcls: AnyModel) -> AnyModel:
|
445
|
+
model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
|
476
446
|
for name in names:
|
477
|
-
cls._model_classes[name] = modelcls
|
447
|
+
cls._model_classes[model_type][name] = modelcls
|
478
448
|
return modelcls
|
479
449
|
return func
|
480
450
|
|
481
451
|
@classmethod
|
482
452
|
def print_registered_models(cls):
|
483
|
-
for
|
484
|
-
logger.error(f"
|
453
|
+
for model_type, model_classes in cls._model_classes.items():
|
454
|
+
logger.error(f"{model_type.name} models:")
|
455
|
+
for name in sorted(model_classes.keys()):
|
456
|
+
logger.error(f" - {name}")
|
485
457
|
|
486
458
|
@classmethod
|
487
|
-
def from_model_architecture(cls, arch: str) -> type[
|
459
|
+
def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]:
|
488
460
|
try:
|
489
|
-
return cls._model_classes[arch]
|
461
|
+
return cls._model_classes[model_type][arch]
|
490
462
|
except KeyError:
|
491
463
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
492
464
|
|
465
|
+
|
466
|
+
class TextModel(ModelBase):
|
467
|
+
model_type = ModelType.TEXT
|
468
|
+
hf_arch: str
|
469
|
+
|
470
|
+
def __init__(self, *args, **kwargs):
|
471
|
+
super().__init__(*args, **kwargs)
|
472
|
+
self.hf_arch = get_model_architecture(self.hparams, self.model_type)
|
473
|
+
|
474
|
+
if "text_config" in self.hparams:
|
475
|
+
# move the text_config to the root level
|
476
|
+
self.hparams = {**self.hparams, **self.hparams["text_config"]}
|
477
|
+
|
478
|
+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
479
|
+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
480
|
+
|
481
|
+
@classmethod
|
482
|
+
def __init_subclass__(cls):
|
483
|
+
# can't use an abstract property, because overriding it without type errors
|
484
|
+
# would require using decorated functions instead of simply defining the property
|
485
|
+
if "model_arch" not in cls.__dict__:
|
486
|
+
raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
|
487
|
+
|
488
|
+
def set_vocab(self):
|
489
|
+
self._set_vocab_gpt2()
|
490
|
+
|
491
|
+
def prepare_metadata(self, vocab_only: bool):
|
492
|
+
super().prepare_metadata(vocab_only=vocab_only)
|
493
|
+
|
494
|
+
total_params = self.gguf_writer.get_total_parameter_count()[0]
|
495
|
+
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
496
|
+
output_type: str = self.ftype.name.partition("_")[2]
|
497
|
+
|
498
|
+
# Filename Output
|
499
|
+
if self.fname_out.is_dir():
|
500
|
+
# Generate default filename based on model specification and available metadata
|
501
|
+
if not vocab_only:
|
502
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
503
|
+
else:
|
504
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
505
|
+
|
506
|
+
# Use the default filename
|
507
|
+
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
508
|
+
else:
|
509
|
+
# Output path is a custom defined templated filename
|
510
|
+
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
511
|
+
# file template strings as it doesn't actually exist as a file
|
512
|
+
|
513
|
+
# Process templated file name with the output ftype, useful with the "auto" ftype
|
514
|
+
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
515
|
+
|
516
|
+
logger.info("Set model tokenizer")
|
517
|
+
self.set_vocab()
|
518
|
+
|
519
|
+
def set_gguf_parameters(self):
|
520
|
+
self.gguf_writer.add_block_count(self.block_count)
|
521
|
+
|
522
|
+
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
523
|
+
self.gguf_writer.add_context_length(n_ctx)
|
524
|
+
logger.info(f"gguf: context length = {n_ctx}")
|
525
|
+
|
526
|
+
if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
|
527
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
528
|
+
logger.info(f"gguf: embedding length = {n_embd}")
|
529
|
+
|
530
|
+
if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
|
531
|
+
self.gguf_writer.add_feed_forward_length(n_ff)
|
532
|
+
logger.info(f"gguf: feed forward length = {n_ff}")
|
533
|
+
|
534
|
+
if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
|
535
|
+
self.gguf_writer.add_head_count(n_head)
|
536
|
+
logger.info(f"gguf: head count = {n_head}")
|
537
|
+
|
538
|
+
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
539
|
+
self.gguf_writer.add_head_count_kv(n_head_kv)
|
540
|
+
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
541
|
+
|
542
|
+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
543
|
+
self.gguf_writer.add_rope_freq_base(rope_theta)
|
544
|
+
logger.info(f"gguf: rope theta = {rope_theta}")
|
545
|
+
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
546
|
+
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
547
|
+
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
548
|
+
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
|
549
|
+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
550
|
+
logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
|
551
|
+
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
552
|
+
self.gguf_writer.add_expert_count(n_experts)
|
553
|
+
logger.info(f"gguf: expert count = {n_experts}")
|
554
|
+
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
555
|
+
self.gguf_writer.add_expert_used_count(n_experts_used)
|
556
|
+
logger.info(f"gguf: experts used count = {n_experts_used}")
|
557
|
+
|
558
|
+
if (head_dim := self.hparams.get("head_dim")) is not None:
|
559
|
+
# Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
|
560
|
+
# https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
|
561
|
+
if self.hparams.get("model_type") != "deepseek_v3":
|
562
|
+
self.gguf_writer.add_key_length(head_dim)
|
563
|
+
self.gguf_writer.add_value_length(head_dim)
|
564
|
+
|
565
|
+
self.gguf_writer.add_file_type(self.ftype)
|
566
|
+
logger.info(f"gguf: file type = {self.ftype}")
|
567
|
+
|
568
|
+
def write_vocab(self):
|
569
|
+
if len(self.gguf_writer.tensors) != 1:
|
570
|
+
raise ValueError('Splitting the vocabulary is not supported')
|
571
|
+
|
572
|
+
self.prepare_metadata(vocab_only=True)
|
573
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
574
|
+
self.gguf_writer.write_kv_data_to_file()
|
575
|
+
self.gguf_writer.close()
|
576
|
+
|
493
577
|
def does_token_look_special(self, token: str | bytes) -> bool:
|
494
578
|
if isinstance(token, (bytes, bytearray)):
|
495
579
|
token_text = token.decode(encoding="utf-8")
|
@@ -528,6 +612,8 @@ class Model:
|
|
528
612
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
529
613
|
added_vocab = tokenizer.get_added_vocab()
|
530
614
|
|
615
|
+
added_tokens_decoder = tokenizer.added_tokens_decoder
|
616
|
+
|
531
617
|
for i in range(vocab_size):
|
532
618
|
if i not in reverse_vocab:
|
533
619
|
tokens.append(f"[PAD{i}]")
|
@@ -537,13 +623,13 @@ class Model:
|
|
537
623
|
if token in added_vocab:
|
538
624
|
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
539
625
|
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
540
|
-
if not
|
626
|
+
if not added_tokens_decoder[i].normalized:
|
541
627
|
previous_token = token
|
542
628
|
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
543
629
|
if previous_token != token:
|
544
630
|
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
545
631
|
|
546
|
-
if
|
632
|
+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
547
633
|
toktypes.append(gguf.TokenType.CONTROL)
|
548
634
|
else:
|
549
635
|
# NOTE: this was added for Gemma.
|
@@ -591,12 +677,12 @@ class Model:
|
|
591
677
|
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
592
678
|
# ref: https://huggingface.co/tiiuae/falcon-7b
|
593
679
|
res = "falcon"
|
594
|
-
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
595
|
-
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
596
|
-
res = "falcon3"
|
597
680
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
598
681
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
599
682
|
res = "bert-bge"
|
683
|
+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
684
|
+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
685
|
+
res = "falcon3"
|
600
686
|
if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
|
601
687
|
# ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
|
602
688
|
res = "bert-bge-large"
|
@@ -648,9 +734,6 @@ class Model:
|
|
648
734
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
649
735
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
650
736
|
res = "jina-v2-code"
|
651
|
-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
652
|
-
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
653
|
-
res = "chatglm-bpe"
|
654
737
|
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
655
738
|
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
656
739
|
res = "viking"
|
@@ -681,9 +764,6 @@ class Model:
|
|
681
764
|
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
|
682
765
|
# ref: https://huggingface.co/facebook/chameleon-7b
|
683
766
|
res = "chameleon"
|
684
|
-
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
685
|
-
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
686
|
-
res = "minerva-7b"
|
687
767
|
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
|
688
768
|
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
|
689
769
|
res = "roberta-bpe"
|
@@ -699,6 +779,39 @@ class Model:
|
|
699
779
|
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
|
700
780
|
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
701
781
|
res = "deepseek-r1-qwen"
|
782
|
+
if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
|
783
|
+
# ref: https://huggingface.co/Xenova/gpt-4o
|
784
|
+
res = "gpt-4o"
|
785
|
+
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
|
786
|
+
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
|
787
|
+
res = "superbpe"
|
788
|
+
if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
|
789
|
+
# ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
|
790
|
+
res = "trillion"
|
791
|
+
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
|
792
|
+
# ref: https://huggingface.co/inclusionAI/Ling-lite
|
793
|
+
res = "bailingmoe"
|
794
|
+
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
|
795
|
+
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
796
|
+
res = "llama4"
|
797
|
+
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
|
798
|
+
# ref: https://huggingface.co/mistral-community/pixtral-12b
|
799
|
+
res = "pixtral"
|
800
|
+
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
|
801
|
+
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
|
802
|
+
res = "seed-coder"
|
803
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
804
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
805
|
+
res = "chatglm-bpe"
|
806
|
+
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
807
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
808
|
+
res = "chatglm-bpe"
|
809
|
+
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
|
810
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
|
811
|
+
res = "glm4"
|
812
|
+
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
813
|
+
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
814
|
+
res = "minerva-7b"
|
702
815
|
|
703
816
|
if res is None:
|
704
817
|
logger.warning("\n")
|
@@ -858,6 +971,9 @@ class Model:
|
|
858
971
|
for token_id, token_data in added_tokens_decoder.items():
|
859
972
|
token_id = int(token_id)
|
860
973
|
token: str = token_data["content"]
|
974
|
+
if token_id >= vocab_size:
|
975
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
976
|
+
continue
|
861
977
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
862
978
|
if tokens[token_id] != token.encode("utf-8"):
|
863
979
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
@@ -902,6 +1018,44 @@ class Model:
|
|
902
1018
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
903
1019
|
special_vocab.add_to_gguf(self.gguf_writer)
|
904
1020
|
|
1021
|
+
def _set_vocab_rwkv_world(self):
|
1022
|
+
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
1023
|
+
vocab_size = self.hparams.get("vocab_size", 65536)
|
1024
|
+
|
1025
|
+
tokens: list[bytes] = ['<s>'.encode("utf-8")]
|
1026
|
+
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
1027
|
+
|
1028
|
+
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
|
1029
|
+
lines = f.readlines()
|
1030
|
+
for line in lines:
|
1031
|
+
parts = line.split(' ')
|
1032
|
+
assert len(parts) >= 3
|
1033
|
+
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
1034
|
+
token = token.encode("utf-8") if isinstance(token, str) else token
|
1035
|
+
assert isinstance(token, bytes)
|
1036
|
+
assert len(token) == token_len
|
1037
|
+
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
1038
|
+
tokens.append(token_text.encode("utf-8"))
|
1039
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
1040
|
+
remainder = vocab_size - len(tokens)
|
1041
|
+
assert remainder >= 0
|
1042
|
+
for i in range(len(tokens), vocab_size):
|
1043
|
+
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
1044
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
1045
|
+
|
1046
|
+
self.gguf_writer.add_tokenizer_model("rwkv")
|
1047
|
+
self.gguf_writer.add_token_list(tokens)
|
1048
|
+
self.gguf_writer.add_token_types(toktypes)
|
1049
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
1050
|
+
special_vocab.chat_template = "rwkv-world"
|
1051
|
+
# hack: Add '\n\n' as the EOT token to make it chat normally
|
1052
|
+
special_vocab._set_special_token("eot", 261)
|
1053
|
+
# hack: Override these as they have already been set (incorrectly)
|
1054
|
+
special_vocab.special_token_ids["bos"] = 0
|
1055
|
+
special_vocab.special_token_ids["eos"] = 0
|
1056
|
+
|
1057
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
1058
|
+
|
905
1059
|
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
906
1060
|
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
907
1061
|
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
@@ -947,11 +1101,149 @@ class Model:
|
|
947
1101
|
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
|
948
1102
|
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
|
949
1103
|
|
1104
|
+
def _try_set_pooling_type(self) -> None:
|
1105
|
+
# get pooling path
|
1106
|
+
pooling_path = None
|
1107
|
+
module_path = self.dir_model / "modules.json"
|
1108
|
+
if module_path.is_file():
|
1109
|
+
with open(module_path, encoding="utf-8") as f:
|
1110
|
+
modules = json.load(f)
|
1111
|
+
for mod in modules:
|
1112
|
+
if mod["type"] == "sentence_transformers.models.Pooling":
|
1113
|
+
pooling_path = mod["path"]
|
1114
|
+
break
|
1115
|
+
|
1116
|
+
# get pooling type
|
1117
|
+
if pooling_path is not None:
|
1118
|
+
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
1119
|
+
pooling = json.load(f)
|
1120
|
+
if pooling["pooling_mode_mean_tokens"]:
|
1121
|
+
pooling_type = gguf.PoolingType.MEAN
|
1122
|
+
elif pooling["pooling_mode_cls_token"]:
|
1123
|
+
pooling_type = gguf.PoolingType.CLS
|
1124
|
+
elif pooling["pooling_mode_lasttoken"]:
|
1125
|
+
pooling_type = gguf.PoolingType.LAST
|
1126
|
+
else:
|
1127
|
+
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
1128
|
+
self.gguf_writer.add_pooling_type(pooling_type)
|
1129
|
+
|
1130
|
+
|
1131
|
+
class MmprojModel(ModelBase):
|
1132
|
+
model_type = ModelType.MMPROJ
|
1133
|
+
model_arch = gguf.MODEL_ARCH.MMPROJ
|
1134
|
+
preprocessor_config: dict[str, Any]
|
1135
|
+
global_config: dict[str, Any]
|
1136
|
+
|
1137
|
+
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
1138
|
+
|
1139
|
+
has_vision_encoder: bool = True # by default
|
1140
|
+
has_audio_encoder: bool = False
|
1141
|
+
|
1142
|
+
# for models having multiple encoders, we need to separate their hparams
|
1143
|
+
hparams_vision: dict[str, Any] | None = None
|
1144
|
+
hparams_audio: dict[str, Any] | None = None
|
1145
|
+
|
1146
|
+
def __init__(self, *args, **kwargs):
|
1147
|
+
super().__init__(*args, **kwargs)
|
1148
|
+
|
1149
|
+
if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
|
1150
|
+
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
|
1151
|
+
|
1152
|
+
# get n_embd of the text model
|
1153
|
+
if "text_config" not in self.hparams:
|
1154
|
+
self.hparams["text_config"] = {}
|
1155
|
+
if "audio_config" not in self.hparams:
|
1156
|
+
self.hparams["audio_config"] = {}
|
1157
|
+
text_config = {**self.hparams, **self.hparams["text_config"]}
|
1158
|
+
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
|
1159
|
+
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
1160
|
+
|
1161
|
+
# move vision config to the top level, while preserving the original hparams in global_config
|
1162
|
+
import copy
|
1163
|
+
self.global_config = copy.deepcopy(self.hparams)
|
1164
|
+
self.hparams_vision = self.get_vision_config()
|
1165
|
+
self.hparams_audio = self.get_audio_config()
|
1166
|
+
|
1167
|
+
if self.hparams_vision is None and self.hparams_audio is None:
|
1168
|
+
raise ValueError("vision_config / audio_config not found in hparams")
|
1169
|
+
|
1170
|
+
# for compat with vision-only models
|
1171
|
+
self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
|
1172
|
+
|
1173
|
+
# TODO @ngxson : this is a hack to support both vision and audio encoders
|
1174
|
+
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
|
1175
|
+
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
|
1176
|
+
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
|
1177
|
+
|
1178
|
+
# load preprocessor config
|
1179
|
+
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
|
1180
|
+
self.preprocessor_config = json.load(f)
|
1181
|
+
|
1182
|
+
def get_vision_config(self) -> dict[str, Any] | None:
|
1183
|
+
return self.global_config.get("vision_config")
|
1184
|
+
|
1185
|
+
def get_audio_config(self) -> dict[str, Any] | None:
|
1186
|
+
return self.global_config.get("audio_config")
|
1187
|
+
|
1188
|
+
def set_type(self):
|
1189
|
+
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
1190
|
+
|
1191
|
+
def set_gguf_parameters(self):
|
1192
|
+
self.gguf_writer.add_file_type(self.ftype)
|
1193
|
+
|
1194
|
+
if self.has_vision_encoder:
|
1195
|
+
self.gguf_writer.add_clip_has_vision_encoder(True)
|
1196
|
+
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
|
1197
|
+
|
1198
|
+
# vision config
|
1199
|
+
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
|
1200
|
+
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
|
1201
|
+
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
|
1202
|
+
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
|
1203
|
+
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
|
1204
|
+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
|
1205
|
+
|
1206
|
+
# preprocessor config
|
1207
|
+
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
|
1208
|
+
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
|
1209
|
+
|
1210
|
+
if self.has_audio_encoder:
|
1211
|
+
self.gguf_writer.add_clip_has_audio_encoder(True)
|
1212
|
+
self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
|
1213
|
+
|
1214
|
+
# audio config
|
1215
|
+
self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
|
1216
|
+
self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
|
1217
|
+
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
|
1218
|
+
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
|
1219
|
+
|
1220
|
+
if not self.has_vision_encoder and not self.has_audio_encoder:
|
1221
|
+
raise ValueError("MmprojModel must have either vision or audio encoder")
|
1222
|
+
|
1223
|
+
def write_vocab(self):
|
1224
|
+
raise ValueError("MmprojModel does not support vocab writing")
|
1225
|
+
|
1226
|
+
def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
1227
|
+
assert self.hparams_vision is not None
|
1228
|
+
return self._find_param(self.hparams_vision, keys, optional)
|
1229
|
+
|
1230
|
+
def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
1231
|
+
assert self.hparams_audio is not None
|
1232
|
+
return self._find_param(self.hparams_audio, keys, optional)
|
1233
|
+
|
1234
|
+
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
|
1235
|
+
key = next((k for k in keys if k in obj), None)
|
1236
|
+
if key is not None:
|
1237
|
+
return obj[key]
|
1238
|
+
if optional:
|
1239
|
+
return None
|
1240
|
+
raise KeyError(f"could not find any of: {keys}")
|
1241
|
+
|
1242
|
+
|
1243
|
+
@ModelBase.register("GPTNeoXForCausalLM")
|
1244
|
+
class GPTNeoXModel(TextModel):
|
1245
|
+
model_arch = gguf.MODEL_ARCH.GPTNEOX
|
950
1246
|
|
951
|
-
@Model.register("GPTNeoXForCausalLM")
|
952
|
-
class GPTNeoXModel(Model):
|
953
|
-
model_arch = gguf.MODEL_ARCH.GPTNEOX
|
954
|
-
|
955
1247
|
def set_gguf_parameters(self):
|
956
1248
|
block_count = self.hparams["num_hidden_layers"]
|
957
1249
|
|
@@ -1005,8 +1297,8 @@ class GPTNeoXModel(Model):
|
|
1005
1297
|
return tensors
|
1006
1298
|
|
1007
1299
|
|
1008
|
-
@
|
1009
|
-
class BloomModel(
|
1300
|
+
@ModelBase.register("BloomForCausalLM", "BloomModel")
|
1301
|
+
class BloomModel(TextModel):
|
1010
1302
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
1011
1303
|
|
1012
1304
|
def set_gguf_parameters(self):
|
@@ -1059,18 +1351,11 @@ class BloomModel(Model):
|
|
1059
1351
|
|
1060
1352
|
tensors.append((self.map_tensor_name(name), data_torch))
|
1061
1353
|
|
1062
|
-
if name == "word_embeddings.weight":
|
1063
|
-
assert self.tensor_names is not None
|
1064
|
-
|
1065
|
-
# TODO: tie them at runtime, don't duplicate in the model file
|
1066
|
-
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
|
1067
|
-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
1068
|
-
|
1069
1354
|
return tensors
|
1070
1355
|
|
1071
1356
|
|
1072
|
-
@
|
1073
|
-
class MPTModel(
|
1357
|
+
@ModelBase.register("MPTForCausalLM")
|
1358
|
+
class MPTModel(TextModel):
|
1074
1359
|
model_arch = gguf.MODEL_ARCH.MPT
|
1075
1360
|
|
1076
1361
|
def set_vocab(self):
|
@@ -1113,8 +1398,8 @@ class MPTModel(Model):
|
|
1113
1398
|
return [(new_name, data_torch)]
|
1114
1399
|
|
1115
1400
|
|
1116
|
-
@
|
1117
|
-
class OrionModel(
|
1401
|
+
@ModelBase.register("OrionForCausalLM")
|
1402
|
+
class OrionModel(TextModel):
|
1118
1403
|
model_arch = gguf.MODEL_ARCH.ORION
|
1119
1404
|
|
1120
1405
|
def set_vocab(self):
|
@@ -1148,8 +1433,8 @@ class OrionModel(Model):
|
|
1148
1433
|
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
1149
1434
|
|
1150
1435
|
|
1151
|
-
@
|
1152
|
-
class BaichuanModel(
|
1436
|
+
@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
|
1437
|
+
class BaichuanModel(TextModel):
|
1153
1438
|
model_arch = gguf.MODEL_ARCH.BAICHUAN
|
1154
1439
|
|
1155
1440
|
def set_vocab(self):
|
@@ -1181,10 +1466,10 @@ class BaichuanModel(Model):
|
|
1181
1466
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
1182
1467
|
self.gguf_writer.add_file_type(self.ftype)
|
1183
1468
|
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1469
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
1470
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
1471
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1472
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
1188
1473
|
|
1189
1474
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1190
1475
|
head_count = self.hparams["num_attention_heads"]
|
@@ -1228,8 +1513,8 @@ class BaichuanModel(Model):
|
|
1228
1513
|
return weights[r * n_part:r * n_part + r, ...]
|
1229
1514
|
|
1230
1515
|
|
1231
|
-
@
|
1232
|
-
class XverseModel(
|
1516
|
+
@ModelBase.register("XverseForCausalLM")
|
1517
|
+
class XverseModel(TextModel):
|
1233
1518
|
model_arch = gguf.MODEL_ARCH.XVERSE
|
1234
1519
|
|
1235
1520
|
def set_vocab(self):
|
@@ -1305,10 +1590,10 @@ class XverseModel(Model):
|
|
1305
1590
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
1306
1591
|
self.gguf_writer.add_file_type(self.ftype)
|
1307
1592
|
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1593
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
1594
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
1595
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1596
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
1312
1597
|
|
1313
1598
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1314
1599
|
del bid # unused
|
@@ -1335,8 +1620,8 @@ class XverseModel(Model):
|
|
1335
1620
|
)
|
1336
1621
|
|
1337
1622
|
|
1338
|
-
@
|
1339
|
-
class FalconModel(
|
1623
|
+
@ModelBase.register("FalconForCausalLM", "RWForCausalLM")
|
1624
|
+
class FalconModel(TextModel):
|
1340
1625
|
model_arch = gguf.MODEL_ARCH.FALCON
|
1341
1626
|
|
1342
1627
|
def set_gguf_parameters(self):
|
@@ -1389,8 +1674,8 @@ class FalconModel(Model):
|
|
1389
1674
|
return [(self.map_tensor_name(name), data_torch)]
|
1390
1675
|
|
1391
1676
|
|
1392
|
-
@
|
1393
|
-
class StarCoderModel(
|
1677
|
+
@ModelBase.register("GPTBigCodeForCausalLM")
|
1678
|
+
class StarCoderModel(TextModel):
|
1394
1679
|
model_arch = gguf.MODEL_ARCH.STARCODER
|
1395
1680
|
|
1396
1681
|
def set_gguf_parameters(self):
|
@@ -1406,8 +1691,8 @@ class StarCoderModel(Model):
|
|
1406
1691
|
self.gguf_writer.add_file_type(self.ftype)
|
1407
1692
|
|
1408
1693
|
|
1409
|
-
@
|
1410
|
-
class RefactModel(
|
1694
|
+
@ModelBase.register("GPTRefactForCausalLM")
|
1695
|
+
class RefactModel(TextModel):
|
1411
1696
|
model_arch = gguf.MODEL_ARCH.REFACT
|
1412
1697
|
|
1413
1698
|
def set_vocab(self):
|
@@ -1470,8 +1755,8 @@ class RefactModel(Model):
|
|
1470
1755
|
return tensors
|
1471
1756
|
|
1472
1757
|
|
1473
|
-
@
|
1474
|
-
class StableLMModel(
|
1758
|
+
@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
|
1759
|
+
class StableLMModel(TextModel):
|
1475
1760
|
model_arch = gguf.MODEL_ARCH.STABLELM
|
1476
1761
|
|
1477
1762
|
def set_vocab(self):
|
@@ -1560,9 +1845,23 @@ class StableLMModel(Model):
|
|
1560
1845
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1561
1846
|
|
1562
1847
|
|
1563
|
-
@
|
1564
|
-
|
1848
|
+
@ModelBase.register(
|
1849
|
+
"LLaMAForCausalLM",
|
1850
|
+
"LlamaForCausalLM",
|
1851
|
+
"MistralForCausalLM",
|
1852
|
+
"MixtralForCausalLM",
|
1853
|
+
"VLlama3ForCausalLM",
|
1854
|
+
"LlavaForConditionalGeneration",
|
1855
|
+
"LlamaModel")
|
1856
|
+
class LlamaModel(TextModel):
|
1565
1857
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
1858
|
+
undo_permute = True
|
1859
|
+
|
1860
|
+
def __init__(self, *args, **kwargs):
|
1861
|
+
super().__init__(*args, **kwargs)
|
1862
|
+
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
1863
|
+
if self.hf_arch == "VLlama3ForCausalLM":
|
1864
|
+
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
1566
1865
|
|
1567
1866
|
def set_vocab(self):
|
1568
1867
|
try:
|
@@ -1608,10 +1907,10 @@ class LlamaModel(Model):
|
|
1608
1907
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1609
1908
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1610
1909
|
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1910
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
1911
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
1912
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1913
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
1615
1914
|
|
1616
1915
|
@staticmethod
|
1617
1916
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
@@ -1626,11 +1925,25 @@ class LlamaModel(Model):
|
|
1626
1925
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1627
1926
|
n_head = self.hparams["num_attention_heads"]
|
1628
1927
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1633
|
-
|
1928
|
+
is_vision_tensor = "vision_tower" in name \
|
1929
|
+
or "vision_model" in name \
|
1930
|
+
or "model.connector" in name \
|
1931
|
+
or "multi_modal_projector" in name
|
1932
|
+
|
1933
|
+
if is_vision_tensor:
|
1934
|
+
return [] # skip vision tensors
|
1935
|
+
elif self.hf_arch == "LlamaModel":
|
1936
|
+
name = "model." + name
|
1937
|
+
elif name.startswith("model.text_model"):
|
1938
|
+
name = name.replace("text_model.", "") # for SmolVLM
|
1939
|
+
elif name.startswith("language_model."):
|
1940
|
+
name = name.replace("language_model.", "") # for the rest
|
1941
|
+
|
1942
|
+
if self.undo_permute:
|
1943
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1944
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1945
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1946
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1634
1947
|
|
1635
1948
|
# process the experts separately
|
1636
1949
|
if name.find("block_sparse_moe.experts") != -1:
|
@@ -1682,7 +1995,7 @@ class LlamaModel(Model):
|
|
1682
1995
|
|
1683
1996
|
low_freq_wavelen = old_context_len / low_freq_factor
|
1684
1997
|
high_freq_wavelen = old_context_len / high_freq_factor
|
1685
|
-
assert low_freq_wavelen != high_freq_wavelen
|
1998
|
+
# assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4
|
1686
1999
|
|
1687
2000
|
rope_factors = []
|
1688
2001
|
for freq in freqs:
|
@@ -1707,8 +2020,202 @@ class LlamaModel(Model):
|
|
1707
2020
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1708
2021
|
|
1709
2022
|
|
1710
|
-
@
|
1711
|
-
class
|
2023
|
+
@ModelBase.register("ArceeForCausalLM")
|
2024
|
+
class ArceeModel(LlamaModel):
|
2025
|
+
model_arch = gguf.MODEL_ARCH.ARCEE
|
2026
|
+
|
2027
|
+
def set_gguf_parameters(self):
|
2028
|
+
super().set_gguf_parameters()
|
2029
|
+
self._try_set_pooling_type()
|
2030
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
2031
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
2032
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
2033
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
2034
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
2035
|
+
|
2036
|
+
|
2037
|
+
@ModelBase.register(
|
2038
|
+
"LlavaForConditionalGeneration", # pixtral
|
2039
|
+
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
2040
|
+
)
|
2041
|
+
class LlavaVisionModel(MmprojModel):
|
2042
|
+
img_break_tok_id = -1
|
2043
|
+
|
2044
|
+
def __init__(self, *args, **kwargs):
|
2045
|
+
super().__init__(*args, **kwargs)
|
2046
|
+
if self.hparams["model_type"] == "pixtral":
|
2047
|
+
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
|
2048
|
+
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
|
2049
|
+
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
|
2050
|
+
logger.info(f"Image break token id: {self.img_break_tok_id}")
|
2051
|
+
else:
|
2052
|
+
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
|
2053
|
+
|
2054
|
+
def get_token_id(self, token: str) -> int:
|
2055
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2056
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2057
|
+
added_tokens_decoder = json.load(f)['added_tokens_decoder']
|
2058
|
+
for id_, token_data in added_tokens_decoder.items():
|
2059
|
+
if token_data["content"] == token:
|
2060
|
+
return int(id_)
|
2061
|
+
raise ValueError(f"Token '{token}' not found in tokenizer config.")
|
2062
|
+
|
2063
|
+
def set_gguf_parameters(self):
|
2064
|
+
super().set_gguf_parameters()
|
2065
|
+
hparams = self.hparams
|
2066
|
+
if hparams["model_type"] == "pixtral":
|
2067
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
|
2068
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
2069
|
+
|
2070
|
+
# hidden_act
|
2071
|
+
if hparams["hidden_act"] == "silu":
|
2072
|
+
self.gguf_writer.add_vision_use_silu(True)
|
2073
|
+
elif hparams["hidden_act"] == "gelu":
|
2074
|
+
self.gguf_writer.add_vision_use_gelu(True)
|
2075
|
+
else:
|
2076
|
+
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
|
2077
|
+
|
2078
|
+
# spatial_merge_size
|
2079
|
+
if "spatial_merge_size" in self.global_config:
|
2080
|
+
self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"])
|
2081
|
+
|
2082
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2083
|
+
del bid # unused
|
2084
|
+
n_head = self.hparams["num_attention_heads"]
|
2085
|
+
n_kv_head = n_head
|
2086
|
+
|
2087
|
+
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
|
2088
|
+
# process vision tensors
|
2089
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
2090
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
2091
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
2092
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
2093
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2094
|
+
|
2095
|
+
if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
|
2096
|
+
logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
|
2097
|
+
# for pixtral model, we need to extract the [IMG_BREAK] token embedding
|
2098
|
+
img_break_embd = data_torch[self.img_break_tok_id]
|
2099
|
+
name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK]
|
2100
|
+
return [(self.map_tensor_name(name), img_break_embd)]
|
2101
|
+
|
2102
|
+
return [] # skip other tensors
|
2103
|
+
|
2104
|
+
|
2105
|
+
@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
|
2106
|
+
class SmolVLMModel(MmprojModel):
|
2107
|
+
def __init__(self, *args, **kwargs):
|
2108
|
+
super().__init__(*args, **kwargs)
|
2109
|
+
if self.hparams["model_type"] == "smolvlm_vision":
|
2110
|
+
# fix for SmolVLM2, missing some keys in config.json
|
2111
|
+
# default values are taken from transformers code
|
2112
|
+
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
|
2113
|
+
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
|
2114
|
+
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
|
2115
|
+
|
2116
|
+
def set_gguf_parameters(self):
|
2117
|
+
super().set_gguf_parameters()
|
2118
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
|
2119
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
2120
|
+
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
|
2121
|
+
self.gguf_writer.add_vision_use_gelu(True)
|
2122
|
+
|
2123
|
+
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
2124
|
+
del bid, new_name, n_dims # unused
|
2125
|
+
if ".embeddings." in name:
|
2126
|
+
return gguf.GGMLQuantizationType.F32
|
2127
|
+
return False
|
2128
|
+
|
2129
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2130
|
+
del bid # unused
|
2131
|
+
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
|
2132
|
+
|
2133
|
+
if is_vision_tensor:
|
2134
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2135
|
+
|
2136
|
+
return [] # skip other tensors
|
2137
|
+
|
2138
|
+
|
2139
|
+
@ModelBase.register("Llama4ForConditionalGeneration")
|
2140
|
+
class Llama4Model(LlamaModel):
|
2141
|
+
model_arch = gguf.MODEL_ARCH.LLAMA4
|
2142
|
+
undo_permute = False
|
2143
|
+
|
2144
|
+
def __init__(self, *args, **kwargs):
|
2145
|
+
super().__init__(*args, **kwargs)
|
2146
|
+
# IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
|
2147
|
+
self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
|
2148
|
+
self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
|
2149
|
+
|
2150
|
+
def set_vocab(self):
|
2151
|
+
self._set_vocab_gpt2()
|
2152
|
+
self.gguf_writer.add_add_bos_token(True)
|
2153
|
+
|
2154
|
+
def set_gguf_parameters(self):
|
2155
|
+
super().set_gguf_parameters()
|
2156
|
+
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
|
2157
|
+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
|
2158
|
+
|
2159
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
2160
|
+
if name.startswith("language_model."):
|
2161
|
+
name = name.replace("language_model.", "")
|
2162
|
+
|
2163
|
+
# split the gate_up into gate and up
|
2164
|
+
if "gate_up_proj" in name:
|
2165
|
+
name_up = name.replace("gate_up_proj", "up_proj.weight")
|
2166
|
+
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
|
2167
|
+
dim_half = data_torch.shape[-1] // 2
|
2168
|
+
gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
|
2169
|
+
return [
|
2170
|
+
(self.map_tensor_name(name_gate), gate_proj_weight),
|
2171
|
+
(self.map_tensor_name(name_up), up_proj_weight)
|
2172
|
+
]
|
2173
|
+
|
2174
|
+
if name.endswith("down_proj"):
|
2175
|
+
name += ".weight"
|
2176
|
+
data_torch = data_torch.transpose(-1, -2)
|
2177
|
+
|
2178
|
+
if "multi_modal_projector" in name or "vision_model" in name:
|
2179
|
+
return []
|
2180
|
+
return super().modify_tensors(data_torch, name, bid)
|
2181
|
+
|
2182
|
+
|
2183
|
+
@ModelBase.register("Llama4ForConditionalGeneration")
|
2184
|
+
class Llama4VisionModel(MmprojModel):
|
2185
|
+
def set_gguf_parameters(self):
|
2186
|
+
super().set_gguf_parameters()
|
2187
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
|
2188
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
|
2189
|
+
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
|
2190
|
+
assert self.hparams["hidden_act"] == "gelu"
|
2191
|
+
self.gguf_writer.add_vision_use_gelu(True)
|
2192
|
+
|
2193
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2194
|
+
del bid # unused
|
2195
|
+
if "multi_modal_projector" in name or "vision_model" in name:
|
2196
|
+
# process vision tensors
|
2197
|
+
if "positional_embedding_vlm" in name and ".weight" not in name:
|
2198
|
+
name += ".weight"
|
2199
|
+
if "multi_modal_projector.linear_1" in name:
|
2200
|
+
# despite the name with number postfix, this is a single fully connected layer
|
2201
|
+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
|
2202
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2203
|
+
return []
|
2204
|
+
|
2205
|
+
|
2206
|
+
@ModelBase.register("Mistral3ForConditionalGeneration")
|
2207
|
+
class Mistral3Model(LlamaModel):
|
2208
|
+
model_arch = gguf.MODEL_ARCH.LLAMA
|
2209
|
+
|
2210
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
2211
|
+
name = name.replace("language_model.", "")
|
2212
|
+
if "multi_modal_projector" in name or "vision_tower" in name:
|
2213
|
+
return []
|
2214
|
+
return super().modify_tensors(data_torch, name, bid)
|
2215
|
+
|
2216
|
+
|
2217
|
+
@ModelBase.register("DeciLMForCausalLM")
|
2218
|
+
class DeciModel(TextModel):
|
1712
2219
|
model_arch = gguf.MODEL_ARCH.DECI
|
1713
2220
|
|
1714
2221
|
@staticmethod
|
@@ -1743,6 +2250,9 @@ class DeciModel(Model):
|
|
1743
2250
|
# if n_heads_in_group is not None, then
|
1744
2251
|
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
|
1745
2252
|
# _num_heads[il] is num_attention_head
|
2253
|
+
# ***dummy layer*** for nemotron 253B
|
2254
|
+
# if n_heads_in_group is None and ffn_mult is None
|
2255
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
|
1746
2256
|
for il in range(len(_block_configs)):
|
1747
2257
|
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
|
1748
2258
|
if _block_configs[il]["attention"]["replace_with_linear"] is True:
|
@@ -1754,7 +2264,10 @@ class DeciModel(Model):
|
|
1754
2264
|
else:
|
1755
2265
|
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
|
1756
2266
|
self._num_heads.append(self.hparams["num_attention_heads"])
|
1757
|
-
|
2267
|
+
if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
|
2268
|
+
_ffn_multipliers.append(0.0)
|
2269
|
+
else:
|
2270
|
+
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
1758
2271
|
assert self.block_count == len(self._num_kv_heads)
|
1759
2272
|
assert self.block_count == len(self._num_heads)
|
1760
2273
|
assert self.block_count == len(_ffn_multipliers)
|
@@ -1814,10 +2327,10 @@ class DeciModel(Model):
|
|
1814
2327
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1815
2328
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1816
2329
|
|
1817
|
-
|
1818
|
-
|
1819
|
-
|
1820
|
-
|
2330
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
2331
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
2332
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2333
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
1821
2334
|
|
1822
2335
|
@staticmethod
|
1823
2336
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
@@ -1879,8 +2392,8 @@ class DeciModel(Model):
|
|
1879
2392
|
super().prepare_tensors()
|
1880
2393
|
|
1881
2394
|
|
1882
|
-
@
|
1883
|
-
class BitnetModel(
|
2395
|
+
@ModelBase.register("BitnetForCausalLM")
|
2396
|
+
class BitnetModel(TextModel):
|
1884
2397
|
model_arch = gguf.MODEL_ARCH.BITNET
|
1885
2398
|
|
1886
2399
|
def set_vocab(self):
|
@@ -1920,8 +2433,8 @@ class BitnetModel(Model):
|
|
1920
2433
|
yield (new_name, data_torch)
|
1921
2434
|
|
1922
2435
|
|
1923
|
-
@
|
1924
|
-
class GrokModel(
|
2436
|
+
@ModelBase.register("GrokForCausalLM")
|
2437
|
+
class GrokModel(TextModel):
|
1925
2438
|
model_arch = gguf.MODEL_ARCH.GROK
|
1926
2439
|
|
1927
2440
|
def set_vocab(self):
|
@@ -1973,8 +2486,8 @@ class GrokModel(Model):
|
|
1973
2486
|
return [(self.map_tensor_name(name), data_torch)]
|
1974
2487
|
|
1975
2488
|
|
1976
|
-
@
|
1977
|
-
class DbrxModel(
|
2489
|
+
@ModelBase.register("DbrxForCausalLM")
|
2490
|
+
class DbrxModel(TextModel):
|
1978
2491
|
model_arch = gguf.MODEL_ARCH.DBRX
|
1979
2492
|
|
1980
2493
|
def set_gguf_parameters(self):
|
@@ -2042,8 +2555,8 @@ class DbrxModel(Model):
|
|
2042
2555
|
return n_dims > 1
|
2043
2556
|
|
2044
2557
|
|
2045
|
-
@
|
2046
|
-
class MiniCPMModel(
|
2558
|
+
@ModelBase.register("MiniCPMForCausalLM")
|
2559
|
+
class MiniCPMModel(TextModel):
|
2047
2560
|
model_arch = gguf.MODEL_ARCH.MINICPM
|
2048
2561
|
|
2049
2562
|
def set_gguf_parameters(self):
|
@@ -2057,10 +2570,10 @@ class MiniCPMModel(Model):
|
|
2057
2570
|
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
2058
2571
|
self.gguf_writer.add_logit_scale(logit_scale)
|
2059
2572
|
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
2060
|
-
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2573
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
2574
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
|
2575
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
2576
|
+
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
2064
2577
|
|
2065
2578
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2066
2579
|
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
@@ -2097,8 +2610,8 @@ class MiniCPMModel(Model):
|
|
2097
2610
|
return [(self.map_tensor_name(name), data_torch)]
|
2098
2611
|
|
2099
2612
|
|
2100
|
-
@
|
2101
|
-
class MiniCPM3Model(
|
2613
|
+
@ModelBase.register("MiniCPM3ForCausalLM")
|
2614
|
+
class MiniCPM3Model(TextModel):
|
2102
2615
|
model_arch = gguf.MODEL_ARCH.MINICPM3
|
2103
2616
|
|
2104
2617
|
def set_gguf_parameters(self):
|
@@ -2150,8 +2663,8 @@ class MiniCPM3Model(Model):
|
|
2150
2663
|
)
|
2151
2664
|
|
2152
2665
|
|
2153
|
-
@
|
2154
|
-
class QwenModel(
|
2666
|
+
@ModelBase.register("QWenLMHeadModel")
|
2667
|
+
class QwenModel(TextModel):
|
2155
2668
|
model_arch = gguf.MODEL_ARCH.QWEN
|
2156
2669
|
|
2157
2670
|
@staticmethod
|
@@ -2192,8 +2705,8 @@ class QwenModel(Model):
|
|
2192
2705
|
self.gguf_writer.add_file_type(self.ftype)
|
2193
2706
|
|
2194
2707
|
|
2195
|
-
@
|
2196
|
-
class Qwen2Model(
|
2708
|
+
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
2709
|
+
class Qwen2Model(TextModel):
|
2197
2710
|
model_arch = gguf.MODEL_ARCH.QWEN2
|
2198
2711
|
|
2199
2712
|
def set_vocab(self):
|
@@ -2204,15 +2717,32 @@ class Qwen2Model(Model):
|
|
2204
2717
|
|
2205
2718
|
def set_gguf_parameters(self):
|
2206
2719
|
super().set_gguf_parameters()
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2210
|
-
|
2211
|
-
|
2720
|
+
self._try_set_pooling_type()
|
2721
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
2722
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
2723
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
2724
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
2725
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
2726
|
+
|
2727
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2728
|
+
if self.hf_arch == "Qwen2Model":
|
2729
|
+
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
|
2730
|
+
if "language_model." in name:
|
2731
|
+
name = name.replace("language_model.", "") # for InternVL
|
2732
|
+
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
|
2733
|
+
or name.startswith("vision_model") or name.startswith("audio_tower"):
|
2734
|
+
# skip vision and audio tensors
|
2735
|
+
return []
|
2736
|
+
yield from super().modify_tensors(data_torch, name, bid)
|
2212
2737
|
|
2213
2738
|
|
2214
|
-
@
|
2215
|
-
|
2739
|
+
@ModelBase.register(
|
2740
|
+
"Qwen2VLModel",
|
2741
|
+
"Qwen2VLForConditionalGeneration",
|
2742
|
+
"Qwen2_5_VLForConditionalGeneration",
|
2743
|
+
"Qwen2_5OmniModel",
|
2744
|
+
)
|
2745
|
+
class Qwen2VLModel(TextModel):
|
2216
2746
|
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
2217
2747
|
|
2218
2748
|
def set_gguf_parameters(self):
|
@@ -2227,15 +2757,217 @@ class Qwen2VLModel(Model):
|
|
2227
2757
|
except FileNotFoundError:
|
2228
2758
|
self._set_vocab_gpt2()
|
2229
2759
|
|
2230
|
-
def
|
2231
|
-
|
2232
|
-
|
2233
|
-
|
2234
|
-
|
2760
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2761
|
+
del bid # unused
|
2762
|
+
if name.startswith("thinker."):
|
2763
|
+
name = name.replace("thinker.", "")
|
2764
|
+
if name.startswith("visual") or name.startswith("audio") or \
|
2765
|
+
name.startswith("talker") or name.startswith("token2wav"):
|
2766
|
+
# skip multimodal tensors
|
2767
|
+
return []
|
2768
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2769
|
+
|
2770
|
+
|
2771
|
+
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
2772
|
+
class Qwen2VLVisionModel(MmprojModel):
|
2773
|
+
def __init__(self, *args, **kwargs):
|
2774
|
+
super().__init__(*args, **kwargs)
|
2775
|
+
assert self.hparams_vision is not None
|
2776
|
+
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
|
2777
|
+
# rename config.json values
|
2778
|
+
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
|
2779
|
+
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
|
2780
|
+
if "embed_dim" in self.hparams_vision: # qwen2vl
|
2781
|
+
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
|
2782
|
+
self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
|
2783
|
+
|
2784
|
+
def set_gguf_parameters(self):
|
2785
|
+
super().set_gguf_parameters()
|
2786
|
+
assert self.hparams_vision is not None
|
2787
|
+
hparams = self.hparams_vision
|
2788
|
+
model_type = self.global_config['model_type']
|
2789
|
+
if model_type == 'qwen2_vl':
|
2790
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
|
2791
|
+
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
|
2792
|
+
if model_type == 'qwen2_5_omni':
|
2793
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
|
2794
|
+
else:
|
2795
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
2796
|
+
self.gguf_writer.add_vision_use_silu(True)
|
2797
|
+
# find n_wa_pattern (window attention pattern)
|
2798
|
+
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
|
2799
|
+
assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
|
2800
|
+
n_wa_pattern = fullatt_block_indexes[0] + 1
|
2801
|
+
# validate n_wa_pattern
|
2802
|
+
for i in range(1, len(fullatt_block_indexes)):
|
2803
|
+
if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
|
2804
|
+
raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
|
2805
|
+
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
|
2806
|
+
else:
|
2807
|
+
raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
|
2808
|
+
# default values below are taken from HF tranformers code
|
2809
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
|
2810
|
+
|
2811
|
+
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
2812
|
+
del bid, name, n_dims # unused
|
2813
|
+
if ".patch_embd." in new_name:
|
2814
|
+
return gguf.GGMLQuantizationType.F16
|
2815
|
+
if ".position_embd." in new_name:
|
2816
|
+
return gguf.GGMLQuantizationType.F32
|
2817
|
+
return False
|
2818
|
+
|
2819
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2820
|
+
del bid # unused
|
2821
|
+
if name.startswith("visual."):
|
2822
|
+
# process visual tensors
|
2823
|
+
# split QKV tensors if needed
|
2824
|
+
if ".qkv." in name:
|
2825
|
+
if data_torch.ndim == 2: # weight
|
2826
|
+
c3, _ = data_torch.shape
|
2827
|
+
else: # bias
|
2828
|
+
c3 = data_torch.shape[0]
|
2829
|
+
assert c3 % 3 == 0
|
2830
|
+
c = c3 // 3
|
2831
|
+
wq = data_torch[:c]
|
2832
|
+
wk = data_torch[c: c * 2]
|
2833
|
+
wv = data_torch[c * 2:]
|
2834
|
+
return [
|
2835
|
+
(self.map_tensor_name(name.replace("qkv", "q")), wq),
|
2836
|
+
(self.map_tensor_name(name.replace("qkv", "k")), wk),
|
2837
|
+
(self.map_tensor_name(name.replace("qkv", "v")), wv),
|
2838
|
+
]
|
2839
|
+
elif 'patch_embed.proj.weight' in name:
|
2840
|
+
# split Conv3D into Conv2Ds
|
2841
|
+
c1, c2, kt, kh, kw = data_torch.shape
|
2842
|
+
del c1, c2, kh, kw # unused
|
2843
|
+
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
2844
|
+
return [
|
2845
|
+
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]),
|
2846
|
+
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
|
2847
|
+
]
|
2848
|
+
else:
|
2849
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2850
|
+
return [] # skip other tensors
|
2851
|
+
|
2852
|
+
|
2853
|
+
@ModelBase.register("Qwen2_5OmniModel")
|
2854
|
+
class Qwen25OmniModel(Qwen2VLVisionModel):
|
2855
|
+
has_vision_encoder = True
|
2856
|
+
has_audio_encoder = True
|
2857
|
+
|
2858
|
+
def __init__(self, *args, **kwargs):
|
2859
|
+
super().__init__(*args, **kwargs)
|
2860
|
+
assert self.hparams_audio is not None
|
2861
|
+
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
|
2862
|
+
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
|
2863
|
+
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
|
2864
|
+
|
2865
|
+
def set_gguf_parameters(self):
|
2866
|
+
super().set_gguf_parameters()
|
2867
|
+
assert self.hparams_audio is not None
|
2868
|
+
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
|
2869
|
+
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
|
2870
|
+
|
2871
|
+
def get_vision_config(self) -> dict[str, Any] | None:
|
2872
|
+
return self.global_config["thinker_config"].get("vision_config")
|
2873
|
+
|
2874
|
+
def get_audio_config(self) -> dict[str, Any] | None:
|
2875
|
+
return self.global_config["thinker_config"].get("audio_config")
|
2876
|
+
|
2877
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2878
|
+
# SinusoidsPositionEmbedding
|
2879
|
+
assert self.hparams_audio is not None
|
2880
|
+
max_timescale = 10000
|
2881
|
+
length = 1500
|
2882
|
+
channels = self.hparams_audio["hidden_size"]
|
2883
|
+
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
2884
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
|
2885
|
+
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
2886
|
+
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
|
2887
|
+
yield ("audio_tower.embed_positions.weight", pos_embd)
|
2888
|
+
|
2889
|
+
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
2890
|
+
del bid, new_name, n_dims # unused
|
2891
|
+
if ".conv" in name and ".weight" in name:
|
2892
|
+
return gguf.GGMLQuantizationType.F16
|
2893
|
+
return False
|
2894
|
+
|
2895
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2896
|
+
if name.startswith("thinker."):
|
2897
|
+
name = name.replace("thinker.", "")
|
2898
|
+
|
2899
|
+
if name.startswith("audio_tower"):
|
2900
|
+
# process audio tensors
|
2901
|
+
if "conv1.bias" in name or "conv2.bias" in name:
|
2902
|
+
# transpose conv1 and conv2 bias
|
2903
|
+
data_torch = data_torch.unsqueeze(-1)
|
2904
|
+
if "audio_bos_eos_token" in name:
|
2905
|
+
# this tensor is left unused in transformers code
|
2906
|
+
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
|
2907
|
+
return []
|
2908
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2909
|
+
|
2910
|
+
return super().modify_tensors(data_torch, name, bid)
|
2911
|
+
|
2912
|
+
|
2913
|
+
@ModelBase.register("InternVisionModel")
|
2914
|
+
class InternVisionModel(MmprojModel):
|
2915
|
+
def set_gguf_parameters(self):
|
2916
|
+
super().set_gguf_parameters()
|
2917
|
+
hparams = self.hparams
|
2918
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
|
2919
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
2920
|
+
# hidden_act
|
2921
|
+
if hparams["hidden_act"] == "silu":
|
2922
|
+
self.gguf_writer.add_vision_use_silu(True)
|
2923
|
+
elif hparams["hidden_act"] == "gelu":
|
2924
|
+
self.gguf_writer.add_vision_use_gelu(True)
|
2925
|
+
else:
|
2926
|
+
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
|
2927
|
+
# downsample_ratio
|
2928
|
+
downsample_ratio = self.global_config.get("downsample_ratio")
|
2929
|
+
assert downsample_ratio is not None
|
2930
|
+
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
|
2931
|
+
|
2932
|
+
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
2933
|
+
del bid, name, n_dims # unused
|
2934
|
+
if ".patch_embd." in new_name:
|
2935
|
+
return gguf.GGMLQuantizationType.F16
|
2936
|
+
if ".position_embd." in new_name:
|
2937
|
+
return gguf.GGMLQuantizationType.F32
|
2938
|
+
return False
|
2939
|
+
|
2940
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2941
|
+
del bid # unused
|
2942
|
+
if name.startswith("vision_model") or name.startswith("mlp"):
|
2943
|
+
# process visual tensors
|
2944
|
+
# correct name
|
2945
|
+
if name.startswith("vision_model"):
|
2946
|
+
name = "vision_tower." + name
|
2947
|
+
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
|
2948
|
+
name += ".weight"
|
2949
|
+
# split QKV tensors if needed
|
2950
|
+
if ".qkv." in name:
|
2951
|
+
if data_torch.ndim == 2: # weight
|
2952
|
+
c3, _ = data_torch.shape
|
2953
|
+
else: # bias
|
2954
|
+
c3 = data_torch.shape[0]
|
2955
|
+
assert c3 % 3 == 0
|
2956
|
+
c = c3 // 3
|
2957
|
+
wq = data_torch[:c]
|
2958
|
+
wk = data_torch[c: c * 2]
|
2959
|
+
wv = data_torch[c * 2:]
|
2960
|
+
return [
|
2961
|
+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
|
2962
|
+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
|
2963
|
+
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
|
2964
|
+
]
|
2965
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2966
|
+
return [] # skip other tensors
|
2235
2967
|
|
2236
2968
|
|
2237
|
-
@
|
2238
|
-
class WavTokenizerDecModel(
|
2969
|
+
@ModelBase.register("WavTokenizerDec")
|
2970
|
+
class WavTokenizerDecModel(TextModel):
|
2239
2971
|
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
2240
2972
|
|
2241
2973
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
@@ -2272,8 +3004,8 @@ class WavTokenizerDecModel(Model):
|
|
2272
3004
|
self.gguf_writer.add_causal_attention(False)
|
2273
3005
|
|
2274
3006
|
|
2275
|
-
@
|
2276
|
-
class Qwen2MoeModel(
|
3007
|
+
@ModelBase.register("Qwen2MoeForCausalLM")
|
3008
|
+
class Qwen2MoeModel(TextModel):
|
2277
3009
|
model_arch = gguf.MODEL_ARCH.QWEN2MOE
|
2278
3010
|
|
2279
3011
|
def set_gguf_parameters(self):
|
@@ -2286,6 +3018,13 @@ class Qwen2MoeModel(Model):
|
|
2286
3018
|
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
2287
3019
|
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
2288
3020
|
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
3021
|
+
# YaRN is not enabled by default
|
3022
|
+
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
3023
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
3024
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
3025
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
3026
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
3027
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
2289
3028
|
|
2290
3029
|
_experts: list[dict[str, Tensor]] | None = None
|
2291
3030
|
|
@@ -2335,18 +3074,18 @@ class Qwen2MoeModel(Model):
|
|
2335
3074
|
raise ValueError(f"Unprocessed experts: {experts}")
|
2336
3075
|
|
2337
3076
|
|
2338
|
-
@
|
3077
|
+
@ModelBase.register("Qwen3ForCausalLM")
|
2339
3078
|
class Qwen3Model(Qwen2Model):
|
2340
3079
|
model_arch = gguf.MODEL_ARCH.QWEN3
|
2341
3080
|
|
2342
3081
|
|
2343
|
-
@
|
3082
|
+
@ModelBase.register("Qwen3MoeForCausalLM")
|
2344
3083
|
class Qwen3MoeModel(Qwen2MoeModel):
|
2345
3084
|
model_arch = gguf.MODEL_ARCH.QWEN3MOE
|
2346
3085
|
|
2347
3086
|
|
2348
|
-
@
|
2349
|
-
class GPT2Model(
|
3087
|
+
@ModelBase.register("GPT2LMHeadModel")
|
3088
|
+
class GPT2Model(TextModel):
|
2350
3089
|
model_arch = gguf.MODEL_ARCH.GPT2
|
2351
3090
|
|
2352
3091
|
def set_gguf_parameters(self):
|
@@ -2374,15 +3113,11 @@ class GPT2Model(Model):
|
|
2374
3113
|
|
2375
3114
|
tensors.append((new_name, data_torch))
|
2376
3115
|
|
2377
|
-
# note: GPT2 output is tied to (same as) wte in original model
|
2378
|
-
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
2379
|
-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
2380
|
-
|
2381
3116
|
return tensors
|
2382
3117
|
|
2383
3118
|
|
2384
|
-
@
|
2385
|
-
class Phi2Model(
|
3119
|
+
@ModelBase.register("PhiForCausalLM")
|
3120
|
+
class Phi2Model(TextModel):
|
2386
3121
|
model_arch = gguf.MODEL_ARCH.PHI2
|
2387
3122
|
|
2388
3123
|
def set_gguf_parameters(self):
|
@@ -2405,8 +3140,8 @@ class Phi2Model(Model):
|
|
2405
3140
|
self.gguf_writer.add_add_bos_token(False)
|
2406
3141
|
|
2407
3142
|
|
2408
|
-
@
|
2409
|
-
class Phi3MiniModel(
|
3143
|
+
@ModelBase.register("Phi3ForCausalLM")
|
3144
|
+
class Phi3MiniModel(TextModel):
|
2410
3145
|
model_arch = gguf.MODEL_ARCH.PHI3
|
2411
3146
|
|
2412
3147
|
def set_vocab(self):
|
@@ -2522,7 +3257,8 @@ class Phi3MiniModel(Model):
|
|
2522
3257
|
rms_eps = self.find_hparam(["rms_norm_eps"])
|
2523
3258
|
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
2524
3259
|
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
2525
|
-
|
3260
|
+
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
|
3261
|
+
rope_dims = int(rot_pct * n_embd) // n_head
|
2526
3262
|
|
2527
3263
|
self.gguf_writer.add_context_length(max_pos_embds)
|
2528
3264
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
@@ -2546,7 +3282,8 @@ class Phi3MiniModel(Model):
|
|
2546
3282
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
2547
3283
|
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
2548
3284
|
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
2549
|
-
|
3285
|
+
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
|
3286
|
+
rope_dims = int(rot_pct * n_embd) // n_head
|
2550
3287
|
|
2551
3288
|
# write rope scaling for long context (128k) model
|
2552
3289
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
@@ -2555,7 +3292,7 @@ class Phi3MiniModel(Model):
|
|
2555
3292
|
|
2556
3293
|
scale = max_pos_embds / orig_max_pos_embds
|
2557
3294
|
|
2558
|
-
rope_scaling_type = rope_scaling.get('type', '').lower()
|
3295
|
+
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
|
2559
3296
|
if len(rope_scaling_type) == 0:
|
2560
3297
|
raise KeyError('Missing the required key rope_scaling.type')
|
2561
3298
|
|
@@ -2575,13 +3312,13 @@ class Phi3MiniModel(Model):
|
|
2575
3312
|
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
2576
3313
|
|
2577
3314
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2578
|
-
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
3315
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
|
2579
3316
|
|
2580
3317
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2581
3318
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2582
3319
|
|
2583
3320
|
|
2584
|
-
@
|
3321
|
+
@ModelBase.register("PhiMoEForCausalLM")
|
2585
3322
|
class PhiMoeModel(Phi3MiniModel):
|
2586
3323
|
model_arch = gguf.MODEL_ARCH.PHIMOE
|
2587
3324
|
|
@@ -2638,8 +3375,8 @@ class PhiMoeModel(Phi3MiniModel):
|
|
2638
3375
|
raise ValueError(f"Unprocessed experts: {experts}")
|
2639
3376
|
|
2640
3377
|
|
2641
|
-
@
|
2642
|
-
class PlamoModel(
|
3378
|
+
@ModelBase.register("PlamoForCausalLM")
|
3379
|
+
class PlamoModel(TextModel):
|
2643
3380
|
model_arch = gguf.MODEL_ARCH.PLAMO
|
2644
3381
|
|
2645
3382
|
def set_vocab(self):
|
@@ -2686,8 +3423,8 @@ class PlamoModel(Model):
|
|
2686
3423
|
return [(new_name, data_torch)]
|
2687
3424
|
|
2688
3425
|
|
2689
|
-
@
|
2690
|
-
class CodeShellModel(
|
3426
|
+
@ModelBase.register("CodeShellForCausalLM")
|
3427
|
+
class CodeShellModel(TextModel):
|
2691
3428
|
model_arch = gguf.MODEL_ARCH.CODESHELL
|
2692
3429
|
|
2693
3430
|
def set_gguf_parameters(self):
|
@@ -2705,25 +3442,30 @@ class CodeShellModel(Model):
|
|
2705
3442
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2706
3443
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
2707
3444
|
|
3445
|
+
_has_tok_embd = False
|
3446
|
+
|
2708
3447
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2709
3448
|
del bid # unused
|
2710
3449
|
|
2711
|
-
|
2712
|
-
|
2713
|
-
tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
|
3450
|
+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
3451
|
+
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
|
2714
3452
|
|
2715
|
-
|
2716
|
-
assert self.tensor_names is not None
|
3453
|
+
new_name = self.map_tensor_name(name)
|
2717
3454
|
|
2718
|
-
|
2719
|
-
|
2720
|
-
|
3455
|
+
# assuming token_embd.weight is seen before output.weight
|
3456
|
+
if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
3457
|
+
# even though the tensor file(s) does not contain the word embeddings they are still in the weight map
|
3458
|
+
if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
|
3459
|
+
logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
|
3460
|
+
self.tensor_names.remove("transformer.wte.weight")
|
3461
|
+
elif new_name == tok_embd_name:
|
3462
|
+
self._has_tok_embd = True
|
2721
3463
|
|
2722
|
-
return
|
3464
|
+
return [(new_name, data_torch)]
|
2723
3465
|
|
2724
3466
|
|
2725
|
-
@
|
2726
|
-
class InternLM2Model(
|
3467
|
+
@ModelBase.register("InternLM2ForCausalLM")
|
3468
|
+
class InternLM2Model(TextModel):
|
2727
3469
|
model_arch = gguf.MODEL_ARCH.INTERNLM2
|
2728
3470
|
|
2729
3471
|
def set_vocab(self):
|
@@ -2862,10 +3604,10 @@ class InternLM2Model(Model):
|
|
2862
3604
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2863
3605
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
2864
3606
|
self.gguf_writer.add_file_type(self.ftype)
|
2865
|
-
|
2866
|
-
|
2867
|
-
|
2868
|
-
|
3607
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
3608
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
3609
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
3610
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
2869
3611
|
|
2870
3612
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2871
3613
|
num_heads = self.hparams["num_attention_heads"]
|
@@ -2875,6 +3617,11 @@ class InternLM2Model(Model):
|
|
2875
3617
|
head_dim = n_embd // num_heads
|
2876
3618
|
num_groups = num_heads // q_per_kv
|
2877
3619
|
|
3620
|
+
name = name.replace("language_model.", "") # InternVL
|
3621
|
+
if name.startswith("mlp") or name.startswith("vision_model"):
|
3622
|
+
# skip visual tensors
|
3623
|
+
return []
|
3624
|
+
|
2878
3625
|
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
2879
3626
|
qkv = data_torch
|
2880
3627
|
|
@@ -2895,8 +3642,8 @@ class InternLM2Model(Model):
|
|
2895
3642
|
return [(self.map_tensor_name(name), data_torch)]
|
2896
3643
|
|
2897
3644
|
|
2898
|
-
@
|
2899
|
-
class InternLM3Model(
|
3645
|
+
@ModelBase.register("InternLM3ForCausalLM")
|
3646
|
+
class InternLM3Model(TextModel):
|
2900
3647
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
2901
3648
|
|
2902
3649
|
def set_vocab(self):
|
@@ -2940,14 +3687,18 @@ class InternLM3Model(Model):
|
|
2940
3687
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
2941
3688
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
2942
3689
|
|
2943
|
-
|
2944
|
-
|
2945
|
-
|
2946
|
-
|
3690
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
3691
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
3692
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
3693
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
2947
3694
|
|
2948
3695
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2949
3696
|
n_head = self.hparams["num_attention_heads"]
|
2950
3697
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
3698
|
+
name = name.replace("language_model.", "") # InternVL
|
3699
|
+
if name.startswith("mlp") or name.startswith("vision_model"):
|
3700
|
+
# skip visual tensors
|
3701
|
+
return []
|
2951
3702
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
2952
3703
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
2953
3704
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
@@ -2955,40 +3706,27 @@ class InternLM3Model(Model):
|
|
2955
3706
|
return [(self.map_tensor_name(name), data_torch)]
|
2956
3707
|
|
2957
3708
|
|
2958
|
-
@
|
2959
|
-
class BertModel(
|
3709
|
+
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
|
3710
|
+
class BertModel(TextModel):
|
2960
3711
|
model_arch = gguf.MODEL_ARCH.BERT
|
2961
3712
|
|
2962
3713
|
def __init__(self, *args, **kwargs):
|
2963
3714
|
super().__init__(*args, **kwargs)
|
2964
3715
|
self.vocab_size = None
|
2965
3716
|
|
3717
|
+
if cls_out_labels := self.hparams.get("id2label"):
|
3718
|
+
if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
|
3719
|
+
# Remove dummy labels added by AutoConfig
|
3720
|
+
cls_out_labels = None
|
3721
|
+
self.cls_out_labels = cls_out_labels
|
3722
|
+
|
2966
3723
|
def set_gguf_parameters(self):
|
2967
3724
|
super().set_gguf_parameters()
|
2968
3725
|
self.gguf_writer.add_causal_attention(False)
|
3726
|
+
self._try_set_pooling_type()
|
2969
3727
|
|
2970
|
-
|
2971
|
-
|
2972
|
-
module_path = self.dir_model / "modules.json"
|
2973
|
-
if module_path.is_file():
|
2974
|
-
with open(module_path, encoding="utf-8") as f:
|
2975
|
-
modules = json.load(f)
|
2976
|
-
for mod in modules:
|
2977
|
-
if mod["type"] == "sentence_transformers.models.Pooling":
|
2978
|
-
pooling_path = mod["path"]
|
2979
|
-
break
|
2980
|
-
|
2981
|
-
# get pooling type
|
2982
|
-
if pooling_path is not None:
|
2983
|
-
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
2984
|
-
pooling = json.load(f)
|
2985
|
-
if pooling["pooling_mode_mean_tokens"]:
|
2986
|
-
pooling_type = gguf.PoolingType.MEAN
|
2987
|
-
elif pooling["pooling_mode_cls_token"]:
|
2988
|
-
pooling_type = gguf.PoolingType.CLS
|
2989
|
-
else:
|
2990
|
-
raise NotImplementedError("Only MEAN and CLS pooling types supported")
|
2991
|
-
self.gguf_writer.add_pooling_type(pooling_type)
|
3728
|
+
if self.cls_out_labels:
|
3729
|
+
self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
|
2992
3730
|
|
2993
3731
|
def set_vocab(self):
|
2994
3732
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
@@ -3040,16 +3778,17 @@ class BertModel(Model):
|
|
3040
3778
|
if name.startswith("cls.seq_relationship"):
|
3041
3779
|
return []
|
3042
3780
|
|
3043
|
-
|
3044
|
-
|
3781
|
+
if self.cls_out_labels:
|
3782
|
+
# For BertForSequenceClassification (direct projection layer)
|
3783
|
+
if name == "classifier.weight":
|
3784
|
+
name = "classifier.out_proj.weight"
|
3045
3785
|
|
3046
|
-
|
3047
|
-
|
3048
|
-
model_arch = gguf.MODEL_ARCH.BERT
|
3786
|
+
if name == "classifier.bias":
|
3787
|
+
name = "classifier.out_proj.bias"
|
3049
3788
|
|
3050
|
-
|
3051
|
-
super().__init__(*args, **kwargs)
|
3789
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3052
3790
|
|
3791
|
+
def _xlmroberta_tokenizer_init(self) -> None:
|
3053
3792
|
# we need the pad_token_id to know how to chop down position_embd matrix
|
3054
3793
|
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
3055
3794
|
self._position_offset = 1 + pad_token_id
|
@@ -3058,68 +3797,160 @@ class RobertaModel(BertModel):
|
|
3058
3797
|
else:
|
3059
3798
|
self._position_offset = None
|
3060
3799
|
|
3061
|
-
def
|
3062
|
-
|
3063
|
-
|
3064
|
-
|
3065
|
-
|
3066
|
-
|
3067
|
-
self.gguf_writer.add_add_eos_token(True)
|
3800
|
+
def _xlmroberta_set_vocab(self) -> None:
|
3801
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
3802
|
+
# exception when importing sentencepiece_model_pb2
|
3803
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
3804
|
+
from sentencepiece import SentencePieceProcessor
|
3805
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
3068
3806
|
|
3069
|
-
|
3070
|
-
# though currently we are passing all zeros to the token_type embeddings
|
3071
|
-
# "Sequence A" or "Sequence B"
|
3072
|
-
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
3807
|
+
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
|
3073
3808
|
|
3074
|
-
|
3075
|
-
|
3809
|
+
tokenizer_json = {}
|
3810
|
+
tokenizer_config_json = {}
|
3811
|
+
if not tokenizer_path.is_file():
|
3812
|
+
tokenizer_path = self.dir_model / 'tokenizer.json'
|
3813
|
+
tokenizer_config_path = self.dir_model / 'tokenizer_config.json'
|
3076
3814
|
|
3077
|
-
|
3078
|
-
|
3079
|
-
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
3080
|
-
if name.startswith("roberta."):
|
3081
|
-
name = name[8:]
|
3815
|
+
if not tokenizer_path.is_file():
|
3816
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
3082
3817
|
|
3083
|
-
|
3084
|
-
|
3085
|
-
|
3086
|
-
data_torch = data_torch[self._position_offset:,:]
|
3818
|
+
from base64 import b64decode
|
3819
|
+
from transformers import AutoTokenizer
|
3820
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
3087
3821
|
|
3088
|
-
|
3822
|
+
with open(tokenizer_path, "r", encoding="utf-8") as fp:
|
3823
|
+
tokenizer_json = json.load(fp)
|
3089
3824
|
|
3825
|
+
if tokenizer_config_path.is_file():
|
3826
|
+
with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
|
3827
|
+
tokenizer_config_json = json.load(fp)
|
3090
3828
|
|
3091
|
-
|
3092
|
-
|
3093
|
-
|
3829
|
+
add_prefix = tokenizer.add_prefix_space
|
3830
|
+
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
|
3831
|
+
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
|
3094
3832
|
|
3095
|
-
|
3096
|
-
|
3833
|
+
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
|
3834
|
+
else:
|
3835
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
3836
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
3837
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3097
3838
|
|
3098
|
-
|
3099
|
-
|
3839
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
3840
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
3841
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
3100
3842
|
|
3101
|
-
|
3102
|
-
|
3103
|
-
|
3104
|
-
|
3105
|
-
|
3106
|
-
|
3107
|
-
|
3108
|
-
|
3109
|
-
|
3110
|
-
|
3111
|
-
|
3112
|
-
|
3113
|
-
|
3114
|
-
|
3843
|
+
tokenizer = SentencePieceProcessor()
|
3844
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3845
|
+
|
3846
|
+
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
|
3847
|
+
|
3848
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3849
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3850
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3851
|
+
|
3852
|
+
if isinstance(tokenizer, SentencePieceProcessor):
|
3853
|
+
for token_id in range(tokenizer.vocab_size()):
|
3854
|
+
piece = tokenizer.IdToPiece(token_id)
|
3855
|
+
text = piece.encode("utf-8")
|
3856
|
+
score = tokenizer.GetScore(token_id)
|
3857
|
+
|
3858
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3859
|
+
if tokenizer.IsUnknown(token_id):
|
3860
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3861
|
+
elif tokenizer.IsControl(token_id):
|
3862
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3863
|
+
elif tokenizer.IsUnused(token_id):
|
3864
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3865
|
+
elif tokenizer.IsByte(token_id):
|
3866
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3867
|
+
|
3868
|
+
tokens[token_id] = text
|
3869
|
+
scores[token_id] = score
|
3870
|
+
toktypes[token_id] = toktype
|
3871
|
+
else:
|
3872
|
+
added_vocab = tokenizer.get_added_vocab()
|
3873
|
+
unk_token = tokenizer_config_json.get("unk_token")
|
3874
|
+
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
|
3875
|
+
|
3876
|
+
for token_id in range(tokenizer.vocab_size):
|
3877
|
+
piece = tokenizer._convert_id_to_token(token_id)
|
3878
|
+
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
|
3879
|
+
text = piece.encode("utf-8")
|
3880
|
+
score = tokenizer_json["model"]["vocab"][token_id][1]
|
3881
|
+
|
3882
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3883
|
+
if token_id == unk_token_id:
|
3884
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3885
|
+
elif token_id in tokenizer.all_special_ids:
|
3886
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3887
|
+
elif token_id in added_vocab.values():
|
3888
|
+
toktype = SentencePieceTokenTypes.USER_DEFINED
|
3889
|
+
# No reliable way to detect this, but jina doesn't have any
|
3890
|
+
# elif tokenizer.IsByte(token_id):
|
3891
|
+
# toktype = SentencePieceTokenTypes.BYTE
|
3892
|
+
|
3893
|
+
tokens[token_id] = text
|
3894
|
+
scores[token_id] = score
|
3895
|
+
toktypes[token_id] = toktype
|
3896
|
+
|
3897
|
+
if isinstance(tokenizer, SentencePieceProcessor):
|
3898
|
+
# realign tokens (see HF tokenizer code)
|
3899
|
+
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
|
3900
|
+
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
|
3901
|
+
toktypes = [
|
3902
|
+
SentencePieceTokenTypes.CONTROL,
|
3903
|
+
SentencePieceTokenTypes.CONTROL,
|
3904
|
+
SentencePieceTokenTypes.CONTROL,
|
3905
|
+
SentencePieceTokenTypes.UNKNOWN,
|
3906
|
+
] + toktypes[3:-1]
|
3907
|
+
|
3908
|
+
if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
|
3909
|
+
# Add mask token missing from sentencepiece.bpe.model
|
3910
|
+
tokens[250001] = b'<mask>'
|
3911
|
+
scores[250001] = 0.0
|
3912
|
+
toktypes[250001] = SentencePieceTokenTypes.CONTROL
|
3913
|
+
|
3914
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
3915
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3916
|
+
self.gguf_writer.add_token_list(tokens)
|
3917
|
+
self.gguf_writer.add_token_scores(scores)
|
3918
|
+
self.gguf_writer.add_token_types(toktypes)
|
3919
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
3920
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
3921
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3922
|
+
if precompiled_charsmap:
|
3923
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3924
|
+
|
3925
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3926
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3927
|
+
|
3928
|
+
self.gguf_writer.add_add_bos_token(True)
|
3929
|
+
self.gguf_writer.add_add_eos_token(True)
|
3930
|
+
|
3931
|
+
|
3932
|
+
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
|
3933
|
+
class DistilBertModel(BertModel):
|
3934
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
3115
3935
|
|
3116
3936
|
def set_gguf_parameters(self):
|
3937
|
+
self.gguf_writer.add_layer_norm_eps(1e-12)
|
3938
|
+
logger.info("gguf: layer norm epsilon = 1e-12")
|
3117
3939
|
super().set_gguf_parameters()
|
3118
|
-
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
3119
3940
|
|
3941
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3942
|
+
if name.startswith("distilbert."):
|
3943
|
+
name = name[11:]
|
3120
3944
|
|
3121
|
-
|
3122
|
-
|
3945
|
+
# These layers act as MLM head, so we don't need them
|
3946
|
+
if name.startswith("vocab_"):
|
3947
|
+
return []
|
3948
|
+
|
3949
|
+
return super().modify_tensors(data_torch, name, bid)
|
3950
|
+
|
3951
|
+
|
3952
|
+
@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
|
3953
|
+
class RobertaModel(BertModel):
|
3123
3954
|
model_arch = gguf.MODEL_ARCH.BERT
|
3124
3955
|
|
3125
3956
|
def __init__(self, *args, **kwargs):
|
@@ -3134,86 +3965,127 @@ class XLMRobertaModel(BertModel):
|
|
3134
3965
|
self._position_offset = None
|
3135
3966
|
|
3136
3967
|
def set_vocab(self):
|
3137
|
-
|
3138
|
-
|
3139
|
-
|
3140
|
-
|
3141
|
-
|
3968
|
+
"""Support BPE tokenizers for roberta models"""
|
3969
|
+
bpe_tok_path = self.dir_model / "tokenizer.json"
|
3970
|
+
if bpe_tok_path.exists():
|
3971
|
+
self._set_vocab_gpt2()
|
3972
|
+
self.gguf_writer.add_add_bos_token(True)
|
3973
|
+
self.gguf_writer.add_add_eos_token(True)
|
3142
3974
|
|
3143
|
-
|
3144
|
-
|
3145
|
-
|
3975
|
+
# we need this to validate the size of the token_type embeddings
|
3976
|
+
# though currently we are passing all zeros to the token_type embeddings
|
3977
|
+
# "Sequence A" or "Sequence B"
|
3978
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
3146
3979
|
|
3147
|
-
|
3148
|
-
|
3149
|
-
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3980
|
+
else:
|
3981
|
+
return super().set_vocab()
|
3150
3982
|
|
3151
|
-
|
3152
|
-
|
3153
|
-
|
3983
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3984
|
+
# if name starts with "roberta.", remove the prefix
|
3985
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
3986
|
+
if name.startswith("roberta."):
|
3987
|
+
name = name[8:]
|
3154
3988
|
|
3155
|
-
|
3156
|
-
|
3989
|
+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
3990
|
+
if name == "embeddings.position_embeddings.weight":
|
3991
|
+
if self._position_offset is not None:
|
3992
|
+
data_torch = data_torch[self._position_offset:,:]
|
3157
3993
|
|
3158
|
-
|
3994
|
+
return super().modify_tensors(data_torch, name, bid)
|
3159
3995
|
|
3160
|
-
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3161
|
-
scores: list[float] = [-10000.0] * vocab_size
|
3162
|
-
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3163
3996
|
|
3164
|
-
|
3165
|
-
|
3166
|
-
|
3167
|
-
score = tokenizer.GetScore(token_id)
|
3997
|
+
@ModelBase.register("NomicBertModel")
|
3998
|
+
class NomicBertModel(BertModel):
|
3999
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
3168
4000
|
|
3169
|
-
|
3170
|
-
|
3171
|
-
|
3172
|
-
|
3173
|
-
toktype = SentencePieceTokenTypes.CONTROL
|
3174
|
-
elif tokenizer.IsUnused(token_id):
|
3175
|
-
toktype = SentencePieceTokenTypes.UNUSED
|
3176
|
-
elif tokenizer.IsByte(token_id):
|
3177
|
-
toktype = SentencePieceTokenTypes.BYTE
|
4001
|
+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
|
4002
|
+
hparams = kwargs.pop("hparams", None)
|
4003
|
+
if hparams is None:
|
4004
|
+
hparams = ModelBase.load_hparams(dir_model)
|
3178
4005
|
|
3179
|
-
|
3180
|
-
|
3181
|
-
toktypes[token_id] = toktype
|
4006
|
+
self.is_moe = bool(hparams.get("moe_every_n_layers"))
|
4007
|
+
self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
|
3182
4008
|
|
3183
|
-
|
3184
|
-
pad_count = vocab_size - len(tokens)
|
3185
|
-
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
3186
|
-
for i in range(1, pad_count + 1):
|
3187
|
-
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
3188
|
-
scores.append(-1000.0)
|
3189
|
-
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
4009
|
+
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
|
3190
4010
|
|
3191
|
-
|
3192
|
-
|
3193
|
-
|
3194
|
-
toktypes = [
|
3195
|
-
SentencePieceTokenTypes.CONTROL,
|
3196
|
-
SentencePieceTokenTypes.CONTROL,
|
3197
|
-
SentencePieceTokenTypes.CONTROL,
|
3198
|
-
SentencePieceTokenTypes.UNKNOWN,
|
3199
|
-
] + toktypes[3:-1]
|
4011
|
+
self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
|
4012
|
+
if self._tokenizer_is_xlmroberta:
|
4013
|
+
self._xlmroberta_tokenizer_init()
|
3200
4014
|
|
3201
|
-
self.
|
3202
|
-
|
3203
|
-
|
3204
|
-
|
3205
|
-
|
3206
|
-
|
3207
|
-
|
3208
|
-
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3209
|
-
if precompiled_charsmap:
|
3210
|
-
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
4015
|
+
npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048)
|
4016
|
+
if npos == 8192 and mtp == 2048:
|
4017
|
+
self.hparams["n_positions"] = 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens.
|
4018
|
+
elif npos == 2048 and mtp == 2048:
|
4019
|
+
self.hparams["n_positions"] = 512 # nomic-embed-text-v2-moe is trained for 512 tokens.
|
4020
|
+
else:
|
4021
|
+
raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}")
|
3211
4022
|
|
3212
|
-
|
3213
|
-
special_vocab.add_to_gguf(self.gguf_writer)
|
4023
|
+
assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
|
3214
4024
|
|
3215
|
-
|
3216
|
-
self.
|
4025
|
+
# this doesn't do anything in the HF version
|
4026
|
+
assert self.hparams["causal"] is False
|
4027
|
+
# no bias tensors unless MoE
|
4028
|
+
assert self.hparams["qkv_proj_bias"] == self.is_moe
|
4029
|
+
assert self.hparams["mlp_fc1_bias"] == self.is_moe
|
4030
|
+
assert self.hparams["mlp_fc2_bias"] == self.is_moe
|
4031
|
+
|
4032
|
+
# norm at end of layer
|
4033
|
+
assert self.hparams["prenorm"] is False
|
4034
|
+
# standard RoPE
|
4035
|
+
assert self.hparams["rotary_emb_fraction"] == 1.0
|
4036
|
+
assert self.hparams["rotary_emb_interleaved"] is False
|
4037
|
+
assert self.hparams["rotary_emb_scale_base"] is None
|
4038
|
+
|
4039
|
+
def set_vocab(self) -> None:
|
4040
|
+
if self._tokenizer_is_xlmroberta:
|
4041
|
+
return self._xlmroberta_set_vocab()
|
4042
|
+
return super().set_vocab()
|
4043
|
+
|
4044
|
+
def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
|
4045
|
+
# If the tensor is an experts bias tensor, skip it by returning an empty list.
|
4046
|
+
if "mlp.experts.bias" in name:
|
4047
|
+
return [] # Explicitly return an empty list.
|
4048
|
+
|
4049
|
+
if "mlp.experts.mlp.w1" in name:
|
4050
|
+
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
|
4051
|
+
name += ".weight"
|
4052
|
+
|
4053
|
+
if "mlp.experts.mlp.w2" in name:
|
4054
|
+
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
|
4055
|
+
data_torch = data_torch.transpose(1, 2)
|
4056
|
+
name += ".weight"
|
4057
|
+
|
4058
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4059
|
+
|
4060
|
+
def set_gguf_parameters(self):
|
4061
|
+
super().set_gguf_parameters()
|
4062
|
+
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
4063
|
+
if self.is_moe:
|
4064
|
+
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
4065
|
+
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
4066
|
+
self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
|
4067
|
+
|
4068
|
+
def _is_tokenizer_xlmroberta(self) -> bool:
|
4069
|
+
with open(self.dir_model / "tokenizer.json") as f:
|
4070
|
+
tokenizer_json = json.load(f)
|
4071
|
+
toktyp = tokenizer_json["model"]["type"]
|
4072
|
+
if toktyp == "Unigram":
|
4073
|
+
return True
|
4074
|
+
if toktyp == "WordPiece":
|
4075
|
+
return False
|
4076
|
+
raise ValueError(f"unknown tokenizer: {toktyp}")
|
4077
|
+
|
4078
|
+
|
4079
|
+
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
4080
|
+
class XLMRobertaModel(BertModel):
|
4081
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
4082
|
+
|
4083
|
+
def __init__(self, *args, **kwargs):
|
4084
|
+
super().__init__(*args, **kwargs)
|
4085
|
+
self._xlmroberta_tokenizer_init()
|
4086
|
+
|
4087
|
+
def set_vocab(self):
|
4088
|
+
self._xlmroberta_set_vocab()
|
3217
4089
|
|
3218
4090
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3219
4091
|
# if name starts with "roberta.", remove the prefix
|
@@ -3229,8 +4101,8 @@ class XLMRobertaModel(BertModel):
|
|
3229
4101
|
return super().modify_tensors(data_torch, name, bid)
|
3230
4102
|
|
3231
4103
|
|
3232
|
-
@
|
3233
|
-
class GemmaModel(
|
4104
|
+
@ModelBase.register("GemmaForCausalLM")
|
4105
|
+
class GemmaModel(TextModel):
|
3234
4106
|
model_arch = gguf.MODEL_ARCH.GEMMA
|
3235
4107
|
|
3236
4108
|
def set_vocab(self):
|
@@ -3280,8 +4152,8 @@ class GemmaModel(Model):
|
|
3280
4152
|
return [(self.map_tensor_name(name), data_torch)]
|
3281
4153
|
|
3282
4154
|
|
3283
|
-
@
|
3284
|
-
class Gemma2Model(
|
4155
|
+
@ModelBase.register("Gemma2ForCausalLM")
|
4156
|
+
class Gemma2Model(TextModel):
|
3285
4157
|
model_arch = gguf.MODEL_ARCH.GEMMA2
|
3286
4158
|
|
3287
4159
|
def set_vocab(self):
|
@@ -3327,48 +4199,128 @@ class Gemma2Model(Model):
|
|
3327
4199
|
return [(self.map_tensor_name(name), data_torch)]
|
3328
4200
|
|
3329
4201
|
|
3330
|
-
@
|
3331
|
-
class
|
3332
|
-
model_arch = gguf.MODEL_ARCH.
|
4202
|
+
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
4203
|
+
class Gemma3Model(TextModel):
|
4204
|
+
model_arch = gguf.MODEL_ARCH.GEMMA3
|
3333
4205
|
|
4206
|
+
def set_vocab(self):
|
4207
|
+
self._set_vocab_sentencepiece()
|
3334
4208
|
|
3335
|
-
|
3336
|
-
class Rwkv6Model(Model):
|
3337
|
-
model_arch = gguf.MODEL_ARCH.RWKV6
|
4209
|
+
self.gguf_writer.add_add_space_prefix(False)
|
3338
4210
|
|
3339
|
-
def
|
3340
|
-
|
3341
|
-
|
4211
|
+
def set_gguf_parameters(self):
|
4212
|
+
hparams = self.hparams
|
4213
|
+
block_count = hparams["num_hidden_layers"]
|
3342
4214
|
|
3343
|
-
|
3344
|
-
|
4215
|
+
# some default values are not specified in the hparams
|
4216
|
+
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
4217
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
4218
|
+
self.gguf_writer.add_block_count(block_count)
|
4219
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
4220
|
+
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
4221
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
4222
|
+
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
4223
|
+
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
4224
|
+
self.gguf_writer.add_file_type(self.ftype)
|
4225
|
+
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
4226
|
+
# both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
|
4227
|
+
assert hparams.get("attn_logit_softcapping") is None
|
4228
|
+
assert hparams.get("final_logit_softcapping") is None
|
4229
|
+
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
4230
|
+
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
4231
|
+
if hparams.get("rope_scaling") is not None:
|
4232
|
+
assert hparams["rope_scaling"]["rope_type"] == "linear"
|
4233
|
+
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
4234
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
4235
|
+
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
3345
4236
|
|
3346
|
-
|
3347
|
-
|
3348
|
-
for line in lines:
|
3349
|
-
parts = line.split(' ')
|
3350
|
-
assert len(parts) >= 3
|
3351
|
-
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
3352
|
-
token = token.encode("utf-8") if isinstance(token, str) else token
|
3353
|
-
assert isinstance(token, bytes)
|
3354
|
-
assert len(token) == token_len
|
3355
|
-
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
3356
|
-
tokens.append(token_text.encode("utf-8"))
|
3357
|
-
toktypes.append(gguf.TokenType.NORMAL)
|
3358
|
-
remainder = vocab_size - len(tokens)
|
3359
|
-
assert remainder >= 0
|
3360
|
-
for i in range(len(tokens), vocab_size):
|
3361
|
-
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
3362
|
-
toktypes.append(gguf.TokenType.UNUSED)
|
4237
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4238
|
+
del bid # unused
|
3363
4239
|
|
3364
|
-
|
3365
|
-
|
3366
|
-
|
3367
|
-
|
3368
|
-
|
3369
|
-
|
3370
|
-
|
3371
|
-
|
4240
|
+
if name.startswith("language_model."):
|
4241
|
+
name = name.replace("language_model.", "")
|
4242
|
+
|
4243
|
+
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
4244
|
+
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
|
4245
|
+
return [] # skip vision tensors
|
4246
|
+
|
4247
|
+
# remove OOV (out-of-vocabulary) rows in token_embd
|
4248
|
+
if "embed_tokens.weight" in name:
|
4249
|
+
vocab = self._create_vocab_sentencepiece()
|
4250
|
+
tokens = vocab[0]
|
4251
|
+
data_torch = data_torch[:len(tokens)]
|
4252
|
+
|
4253
|
+
# ref code in Gemma3RMSNorm
|
4254
|
+
# output = output * (1.0 + self.weight.float())
|
4255
|
+
if name.endswith("norm.weight"):
|
4256
|
+
data_torch = data_torch + 1
|
4257
|
+
|
4258
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4259
|
+
|
4260
|
+
|
4261
|
+
@ModelBase.register("Gemma3ForConditionalGeneration")
|
4262
|
+
class Gemma3VisionModel(MmprojModel):
|
4263
|
+
def set_gguf_parameters(self):
|
4264
|
+
super().set_gguf_parameters()
|
4265
|
+
hparams = self.hparams
|
4266
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
|
4267
|
+
# default values below are taken from HF tranformers code
|
4268
|
+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
|
4269
|
+
self.gguf_writer.add_vision_use_gelu(True)
|
4270
|
+
# calculate proj_scale_factor (used by tinygemma3 test model)
|
4271
|
+
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
|
4272
|
+
n_per_side = int(image_seq_length ** 0.5)
|
4273
|
+
image_size = self.hparams["image_size"]
|
4274
|
+
patch_size = self.hparams["patch_size"]
|
4275
|
+
proj_scale_factor = (image_size // patch_size) // n_per_side
|
4276
|
+
if proj_scale_factor > 0 and proj_scale_factor != 4:
|
4277
|
+
# we only need to write this if it's not the default value
|
4278
|
+
# in this case, we are converting a test model
|
4279
|
+
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
|
4280
|
+
|
4281
|
+
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
4282
|
+
del bid, new_name, n_dims # unused
|
4283
|
+
# related to https://github.com/ggml-org/llama.cpp/issues/13025
|
4284
|
+
if "input_projection" in name:
|
4285
|
+
return gguf.GGMLQuantizationType.F16
|
4286
|
+
if ".embeddings." in name:
|
4287
|
+
return gguf.GGMLQuantizationType.F32
|
4288
|
+
return False
|
4289
|
+
|
4290
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4291
|
+
del bid # unused
|
4292
|
+
|
4293
|
+
if "vision_model.head." in name:
|
4294
|
+
return [] # skip redundant tensors for tinygemma3
|
4295
|
+
|
4296
|
+
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
4297
|
+
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
|
4298
|
+
# process vision tensors
|
4299
|
+
name = name.replace("_weight", ".weight")
|
4300
|
+
|
4301
|
+
# correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
|
4302
|
+
# the other norm values are part of SigLIP model, and they are already correct
|
4303
|
+
# ref code: Gemma3RMSNorm
|
4304
|
+
if "soft_emb_norm.weight" in name:
|
4305
|
+
logger.info(f"Correcting norm value for '{name}'")
|
4306
|
+
data_torch = data_torch + 1
|
4307
|
+
|
4308
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4309
|
+
|
4310
|
+
return [] # skip other tensors
|
4311
|
+
|
4312
|
+
|
4313
|
+
@ModelBase.register("Starcoder2ForCausalLM")
|
4314
|
+
class StarCoder2Model(TextModel):
|
4315
|
+
model_arch = gguf.MODEL_ARCH.STARCODER2
|
4316
|
+
|
4317
|
+
|
4318
|
+
@ModelBase.register("Rwkv6ForCausalLM")
|
4319
|
+
class Rwkv6Model(TextModel):
|
4320
|
+
model_arch = gguf.MODEL_ARCH.RWKV6
|
4321
|
+
|
4322
|
+
def set_vocab(self):
|
4323
|
+
self._set_vocab_rwkv_world()
|
3372
4324
|
|
3373
4325
|
def set_gguf_parameters(self):
|
3374
4326
|
block_count = self.hparams["num_hidden_layers"]
|
@@ -3429,16 +4381,189 @@ class Rwkv6Model(Model):
|
|
3429
4381
|
self.lerp_weights[bid] = {new_name: data_torch}
|
3430
4382
|
if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
|
3431
4383
|
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
|
3432
|
-
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
|
4384
|
+
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
|
4385
|
+
yield (new_name, data)
|
4386
|
+
return
|
4387
|
+
|
4388
|
+
yield (new_name, data_torch)
|
4389
|
+
|
4390
|
+
|
4391
|
+
@ModelBase.register("RWKV6Qwen2ForCausalLM")
|
4392
|
+
class RWKV6Qwen2Model(Rwkv6Model):
|
4393
|
+
model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
|
4394
|
+
|
4395
|
+
def set_vocab(self):
|
4396
|
+
try:
|
4397
|
+
self._set_vocab_sentencepiece()
|
4398
|
+
except FileNotFoundError:
|
4399
|
+
self._set_vocab_gpt2()
|
4400
|
+
|
4401
|
+
def set_gguf_parameters(self):
|
4402
|
+
block_count = self.hparams["num_hidden_layers"]
|
4403
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
4404
|
+
num_key_value_heads = self.hparams["num_key_value_heads"]
|
4405
|
+
hidden_size = self.hparams["hidden_size"]
|
4406
|
+
head_size = hidden_size // num_attention_heads
|
4407
|
+
rms_norm_eps = self.hparams["rms_norm_eps"]
|
4408
|
+
intermediate_size = self.hparams["intermediate_size"]
|
4409
|
+
time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
|
4410
|
+
time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
|
4411
|
+
|
4412
|
+
# RWKV isn't context limited
|
4413
|
+
self.gguf_writer.add_context_length(1048576)
|
4414
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
4415
|
+
self.gguf_writer.add_block_count(block_count)
|
4416
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
4417
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
4418
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
4419
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
4420
|
+
self.gguf_writer.add_file_type(self.ftype)
|
4421
|
+
|
4422
|
+
# special parameters for time_mixing in RWKV6QWEN2
|
4423
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
4424
|
+
self.gguf_writer.add_token_shift_count(1)
|
4425
|
+
# RWKV6QWEN2 use grouped key/value like GQA
|
4426
|
+
self.gguf_writer.add_head_count_kv(num_key_value_heads)
|
4427
|
+
|
4428
|
+
# required by llama.cpp, unused
|
4429
|
+
self.gguf_writer.add_head_count(0)
|
4430
|
+
|
4431
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4432
|
+
for new_name, data in super().modify_tensors(data_torch, name, bid):
|
4433
|
+
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
|
4434
|
+
data = data.view(5, -1, data.shape[-1])
|
4435
|
+
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
|
4436
|
+
# permute them here to avoid code changes
|
4437
|
+
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
|
4438
|
+
if "w2" in new_name:
|
4439
|
+
data = data.view(5, -1, data.shape[-1])
|
4440
|
+
yield (new_name, data)
|
4441
|
+
continue
|
4442
|
+
yield (new_name, data)
|
4443
|
+
|
4444
|
+
|
4445
|
+
@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
|
4446
|
+
class Rwkv7Model(TextModel):
|
4447
|
+
model_arch = gguf.MODEL_ARCH.RWKV7
|
4448
|
+
|
4449
|
+
def set_vocab(self):
|
4450
|
+
self._set_vocab_rwkv_world()
|
4451
|
+
|
4452
|
+
def calc_lora_rank(self, hidden_size, exponent, multiplier):
|
4453
|
+
return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
|
4454
|
+
|
4455
|
+
def set_gguf_parameters(self):
|
4456
|
+
block_count = self.hparams["num_hidden_layers"]
|
4457
|
+
try:
|
4458
|
+
head_size = self.hparams["head_size"]
|
4459
|
+
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
4460
|
+
except KeyError:
|
4461
|
+
head_size = self.hparams["head_dim"]
|
4462
|
+
layer_norm_eps = self.hparams["norm_eps"]
|
4463
|
+
hidden_size = self.hparams["hidden_size"]
|
4464
|
+
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4)
|
4465
|
+
|
4466
|
+
# ICLR: In-Context-Learning-Rate
|
4467
|
+
try:
|
4468
|
+
lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
|
4469
|
+
lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
|
4470
|
+
lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
|
4471
|
+
lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
|
4472
|
+
except KeyError:
|
4473
|
+
lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
|
4474
|
+
lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
|
4475
|
+
lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
|
4476
|
+
lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
|
4477
|
+
|
4478
|
+
# RWKV isn't context limited
|
4479
|
+
self.gguf_writer.add_context_length(1048576)
|
4480
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
4481
|
+
self.gguf_writer.add_block_count(block_count)
|
4482
|
+
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
4483
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
4484
|
+
self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
|
4485
|
+
self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
|
4486
|
+
self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
|
4487
|
+
self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
|
4488
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
4489
|
+
self.gguf_writer.add_file_type(self.ftype)
|
4490
|
+
|
4491
|
+
# required by llama.cpp, unused
|
4492
|
+
self.gguf_writer.add_head_count(0)
|
4493
|
+
|
4494
|
+
lerp_weights: dict[int, dict[str, Tensor]] = {}
|
4495
|
+
lora_needs_transpose: bool = True
|
4496
|
+
|
4497
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4498
|
+
# unify tensor names here to make life easier
|
4499
|
+
name = name.replace("blocks", "layers").replace("ffn", "feed_forward")
|
4500
|
+
name = name.replace("self_attn", "attention").replace("attn", "attention")
|
4501
|
+
name = name.replace("time_mixer.", "")
|
4502
|
+
# lora layer names in fla-hub's impl
|
4503
|
+
if "_lora.lora" in name:
|
4504
|
+
self.lora_needs_transpose = False
|
4505
|
+
name = name.replace("_lora.lora.0.weight", "1.weight")
|
4506
|
+
name = name.replace("_lora.lora.2.weight", "2.weight")
|
4507
|
+
name = name.replace("_lora.lora.2.bias", "0.weight")
|
4508
|
+
|
4509
|
+
name = name.replace("feed_forward_norm", "ln2")
|
4510
|
+
name = name.replace("g_norm", "ln_x")
|
4511
|
+
|
4512
|
+
if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0:
|
4513
|
+
# some models have dummy v0/v1/v2 on first layer while others don't
|
4514
|
+
# ignore them all since they are not used
|
4515
|
+
return
|
4516
|
+
|
4517
|
+
wkv_has_gate = self.hparams.get("wkv_has_gate", True)
|
4518
|
+
lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"]
|
4519
|
+
|
4520
|
+
if bid is not None and "attention.x_" in name:
|
4521
|
+
if "attention.x_x" in name:
|
4522
|
+
# already concatenated
|
4523
|
+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
|
4524
|
+
data = data_torch.reshape(len(lerp_list), 1, 1, -1)
|
3433
4525
|
yield (new_name, data)
|
4526
|
+
else:
|
4527
|
+
try:
|
4528
|
+
self.lerp_weights[bid][name] = data_torch
|
4529
|
+
except KeyError:
|
4530
|
+
self.lerp_weights[bid] = {name: data_torch}
|
4531
|
+
if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list):
|
4532
|
+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
|
4533
|
+
data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0)
|
4534
|
+
yield (new_name, data)
|
3434
4535
|
return
|
4536
|
+
else:
|
4537
|
+
data_torch = data_torch.squeeze()
|
4538
|
+
new_name = self.map_tensor_name(name)
|
3435
4539
|
|
3436
|
-
|
4540
|
+
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
4541
|
+
new_name += ".weight"
|
3437
4542
|
|
4543
|
+
if self.lora_needs_transpose and any(
|
4544
|
+
new_name.endswith(t) for t in [
|
4545
|
+
"time_mix_w1.weight", "time_mix_w2.weight",
|
4546
|
+
"time_mix_a1.weight", "time_mix_a2.weight",
|
4547
|
+
"time_mix_v1.weight", "time_mix_v2.weight",
|
4548
|
+
"time_mix_g1.weight", "time_mix_g2.weight",
|
4549
|
+
]
|
4550
|
+
):
|
4551
|
+
data_torch = data_torch.transpose(0, 1)
|
3438
4552
|
|
3439
|
-
|
3440
|
-
|
3441
|
-
|
4553
|
+
if 'r_k' in new_name:
|
4554
|
+
data_torch = data_torch.flatten()
|
4555
|
+
|
4556
|
+
if bid == 0 and "time_mix_a" in new_name:
|
4557
|
+
# dummy v0/v1/v2 on first layer
|
4558
|
+
# easist way to make llama happy
|
4559
|
+
yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
|
4560
|
+
|
4561
|
+
yield (new_name, data_torch)
|
4562
|
+
|
4563
|
+
|
4564
|
+
@ModelBase.register("RwkvHybridForCausalLM")
|
4565
|
+
class ARwkv7Model(Rwkv7Model):
|
4566
|
+
model_arch = gguf.MODEL_ARCH.ARWKV7
|
3442
4567
|
|
3443
4568
|
def set_vocab(self):
|
3444
4569
|
try:
|
@@ -3448,50 +4573,39 @@ class RWKV6Qwen2Model(Rwkv6Model):
|
|
3448
4573
|
|
3449
4574
|
def set_gguf_parameters(self):
|
3450
4575
|
block_count = self.hparams["num_hidden_layers"]
|
3451
|
-
num_attention_heads = self.hparams["num_attention_heads"]
|
3452
|
-
num_key_value_heads = self.hparams["num_key_value_heads"]
|
3453
4576
|
hidden_size = self.hparams["hidden_size"]
|
3454
|
-
head_size =
|
4577
|
+
head_size = self.hparams["head_size"]
|
3455
4578
|
rms_norm_eps = self.hparams["rms_norm_eps"]
|
3456
4579
|
intermediate_size = self.hparams["intermediate_size"]
|
3457
|
-
|
3458
|
-
|
4580
|
+
wkv_has_gate = self.hparams["wkv_has_gate"]
|
4581
|
+
assert self.hparams["wkv_version"] == 7
|
4582
|
+
|
4583
|
+
# ICLR: In-Context-Learning-Rate
|
4584
|
+
lora_rank_decay = 64
|
4585
|
+
lora_rank_iclr = 64
|
4586
|
+
lora_rank_value_residual_mix = 32
|
4587
|
+
lora_rank_gate = 128 if wkv_has_gate else 0
|
3459
4588
|
|
3460
4589
|
# RWKV isn't context limited
|
3461
4590
|
self.gguf_writer.add_context_length(1048576)
|
3462
4591
|
self.gguf_writer.add_embedding_length(hidden_size)
|
3463
4592
|
self.gguf_writer.add_block_count(block_count)
|
4593
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3464
4594
|
self.gguf_writer.add_wkv_head_size(head_size)
|
3465
|
-
self.gguf_writer.
|
3466
|
-
self.gguf_writer.
|
4595
|
+
self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
|
4596
|
+
self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
|
4597
|
+
self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
|
4598
|
+
self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
|
3467
4599
|
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
3468
4600
|
self.gguf_writer.add_file_type(self.ftype)
|
3469
|
-
|
3470
|
-
# special parameters for time_mixing in RWKV6QWEN2
|
3471
|
-
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3472
4601
|
self.gguf_writer.add_token_shift_count(1)
|
3473
|
-
# RWKV6QWEN2 use grouped key/value like GQA
|
3474
|
-
self.gguf_writer.add_head_count_kv(num_key_value_heads)
|
3475
4602
|
|
3476
4603
|
# required by llama.cpp, unused
|
3477
4604
|
self.gguf_writer.add_head_count(0)
|
3478
4605
|
|
3479
|
-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3480
|
-
for new_name, data in super().modify_tensors(data_torch, name, bid):
|
3481
|
-
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
|
3482
|
-
data = data.view(5, -1, data.shape[-1])
|
3483
|
-
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
|
3484
|
-
# permute them here to avoid code changes
|
3485
|
-
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
|
3486
|
-
if "w2" in new_name:
|
3487
|
-
data = data.view(5, -1, data.shape[-1])
|
3488
|
-
yield (new_name, data)
|
3489
|
-
continue
|
3490
|
-
yield (new_name, data)
|
3491
|
-
|
3492
4606
|
|
3493
|
-
@
|
3494
|
-
class MambaModel(
|
4607
|
+
@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
4608
|
+
class MambaModel(TextModel):
|
3495
4609
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
3496
4610
|
|
3497
4611
|
def set_vocab(self):
|
@@ -3544,8 +4658,6 @@ class MambaModel(Model):
|
|
3544
4658
|
_tok_embd = None
|
3545
4659
|
|
3546
4660
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3547
|
-
del bid # unused
|
3548
|
-
|
3549
4661
|
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
3550
4662
|
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
|
3551
4663
|
|
@@ -3555,6 +4667,10 @@ class MambaModel(Model):
|
|
3555
4667
|
logger.debug("A_log --> A ==> " + new_name)
|
3556
4668
|
data_torch = -torch.exp(data_torch)
|
3557
4669
|
|
4670
|
+
# [4 1 8192 1] -> [4 8192 1 1]
|
4671
|
+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
4672
|
+
data_torch = data_torch.squeeze()
|
4673
|
+
|
3558
4674
|
# assuming token_embd.weight is seen before output.weight
|
3559
4675
|
if self._tok_embd is not None and new_name == output_name:
|
3560
4676
|
if torch.equal(self._tok_embd, data_torch):
|
@@ -3566,8 +4682,8 @@ class MambaModel(Model):
|
|
3566
4682
|
return [(new_name, data_torch)]
|
3567
4683
|
|
3568
4684
|
|
3569
|
-
@
|
3570
|
-
class CommandR2Model(
|
4685
|
+
@ModelBase.register("CohereForCausalLM")
|
4686
|
+
class CommandR2Model(TextModel):
|
3571
4687
|
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
3572
4688
|
|
3573
4689
|
def __init__(self, *args, **kwargs):
|
@@ -3584,8 +4700,8 @@ class CommandR2Model(Model):
|
|
3584
4700
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3585
4701
|
|
3586
4702
|
|
3587
|
-
@
|
3588
|
-
class Cohere2Model(
|
4703
|
+
@ModelBase.register("Cohere2ForCausalLM")
|
4704
|
+
class Cohere2Model(TextModel):
|
3589
4705
|
model_arch = gguf.MODEL_ARCH.COHERE2
|
3590
4706
|
|
3591
4707
|
def set_gguf_parameters(self):
|
@@ -3602,9 +4718,9 @@ class Cohere2Model(Model):
|
|
3602
4718
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3603
4719
|
|
3604
4720
|
|
3605
|
-
@
|
3606
|
-
@
|
3607
|
-
class OlmoModel(
|
4721
|
+
@ModelBase.register("OlmoForCausalLM")
|
4722
|
+
@ModelBase.register("OLMoForCausalLM")
|
4723
|
+
class OlmoModel(TextModel):
|
3608
4724
|
model_arch = gguf.MODEL_ARCH.OLMO
|
3609
4725
|
|
3610
4726
|
def set_gguf_parameters(self):
|
@@ -3630,13 +4746,13 @@ class OlmoModel(Model):
|
|
3630
4746
|
return [(self.map_tensor_name(name), data_torch)]
|
3631
4747
|
|
3632
4748
|
|
3633
|
-
@
|
3634
|
-
class Olmo2Model(
|
4749
|
+
@ModelBase.register("Olmo2ForCausalLM")
|
4750
|
+
class Olmo2Model(TextModel):
|
3635
4751
|
model_arch = gguf.MODEL_ARCH.OLMO2
|
3636
4752
|
|
3637
4753
|
|
3638
|
-
@
|
3639
|
-
class OlmoeModel(
|
4754
|
+
@ModelBase.register("OlmoeForCausalLM")
|
4755
|
+
class OlmoeModel(TextModel):
|
3640
4756
|
model_arch = gguf.MODEL_ARCH.OLMOE
|
3641
4757
|
|
3642
4758
|
def set_gguf_parameters(self):
|
@@ -3695,29 +4811,10 @@ class OlmoeModel(Model):
|
|
3695
4811
|
raise ValueError(f"Unprocessed experts: {experts}")
|
3696
4812
|
|
3697
4813
|
|
3698
|
-
@
|
4814
|
+
@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM")
|
3699
4815
|
class JinaBertV2Model(BertModel):
|
3700
4816
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
3701
4817
|
|
3702
|
-
def __init__(self, *args, **kwargs):
|
3703
|
-
super().__init__(*args, **kwargs)
|
3704
|
-
self.intermediate_size = self.hparams["intermediate_size"]
|
3705
|
-
|
3706
|
-
def get_tensors(self):
|
3707
|
-
for name, data in super().get_tensors():
|
3708
|
-
if 'gated_layer' in name:
|
3709
|
-
d1 = data[:self.intermediate_size, :]
|
3710
|
-
name1 = name.replace('gated_layers', 'gated_layers_w')
|
3711
|
-
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
3712
|
-
d2 = data[self.intermediate_size:, :]
|
3713
|
-
name2 = name.replace('gated_layers', 'gated_layers_v')
|
3714
|
-
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
3715
|
-
yield name1, d1
|
3716
|
-
yield name2, d2
|
3717
|
-
continue
|
3718
|
-
|
3719
|
-
yield name, data
|
3720
|
-
|
3721
4818
|
def set_vocab(self):
|
3722
4819
|
tokenizer_class = 'BertTokenizer'
|
3723
4820
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
@@ -3733,17 +4830,9 @@ class JinaBertV2Model(BertModel):
|
|
3733
4830
|
self.gguf_writer.add_add_bos_token(True)
|
3734
4831
|
self.gguf_writer.add_add_eos_token(True)
|
3735
4832
|
|
3736
|
-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3737
|
-
# if name starts with "bert.", remove the prefix
|
3738
|
-
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
3739
|
-
if name.startswith("bert."):
|
3740
|
-
name = name[5:]
|
3741
|
-
|
3742
|
-
return super().modify_tensors(data_torch, name, bid)
|
3743
|
-
|
3744
4833
|
|
3745
|
-
@
|
3746
|
-
class OpenELMModel(
|
4834
|
+
@ModelBase.register("OpenELMForCausalLM")
|
4835
|
+
class OpenELMModel(TextModel):
|
3747
4836
|
model_arch = gguf.MODEL_ARCH.OPENELM
|
3748
4837
|
|
3749
4838
|
@staticmethod
|
@@ -3817,8 +4906,8 @@ class OpenELMModel(Model):
|
|
3817
4906
|
yield (self.map_tensor_name(name), data_torch)
|
3818
4907
|
|
3819
4908
|
|
3820
|
-
@
|
3821
|
-
class ArcticModel(
|
4909
|
+
@ModelBase.register("ArcticForCausalLM")
|
4910
|
+
class ArcticModel(TextModel):
|
3822
4911
|
model_arch = gguf.MODEL_ARCH.ARCTIC
|
3823
4912
|
|
3824
4913
|
def set_vocab(self):
|
@@ -3968,8 +5057,8 @@ class ArcticModel(Model):
|
|
3968
5057
|
raise ValueError(f"Unprocessed experts: {experts}")
|
3969
5058
|
|
3970
5059
|
|
3971
|
-
@
|
3972
|
-
class DeepseekModel(
|
5060
|
+
@ModelBase.register("DeepseekForCausalLM")
|
5061
|
+
class DeepseekModel(TextModel):
|
3973
5062
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
3974
5063
|
|
3975
5064
|
def set_vocab(self):
|
@@ -4059,15 +5148,19 @@ class DeepseekModel(Model):
|
|
4059
5148
|
raise ValueError(f"Unprocessed experts: {experts}")
|
4060
5149
|
|
4061
5150
|
|
4062
|
-
@
|
4063
|
-
@
|
4064
|
-
class DeepseekV2Model(
|
5151
|
+
@ModelBase.register("DeepseekV2ForCausalLM")
|
5152
|
+
@ModelBase.register("DeepseekV3ForCausalLM")
|
5153
|
+
class DeepseekV2Model(TextModel):
|
4065
5154
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
4066
5155
|
|
4067
5156
|
def set_vocab(self):
|
4068
5157
|
self._set_vocab_gpt2()
|
4069
5158
|
|
4070
5159
|
def set_gguf_parameters(self):
|
5160
|
+
|
5161
|
+
# note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
|
5162
|
+
self.hparams["num_key_value_heads"] = 1
|
5163
|
+
|
4071
5164
|
super().set_gguf_parameters()
|
4072
5165
|
hparams = self.hparams
|
4073
5166
|
|
@@ -4076,8 +5169,13 @@ class DeepseekV2Model(Model):
|
|
4076
5169
|
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
4077
5170
|
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
4078
5171
|
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
4079
|
-
|
4080
|
-
|
5172
|
+
|
5173
|
+
# note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
5174
|
+
self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
|
5175
|
+
self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
|
5176
|
+
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
5177
|
+
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
|
5178
|
+
|
4081
5179
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
4082
5180
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
4083
5181
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
@@ -4093,12 +5191,12 @@ class DeepseekV2Model(Model):
|
|
4093
5191
|
|
4094
5192
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
4095
5193
|
|
4096
|
-
|
4097
|
-
|
4098
|
-
|
4099
|
-
|
4100
|
-
|
4101
|
-
|
5194
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
5195
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
5196
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
5197
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
5198
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
5199
|
+
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
|
4102
5200
|
|
4103
5201
|
_experts: list[dict[str, Tensor]] | None = None
|
4104
5202
|
|
@@ -4146,6 +5244,26 @@ class DeepseekV2Model(Model):
|
|
4146
5244
|
else:
|
4147
5245
|
return []
|
4148
5246
|
|
5247
|
+
# note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
|
5248
|
+
if name.endswith("kv_b_proj.weight"):
|
5249
|
+
name_kb = name.replace("kv_b_proj", "k_b_proj")
|
5250
|
+
name_vb = name.replace("kv_b_proj", "v_b_proj")
|
5251
|
+
|
5252
|
+
n_head_kv = self.hparams["num_key_value_heads"]
|
5253
|
+
v_head_dim = self.hparams["v_head_dim"]
|
5254
|
+
qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
|
5255
|
+
|
5256
|
+
assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
|
5257
|
+
|
5258
|
+
kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
|
5259
|
+
k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
|
5260
|
+
k_b = k_b.transpose(1, 2)
|
5261
|
+
|
5262
|
+
return [
|
5263
|
+
(self.map_tensor_name(name_kb), k_b),
|
5264
|
+
(self.map_tensor_name(name_vb), v_b)
|
5265
|
+
]
|
5266
|
+
|
4149
5267
|
return [(self.map_tensor_name(name), data_torch)]
|
4150
5268
|
|
4151
5269
|
def prepare_tensors(self):
|
@@ -4158,11 +5276,62 @@ class DeepseekV2Model(Model):
|
|
4158
5276
|
raise ValueError(f"Unprocessed experts: {experts}")
|
4159
5277
|
|
4160
5278
|
|
4161
|
-
@
|
4162
|
-
|
4163
|
-
|
4164
|
-
|
4165
|
-
|
5279
|
+
@ModelBase.register("Dots1ForCausalLM")
|
5280
|
+
class Dots1Model(Qwen2MoeModel):
|
5281
|
+
model_arch = gguf.MODEL_ARCH.DOTS1
|
5282
|
+
|
5283
|
+
def __init__(self, *args, **kwargs):
|
5284
|
+
super().__init__(*args, **kwargs)
|
5285
|
+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
|
5286
|
+
|
5287
|
+
def set_gguf_parameters(self):
|
5288
|
+
super().set_gguf_parameters()
|
5289
|
+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
5290
|
+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
5291
|
+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
5292
|
+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
5293
|
+
|
5294
|
+
if self.hparams["scoring_func"] == "noaux_tc":
|
5295
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
5296
|
+
else:
|
5297
|
+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
5298
|
+
|
5299
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
5300
|
+
if name.endswith("e_score_correction_bias"):
|
5301
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
5302
|
+
if "shared_experts" in name:
|
5303
|
+
return [(self.map_tensor_name(name), data_torch)]
|
5304
|
+
return super().modify_tensors(data_torch, name, bid)
|
5305
|
+
|
5306
|
+
|
5307
|
+
@ModelBase.register("PLMForCausalLM")
|
5308
|
+
class PLMModel(TextModel):
|
5309
|
+
model_arch = gguf.MODEL_ARCH.PLM
|
5310
|
+
|
5311
|
+
def set_vocab(self):
|
5312
|
+
self._set_vocab_gpt2()
|
5313
|
+
|
5314
|
+
def set_gguf_parameters(self):
|
5315
|
+
super().set_gguf_parameters()
|
5316
|
+
hparams = self.hparams
|
5317
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
5318
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
5319
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
5320
|
+
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
5321
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
5322
|
+
|
5323
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
5324
|
+
return [(self.map_tensor_name(name), data_torch)]
|
5325
|
+
|
5326
|
+
def prepare_tensors(self):
|
5327
|
+
super().prepare_tensors()
|
5328
|
+
|
5329
|
+
|
5330
|
+
@ModelBase.register("T5WithLMHeadModel")
|
5331
|
+
@ModelBase.register("T5ForConditionalGeneration")
|
5332
|
+
@ModelBase.register("MT5ForConditionalGeneration")
|
5333
|
+
@ModelBase.register("UMT5ForConditionalGeneration")
|
5334
|
+
class T5Model(TextModel):
|
4166
5335
|
model_arch = gguf.MODEL_ARCH.T5
|
4167
5336
|
|
4168
5337
|
def __init__(self, *args, **kwargs):
|
@@ -4301,8 +5470,8 @@ class T5Model(Model):
|
|
4301
5470
|
return [(self.map_tensor_name(name), data_torch)]
|
4302
5471
|
|
4303
5472
|
|
4304
|
-
@
|
4305
|
-
class T5EncoderModel(
|
5473
|
+
@ModelBase.register("T5EncoderModel")
|
5474
|
+
class T5EncoderModel(TextModel):
|
4306
5475
|
model_arch = gguf.MODEL_ARCH.T5ENCODER
|
4307
5476
|
|
4308
5477
|
def __init__(self, *args, **kwargs):
|
@@ -4440,8 +5609,8 @@ class T5EncoderModel(Model):
|
|
4440
5609
|
return [(self.map_tensor_name(name), data_torch)]
|
4441
5610
|
|
4442
5611
|
|
4443
|
-
@
|
4444
|
-
class JaisModel(
|
5612
|
+
@ModelBase.register("JAISLMHeadModel")
|
5613
|
+
class JaisModel(TextModel):
|
4445
5614
|
model_arch = gguf.MODEL_ARCH.JAIS
|
4446
5615
|
|
4447
5616
|
def __init__(self, *args, **kwargs):
|
@@ -4523,8 +5692,39 @@ class JaisModel(Model):
|
|
4523
5692
|
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
4524
5693
|
|
4525
5694
|
|
4526
|
-
@
|
4527
|
-
class
|
5695
|
+
@ModelBase.register("Glm4ForCausalLM")
|
5696
|
+
class Glm4Model(TextModel):
|
5697
|
+
model_arch = gguf.MODEL_ARCH.GLM4
|
5698
|
+
|
5699
|
+
def set_vocab(self):
|
5700
|
+
from transformers import AutoTokenizer
|
5701
|
+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
5702
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
5703
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
5704
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
5705
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
5706
|
+
self.gguf_writer.add_token_list(tokens)
|
5707
|
+
self.gguf_writer.add_token_types(toktypes)
|
5708
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
5709
|
+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
5710
|
+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
5711
|
+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
5712
|
+
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
5713
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
5714
|
+
|
5715
|
+
def set_gguf_parameters(self):
|
5716
|
+
super().set_gguf_parameters()
|
5717
|
+
rope_dim = self.hparams["head_dim"]
|
5718
|
+
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
5719
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
5720
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
5721
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
5722
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
5723
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
5724
|
+
|
5725
|
+
|
5726
|
+
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
5727
|
+
class ChatGLMModel(TextModel):
|
4528
5728
|
model_arch = gguf.MODEL_ARCH.CHATGLM
|
4529
5729
|
|
4530
5730
|
def set_vocab_chatglm3(self):
|
@@ -4678,8 +5878,8 @@ class ChatGLMModel(Model):
|
|
4678
5878
|
return [(self.map_tensor_name(name), data_torch)]
|
4679
5879
|
|
4680
5880
|
|
4681
|
-
@
|
4682
|
-
class NemotronModel(
|
5881
|
+
@ModelBase.register("NemotronForCausalLM")
|
5882
|
+
class NemotronModel(TextModel):
|
4683
5883
|
model_arch = gguf.MODEL_ARCH.NEMOTRON
|
4684
5884
|
|
4685
5885
|
def set_vocab(self):
|
@@ -4719,8 +5919,8 @@ class NemotronModel(Model):
|
|
4719
5919
|
return [(self.map_tensor_name(name), data_torch)]
|
4720
5920
|
|
4721
5921
|
|
4722
|
-
@
|
4723
|
-
class ExaoneModel(
|
5922
|
+
@ModelBase.register("ExaoneForCausalLM")
|
5923
|
+
class ExaoneModel(TextModel):
|
4724
5924
|
model_arch = gguf.MODEL_ARCH.EXAONE
|
4725
5925
|
|
4726
5926
|
def set_gguf_parameters(self):
|
@@ -4753,10 +5953,10 @@ class ExaoneModel(Model):
|
|
4753
5953
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
4754
5954
|
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
4755
5955
|
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
4756
|
-
|
4757
|
-
|
4758
|
-
|
4759
|
-
|
5956
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
5957
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
5958
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
5959
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
4760
5960
|
|
4761
5961
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
4762
5962
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
@@ -4788,7 +5988,7 @@ class ExaoneModel(Model):
|
|
4788
5988
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
4789
5989
|
|
4790
5990
|
|
4791
|
-
@
|
5991
|
+
@ModelBase.register("GraniteForCausalLM")
|
4792
5992
|
class GraniteModel(LlamaModel):
|
4793
5993
|
"""Conversion for IBM's GraniteForCausalLM"""
|
4794
5994
|
model_arch = gguf.MODEL_ARCH.GRANITE
|
@@ -4822,11 +6022,20 @@ class GraniteModel(LlamaModel):
|
|
4822
6022
|
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
4823
6023
|
|
4824
6024
|
|
4825
|
-
@
|
6025
|
+
@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
|
4826
6026
|
class GraniteMoeModel(GraniteModel):
|
4827
6027
|
"""Conversion for IBM's GraniteMoeForCausalLM"""
|
4828
6028
|
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
|
4829
6029
|
|
6030
|
+
def set_gguf_parameters(self):
|
6031
|
+
"""GraniteMoeShared uses GraniteMoe parameters plus the following:
|
6032
|
+
- shared_intermediate_size
|
6033
|
+
"""
|
6034
|
+
super().set_gguf_parameters()
|
6035
|
+
if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
|
6036
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
|
6037
|
+
logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
|
6038
|
+
|
4830
6039
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4831
6040
|
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
|
4832
6041
|
is used. This essentially merges w1 and w3 into a single tensor with 2x
|
@@ -4837,18 +6046,132 @@ class GraniteMoeModel(GraniteModel):
|
|
4837
6046
|
if name.endswith("block_sparse_moe.input_linear.weight"):
|
4838
6047
|
ffn_dim = self.hparams["intermediate_size"]
|
4839
6048
|
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
|
4840
|
-
gate, up = data_torch
|
6049
|
+
gate, up = data_torch.split(ffn_dim, dim=-2)
|
4841
6050
|
return [
|
4842
6051
|
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
|
4843
6052
|
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
4844
6053
|
]
|
4845
6054
|
|
6055
|
+
if name.endswith("shared_mlp.input_linear.weight"):
|
6056
|
+
ffn_dim = self.hparams["shared_intermediate_size"]
|
6057
|
+
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
|
6058
|
+
gate, up = data_torch.split(ffn_dim, dim=-2)
|
6059
|
+
return [
|
6060
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
|
6061
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
6062
|
+
]
|
6063
|
+
|
4846
6064
|
return super().modify_tensors(data_torch, name, bid)
|
4847
6065
|
|
4848
6066
|
|
4849
|
-
@
|
4850
|
-
|
4851
|
-
|
6067
|
+
@ModelBase.register("BailingMoeForCausalLM")
|
6068
|
+
class BailingMoeModel(TextModel):
|
6069
|
+
model_arch = gguf.MODEL_ARCH.BAILINGMOE
|
6070
|
+
|
6071
|
+
def set_vocab(self):
|
6072
|
+
self._set_vocab_gpt2()
|
6073
|
+
|
6074
|
+
def set_gguf_parameters(self):
|
6075
|
+
super().set_gguf_parameters()
|
6076
|
+
hparams = self.hparams
|
6077
|
+
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
|
6078
|
+
|
6079
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
6080
|
+
rope_scaling = self.hparams.get("rope_scaling") or {}
|
6081
|
+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
6082
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
6083
|
+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
6084
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
6085
|
+
else:
|
6086
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
6087
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
6088
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
6089
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
6090
|
+
self.gguf_writer.add_expert_weights_scale(1.0)
|
6091
|
+
self.gguf_writer.add_expert_count(hparams["num_experts"])
|
6092
|
+
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
|
6093
|
+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
6094
|
+
|
6095
|
+
_experts: list[dict[str, Tensor]] | None = None
|
6096
|
+
|
6097
|
+
@staticmethod
|
6098
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
6099
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
6100
|
+
n_head = n_head_kv
|
6101
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
6102
|
+
.swapaxes(1, 2)
|
6103
|
+
.reshape(weights.shape))
|
6104
|
+
|
6105
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
6106
|
+
n_head = self.hparams["num_attention_heads"]
|
6107
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
6108
|
+
n_embd = self.hparams["hidden_size"]
|
6109
|
+
head_dim = self.hparams.get("head_dim") or n_embd // n_head
|
6110
|
+
|
6111
|
+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
6112
|
+
|
6113
|
+
if name.endswith("attention.dense.weight"):
|
6114
|
+
return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
|
6115
|
+
elif name.endswith("query_key_value.weight"):
|
6116
|
+
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
|
6117
|
+
|
6118
|
+
return [
|
6119
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
|
6120
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
|
6121
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
|
6122
|
+
]
|
6123
|
+
elif name.find("mlp.experts") != -1:
|
6124
|
+
n_experts = self.hparams["num_experts"]
|
6125
|
+
assert bid is not None
|
6126
|
+
|
6127
|
+
tensors: list[tuple[str, Tensor]] = []
|
6128
|
+
|
6129
|
+
if self._experts is None:
|
6130
|
+
self._experts = [{} for _ in range(self.block_count)]
|
6131
|
+
|
6132
|
+
self._experts[bid][name] = data_torch
|
6133
|
+
|
6134
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
6135
|
+
# merge the experts into a single 3d tensor
|
6136
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
6137
|
+
datas: list[Tensor] = []
|
6138
|
+
|
6139
|
+
for xid in range(n_experts):
|
6140
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
6141
|
+
datas.append(self._experts[bid][ename])
|
6142
|
+
del self._experts[bid][ename]
|
6143
|
+
|
6144
|
+
data_torch = torch.stack(datas, dim=0)
|
6145
|
+
|
6146
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
6147
|
+
|
6148
|
+
new_name = self.map_tensor_name(merged_name)
|
6149
|
+
|
6150
|
+
tensors.append((new_name, data_torch))
|
6151
|
+
|
6152
|
+
return tensors
|
6153
|
+
|
6154
|
+
new_name = self.map_tensor_name(name)
|
6155
|
+
|
6156
|
+
if new_name == output_name and self.hparams.get("norm_head"):
|
6157
|
+
data_torch = data_torch.float()
|
6158
|
+
data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
|
6159
|
+
|
6160
|
+
return [(new_name, data_torch)]
|
6161
|
+
|
6162
|
+
def prepare_tensors(self):
|
6163
|
+
super().prepare_tensors()
|
6164
|
+
|
6165
|
+
if self._experts is not None:
|
6166
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
6167
|
+
experts = [k for d in self._experts for k in d.keys()]
|
6168
|
+
if len(experts) > 0:
|
6169
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
6170
|
+
|
6171
|
+
|
6172
|
+
@ModelBase.register("ChameleonForConditionalGeneration")
|
6173
|
+
@ModelBase.register("ChameleonForCausalLM") # obsolete
|
6174
|
+
class ChameleonModel(TextModel):
|
4852
6175
|
model_arch = gguf.MODEL_ARCH.CHAMELEON
|
4853
6176
|
|
4854
6177
|
def set_gguf_parameters(self):
|
@@ -4888,8 +6211,68 @@ class ChameleonModel(Model):
|
|
4888
6211
|
return data_torch
|
4889
6212
|
|
4890
6213
|
|
6214
|
+
@ModelBase.register("UltravoxModel")
|
6215
|
+
class UltravoxModel(TextModel):
|
6216
|
+
model_arch = gguf.MODEL_ARCH.LLAMA # dummy
|
6217
|
+
|
6218
|
+
def __init__(self, *args, **kwargs):
|
6219
|
+
super().__init__(*args, **kwargs)
|
6220
|
+
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
6221
|
+
|
6222
|
+
|
6223
|
+
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
6224
|
+
class WhisperEncoderModel(MmprojModel):
|
6225
|
+
has_vision_encoder = False # no vision encoder
|
6226
|
+
has_audio_encoder = True
|
6227
|
+
|
6228
|
+
def __init__(self, *args, **kwargs):
|
6229
|
+
super().__init__(*args, **kwargs)
|
6230
|
+
self.hparams["hidden_size"] = self.hparams["d_model"]
|
6231
|
+
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
6232
|
+
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
6233
|
+
|
6234
|
+
def set_gguf_parameters(self):
|
6235
|
+
super().set_gguf_parameters()
|
6236
|
+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
|
6237
|
+
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
|
6238
|
+
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
6239
|
+
|
6240
|
+
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
6241
|
+
del bid, new_name, n_dims # unused
|
6242
|
+
if ".conv" in name and ".weight" in name:
|
6243
|
+
return gguf.GGMLQuantizationType.F16
|
6244
|
+
return False
|
6245
|
+
|
6246
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
6247
|
+
del bid # unused
|
6248
|
+
|
6249
|
+
if name.startswith("language_model."):
|
6250
|
+
# skip language model tensors
|
6251
|
+
return []
|
6252
|
+
|
6253
|
+
# prevent clash naming with vision tensors
|
6254
|
+
if name.startswith("multi_modal_projector"):
|
6255
|
+
name = "audio." + name
|
6256
|
+
|
6257
|
+
if "conv1.bias" in name or "conv2.bias" in name:
|
6258
|
+
# transpose conv1 and conv2 bias
|
6259
|
+
data_torch = data_torch.unsqueeze(-1)
|
6260
|
+
|
6261
|
+
return [(self.map_tensor_name(name), data_torch)]
|
6262
|
+
|
6263
|
+
|
6264
|
+
@ModelBase.register("UltravoxModel")
|
6265
|
+
class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
6266
|
+
has_vision_encoder = False # no vision encoder
|
6267
|
+
has_audio_encoder = True
|
6268
|
+
|
6269
|
+
def set_gguf_parameters(self):
|
6270
|
+
super().set_gguf_parameters()
|
6271
|
+
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
6272
|
+
|
4891
6273
|
###### CONVERSION LOGIC ######
|
4892
6274
|
|
6275
|
+
|
4893
6276
|
# tree of lazy tensors
|
4894
6277
|
class LazyTorchTensor(gguf.LazyBase):
|
4895
6278
|
_tensor_type = torch.Tensor
|
@@ -4943,6 +6326,14 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
4943
6326
|
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
4944
6327
|
return cast(torch.Tensor, lazy)
|
4945
6328
|
|
6329
|
+
@classmethod
|
6330
|
+
def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
|
6331
|
+
dtype = cls._dtype_str_map[remote_tensor.dtype]
|
6332
|
+
shape = remote_tensor.shape
|
6333
|
+
meta = cls.meta_with_dtype_and_shape(dtype, shape)
|
6334
|
+
lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape))
|
6335
|
+
return cast(torch.Tensor, lazy)
|
6336
|
+
|
4946
6337
|
@classmethod
|
4947
6338
|
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
4948
6339
|
del types # unused
|
@@ -5020,6 +6411,14 @@ def parse_args() -> argparse.Namespace:
|
|
5020
6411
|
"--print-supported-models", action="store_true",
|
5021
6412
|
help="Print the supported models"
|
5022
6413
|
)
|
6414
|
+
parser.add_argument(
|
6415
|
+
"--remote", action="store_true",
|
6416
|
+
help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.",
|
6417
|
+
)
|
6418
|
+
parser.add_argument(
|
6419
|
+
"--mmproj", action="store_true",
|
6420
|
+
help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
|
6421
|
+
)
|
5023
6422
|
|
5024
6423
|
args = parser.parse_args()
|
5025
6424
|
if not args.print_supported_models and args.model is None:
|
@@ -5045,12 +6444,26 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
|
5045
6444
|
return n
|
5046
6445
|
|
5047
6446
|
|
6447
|
+
def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
|
6448
|
+
# TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
|
6449
|
+
# maybe we should fallback to text model's arch in that case, since not many models have both
|
6450
|
+
text_config = hparams.get("text_config", {})
|
6451
|
+
vision_config = hparams.get("vision_config", {})
|
6452
|
+
arch = hparams["architectures"][0]
|
6453
|
+
# if "architectures" is found in the sub-config, use that instead
|
6454
|
+
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
|
6455
|
+
arch = text_config["architectures"][0]
|
6456
|
+
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
|
6457
|
+
arch = vision_config["architectures"][0]
|
6458
|
+
return arch
|
6459
|
+
|
6460
|
+
|
5048
6461
|
def main() -> None:
|
5049
6462
|
args = parse_args()
|
5050
6463
|
|
5051
6464
|
if args.print_supported_models:
|
5052
6465
|
logger.error("Supported models:")
|
5053
|
-
|
6466
|
+
ModelBase.print_registered_models()
|
5054
6467
|
sys.exit(0)
|
5055
6468
|
|
5056
6469
|
if args.verbose:
|
@@ -5060,6 +6473,14 @@ def main() -> None:
|
|
5060
6473
|
|
5061
6474
|
dir_model = args.model
|
5062
6475
|
|
6476
|
+
if args.remote:
|
6477
|
+
from huggingface_hub import snapshot_download
|
6478
|
+
local_dir = snapshot_download(
|
6479
|
+
repo_id=str(dir_model),
|
6480
|
+
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
|
6481
|
+
dir_model = Path(local_dir)
|
6482
|
+
logger.info(f"Downloaded config and tokenizer to {local_dir}")
|
6483
|
+
|
5063
6484
|
if not dir_model.is_dir():
|
5064
6485
|
logger.error(f'Error: {args.model} is not a directory')
|
5065
6486
|
sys.exit(1)
|
@@ -5081,30 +6502,38 @@ def main() -> None:
|
|
5081
6502
|
|
5082
6503
|
if args.outfile is not None:
|
5083
6504
|
fname_out = args.outfile
|
6505
|
+
elif args.remote:
|
6506
|
+
# if remote, use the model ID as the output file name
|
6507
|
+
fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
|
5084
6508
|
else:
|
5085
6509
|
fname_out = dir_model
|
5086
6510
|
|
5087
6511
|
logger.info(f"Loading model: {dir_model.name}")
|
5088
6512
|
|
5089
|
-
|
6513
|
+
if args.mmproj:
|
6514
|
+
if "mmproj" not in fname_out.name:
|
6515
|
+
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
|
5090
6516
|
|
5091
6517
|
with torch.inference_mode():
|
5092
6518
|
output_type = ftype_map[args.outtype]
|
5093
|
-
|
5094
|
-
|
6519
|
+
model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
|
6520
|
+
hparams = ModelBase.load_hparams(dir_model)
|
6521
|
+
model_architecture = get_model_architecture(hparams, model_type)
|
6522
|
+
logger.info(f"Model architecture: {model_architecture}")
|
5095
6523
|
try:
|
5096
|
-
model_class =
|
6524
|
+
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
|
5097
6525
|
except NotImplementedError:
|
5098
6526
|
logger.error(f"Model {model_architecture} is not supported")
|
5099
6527
|
sys.exit(1)
|
5100
6528
|
|
5101
|
-
model_instance = model_class(dir_model
|
6529
|
+
model_instance = model_class(dir_model, output_type, fname_out,
|
5102
6530
|
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
|
5103
6531
|
eager=args.no_lazy,
|
5104
6532
|
metadata_override=args.metadata, model_name=args.model_name,
|
5105
6533
|
split_max_tensors=args.split_max_tensors,
|
5106
6534
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
5107
|
-
small_first_shard=args.no_tensor_first_split
|
6535
|
+
small_first_shard=args.no_tensor_first_split,
|
6536
|
+
remote_hf_model_id=str(args.model) if args.remote else None)
|
5108
6537
|
|
5109
6538
|
if args.vocab_only:
|
5110
6539
|
logger.info("Exporting model vocab...")
|