bigdl-core-cpp 2.7.0b20250629__py3-none-win_amd64.whl → 2.7.0b20250701__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +1987 -558
  2. bigdl/cpp/convert_hf_to_gguf_update.py +131 -67
  3. bigdl/cpp/convert_lora_to_gguf.py +3 -3
  4. bigdl/cpp/gguf-py/gguf/constants.py +546 -16
  5. bigdl/cpp/gguf-py/gguf/gguf_reader.py +57 -6
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +119 -7
  7. bigdl/cpp/gguf-py/gguf/lazy.py +10 -0
  8. bigdl/cpp/gguf-py/gguf/metadata.py +28 -8
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +461 -48
  10. bigdl/cpp/gguf-py/gguf/utility.py +195 -0
  11. bigdl/cpp/gguf-py/gguf/vocab.py +6 -1
  12. bigdl/cpp/libs/llama_cpp/ggml-base.dll +0 -0
  13. bigdl/cpp/libs/llama_cpp/ggml-cpu.dll +0 -0
  14. bigdl/cpp/libs/llama_cpp/ggml-sycl.dll +0 -0
  15. bigdl/cpp/libs/llama_cpp/ggml.dll +0 -0
  16. bigdl/cpp/libs/llama_cpp/llama-batched.exe +0 -0
  17. bigdl/cpp/libs/llama_cpp/llama-bench.exe +0 -0
  18. bigdl/cpp/libs/llama_cpp/llama-cli.exe +0 -0
  19. bigdl/cpp/libs/llama_cpp/llama-embedding.exe +0 -0
  20. bigdl/cpp/libs/llama_cpp/llama-gemma3-cli.exe +0 -0
  21. bigdl/cpp/libs/llama_cpp/llama-gguf.exe +0 -0
  22. bigdl/cpp/libs/llama_cpp/llama-llava-cli.exe +0 -0
  23. bigdl/cpp/libs/llama_cpp/llama-lookup.exe +0 -0
  24. bigdl/cpp/libs/llama_cpp/llama-ls-sycl-device.exe +0 -0
  25. bigdl/cpp/libs/llama_cpp/llama-minicpmv-cli.exe +0 -0
  26. bigdl/cpp/libs/llama_cpp/llama-perplexity.exe +0 -0
  27. bigdl/cpp/libs/llama_cpp/llama-quantize.exe +0 -0
  28. bigdl/cpp/libs/llama_cpp/llama-server.exe +0 -0
  29. bigdl/cpp/libs/llama_cpp/llama-simple.exe +0 -0
  30. bigdl/cpp/libs/llama_cpp/llama-speculative.exe +0 -0
  31. bigdl/cpp/libs/llama_cpp/llama-tokenize.exe +0 -0
  32. bigdl/cpp/libs/llama_cpp/llama.dll +0 -0
  33. bigdl/cpp/libs/ollama/ggml-base.dll +0 -0
  34. bigdl/cpp/libs/ollama/ggml-cpu.dll +0 -0
  35. bigdl/cpp/libs/ollama/ggml-sycl.dll +0 -0
  36. bigdl/cpp/libs/ollama/ggml.dll +0 -0
  37. bigdl/cpp/libs/ollama/llama.dll +0 -0
  38. bigdl/cpp/libs/ollama/llava_shared.dll +0 -0
  39. bigdl/cpp/libs/ollama/mtmd_shared.dll +0 -0
  40. bigdl/cpp/libs/ollama/ollama-lib.exe +0 -0
  41. bigdl/cpp/libs/ollama/ollama.exe +0 -0
  42. {bigdl_core_cpp-2.7.0b20250629.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-ollama.bat +1 -5
  43. {bigdl_core_cpp-2.7.0b20250629.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/METADATA +1 -1
  44. bigdl_core_cpp-2.7.0b20250701.dist-info/RECORD +56 -0
  45. bigdl/cpp/libs/llama_cpp/llava_shared.dll +0 -0
  46. bigdl_core_cpp-2.7.0b20250629.dist-info/RECORD +0 -56
  47. {bigdl_core_cpp-2.7.0b20250629.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-llama-cpp.bat +0 -0
  48. {bigdl_core_cpp-2.7.0b20250629.data → bigdl_core_cpp-2.7.0b20250701.data}/scripts/init-llama-cpp.ps1 +0 -0
  49. {bigdl_core_cpp-2.7.0b20250629.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/WHEEL +0 -0
  50. {bigdl_core_cpp-2.7.0b20250629.dist-info → bigdl_core_cpp-2.7.0b20250701.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@ from pathlib import Path
16
16
  from hashlib import sha256
17
17
  from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
18
18
  from itertools import chain
19
+ from transformers import AutoConfig
19
20
 
20
21
  import math
21
22
  import numpy as np
@@ -42,11 +43,19 @@ class SentencePieceTokenTypes(IntEnum):
42
43
  BYTE = 6
43
44
 
44
45
 
45
- AnyModel = TypeVar("AnyModel", bound="type[Model]")
46
+ class ModelType(IntEnum):
47
+ TEXT = 1
48
+ MMPROJ = 2
46
49
 
47
50
 
48
- class Model:
49
- _model_classes: dict[str, type[Model]] = {}
51
+ AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
52
+
53
+
54
+ class ModelBase:
55
+ _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
56
+ ModelType.TEXT: {},
57
+ ModelType.MMPROJ: {},
58
+ }
50
59
 
51
60
  dir_model: Path
52
61
  ftype: gguf.LlamaFileType
@@ -58,23 +67,28 @@ class Model:
58
67
  part_names: list[str]
59
68
  is_safetensors: bool
60
69
  hparams: dict[str, Any]
61
- block_count: int
62
- tensor_map: gguf.TensorNameMap
63
70
  tensor_names: set[str] | None
64
71
  gguf_writer: gguf.GGUFWriter
65
72
  model_name: str | None
66
73
  metadata_override: Path | None
67
74
  dir_model_card: Path
75
+ remote_hf_model_id: str | None
68
76
 
69
77
  # subclasses should define this!
70
78
  model_arch: gguf.MODEL_ARCH
71
79
 
72
- def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
80
+ # subclasses should initialize this!
81
+ block_count: int
82
+ tensor_map: gguf.TensorNameMap
83
+
84
+ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
73
85
  use_temp_file: bool = False, eager: bool = False,
74
86
  metadata_override: Path | None = None, model_name: str | None = None,
75
87
  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
76
- small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
77
- if type(self) is Model:
88
+ small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
89
+ if type(self) is ModelBase or \
90
+ type(self) is TextModel or \
91
+ type(self) is MmprojModel:
78
92
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
79
93
 
80
94
  self.dir_model = dir_model
@@ -83,14 +97,25 @@ class Model:
83
97
  self.is_big_endian = is_big_endian
84
98
  self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
85
99
  self.use_temp_file = use_temp_file
86
- self.lazy = not eager
87
- self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
88
- self.is_safetensors = len(self.part_names) > 0
89
- if not self.is_safetensors:
90
- self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
91
- self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
92
- self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
93
- self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
100
+ self.lazy = not eager or (remote_hf_model_id is not None)
101
+ self.remote_hf_model_id = remote_hf_model_id
102
+ if remote_hf_model_id is not None:
103
+ self.is_safetensors = True
104
+
105
+ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
106
+ logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
107
+ remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
108
+ self.tensor_names = set(name for name in remote_tensors.keys())
109
+ for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items():
110
+ yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
111
+
112
+ self.get_tensors = get_remote_tensors
113
+ else:
114
+ self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors")
115
+ self.is_safetensors = len(self.part_names) > 0
116
+ if not self.is_safetensors:
117
+ self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
118
+ self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
94
119
  self.tensor_names = None
95
120
  self.metadata_override = metadata_override
96
121
  self.model_name = model_name
@@ -112,11 +137,10 @@ class Model:
112
137
  split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
113
138
 
114
139
  @classmethod
115
- def __init_subclass__(cls):
116
- # can't use an abstract property, because overriding it without type errors
117
- # would require using decorated functions instead of simply defining the property
118
- if "model_arch" not in cls.__dict__:
119
- raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
140
+ def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path:
141
+ stem, suffix = path.stem, path.suffix
142
+ new_name = f"{prefix}{stem}{suffix}"
143
+ return path.with_name(new_name)
120
144
 
121
145
  def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
122
146
  key = next((k for k in keys if k in self.hparams), None)
@@ -126,9 +150,6 @@ class Model:
126
150
  return None
127
151
  raise KeyError(f"could not find any of: {keys}")
128
152
 
129
- def set_vocab(self):
130
- self._set_vocab_gpt2()
131
-
132
153
  def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
133
154
  tensor_names_from_parts: set[str] = set()
134
155
 
@@ -180,7 +201,8 @@ class Model:
180
201
  extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
181
202
  missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
182
203
  if len(extra) == 0 and len(missing_files) > 0:
183
- raise ValueError(f"Missing or incomplete model files: {missing_files}")
204
+ raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
205
+ f"Missing tensors: {missing}")
184
206
  else:
185
207
  raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
186
208
  f"Missing tensors: {missing}\n"
@@ -215,50 +237,7 @@ class Model:
215
237
  return new_name
216
238
 
217
239
  def set_gguf_parameters(self):
218
- self.gguf_writer.add_block_count(self.block_count)
219
-
220
- if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
221
- self.gguf_writer.add_context_length(n_ctx)
222
- logger.info(f"gguf: context length = {n_ctx}")
223
-
224
- if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
225
- self.gguf_writer.add_embedding_length(n_embd)
226
- logger.info(f"gguf: embedding length = {n_embd}")
227
-
228
- if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
229
- self.gguf_writer.add_feed_forward_length(n_ff)
230
- logger.info(f"gguf: feed forward length = {n_ff}")
231
-
232
- if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
233
- self.gguf_writer.add_head_count(n_head)
234
- logger.info(f"gguf: head count = {n_head}")
235
-
236
- if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
237
- self.gguf_writer.add_head_count_kv(n_head_kv)
238
- logger.info(f"gguf: key-value head count = {n_head_kv}")
239
-
240
- if (rope_theta := self.hparams.get("rope_theta")) is not None:
241
- self.gguf_writer.add_rope_freq_base(rope_theta)
242
- logger.info(f"gguf: rope theta = {rope_theta}")
243
- if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
244
- self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
245
- logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
246
- if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
247
- self.gguf_writer.add_layer_norm_eps(f_norm_eps)
248
- logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
249
- if (n_experts := self.hparams.get("num_local_experts")) is not None:
250
- self.gguf_writer.add_expert_count(n_experts)
251
- logger.info(f"gguf: expert count = {n_experts}")
252
- if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
253
- self.gguf_writer.add_expert_used_count(n_experts_used)
254
- logger.info(f"gguf: experts used count = {n_experts_used}")
255
-
256
- if (head_dim := self.hparams.get("head_dim")) is not None:
257
- self.gguf_writer.add_key_length(head_dim)
258
- self.gguf_writer.add_value_length(head_dim)
259
-
260
- self.gguf_writer.add_file_type(self.ftype)
261
- logger.info(f"gguf: file type = {self.ftype}")
240
+ raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
262
241
 
263
242
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
264
243
  del bid # unused
@@ -329,6 +308,8 @@ class Model:
329
308
  gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
330
309
  gguf.MODEL_TENSOR.POSNET_NORM1,
331
310
  gguf.MODEL_TENSOR.POSNET_NORM2,
311
+ gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
312
+ gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
332
313
  )
333
314
  )
334
315
  or not new_name.endswith(".weight")
@@ -392,6 +373,10 @@ class Model:
392
373
 
393
374
  self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
394
375
 
376
+ # If we are using HF model id, set the metadata name to the model id
377
+ if self.remote_hf_model_id:
378
+ self.metadata.name = self.remote_hf_model_id
379
+
395
380
  # Fallback to model directory name if metadata name is still missing
396
381
  if self.metadata.name is None:
397
382
  self.metadata.name = self.dir_model.name
@@ -400,27 +385,6 @@ class Model:
400
385
  if self.metadata.size_label is None and total_params > 0:
401
386
  self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
402
387
 
403
- # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
404
- output_type: str = self.ftype.name.partition("_")[2]
405
-
406
- # Filename Output
407
- if self.fname_out.is_dir():
408
- # Generate default filename based on model specification and available metadata
409
- if not vocab_only:
410
- fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
411
- else:
412
- fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
413
-
414
- # Use the default filename
415
- self.fname_out = self.fname_out / f"{fname_default}.gguf"
416
- else:
417
- # Output path is a custom defined templated filename
418
- # Note: `not is_dir()` is used because `.is_file()` will not detect
419
- # file template strings as it doesn't actually exist as a file
420
-
421
- # Process templated file name with the output ftype, useful with the "auto" ftype
422
- self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
423
-
424
388
  self.set_type()
425
389
 
426
390
  logger.info("Set meta model")
@@ -429,12 +393,12 @@ class Model:
429
393
  logger.info("Set model parameters")
430
394
  self.set_gguf_parameters()
431
395
 
432
- logger.info("Set model tokenizer")
433
- self.set_vocab()
434
-
435
396
  logger.info("Set model quantization version")
436
397
  self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
437
398
 
399
+ def write_vocab(self):
400
+ raise NotImplementedError("write_vocab() must be implemented in subclasses")
401
+
438
402
  def write(self):
439
403
  self.prepare_tensors()
440
404
  self.prepare_metadata(vocab_only=False)
@@ -443,15 +407,6 @@ class Model:
443
407
  self.gguf_writer.write_tensors_to_file(progress=True)
444
408
  self.gguf_writer.close()
445
409
 
446
- def write_vocab(self):
447
- if len(self.gguf_writer.tensors) != 1:
448
- raise ValueError('Splitting the vocabulary is not supported')
449
-
450
- self.prepare_metadata(vocab_only=True)
451
- self.gguf_writer.write_header_to_file(path=self.fname_out)
452
- self.gguf_writer.write_kv_data_to_file()
453
- self.gguf_writer.close()
454
-
455
410
  @staticmethod
456
411
  def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
457
412
  part_names: list[str] = []
@@ -465,31 +420,160 @@ class Model:
465
420
 
466
421
  @staticmethod
467
422
  def load_hparams(dir_model: Path):
468
- with open(dir_model / "config.json", "r", encoding="utf-8") as f:
469
- return json.load(f)
423
+ try:
424
+ # for security reason, we don't allow loading remote code by default
425
+ # if a model need remote code, we will fallback to config.json
426
+ config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
427
+ except Exception as e:
428
+ logger.warning(f"Failed to load model config from {dir_model}: {e}")
429
+ logger.warning("Trying to load config.json instead")
430
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
431
+ config = json.load(f)
432
+ if "llm_config" in config:
433
+ # rename for InternVL
434
+ config["text_config"] = config["llm_config"]
435
+ if "thinker_config" in config:
436
+ # rename for Qwen2.5-Omni
437
+ config["text_config"] = config["thinker_config"]["text_config"]
438
+ return config
470
439
 
471
440
  @classmethod
472
441
  def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
473
442
  assert names
474
443
 
475
444
  def func(modelcls: AnyModel) -> AnyModel:
445
+ model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
476
446
  for name in names:
477
- cls._model_classes[name] = modelcls
447
+ cls._model_classes[model_type][name] = modelcls
478
448
  return modelcls
479
449
  return func
480
450
 
481
451
  @classmethod
482
452
  def print_registered_models(cls):
483
- for name in sorted(cls._model_classes.keys()):
484
- logger.error(f"- {name}")
453
+ for model_type, model_classes in cls._model_classes.items():
454
+ logger.error(f"{model_type.name} models:")
455
+ for name in sorted(model_classes.keys()):
456
+ logger.error(f" - {name}")
485
457
 
486
458
  @classmethod
487
- def from_model_architecture(cls, arch: str) -> type[Model]:
459
+ def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]:
488
460
  try:
489
- return cls._model_classes[arch]
461
+ return cls._model_classes[model_type][arch]
490
462
  except KeyError:
491
463
  raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
492
464
 
465
+
466
+ class TextModel(ModelBase):
467
+ model_type = ModelType.TEXT
468
+ hf_arch: str
469
+
470
+ def __init__(self, *args, **kwargs):
471
+ super().__init__(*args, **kwargs)
472
+ self.hf_arch = get_model_architecture(self.hparams, self.model_type)
473
+
474
+ if "text_config" in self.hparams:
475
+ # move the text_config to the root level
476
+ self.hparams = {**self.hparams, **self.hparams["text_config"]}
477
+
478
+ self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
479
+ self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
480
+
481
+ @classmethod
482
+ def __init_subclass__(cls):
483
+ # can't use an abstract property, because overriding it without type errors
484
+ # would require using decorated functions instead of simply defining the property
485
+ if "model_arch" not in cls.__dict__:
486
+ raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
487
+
488
+ def set_vocab(self):
489
+ self._set_vocab_gpt2()
490
+
491
+ def prepare_metadata(self, vocab_only: bool):
492
+ super().prepare_metadata(vocab_only=vocab_only)
493
+
494
+ total_params = self.gguf_writer.get_total_parameter_count()[0]
495
+ # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
496
+ output_type: str = self.ftype.name.partition("_")[2]
497
+
498
+ # Filename Output
499
+ if self.fname_out.is_dir():
500
+ # Generate default filename based on model specification and available metadata
501
+ if not vocab_only:
502
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
503
+ else:
504
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
505
+
506
+ # Use the default filename
507
+ self.fname_out = self.fname_out / f"{fname_default}.gguf"
508
+ else:
509
+ # Output path is a custom defined templated filename
510
+ # Note: `not is_dir()` is used because `.is_file()` will not detect
511
+ # file template strings as it doesn't actually exist as a file
512
+
513
+ # Process templated file name with the output ftype, useful with the "auto" ftype
514
+ self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
515
+
516
+ logger.info("Set model tokenizer")
517
+ self.set_vocab()
518
+
519
+ def set_gguf_parameters(self):
520
+ self.gguf_writer.add_block_count(self.block_count)
521
+
522
+ if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
523
+ self.gguf_writer.add_context_length(n_ctx)
524
+ logger.info(f"gguf: context length = {n_ctx}")
525
+
526
+ if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
527
+ self.gguf_writer.add_embedding_length(n_embd)
528
+ logger.info(f"gguf: embedding length = {n_embd}")
529
+
530
+ if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
531
+ self.gguf_writer.add_feed_forward_length(n_ff)
532
+ logger.info(f"gguf: feed forward length = {n_ff}")
533
+
534
+ if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
535
+ self.gguf_writer.add_head_count(n_head)
536
+ logger.info(f"gguf: head count = {n_head}")
537
+
538
+ if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
539
+ self.gguf_writer.add_head_count_kv(n_head_kv)
540
+ logger.info(f"gguf: key-value head count = {n_head_kv}")
541
+
542
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
543
+ self.gguf_writer.add_rope_freq_base(rope_theta)
544
+ logger.info(f"gguf: rope theta = {rope_theta}")
545
+ if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
546
+ self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
547
+ logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
548
+ if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
549
+ self.gguf_writer.add_layer_norm_eps(f_norm_eps)
550
+ logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
551
+ if (n_experts := self.hparams.get("num_local_experts")) is not None:
552
+ self.gguf_writer.add_expert_count(n_experts)
553
+ logger.info(f"gguf: expert count = {n_experts}")
554
+ if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
555
+ self.gguf_writer.add_expert_used_count(n_experts_used)
556
+ logger.info(f"gguf: experts used count = {n_experts_used}")
557
+
558
+ if (head_dim := self.hparams.get("head_dim")) is not None:
559
+ # Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
560
+ # https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
561
+ if self.hparams.get("model_type") != "deepseek_v3":
562
+ self.gguf_writer.add_key_length(head_dim)
563
+ self.gguf_writer.add_value_length(head_dim)
564
+
565
+ self.gguf_writer.add_file_type(self.ftype)
566
+ logger.info(f"gguf: file type = {self.ftype}")
567
+
568
+ def write_vocab(self):
569
+ if len(self.gguf_writer.tensors) != 1:
570
+ raise ValueError('Splitting the vocabulary is not supported')
571
+
572
+ self.prepare_metadata(vocab_only=True)
573
+ self.gguf_writer.write_header_to_file(path=self.fname_out)
574
+ self.gguf_writer.write_kv_data_to_file()
575
+ self.gguf_writer.close()
576
+
493
577
  def does_token_look_special(self, token: str | bytes) -> bool:
494
578
  if isinstance(token, (bytes, bytearray)):
495
579
  token_text = token.decode(encoding="utf-8")
@@ -528,6 +612,8 @@ class Model:
528
612
  reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
529
613
  added_vocab = tokenizer.get_added_vocab()
530
614
 
615
+ added_tokens_decoder = tokenizer.added_tokens_decoder
616
+
531
617
  for i in range(vocab_size):
532
618
  if i not in reverse_vocab:
533
619
  tokens.append(f"[PAD{i}]")
@@ -537,13 +623,13 @@ class Model:
537
623
  if token in added_vocab:
538
624
  # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539
625
  # To avoid unexpected issues - we make sure to normalize non-normalized tokens
540
- if not tokenizer.added_tokens_decoder[i].normalized:
626
+ if not added_tokens_decoder[i].normalized:
541
627
  previous_token = token
542
628
  token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
543
629
  if previous_token != token:
544
630
  logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
545
631
 
546
- if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
632
+ if added_tokens_decoder[i].special or self.does_token_look_special(token):
547
633
  toktypes.append(gguf.TokenType.CONTROL)
548
634
  else:
549
635
  # NOTE: this was added for Gemma.
@@ -591,12 +677,12 @@ class Model:
591
677
  if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
592
678
  # ref: https://huggingface.co/tiiuae/falcon-7b
593
679
  res = "falcon"
594
- if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
595
- # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
596
- res = "falcon3"
597
680
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
598
681
  # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
599
682
  res = "bert-bge"
683
+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
684
+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
685
+ res = "falcon3"
600
686
  if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
601
687
  # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
602
688
  res = "bert-bge-large"
@@ -648,9 +734,6 @@ class Model:
648
734
  if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
649
735
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
650
736
  res = "jina-v2-code"
651
- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
652
- # ref: https://huggingface.co/THUDM/glm-4-9b-chat
653
- res = "chatglm-bpe"
654
737
  if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
655
738
  # ref: https://huggingface.co/LumiOpen/Viking-7B
656
739
  res = "viking"
@@ -681,9 +764,6 @@ class Model:
681
764
  if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
682
765
  # ref: https://huggingface.co/facebook/chameleon-7b
683
766
  res = "chameleon"
684
- if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
685
- # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
686
- res = "minerva-7b"
687
767
  if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
688
768
  # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
689
769
  res = "roberta-bpe"
@@ -699,6 +779,39 @@ class Model:
699
779
  if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
700
780
  # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
701
781
  res = "deepseek-r1-qwen"
782
+ if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
783
+ # ref: https://huggingface.co/Xenova/gpt-4o
784
+ res = "gpt-4o"
785
+ if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
786
+ # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
787
+ res = "superbpe"
788
+ if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
789
+ # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
790
+ res = "trillion"
791
+ if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
792
+ # ref: https://huggingface.co/inclusionAI/Ling-lite
793
+ res = "bailingmoe"
794
+ if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
795
+ # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
796
+ res = "llama4"
797
+ if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
798
+ # ref: https://huggingface.co/mistral-community/pixtral-12b
799
+ res = "pixtral"
800
+ if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
801
+ # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
802
+ res = "seed-coder"
803
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
804
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
805
+ res = "chatglm-bpe"
806
+ if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
807
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
808
+ res = "chatglm-bpe"
809
+ if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
810
+ # ref: https://huggingface.co/THUDM/glm-4-9b-hf
811
+ res = "glm4"
812
+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
813
+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
814
+ res = "minerva-7b"
702
815
 
703
816
  if res is None:
704
817
  logger.warning("\n")
@@ -858,6 +971,9 @@ class Model:
858
971
  for token_id, token_data in added_tokens_decoder.items():
859
972
  token_id = int(token_id)
860
973
  token: str = token_data["content"]
974
+ if token_id >= vocab_size:
975
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
976
+ continue
861
977
  if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
862
978
  if tokens[token_id] != token.encode("utf-8"):
863
979
  logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
@@ -902,6 +1018,44 @@ class Model:
902
1018
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
903
1019
  special_vocab.add_to_gguf(self.gguf_writer)
904
1020
 
1021
+ def _set_vocab_rwkv_world(self):
1022
+ assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
1023
+ vocab_size = self.hparams.get("vocab_size", 65536)
1024
+
1025
+ tokens: list[bytes] = ['<s>'.encode("utf-8")]
1026
+ toktypes: list[int] = [gguf.TokenType.CONTROL]
1027
+
1028
+ with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
1029
+ lines = f.readlines()
1030
+ for line in lines:
1031
+ parts = line.split(' ')
1032
+ assert len(parts) >= 3
1033
+ token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
1034
+ token = token.encode("utf-8") if isinstance(token, str) else token
1035
+ assert isinstance(token, bytes)
1036
+ assert len(token) == token_len
1037
+ token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
1038
+ tokens.append(token_text.encode("utf-8"))
1039
+ toktypes.append(gguf.TokenType.NORMAL)
1040
+ remainder = vocab_size - len(tokens)
1041
+ assert remainder >= 0
1042
+ for i in range(len(tokens), vocab_size):
1043
+ tokens.append(f"[PAD{i}]".encode("utf-8"))
1044
+ toktypes.append(gguf.TokenType.UNUSED)
1045
+
1046
+ self.gguf_writer.add_tokenizer_model("rwkv")
1047
+ self.gguf_writer.add_token_list(tokens)
1048
+ self.gguf_writer.add_token_types(toktypes)
1049
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
1050
+ special_vocab.chat_template = "rwkv-world"
1051
+ # hack: Add '\n\n' as the EOT token to make it chat normally
1052
+ special_vocab._set_special_token("eot", 261)
1053
+ # hack: Override these as they have already been set (incorrectly)
1054
+ special_vocab.special_token_ids["bos"] = 0
1055
+ special_vocab.special_token_ids["eos"] = 0
1056
+
1057
+ special_vocab.add_to_gguf(self.gguf_writer)
1058
+
905
1059
  def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
906
1060
  tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
907
1061
  logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
@@ -947,11 +1101,149 @@ class Model:
947
1101
  if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
948
1102
  self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
949
1103
 
1104
+ def _try_set_pooling_type(self) -> None:
1105
+ # get pooling path
1106
+ pooling_path = None
1107
+ module_path = self.dir_model / "modules.json"
1108
+ if module_path.is_file():
1109
+ with open(module_path, encoding="utf-8") as f:
1110
+ modules = json.load(f)
1111
+ for mod in modules:
1112
+ if mod["type"] == "sentence_transformers.models.Pooling":
1113
+ pooling_path = mod["path"]
1114
+ break
1115
+
1116
+ # get pooling type
1117
+ if pooling_path is not None:
1118
+ with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
1119
+ pooling = json.load(f)
1120
+ if pooling["pooling_mode_mean_tokens"]:
1121
+ pooling_type = gguf.PoolingType.MEAN
1122
+ elif pooling["pooling_mode_cls_token"]:
1123
+ pooling_type = gguf.PoolingType.CLS
1124
+ elif pooling["pooling_mode_lasttoken"]:
1125
+ pooling_type = gguf.PoolingType.LAST
1126
+ else:
1127
+ raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
1128
+ self.gguf_writer.add_pooling_type(pooling_type)
1129
+
1130
+
1131
+ class MmprojModel(ModelBase):
1132
+ model_type = ModelType.MMPROJ
1133
+ model_arch = gguf.MODEL_ARCH.MMPROJ
1134
+ preprocessor_config: dict[str, Any]
1135
+ global_config: dict[str, Any]
1136
+
1137
+ n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
1138
+
1139
+ has_vision_encoder: bool = True # by default
1140
+ has_audio_encoder: bool = False
1141
+
1142
+ # for models having multiple encoders, we need to separate their hparams
1143
+ hparams_vision: dict[str, Any] | None = None
1144
+ hparams_audio: dict[str, Any] | None = None
1145
+
1146
+ def __init__(self, *args, **kwargs):
1147
+ super().__init__(*args, **kwargs)
1148
+
1149
+ if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
1150
+ raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
1151
+
1152
+ # get n_embd of the text model
1153
+ if "text_config" not in self.hparams:
1154
+ self.hparams["text_config"] = {}
1155
+ if "audio_config" not in self.hparams:
1156
+ self.hparams["audio_config"] = {}
1157
+ text_config = {**self.hparams, **self.hparams["text_config"]}
1158
+ self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
1159
+ assert self.n_embd_text > 0, "n_embd not found in hparams"
1160
+
1161
+ # move vision config to the top level, while preserving the original hparams in global_config
1162
+ import copy
1163
+ self.global_config = copy.deepcopy(self.hparams)
1164
+ self.hparams_vision = self.get_vision_config()
1165
+ self.hparams_audio = self.get_audio_config()
1166
+
1167
+ if self.hparams_vision is None and self.hparams_audio is None:
1168
+ raise ValueError("vision_config / audio_config not found in hparams")
1169
+
1170
+ # for compat with vision-only models
1171
+ self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
1172
+
1173
+ # TODO @ngxson : this is a hack to support both vision and audio encoders
1174
+ have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
1175
+ self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
1176
+ self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
1177
+
1178
+ # load preprocessor config
1179
+ with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
1180
+ self.preprocessor_config = json.load(f)
1181
+
1182
+ def get_vision_config(self) -> dict[str, Any] | None:
1183
+ return self.global_config.get("vision_config")
1184
+
1185
+ def get_audio_config(self) -> dict[str, Any] | None:
1186
+ return self.global_config.get("audio_config")
1187
+
1188
+ def set_type(self):
1189
+ self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
1190
+
1191
+ def set_gguf_parameters(self):
1192
+ self.gguf_writer.add_file_type(self.ftype)
1193
+
1194
+ if self.has_vision_encoder:
1195
+ self.gguf_writer.add_clip_has_vision_encoder(True)
1196
+ self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
1197
+
1198
+ # vision config
1199
+ self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
1200
+ self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
1201
+ self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
1202
+ self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
1203
+ self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1204
+ self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
1205
+
1206
+ # preprocessor config
1207
+ self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
1208
+ self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
1209
+
1210
+ if self.has_audio_encoder:
1211
+ self.gguf_writer.add_clip_has_audio_encoder(True)
1212
+ self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
1213
+
1214
+ # audio config
1215
+ self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
1216
+ self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
1217
+ self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
1218
+ self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
1219
+
1220
+ if not self.has_vision_encoder and not self.has_audio_encoder:
1221
+ raise ValueError("MmprojModel must have either vision or audio encoder")
1222
+
1223
+ def write_vocab(self):
1224
+ raise ValueError("MmprojModel does not support vocab writing")
1225
+
1226
+ def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
1227
+ assert self.hparams_vision is not None
1228
+ return self._find_param(self.hparams_vision, keys, optional)
1229
+
1230
+ def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
1231
+ assert self.hparams_audio is not None
1232
+ return self._find_param(self.hparams_audio, keys, optional)
1233
+
1234
+ def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
1235
+ key = next((k for k in keys if k in obj), None)
1236
+ if key is not None:
1237
+ return obj[key]
1238
+ if optional:
1239
+ return None
1240
+ raise KeyError(f"could not find any of: {keys}")
1241
+
1242
+
1243
+ @ModelBase.register("GPTNeoXForCausalLM")
1244
+ class GPTNeoXModel(TextModel):
1245
+ model_arch = gguf.MODEL_ARCH.GPTNEOX
950
1246
 
951
- @Model.register("GPTNeoXForCausalLM")
952
- class GPTNeoXModel(Model):
953
- model_arch = gguf.MODEL_ARCH.GPTNEOX
954
-
955
1247
  def set_gguf_parameters(self):
956
1248
  block_count = self.hparams["num_hidden_layers"]
957
1249
 
@@ -1005,8 +1297,8 @@ class GPTNeoXModel(Model):
1005
1297
  return tensors
1006
1298
 
1007
1299
 
1008
- @Model.register("BloomForCausalLM", "BloomModel")
1009
- class BloomModel(Model):
1300
+ @ModelBase.register("BloomForCausalLM", "BloomModel")
1301
+ class BloomModel(TextModel):
1010
1302
  model_arch = gguf.MODEL_ARCH.BLOOM
1011
1303
 
1012
1304
  def set_gguf_parameters(self):
@@ -1059,18 +1351,11 @@ class BloomModel(Model):
1059
1351
 
1060
1352
  tensors.append((self.map_tensor_name(name), data_torch))
1061
1353
 
1062
- if name == "word_embeddings.weight":
1063
- assert self.tensor_names is not None
1064
-
1065
- # TODO: tie them at runtime, don't duplicate in the model file
1066
- if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
1067
- tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
1068
-
1069
1354
  return tensors
1070
1355
 
1071
1356
 
1072
- @Model.register("MPTForCausalLM")
1073
- class MPTModel(Model):
1357
+ @ModelBase.register("MPTForCausalLM")
1358
+ class MPTModel(TextModel):
1074
1359
  model_arch = gguf.MODEL_ARCH.MPT
1075
1360
 
1076
1361
  def set_vocab(self):
@@ -1113,8 +1398,8 @@ class MPTModel(Model):
1113
1398
  return [(new_name, data_torch)]
1114
1399
 
1115
1400
 
1116
- @Model.register("OrionForCausalLM")
1117
- class OrionModel(Model):
1401
+ @ModelBase.register("OrionForCausalLM")
1402
+ class OrionModel(TextModel):
1118
1403
  model_arch = gguf.MODEL_ARCH.ORION
1119
1404
 
1120
1405
  def set_vocab(self):
@@ -1148,8 +1433,8 @@ class OrionModel(Model):
1148
1433
  self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
1149
1434
 
1150
1435
 
1151
- @Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
1152
- class BaichuanModel(Model):
1436
+ @ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
1437
+ class BaichuanModel(TextModel):
1153
1438
  model_arch = gguf.MODEL_ARCH.BAICHUAN
1154
1439
 
1155
1440
  def set_vocab(self):
@@ -1181,10 +1466,10 @@ class BaichuanModel(Model):
1181
1466
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1182
1467
  self.gguf_writer.add_file_type(self.ftype)
1183
1468
 
1184
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1185
- if self.hparams["rope_scaling"].get("type") == "linear":
1186
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1187
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1469
+ rope_scaling = self.hparams.get("rope_scaling") or {}
1470
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
1471
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1472
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
1188
1473
 
1189
1474
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1190
1475
  head_count = self.hparams["num_attention_heads"]
@@ -1228,8 +1513,8 @@ class BaichuanModel(Model):
1228
1513
  return weights[r * n_part:r * n_part + r, ...]
1229
1514
 
1230
1515
 
1231
- @Model.register("XverseForCausalLM")
1232
- class XverseModel(Model):
1516
+ @ModelBase.register("XverseForCausalLM")
1517
+ class XverseModel(TextModel):
1233
1518
  model_arch = gguf.MODEL_ARCH.XVERSE
1234
1519
 
1235
1520
  def set_vocab(self):
@@ -1305,10 +1590,10 @@ class XverseModel(Model):
1305
1590
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1306
1591
  self.gguf_writer.add_file_type(self.ftype)
1307
1592
 
1308
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1309
- if self.hparams["rope_scaling"].get("type") == "linear":
1310
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1311
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1593
+ rope_scaling = self.hparams.get("rope_scaling") or {}
1594
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
1595
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1596
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
1312
1597
 
1313
1598
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1314
1599
  del bid # unused
@@ -1335,8 +1620,8 @@ class XverseModel(Model):
1335
1620
  )
1336
1621
 
1337
1622
 
1338
- @Model.register("FalconForCausalLM", "RWForCausalLM")
1339
- class FalconModel(Model):
1623
+ @ModelBase.register("FalconForCausalLM", "RWForCausalLM")
1624
+ class FalconModel(TextModel):
1340
1625
  model_arch = gguf.MODEL_ARCH.FALCON
1341
1626
 
1342
1627
  def set_gguf_parameters(self):
@@ -1389,8 +1674,8 @@ class FalconModel(Model):
1389
1674
  return [(self.map_tensor_name(name), data_torch)]
1390
1675
 
1391
1676
 
1392
- @Model.register("GPTBigCodeForCausalLM")
1393
- class StarCoderModel(Model):
1677
+ @ModelBase.register("GPTBigCodeForCausalLM")
1678
+ class StarCoderModel(TextModel):
1394
1679
  model_arch = gguf.MODEL_ARCH.STARCODER
1395
1680
 
1396
1681
  def set_gguf_parameters(self):
@@ -1406,8 +1691,8 @@ class StarCoderModel(Model):
1406
1691
  self.gguf_writer.add_file_type(self.ftype)
1407
1692
 
1408
1693
 
1409
- @Model.register("GPTRefactForCausalLM")
1410
- class RefactModel(Model):
1694
+ @ModelBase.register("GPTRefactForCausalLM")
1695
+ class RefactModel(TextModel):
1411
1696
  model_arch = gguf.MODEL_ARCH.REFACT
1412
1697
 
1413
1698
  def set_vocab(self):
@@ -1470,8 +1755,8 @@ class RefactModel(Model):
1470
1755
  return tensors
1471
1756
 
1472
1757
 
1473
- @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
1474
- class StableLMModel(Model):
1758
+ @ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
1759
+ class StableLMModel(TextModel):
1475
1760
  model_arch = gguf.MODEL_ARCH.STABLELM
1476
1761
 
1477
1762
  def set_vocab(self):
@@ -1560,9 +1845,23 @@ class StableLMModel(Model):
1560
1845
  raise ValueError(f"Unprocessed norms: {norms}")
1561
1846
 
1562
1847
 
1563
- @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1564
- class LlamaModel(Model):
1848
+ @ModelBase.register(
1849
+ "LLaMAForCausalLM",
1850
+ "LlamaForCausalLM",
1851
+ "MistralForCausalLM",
1852
+ "MixtralForCausalLM",
1853
+ "VLlama3ForCausalLM",
1854
+ "LlavaForConditionalGeneration",
1855
+ "LlamaModel")
1856
+ class LlamaModel(TextModel):
1565
1857
  model_arch = gguf.MODEL_ARCH.LLAMA
1858
+ undo_permute = True
1859
+
1860
+ def __init__(self, *args, **kwargs):
1861
+ super().__init__(*args, **kwargs)
1862
+ # fix for SmolVLM2, missing `num_attention_heads` in config.json
1863
+ if self.hf_arch == "VLlama3ForCausalLM":
1864
+ self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
1566
1865
 
1567
1866
  def set_vocab(self):
1568
1867
  try:
@@ -1608,10 +1907,10 @@ class LlamaModel(Model):
1608
1907
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1609
1908
  self.gguf_writer.add_rope_dimension_count(rope_dim)
1610
1909
 
1611
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1612
- if self.hparams["rope_scaling"].get("type") == "linear":
1613
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1614
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1910
+ rope_scaling = self.hparams.get("rope_scaling") or {}
1911
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
1912
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1913
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
1615
1914
 
1616
1915
  @staticmethod
1617
1916
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -1626,11 +1925,25 @@ class LlamaModel(Model):
1626
1925
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1627
1926
  n_head = self.hparams["num_attention_heads"]
1628
1927
  n_kv_head = self.hparams.get("num_key_value_heads")
1629
-
1630
- if name.endswith(("q_proj.weight", "q_proj.bias")):
1631
- data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1632
- if name.endswith(("k_proj.weight", "k_proj.bias")):
1633
- data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1928
+ is_vision_tensor = "vision_tower" in name \
1929
+ or "vision_model" in name \
1930
+ or "model.connector" in name \
1931
+ or "multi_modal_projector" in name
1932
+
1933
+ if is_vision_tensor:
1934
+ return [] # skip vision tensors
1935
+ elif self.hf_arch == "LlamaModel":
1936
+ name = "model." + name
1937
+ elif name.startswith("model.text_model"):
1938
+ name = name.replace("text_model.", "") # for SmolVLM
1939
+ elif name.startswith("language_model."):
1940
+ name = name.replace("language_model.", "") # for the rest
1941
+
1942
+ if self.undo_permute:
1943
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
1944
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1945
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
1946
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1634
1947
 
1635
1948
  # process the experts separately
1636
1949
  if name.find("block_sparse_moe.experts") != -1:
@@ -1682,7 +1995,7 @@ class LlamaModel(Model):
1682
1995
 
1683
1996
  low_freq_wavelen = old_context_len / low_freq_factor
1684
1997
  high_freq_wavelen = old_context_len / high_freq_factor
1685
- assert low_freq_wavelen != high_freq_wavelen
1998
+ # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4
1686
1999
 
1687
2000
  rope_factors = []
1688
2001
  for freq in freqs:
@@ -1707,8 +2020,202 @@ class LlamaModel(Model):
1707
2020
  raise ValueError(f"Unprocessed experts: {experts}")
1708
2021
 
1709
2022
 
1710
- @Model.register("DeciLMForCausalLM")
1711
- class DeciModel(Model):
2023
+ @ModelBase.register("ArceeForCausalLM")
2024
+ class ArceeModel(LlamaModel):
2025
+ model_arch = gguf.MODEL_ARCH.ARCEE
2026
+
2027
+ def set_gguf_parameters(self):
2028
+ super().set_gguf_parameters()
2029
+ self._try_set_pooling_type()
2030
+ rope_scaling = self.hparams.get("rope_scaling") or {}
2031
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
2032
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2033
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2034
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2035
+
2036
+
2037
+ @ModelBase.register(
2038
+ "LlavaForConditionalGeneration", # pixtral
2039
+ "Mistral3ForConditionalGeneration", # mistral small 3.1
2040
+ )
2041
+ class LlavaVisionModel(MmprojModel):
2042
+ img_break_tok_id = -1
2043
+
2044
+ def __init__(self, *args, **kwargs):
2045
+ super().__init__(*args, **kwargs)
2046
+ if self.hparams["model_type"] == "pixtral":
2047
+ # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
2048
+ self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
2049
+ self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
2050
+ logger.info(f"Image break token id: {self.img_break_tok_id}")
2051
+ else:
2052
+ raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
2053
+
2054
+ def get_token_id(self, token: str) -> int:
2055
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2056
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2057
+ added_tokens_decoder = json.load(f)['added_tokens_decoder']
2058
+ for id_, token_data in added_tokens_decoder.items():
2059
+ if token_data["content"] == token:
2060
+ return int(id_)
2061
+ raise ValueError(f"Token '{token}' not found in tokenizer config.")
2062
+
2063
+ def set_gguf_parameters(self):
2064
+ super().set_gguf_parameters()
2065
+ hparams = self.hparams
2066
+ if hparams["model_type"] == "pixtral":
2067
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
2068
+ self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
2069
+
2070
+ # hidden_act
2071
+ if hparams["hidden_act"] == "silu":
2072
+ self.gguf_writer.add_vision_use_silu(True)
2073
+ elif hparams["hidden_act"] == "gelu":
2074
+ self.gguf_writer.add_vision_use_gelu(True)
2075
+ else:
2076
+ raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
2077
+
2078
+ # spatial_merge_size
2079
+ if "spatial_merge_size" in self.global_config:
2080
+ self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"])
2081
+
2082
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2083
+ del bid # unused
2084
+ n_head = self.hparams["num_attention_heads"]
2085
+ n_kv_head = n_head
2086
+
2087
+ if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
2088
+ # process vision tensors
2089
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
2090
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2091
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
2092
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
2093
+ return [(self.map_tensor_name(name), data_torch)]
2094
+
2095
+ if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
2096
+ logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
2097
+ # for pixtral model, we need to extract the [IMG_BREAK] token embedding
2098
+ img_break_embd = data_torch[self.img_break_tok_id]
2099
+ name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK]
2100
+ return [(self.map_tensor_name(name), img_break_embd)]
2101
+
2102
+ return [] # skip other tensors
2103
+
2104
+
2105
+ @ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
2106
+ class SmolVLMModel(MmprojModel):
2107
+ def __init__(self, *args, **kwargs):
2108
+ super().__init__(*args, **kwargs)
2109
+ if self.hparams["model_type"] == "smolvlm_vision":
2110
+ # fix for SmolVLM2, missing some keys in config.json
2111
+ # default values are taken from transformers code
2112
+ self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
2113
+ self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
2114
+ self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
2115
+
2116
+ def set_gguf_parameters(self):
2117
+ super().set_gguf_parameters()
2118
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
2119
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
2120
+ self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
2121
+ self.gguf_writer.add_vision_use_gelu(True)
2122
+
2123
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
2124
+ del bid, new_name, n_dims # unused
2125
+ if ".embeddings." in name:
2126
+ return gguf.GGMLQuantizationType.F32
2127
+ return False
2128
+
2129
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2130
+ del bid # unused
2131
+ is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
2132
+
2133
+ if is_vision_tensor:
2134
+ return [(self.map_tensor_name(name), data_torch)]
2135
+
2136
+ return [] # skip other tensors
2137
+
2138
+
2139
+ @ModelBase.register("Llama4ForConditionalGeneration")
2140
+ class Llama4Model(LlamaModel):
2141
+ model_arch = gguf.MODEL_ARCH.LLAMA4
2142
+ undo_permute = False
2143
+
2144
+ def __init__(self, *args, **kwargs):
2145
+ super().__init__(*args, **kwargs)
2146
+ # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
2147
+ self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
2148
+ self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
2149
+
2150
+ def set_vocab(self):
2151
+ self._set_vocab_gpt2()
2152
+ self.gguf_writer.add_add_bos_token(True)
2153
+
2154
+ def set_gguf_parameters(self):
2155
+ super().set_gguf_parameters()
2156
+ self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
2157
+ self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
2158
+
2159
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2160
+ if name.startswith("language_model."):
2161
+ name = name.replace("language_model.", "")
2162
+
2163
+ # split the gate_up into gate and up
2164
+ if "gate_up_proj" in name:
2165
+ name_up = name.replace("gate_up_proj", "up_proj.weight")
2166
+ name_gate = name.replace("gate_up_proj", "gate_proj.weight")
2167
+ dim_half = data_torch.shape[-1] // 2
2168
+ gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
2169
+ return [
2170
+ (self.map_tensor_name(name_gate), gate_proj_weight),
2171
+ (self.map_tensor_name(name_up), up_proj_weight)
2172
+ ]
2173
+
2174
+ if name.endswith("down_proj"):
2175
+ name += ".weight"
2176
+ data_torch = data_torch.transpose(-1, -2)
2177
+
2178
+ if "multi_modal_projector" in name or "vision_model" in name:
2179
+ return []
2180
+ return super().modify_tensors(data_torch, name, bid)
2181
+
2182
+
2183
+ @ModelBase.register("Llama4ForConditionalGeneration")
2184
+ class Llama4VisionModel(MmprojModel):
2185
+ def set_gguf_parameters(self):
2186
+ super().set_gguf_parameters()
2187
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
2188
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
2189
+ self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
2190
+ assert self.hparams["hidden_act"] == "gelu"
2191
+ self.gguf_writer.add_vision_use_gelu(True)
2192
+
2193
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2194
+ del bid # unused
2195
+ if "multi_modal_projector" in name or "vision_model" in name:
2196
+ # process vision tensors
2197
+ if "positional_embedding_vlm" in name and ".weight" not in name:
2198
+ name += ".weight"
2199
+ if "multi_modal_projector.linear_1" in name:
2200
+ # despite the name with number postfix, this is a single fully connected layer
2201
+ return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
2202
+ return [(self.map_tensor_name(name), data_torch)]
2203
+ return []
2204
+
2205
+
2206
+ @ModelBase.register("Mistral3ForConditionalGeneration")
2207
+ class Mistral3Model(LlamaModel):
2208
+ model_arch = gguf.MODEL_ARCH.LLAMA
2209
+
2210
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2211
+ name = name.replace("language_model.", "")
2212
+ if "multi_modal_projector" in name or "vision_tower" in name:
2213
+ return []
2214
+ return super().modify_tensors(data_torch, name, bid)
2215
+
2216
+
2217
+ @ModelBase.register("DeciLMForCausalLM")
2218
+ class DeciModel(TextModel):
1712
2219
  model_arch = gguf.MODEL_ARCH.DECI
1713
2220
 
1714
2221
  @staticmethod
@@ -1743,6 +2250,9 @@ class DeciModel(Model):
1743
2250
  # if n_heads_in_group is not None, then
1744
2251
  # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
1745
2252
  # _num_heads[il] is num_attention_head
2253
+ # ***dummy layer*** for nemotron 253B
2254
+ # if n_heads_in_group is None and ffn_mult is None
2255
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
1746
2256
  for il in range(len(_block_configs)):
1747
2257
  if _block_configs[il]["attention"]["n_heads_in_group"] is None:
1748
2258
  if _block_configs[il]["attention"]["replace_with_linear"] is True:
@@ -1754,7 +2264,10 @@ class DeciModel(Model):
1754
2264
  else:
1755
2265
  self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
1756
2266
  self._num_heads.append(self.hparams["num_attention_heads"])
1757
- _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
2267
+ if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
2268
+ _ffn_multipliers.append(0.0)
2269
+ else:
2270
+ _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
1758
2271
  assert self.block_count == len(self._num_kv_heads)
1759
2272
  assert self.block_count == len(self._num_heads)
1760
2273
  assert self.block_count == len(_ffn_multipliers)
@@ -1814,10 +2327,10 @@ class DeciModel(Model):
1814
2327
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1815
2328
  self.gguf_writer.add_rope_dimension_count(rope_dim)
1816
2329
 
1817
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1818
- if self.hparams["rope_scaling"].get("type") == "linear":
1819
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1820
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2330
+ rope_scaling = self.hparams.get("rope_scaling") or {}
2331
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
2332
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2333
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
1821
2334
 
1822
2335
  @staticmethod
1823
2336
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -1879,8 +2392,8 @@ class DeciModel(Model):
1879
2392
  super().prepare_tensors()
1880
2393
 
1881
2394
 
1882
- @Model.register("BitnetForCausalLM")
1883
- class BitnetModel(Model):
2395
+ @ModelBase.register("BitnetForCausalLM")
2396
+ class BitnetModel(TextModel):
1884
2397
  model_arch = gguf.MODEL_ARCH.BITNET
1885
2398
 
1886
2399
  def set_vocab(self):
@@ -1920,8 +2433,8 @@ class BitnetModel(Model):
1920
2433
  yield (new_name, data_torch)
1921
2434
 
1922
2435
 
1923
- @Model.register("GrokForCausalLM")
1924
- class GrokModel(Model):
2436
+ @ModelBase.register("GrokForCausalLM")
2437
+ class GrokModel(TextModel):
1925
2438
  model_arch = gguf.MODEL_ARCH.GROK
1926
2439
 
1927
2440
  def set_vocab(self):
@@ -1973,8 +2486,8 @@ class GrokModel(Model):
1973
2486
  return [(self.map_tensor_name(name), data_torch)]
1974
2487
 
1975
2488
 
1976
- @Model.register("DbrxForCausalLM")
1977
- class DbrxModel(Model):
2489
+ @ModelBase.register("DbrxForCausalLM")
2490
+ class DbrxModel(TextModel):
1978
2491
  model_arch = gguf.MODEL_ARCH.DBRX
1979
2492
 
1980
2493
  def set_gguf_parameters(self):
@@ -2042,8 +2555,8 @@ class DbrxModel(Model):
2042
2555
  return n_dims > 1
2043
2556
 
2044
2557
 
2045
- @Model.register("MiniCPMForCausalLM")
2046
- class MiniCPMModel(Model):
2558
+ @ModelBase.register("MiniCPMForCausalLM")
2559
+ class MiniCPMModel(TextModel):
2047
2560
  model_arch = gguf.MODEL_ARCH.MINICPM
2048
2561
 
2049
2562
  def set_gguf_parameters(self):
@@ -2057,10 +2570,10 @@ class MiniCPMModel(Model):
2057
2570
  logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
2058
2571
  self.gguf_writer.add_logit_scale(logit_scale)
2059
2572
  logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
2060
- if self.hparams.get("rope_scaling") is not None:
2061
- if self.hparams["rope_scaling"].get("type") == "longrope":
2062
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
2063
- logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
2573
+ rope_scaling = self.hparams.get("rope_scaling") or {}
2574
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
2575
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
2576
+ logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
2064
2577
 
2065
2578
  def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2066
2579
  rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
@@ -2097,8 +2610,8 @@ class MiniCPMModel(Model):
2097
2610
  return [(self.map_tensor_name(name), data_torch)]
2098
2611
 
2099
2612
 
2100
- @Model.register("MiniCPM3ForCausalLM")
2101
- class MiniCPM3Model(Model):
2613
+ @ModelBase.register("MiniCPM3ForCausalLM")
2614
+ class MiniCPM3Model(TextModel):
2102
2615
  model_arch = gguf.MODEL_ARCH.MINICPM3
2103
2616
 
2104
2617
  def set_gguf_parameters(self):
@@ -2150,8 +2663,8 @@ class MiniCPM3Model(Model):
2150
2663
  )
2151
2664
 
2152
2665
 
2153
- @Model.register("QWenLMHeadModel")
2154
- class QwenModel(Model):
2666
+ @ModelBase.register("QWenLMHeadModel")
2667
+ class QwenModel(TextModel):
2155
2668
  model_arch = gguf.MODEL_ARCH.QWEN
2156
2669
 
2157
2670
  @staticmethod
@@ -2192,8 +2705,8 @@ class QwenModel(Model):
2192
2705
  self.gguf_writer.add_file_type(self.ftype)
2193
2706
 
2194
2707
 
2195
- @Model.register("Qwen2ForCausalLM")
2196
- class Qwen2Model(Model):
2708
+ @ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
2709
+ class Qwen2Model(TextModel):
2197
2710
  model_arch = gguf.MODEL_ARCH.QWEN2
2198
2711
 
2199
2712
  def set_vocab(self):
@@ -2204,15 +2717,32 @@ class Qwen2Model(Model):
2204
2717
 
2205
2718
  def set_gguf_parameters(self):
2206
2719
  super().set_gguf_parameters()
2207
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2208
- if self.hparams["rope_scaling"].get("type") == "yarn":
2209
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2210
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2211
- self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
2720
+ self._try_set_pooling_type()
2721
+ rope_scaling = self.hparams.get("rope_scaling") or {}
2722
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
2723
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2724
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2725
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2726
+
2727
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2728
+ if self.hf_arch == "Qwen2Model":
2729
+ name = f"model.{name}" # map to Qwen2ForCausalLM tensors
2730
+ if "language_model." in name:
2731
+ name = name.replace("language_model.", "") # for InternVL
2732
+ if name.startswith("mlp") or name.startswith("multi_modal_projector") \
2733
+ or name.startswith("vision_model") or name.startswith("audio_tower"):
2734
+ # skip vision and audio tensors
2735
+ return []
2736
+ yield from super().modify_tensors(data_torch, name, bid)
2212
2737
 
2213
2738
 
2214
- @Model.register("Qwen2VLForConditionalGeneration")
2215
- class Qwen2VLModel(Model):
2739
+ @ModelBase.register(
2740
+ "Qwen2VLModel",
2741
+ "Qwen2VLForConditionalGeneration",
2742
+ "Qwen2_5_VLForConditionalGeneration",
2743
+ "Qwen2_5OmniModel",
2744
+ )
2745
+ class Qwen2VLModel(TextModel):
2216
2746
  model_arch = gguf.MODEL_ARCH.QWEN2VL
2217
2747
 
2218
2748
  def set_gguf_parameters(self):
@@ -2227,15 +2757,217 @@ class Qwen2VLModel(Model):
2227
2757
  except FileNotFoundError:
2228
2758
  self._set_vocab_gpt2()
2229
2759
 
2230
- def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
2231
- for name, data in super().get_tensors():
2232
- if name.startswith("visual."):
2233
- continue
2234
- yield name, data
2760
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2761
+ del bid # unused
2762
+ if name.startswith("thinker."):
2763
+ name = name.replace("thinker.", "")
2764
+ if name.startswith("visual") or name.startswith("audio") or \
2765
+ name.startswith("talker") or name.startswith("token2wav"):
2766
+ # skip multimodal tensors
2767
+ return []
2768
+ return [(self.map_tensor_name(name), data_torch)]
2769
+
2770
+
2771
+ @ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
2772
+ class Qwen2VLVisionModel(MmprojModel):
2773
+ def __init__(self, *args, **kwargs):
2774
+ super().__init__(*args, **kwargs)
2775
+ assert self.hparams_vision is not None
2776
+ self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
2777
+ # rename config.json values
2778
+ self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
2779
+ self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
2780
+ if "embed_dim" in self.hparams_vision: # qwen2vl
2781
+ self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
2782
+ self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
2783
+
2784
+ def set_gguf_parameters(self):
2785
+ super().set_gguf_parameters()
2786
+ assert self.hparams_vision is not None
2787
+ hparams = self.hparams_vision
2788
+ model_type = self.global_config['model_type']
2789
+ if model_type == 'qwen2_vl':
2790
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
2791
+ elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
2792
+ if model_type == 'qwen2_5_omni':
2793
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
2794
+ else:
2795
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
2796
+ self.gguf_writer.add_vision_use_silu(True)
2797
+ # find n_wa_pattern (window attention pattern)
2798
+ fullatt_block_indexes = hparams.get("fullatt_block_indexes")
2799
+ assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
2800
+ n_wa_pattern = fullatt_block_indexes[0] + 1
2801
+ # validate n_wa_pattern
2802
+ for i in range(1, len(fullatt_block_indexes)):
2803
+ if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
2804
+ raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
2805
+ self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
2806
+ else:
2807
+ raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
2808
+ # default values below are taken from HF tranformers code
2809
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
2810
+
2811
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
2812
+ del bid, name, n_dims # unused
2813
+ if ".patch_embd." in new_name:
2814
+ return gguf.GGMLQuantizationType.F16
2815
+ if ".position_embd." in new_name:
2816
+ return gguf.GGMLQuantizationType.F32
2817
+ return False
2818
+
2819
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2820
+ del bid # unused
2821
+ if name.startswith("visual."):
2822
+ # process visual tensors
2823
+ # split QKV tensors if needed
2824
+ if ".qkv." in name:
2825
+ if data_torch.ndim == 2: # weight
2826
+ c3, _ = data_torch.shape
2827
+ else: # bias
2828
+ c3 = data_torch.shape[0]
2829
+ assert c3 % 3 == 0
2830
+ c = c3 // 3
2831
+ wq = data_torch[:c]
2832
+ wk = data_torch[c: c * 2]
2833
+ wv = data_torch[c * 2:]
2834
+ return [
2835
+ (self.map_tensor_name(name.replace("qkv", "q")), wq),
2836
+ (self.map_tensor_name(name.replace("qkv", "k")), wk),
2837
+ (self.map_tensor_name(name.replace("qkv", "v")), wv),
2838
+ ]
2839
+ elif 'patch_embed.proj.weight' in name:
2840
+ # split Conv3D into Conv2Ds
2841
+ c1, c2, kt, kh, kw = data_torch.shape
2842
+ del c1, c2, kh, kw # unused
2843
+ assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
2844
+ return [
2845
+ (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]),
2846
+ (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
2847
+ ]
2848
+ else:
2849
+ return [(self.map_tensor_name(name), data_torch)]
2850
+ return [] # skip other tensors
2851
+
2852
+
2853
+ @ModelBase.register("Qwen2_5OmniModel")
2854
+ class Qwen25OmniModel(Qwen2VLVisionModel):
2855
+ has_vision_encoder = True
2856
+ has_audio_encoder = True
2857
+
2858
+ def __init__(self, *args, **kwargs):
2859
+ super().__init__(*args, **kwargs)
2860
+ assert self.hparams_audio is not None
2861
+ self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
2862
+ self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
2863
+ self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
2864
+
2865
+ def set_gguf_parameters(self):
2866
+ super().set_gguf_parameters()
2867
+ assert self.hparams_audio is not None
2868
+ self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
2869
+ self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
2870
+
2871
+ def get_vision_config(self) -> dict[str, Any] | None:
2872
+ return self.global_config["thinker_config"].get("vision_config")
2873
+
2874
+ def get_audio_config(self) -> dict[str, Any] | None:
2875
+ return self.global_config["thinker_config"].get("audio_config")
2876
+
2877
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2878
+ # SinusoidsPositionEmbedding
2879
+ assert self.hparams_audio is not None
2880
+ max_timescale = 10000
2881
+ length = 1500
2882
+ channels = self.hparams_audio["hidden_size"]
2883
+ log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
2884
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
2885
+ scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
2886
+ pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
2887
+ yield ("audio_tower.embed_positions.weight", pos_embd)
2888
+
2889
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
2890
+ del bid, new_name, n_dims # unused
2891
+ if ".conv" in name and ".weight" in name:
2892
+ return gguf.GGMLQuantizationType.F16
2893
+ return False
2894
+
2895
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2896
+ if name.startswith("thinker."):
2897
+ name = name.replace("thinker.", "")
2898
+
2899
+ if name.startswith("audio_tower"):
2900
+ # process audio tensors
2901
+ if "conv1.bias" in name or "conv2.bias" in name:
2902
+ # transpose conv1 and conv2 bias
2903
+ data_torch = data_torch.unsqueeze(-1)
2904
+ if "audio_bos_eos_token" in name:
2905
+ # this tensor is left unused in transformers code
2906
+ # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
2907
+ return []
2908
+ return [(self.map_tensor_name(name), data_torch)]
2909
+
2910
+ return super().modify_tensors(data_torch, name, bid)
2911
+
2912
+
2913
+ @ModelBase.register("InternVisionModel")
2914
+ class InternVisionModel(MmprojModel):
2915
+ def set_gguf_parameters(self):
2916
+ super().set_gguf_parameters()
2917
+ hparams = self.hparams
2918
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
2919
+ self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
2920
+ # hidden_act
2921
+ if hparams["hidden_act"] == "silu":
2922
+ self.gguf_writer.add_vision_use_silu(True)
2923
+ elif hparams["hidden_act"] == "gelu":
2924
+ self.gguf_writer.add_vision_use_gelu(True)
2925
+ else:
2926
+ raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
2927
+ # downsample_ratio
2928
+ downsample_ratio = self.global_config.get("downsample_ratio")
2929
+ assert downsample_ratio is not None
2930
+ self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
2931
+
2932
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
2933
+ del bid, name, n_dims # unused
2934
+ if ".patch_embd." in new_name:
2935
+ return gguf.GGMLQuantizationType.F16
2936
+ if ".position_embd." in new_name:
2937
+ return gguf.GGMLQuantizationType.F32
2938
+ return False
2939
+
2940
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2941
+ del bid # unused
2942
+ if name.startswith("vision_model") or name.startswith("mlp"):
2943
+ # process visual tensors
2944
+ # correct name
2945
+ if name.startswith("vision_model"):
2946
+ name = "vision_tower." + name
2947
+ if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
2948
+ name += ".weight"
2949
+ # split QKV tensors if needed
2950
+ if ".qkv." in name:
2951
+ if data_torch.ndim == 2: # weight
2952
+ c3, _ = data_torch.shape
2953
+ else: # bias
2954
+ c3 = data_torch.shape[0]
2955
+ assert c3 % 3 == 0
2956
+ c = c3 // 3
2957
+ wq = data_torch[:c]
2958
+ wk = data_torch[c: c * 2]
2959
+ wv = data_torch[c * 2:]
2960
+ return [
2961
+ (self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
2962
+ (self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
2963
+ (self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
2964
+ ]
2965
+ return [(self.map_tensor_name(name), data_torch)]
2966
+ return [] # skip other tensors
2235
2967
 
2236
2968
 
2237
- @Model.register("WavTokenizerDec")
2238
- class WavTokenizerDecModel(Model):
2969
+ @ModelBase.register("WavTokenizerDec")
2970
+ class WavTokenizerDecModel(TextModel):
2239
2971
  model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
2240
2972
 
2241
2973
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -2272,8 +3004,8 @@ class WavTokenizerDecModel(Model):
2272
3004
  self.gguf_writer.add_causal_attention(False)
2273
3005
 
2274
3006
 
2275
- @Model.register("Qwen2MoeForCausalLM")
2276
- class Qwen2MoeModel(Model):
3007
+ @ModelBase.register("Qwen2MoeForCausalLM")
3008
+ class Qwen2MoeModel(TextModel):
2277
3009
  model_arch = gguf.MODEL_ARCH.QWEN2MOE
2278
3010
 
2279
3011
  def set_gguf_parameters(self):
@@ -2286,6 +3018,13 @@ class Qwen2MoeModel(Model):
2286
3018
  if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
2287
3019
  self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
2288
3020
  logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
3021
+ # YaRN is not enabled by default
3022
+ # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
3023
+ rope_scaling = self.hparams.get("rope_scaling") or {}
3024
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
3025
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
3026
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
3027
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2289
3028
 
2290
3029
  _experts: list[dict[str, Tensor]] | None = None
2291
3030
 
@@ -2335,18 +3074,18 @@ class Qwen2MoeModel(Model):
2335
3074
  raise ValueError(f"Unprocessed experts: {experts}")
2336
3075
 
2337
3076
 
2338
- @Model.register("Qwen3ForCausalLM")
3077
+ @ModelBase.register("Qwen3ForCausalLM")
2339
3078
  class Qwen3Model(Qwen2Model):
2340
3079
  model_arch = gguf.MODEL_ARCH.QWEN3
2341
3080
 
2342
3081
 
2343
- @Model.register("Qwen3MoeForCausalLM")
3082
+ @ModelBase.register("Qwen3MoeForCausalLM")
2344
3083
  class Qwen3MoeModel(Qwen2MoeModel):
2345
3084
  model_arch = gguf.MODEL_ARCH.QWEN3MOE
2346
3085
 
2347
3086
 
2348
- @Model.register("GPT2LMHeadModel")
2349
- class GPT2Model(Model):
3087
+ @ModelBase.register("GPT2LMHeadModel")
3088
+ class GPT2Model(TextModel):
2350
3089
  model_arch = gguf.MODEL_ARCH.GPT2
2351
3090
 
2352
3091
  def set_gguf_parameters(self):
@@ -2374,15 +3113,11 @@ class GPT2Model(Model):
2374
3113
 
2375
3114
  tensors.append((new_name, data_torch))
2376
3115
 
2377
- # note: GPT2 output is tied to (same as) wte in original model
2378
- if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2379
- tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2380
-
2381
3116
  return tensors
2382
3117
 
2383
3118
 
2384
- @Model.register("PhiForCausalLM")
2385
- class Phi2Model(Model):
3119
+ @ModelBase.register("PhiForCausalLM")
3120
+ class Phi2Model(TextModel):
2386
3121
  model_arch = gguf.MODEL_ARCH.PHI2
2387
3122
 
2388
3123
  def set_gguf_parameters(self):
@@ -2405,8 +3140,8 @@ class Phi2Model(Model):
2405
3140
  self.gguf_writer.add_add_bos_token(False)
2406
3141
 
2407
3142
 
2408
- @Model.register("Phi3ForCausalLM")
2409
- class Phi3MiniModel(Model):
3143
+ @ModelBase.register("Phi3ForCausalLM")
3144
+ class Phi3MiniModel(TextModel):
2410
3145
  model_arch = gguf.MODEL_ARCH.PHI3
2411
3146
 
2412
3147
  def set_vocab(self):
@@ -2522,7 +3257,8 @@ class Phi3MiniModel(Model):
2522
3257
  rms_eps = self.find_hparam(["rms_norm_eps"])
2523
3258
  max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
2524
3259
  orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2525
- rope_dims = n_embd // n_head
3260
+ rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
3261
+ rope_dims = int(rot_pct * n_embd) // n_head
2526
3262
 
2527
3263
  self.gguf_writer.add_context_length(max_pos_embds)
2528
3264
  self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
@@ -2546,7 +3282,8 @@ class Phi3MiniModel(Model):
2546
3282
  n_head = self.find_hparam(["num_attention_heads", "n_head"])
2547
3283
  max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
2548
3284
  orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2549
- rope_dims = n_embd // n_head
3285
+ rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
3286
+ rope_dims = int(rot_pct * n_embd) // n_head
2550
3287
 
2551
3288
  # write rope scaling for long context (128k) model
2552
3289
  rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2555,7 +3292,7 @@ class Phi3MiniModel(Model):
2555
3292
 
2556
3293
  scale = max_pos_embds / orig_max_pos_embds
2557
3294
 
2558
- rope_scaling_type = rope_scaling.get('type', '').lower()
3295
+ rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
2559
3296
  if len(rope_scaling_type) == 0:
2560
3297
  raise KeyError('Missing the required key rope_scaling.type')
2561
3298
 
@@ -2575,13 +3312,13 @@ class Phi3MiniModel(Model):
2575
3312
  raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
2576
3313
 
2577
3314
  if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2578
- raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
3315
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
2579
3316
 
2580
3317
  yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2581
3318
  yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2582
3319
 
2583
3320
 
2584
- @Model.register("PhiMoEForCausalLM")
3321
+ @ModelBase.register("PhiMoEForCausalLM")
2585
3322
  class PhiMoeModel(Phi3MiniModel):
2586
3323
  model_arch = gguf.MODEL_ARCH.PHIMOE
2587
3324
 
@@ -2638,8 +3375,8 @@ class PhiMoeModel(Phi3MiniModel):
2638
3375
  raise ValueError(f"Unprocessed experts: {experts}")
2639
3376
 
2640
3377
 
2641
- @Model.register("PlamoForCausalLM")
2642
- class PlamoModel(Model):
3378
+ @ModelBase.register("PlamoForCausalLM")
3379
+ class PlamoModel(TextModel):
2643
3380
  model_arch = gguf.MODEL_ARCH.PLAMO
2644
3381
 
2645
3382
  def set_vocab(self):
@@ -2686,8 +3423,8 @@ class PlamoModel(Model):
2686
3423
  return [(new_name, data_torch)]
2687
3424
 
2688
3425
 
2689
- @Model.register("CodeShellForCausalLM")
2690
- class CodeShellModel(Model):
3426
+ @ModelBase.register("CodeShellForCausalLM")
3427
+ class CodeShellModel(TextModel):
2691
3428
  model_arch = gguf.MODEL_ARCH.CODESHELL
2692
3429
 
2693
3430
  def set_gguf_parameters(self):
@@ -2705,25 +3442,30 @@ class CodeShellModel(Model):
2705
3442
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2706
3443
  self.gguf_writer.add_rope_scaling_factor(1.0)
2707
3444
 
3445
+ _has_tok_embd = False
3446
+
2708
3447
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2709
3448
  del bid # unused
2710
3449
 
2711
- new_name = self.map_tensor_name(name)
2712
-
2713
- tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
3450
+ output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
3451
+ tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
2714
3452
 
2715
- if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2716
- assert self.tensor_names is not None
3453
+ new_name = self.map_tensor_name(name)
2717
3454
 
2718
- if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
2719
- # copy tok_embd.weight to output.weight
2720
- tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
3455
+ # assuming token_embd.weight is seen before output.weight
3456
+ if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3457
+ # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
3458
+ if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
3459
+ logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
3460
+ self.tensor_names.remove("transformer.wte.weight")
3461
+ elif new_name == tok_embd_name:
3462
+ self._has_tok_embd = True
2721
3463
 
2722
- return tensors
3464
+ return [(new_name, data_torch)]
2723
3465
 
2724
3466
 
2725
- @Model.register("InternLM2ForCausalLM")
2726
- class InternLM2Model(Model):
3467
+ @ModelBase.register("InternLM2ForCausalLM")
3468
+ class InternLM2Model(TextModel):
2727
3469
  model_arch = gguf.MODEL_ARCH.INTERNLM2
2728
3470
 
2729
3471
  def set_vocab(self):
@@ -2862,10 +3604,10 @@ class InternLM2Model(Model):
2862
3604
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2863
3605
  self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
2864
3606
  self.gguf_writer.add_file_type(self.ftype)
2865
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2866
- if self.hparams["rope_scaling"].get("type") == "linear":
2867
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2868
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
3607
+ rope_scaling = self.hparams.get("rope_scaling") or {}
3608
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
3609
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3610
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2869
3611
 
2870
3612
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2871
3613
  num_heads = self.hparams["num_attention_heads"]
@@ -2875,6 +3617,11 @@ class InternLM2Model(Model):
2875
3617
  head_dim = n_embd // num_heads
2876
3618
  num_groups = num_heads // q_per_kv
2877
3619
 
3620
+ name = name.replace("language_model.", "") # InternVL
3621
+ if name.startswith("mlp") or name.startswith("vision_model"):
3622
+ # skip visual tensors
3623
+ return []
3624
+
2878
3625
  if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
2879
3626
  qkv = data_torch
2880
3627
 
@@ -2895,8 +3642,8 @@ class InternLM2Model(Model):
2895
3642
  return [(self.map_tensor_name(name), data_torch)]
2896
3643
 
2897
3644
 
2898
- @Model.register("InternLM3ForCausalLM")
2899
- class InternLM3Model(Model):
3645
+ @ModelBase.register("InternLM3ForCausalLM")
3646
+ class InternLM3Model(TextModel):
2900
3647
  model_arch = gguf.MODEL_ARCH.LLAMA
2901
3648
 
2902
3649
  def set_vocab(self):
@@ -2940,14 +3687,18 @@ class InternLM3Model(Model):
2940
3687
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2941
3688
  self.gguf_writer.add_rope_dimension_count(rope_dim)
2942
3689
 
2943
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2944
- if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
2945
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2946
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
3690
+ rope_scaling = self.hparams.get("rope_scaling") or {}
3691
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
3692
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3693
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2947
3694
 
2948
3695
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2949
3696
  n_head = self.hparams["num_attention_heads"]
2950
3697
  n_kv_head = self.hparams.get("num_key_value_heads")
3698
+ name = name.replace("language_model.", "") # InternVL
3699
+ if name.startswith("mlp") or name.startswith("vision_model"):
3700
+ # skip visual tensors
3701
+ return []
2951
3702
  if name.endswith(("q_proj.weight", "q_proj.bias")):
2952
3703
  data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2953
3704
  if name.endswith(("k_proj.weight", "k_proj.bias")):
@@ -2955,40 +3706,27 @@ class InternLM3Model(Model):
2955
3706
  return [(self.map_tensor_name(name), data_torch)]
2956
3707
 
2957
3708
 
2958
- @Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
2959
- class BertModel(Model):
3709
+ @ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
3710
+ class BertModel(TextModel):
2960
3711
  model_arch = gguf.MODEL_ARCH.BERT
2961
3712
 
2962
3713
  def __init__(self, *args, **kwargs):
2963
3714
  super().__init__(*args, **kwargs)
2964
3715
  self.vocab_size = None
2965
3716
 
3717
+ if cls_out_labels := self.hparams.get("id2label"):
3718
+ if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
3719
+ # Remove dummy labels added by AutoConfig
3720
+ cls_out_labels = None
3721
+ self.cls_out_labels = cls_out_labels
3722
+
2966
3723
  def set_gguf_parameters(self):
2967
3724
  super().set_gguf_parameters()
2968
3725
  self.gguf_writer.add_causal_attention(False)
3726
+ self._try_set_pooling_type()
2969
3727
 
2970
- # get pooling path
2971
- pooling_path = None
2972
- module_path = self.dir_model / "modules.json"
2973
- if module_path.is_file():
2974
- with open(module_path, encoding="utf-8") as f:
2975
- modules = json.load(f)
2976
- for mod in modules:
2977
- if mod["type"] == "sentence_transformers.models.Pooling":
2978
- pooling_path = mod["path"]
2979
- break
2980
-
2981
- # get pooling type
2982
- if pooling_path is not None:
2983
- with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
2984
- pooling = json.load(f)
2985
- if pooling["pooling_mode_mean_tokens"]:
2986
- pooling_type = gguf.PoolingType.MEAN
2987
- elif pooling["pooling_mode_cls_token"]:
2988
- pooling_type = gguf.PoolingType.CLS
2989
- else:
2990
- raise NotImplementedError("Only MEAN and CLS pooling types supported")
2991
- self.gguf_writer.add_pooling_type(pooling_type)
3728
+ if self.cls_out_labels:
3729
+ self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
2992
3730
 
2993
3731
  def set_vocab(self):
2994
3732
  tokens, toktypes, tokpre = self.get_vocab_base()
@@ -3040,16 +3778,17 @@ class BertModel(Model):
3040
3778
  if name.startswith("cls.seq_relationship"):
3041
3779
  return []
3042
3780
 
3043
- return [(self.map_tensor_name(name), data_torch)]
3044
-
3781
+ if self.cls_out_labels:
3782
+ # For BertForSequenceClassification (direct projection layer)
3783
+ if name == "classifier.weight":
3784
+ name = "classifier.out_proj.weight"
3045
3785
 
3046
- @Model.register("RobertaModel")
3047
- class RobertaModel(BertModel):
3048
- model_arch = gguf.MODEL_ARCH.BERT
3786
+ if name == "classifier.bias":
3787
+ name = "classifier.out_proj.bias"
3049
3788
 
3050
- def __init__(self, *args, **kwargs):
3051
- super().__init__(*args, **kwargs)
3789
+ return [(self.map_tensor_name(name), data_torch)]
3052
3790
 
3791
+ def _xlmroberta_tokenizer_init(self) -> None:
3053
3792
  # we need the pad_token_id to know how to chop down position_embd matrix
3054
3793
  if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3055
3794
  self._position_offset = 1 + pad_token_id
@@ -3058,68 +3797,160 @@ class RobertaModel(BertModel):
3058
3797
  else:
3059
3798
  self._position_offset = None
3060
3799
 
3061
- def set_vocab(self):
3062
- """Support BPE tokenizers for roberta models"""
3063
- bpe_tok_path = self.dir_model / "tokenizer.json"
3064
- if bpe_tok_path.exists():
3065
- self._set_vocab_gpt2()
3066
- self.gguf_writer.add_add_bos_token(True)
3067
- self.gguf_writer.add_add_eos_token(True)
3800
+ def _xlmroberta_set_vocab(self) -> None:
3801
+ # to avoid TypeError: Descriptors cannot be created directly
3802
+ # exception when importing sentencepiece_model_pb2
3803
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3804
+ from sentencepiece import SentencePieceProcessor
3805
+ from sentencepiece import sentencepiece_model_pb2 as model
3068
3806
 
3069
- # we need this to validate the size of the token_type embeddings
3070
- # though currently we are passing all zeros to the token_type embeddings
3071
- # "Sequence A" or "Sequence B"
3072
- self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3807
+ tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
3073
3808
 
3074
- else:
3075
- return super().set_vocab()
3809
+ tokenizer_json = {}
3810
+ tokenizer_config_json = {}
3811
+ if not tokenizer_path.is_file():
3812
+ tokenizer_path = self.dir_model / 'tokenizer.json'
3813
+ tokenizer_config_path = self.dir_model / 'tokenizer_config.json'
3076
3814
 
3077
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3078
- # if name starts with "roberta.", remove the prefix
3079
- # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
3080
- if name.startswith("roberta."):
3081
- name = name[8:]
3815
+ if not tokenizer_path.is_file():
3816
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
3082
3817
 
3083
- # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
3084
- if name == "embeddings.position_embeddings.weight":
3085
- if self._position_offset is not None:
3086
- data_torch = data_torch[self._position_offset:,:]
3818
+ from base64 import b64decode
3819
+ from transformers import AutoTokenizer
3820
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
3087
3821
 
3088
- return super().modify_tensors(data_torch, name, bid)
3822
+ with open(tokenizer_path, "r", encoding="utf-8") as fp:
3823
+ tokenizer_json = json.load(fp)
3089
3824
 
3825
+ if tokenizer_config_path.is_file():
3826
+ with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
3827
+ tokenizer_config_json = json.load(fp)
3090
3828
 
3091
- @Model.register("NomicBertModel")
3092
- class NomicBertModel(BertModel):
3093
- model_arch = gguf.MODEL_ARCH.NOMIC_BERT
3829
+ add_prefix = tokenizer.add_prefix_space
3830
+ remove_whitespaces = tokenizer.clean_up_tokenization_spaces
3831
+ precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
3094
3832
 
3095
- def __init__(self, *args, **kwargs):
3096
- super().__init__(*args, **kwargs)
3833
+ vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
3834
+ else:
3835
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3836
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3837
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3097
3838
 
3098
- # the HF config claims n_ctx=8192, but it uses RoPE scaling
3099
- self.hparams["n_ctx"] = 2048
3839
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3840
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3841
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3100
3842
 
3101
- # SwigLU activation
3102
- assert self.hparams["activation_function"] == "swiglu"
3103
- # this doesn't do anything in the HF version
3104
- assert self.hparams["causal"] is False
3105
- # no bias tensors
3106
- assert self.hparams["qkv_proj_bias"] is False
3107
- assert self.hparams["mlp_fc1_bias"] is False
3108
- assert self.hparams["mlp_fc2_bias"] is False
3109
- # norm at end of layer
3110
- assert self.hparams["prenorm"] is False
3111
- # standard RoPE
3112
- assert self.hparams["rotary_emb_fraction"] == 1.0
3113
- assert self.hparams["rotary_emb_interleaved"] is False
3114
- assert self.hparams["rotary_emb_scale_base"] is None
3843
+ tokenizer = SentencePieceProcessor()
3844
+ tokenizer.LoadFromFile(str(tokenizer_path))
3845
+
3846
+ vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
3847
+
3848
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3849
+ scores: list[float] = [-10000.0] * vocab_size
3850
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3851
+
3852
+ if isinstance(tokenizer, SentencePieceProcessor):
3853
+ for token_id in range(tokenizer.vocab_size()):
3854
+ piece = tokenizer.IdToPiece(token_id)
3855
+ text = piece.encode("utf-8")
3856
+ score = tokenizer.GetScore(token_id)
3857
+
3858
+ toktype = SentencePieceTokenTypes.NORMAL
3859
+ if tokenizer.IsUnknown(token_id):
3860
+ toktype = SentencePieceTokenTypes.UNKNOWN
3861
+ elif tokenizer.IsControl(token_id):
3862
+ toktype = SentencePieceTokenTypes.CONTROL
3863
+ elif tokenizer.IsUnused(token_id):
3864
+ toktype = SentencePieceTokenTypes.UNUSED
3865
+ elif tokenizer.IsByte(token_id):
3866
+ toktype = SentencePieceTokenTypes.BYTE
3867
+
3868
+ tokens[token_id] = text
3869
+ scores[token_id] = score
3870
+ toktypes[token_id] = toktype
3871
+ else:
3872
+ added_vocab = tokenizer.get_added_vocab()
3873
+ unk_token = tokenizer_config_json.get("unk_token")
3874
+ unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
3875
+
3876
+ for token_id in range(tokenizer.vocab_size):
3877
+ piece = tokenizer._convert_id_to_token(token_id)
3878
+ if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
3879
+ text = piece.encode("utf-8")
3880
+ score = tokenizer_json["model"]["vocab"][token_id][1]
3881
+
3882
+ toktype = SentencePieceTokenTypes.NORMAL
3883
+ if token_id == unk_token_id:
3884
+ toktype = SentencePieceTokenTypes.UNKNOWN
3885
+ elif token_id in tokenizer.all_special_ids:
3886
+ toktype = SentencePieceTokenTypes.CONTROL
3887
+ elif token_id in added_vocab.values():
3888
+ toktype = SentencePieceTokenTypes.USER_DEFINED
3889
+ # No reliable way to detect this, but jina doesn't have any
3890
+ # elif tokenizer.IsByte(token_id):
3891
+ # toktype = SentencePieceTokenTypes.BYTE
3892
+
3893
+ tokens[token_id] = text
3894
+ scores[token_id] = score
3895
+ toktypes[token_id] = toktype
3896
+
3897
+ if isinstance(tokenizer, SentencePieceProcessor):
3898
+ # realign tokens (see HF tokenizer code)
3899
+ tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3900
+ scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3901
+ toktypes = [
3902
+ SentencePieceTokenTypes.CONTROL,
3903
+ SentencePieceTokenTypes.CONTROL,
3904
+ SentencePieceTokenTypes.CONTROL,
3905
+ SentencePieceTokenTypes.UNKNOWN,
3906
+ ] + toktypes[3:-1]
3907
+
3908
+ if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
3909
+ # Add mask token missing from sentencepiece.bpe.model
3910
+ tokens[250001] = b'<mask>'
3911
+ scores[250001] = 0.0
3912
+ toktypes[250001] = SentencePieceTokenTypes.CONTROL
3913
+
3914
+ self.gguf_writer.add_tokenizer_model("t5")
3915
+ self.gguf_writer.add_tokenizer_pre("default")
3916
+ self.gguf_writer.add_token_list(tokens)
3917
+ self.gguf_writer.add_token_scores(scores)
3918
+ self.gguf_writer.add_token_types(toktypes)
3919
+ self.gguf_writer.add_add_space_prefix(add_prefix)
3920
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3921
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3922
+ if precompiled_charsmap:
3923
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3924
+
3925
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3926
+ special_vocab.add_to_gguf(self.gguf_writer)
3927
+
3928
+ self.gguf_writer.add_add_bos_token(True)
3929
+ self.gguf_writer.add_add_eos_token(True)
3930
+
3931
+
3932
+ @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
3933
+ class DistilBertModel(BertModel):
3934
+ model_arch = gguf.MODEL_ARCH.BERT
3115
3935
 
3116
3936
  def set_gguf_parameters(self):
3937
+ self.gguf_writer.add_layer_norm_eps(1e-12)
3938
+ logger.info("gguf: layer norm epsilon = 1e-12")
3117
3939
  super().set_gguf_parameters()
3118
- self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
3119
3940
 
3941
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3942
+ if name.startswith("distilbert."):
3943
+ name = name[11:]
3120
3944
 
3121
- @Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
3122
- class XLMRobertaModel(BertModel):
3945
+ # These layers act as MLM head, so we don't need them
3946
+ if name.startswith("vocab_"):
3947
+ return []
3948
+
3949
+ return super().modify_tensors(data_torch, name, bid)
3950
+
3951
+
3952
+ @ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
3953
+ class RobertaModel(BertModel):
3123
3954
  model_arch = gguf.MODEL_ARCH.BERT
3124
3955
 
3125
3956
  def __init__(self, *args, **kwargs):
@@ -3134,86 +3965,127 @@ class XLMRobertaModel(BertModel):
3134
3965
  self._position_offset = None
3135
3966
 
3136
3967
  def set_vocab(self):
3137
- # to avoid TypeError: Descriptors cannot be created directly
3138
- # exception when importing sentencepiece_model_pb2
3139
- os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3140
- from sentencepiece import SentencePieceProcessor
3141
- from sentencepiece import sentencepiece_model_pb2 as model
3968
+ """Support BPE tokenizers for roberta models"""
3969
+ bpe_tok_path = self.dir_model / "tokenizer.json"
3970
+ if bpe_tok_path.exists():
3971
+ self._set_vocab_gpt2()
3972
+ self.gguf_writer.add_add_bos_token(True)
3973
+ self.gguf_writer.add_add_eos_token(True)
3142
3974
 
3143
- tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
3144
- if not tokenizer_path.is_file():
3145
- raise FileNotFoundError(f"File not found: {tokenizer_path}")
3975
+ # we need this to validate the size of the token_type embeddings
3976
+ # though currently we are passing all zeros to the token_type embeddings
3977
+ # "Sequence A" or "Sequence B"
3978
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3146
3979
 
3147
- sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3148
- sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3149
- assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3980
+ else:
3981
+ return super().set_vocab()
3150
3982
 
3151
- add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3152
- remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3153
- precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3983
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3984
+ # if name starts with "roberta.", remove the prefix
3985
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
3986
+ if name.startswith("roberta."):
3987
+ name = name[8:]
3154
3988
 
3155
- tokenizer = SentencePieceProcessor()
3156
- tokenizer.LoadFromFile(str(tokenizer_path))
3989
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
3990
+ if name == "embeddings.position_embeddings.weight":
3991
+ if self._position_offset is not None:
3992
+ data_torch = data_torch[self._position_offset:,:]
3157
3993
 
3158
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3994
+ return super().modify_tensors(data_torch, name, bid)
3159
3995
 
3160
- tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3161
- scores: list[float] = [-10000.0] * vocab_size
3162
- toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3163
3996
 
3164
- for token_id in range(tokenizer.vocab_size()):
3165
- piece = tokenizer.IdToPiece(token_id)
3166
- text = piece.encode("utf-8")
3167
- score = tokenizer.GetScore(token_id)
3997
+ @ModelBase.register("NomicBertModel")
3998
+ class NomicBertModel(BertModel):
3999
+ model_arch = gguf.MODEL_ARCH.BERT
3168
4000
 
3169
- toktype = SentencePieceTokenTypes.NORMAL
3170
- if tokenizer.IsUnknown(token_id):
3171
- toktype = SentencePieceTokenTypes.UNKNOWN
3172
- elif tokenizer.IsControl(token_id):
3173
- toktype = SentencePieceTokenTypes.CONTROL
3174
- elif tokenizer.IsUnused(token_id):
3175
- toktype = SentencePieceTokenTypes.UNUSED
3176
- elif tokenizer.IsByte(token_id):
3177
- toktype = SentencePieceTokenTypes.BYTE
4001
+ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
4002
+ hparams = kwargs.pop("hparams", None)
4003
+ if hparams is None:
4004
+ hparams = ModelBase.load_hparams(dir_model)
3178
4005
 
3179
- tokens[token_id] = text
3180
- scores[token_id] = score
3181
- toktypes[token_id] = toktype
4006
+ self.is_moe = bool(hparams.get("moe_every_n_layers"))
4007
+ self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
3182
4008
 
3183
- if vocab_size > len(tokens):
3184
- pad_count = vocab_size - len(tokens)
3185
- logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3186
- for i in range(1, pad_count + 1):
3187
- tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3188
- scores.append(-1000.0)
3189
- toktypes.append(SentencePieceTokenTypes.UNUSED)
4009
+ super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
3190
4010
 
3191
- # realign tokens (see HF tokenizer code)
3192
- tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3193
- scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3194
- toktypes = [
3195
- SentencePieceTokenTypes.CONTROL,
3196
- SentencePieceTokenTypes.CONTROL,
3197
- SentencePieceTokenTypes.CONTROL,
3198
- SentencePieceTokenTypes.UNKNOWN,
3199
- ] + toktypes[3:-1]
4011
+ self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
4012
+ if self._tokenizer_is_xlmroberta:
4013
+ self._xlmroberta_tokenizer_init()
3200
4014
 
3201
- self.gguf_writer.add_tokenizer_model("t5")
3202
- self.gguf_writer.add_tokenizer_pre("default")
3203
- self.gguf_writer.add_token_list(tokens)
3204
- self.gguf_writer.add_token_scores(scores)
3205
- self.gguf_writer.add_token_types(toktypes)
3206
- self.gguf_writer.add_add_space_prefix(add_prefix)
3207
- self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3208
- self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3209
- if precompiled_charsmap:
3210
- self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
4015
+ npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048)
4016
+ if npos == 8192 and mtp == 2048:
4017
+ self.hparams["n_positions"] = 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens.
4018
+ elif npos == 2048 and mtp == 2048:
4019
+ self.hparams["n_positions"] = 512 # nomic-embed-text-v2-moe is trained for 512 tokens.
4020
+ else:
4021
+ raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}")
3211
4022
 
3212
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3213
- special_vocab.add_to_gguf(self.gguf_writer)
4023
+ assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
3214
4024
 
3215
- self.gguf_writer.add_add_bos_token(True)
3216
- self.gguf_writer.add_add_eos_token(True)
4025
+ # this doesn't do anything in the HF version
4026
+ assert self.hparams["causal"] is False
4027
+ # no bias tensors unless MoE
4028
+ assert self.hparams["qkv_proj_bias"] == self.is_moe
4029
+ assert self.hparams["mlp_fc1_bias"] == self.is_moe
4030
+ assert self.hparams["mlp_fc2_bias"] == self.is_moe
4031
+
4032
+ # norm at end of layer
4033
+ assert self.hparams["prenorm"] is False
4034
+ # standard RoPE
4035
+ assert self.hparams["rotary_emb_fraction"] == 1.0
4036
+ assert self.hparams["rotary_emb_interleaved"] is False
4037
+ assert self.hparams["rotary_emb_scale_base"] is None
4038
+
4039
+ def set_vocab(self) -> None:
4040
+ if self._tokenizer_is_xlmroberta:
4041
+ return self._xlmroberta_set_vocab()
4042
+ return super().set_vocab()
4043
+
4044
+ def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
4045
+ # If the tensor is an experts bias tensor, skip it by returning an empty list.
4046
+ if "mlp.experts.bias" in name:
4047
+ return [] # Explicitly return an empty list.
4048
+
4049
+ if "mlp.experts.mlp.w1" in name:
4050
+ data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
4051
+ name += ".weight"
4052
+
4053
+ if "mlp.experts.mlp.w2" in name:
4054
+ data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
4055
+ data_torch = data_torch.transpose(1, 2)
4056
+ name += ".weight"
4057
+
4058
+ return [(self.map_tensor_name(name), data_torch)]
4059
+
4060
+ def set_gguf_parameters(self):
4061
+ super().set_gguf_parameters()
4062
+ self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
4063
+ if self.is_moe:
4064
+ self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
4065
+ self.gguf_writer.add_expert_count(self.hparams["num_experts"])
4066
+ self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
4067
+
4068
+ def _is_tokenizer_xlmroberta(self) -> bool:
4069
+ with open(self.dir_model / "tokenizer.json") as f:
4070
+ tokenizer_json = json.load(f)
4071
+ toktyp = tokenizer_json["model"]["type"]
4072
+ if toktyp == "Unigram":
4073
+ return True
4074
+ if toktyp == "WordPiece":
4075
+ return False
4076
+ raise ValueError(f"unknown tokenizer: {toktyp}")
4077
+
4078
+
4079
+ @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
4080
+ class XLMRobertaModel(BertModel):
4081
+ model_arch = gguf.MODEL_ARCH.BERT
4082
+
4083
+ def __init__(self, *args, **kwargs):
4084
+ super().__init__(*args, **kwargs)
4085
+ self._xlmroberta_tokenizer_init()
4086
+
4087
+ def set_vocab(self):
4088
+ self._xlmroberta_set_vocab()
3217
4089
 
3218
4090
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3219
4091
  # if name starts with "roberta.", remove the prefix
@@ -3229,8 +4101,8 @@ class XLMRobertaModel(BertModel):
3229
4101
  return super().modify_tensors(data_torch, name, bid)
3230
4102
 
3231
4103
 
3232
- @Model.register("GemmaForCausalLM")
3233
- class GemmaModel(Model):
4104
+ @ModelBase.register("GemmaForCausalLM")
4105
+ class GemmaModel(TextModel):
3234
4106
  model_arch = gguf.MODEL_ARCH.GEMMA
3235
4107
 
3236
4108
  def set_vocab(self):
@@ -3280,8 +4152,8 @@ class GemmaModel(Model):
3280
4152
  return [(self.map_tensor_name(name), data_torch)]
3281
4153
 
3282
4154
 
3283
- @Model.register("Gemma2ForCausalLM")
3284
- class Gemma2Model(Model):
4155
+ @ModelBase.register("Gemma2ForCausalLM")
4156
+ class Gemma2Model(TextModel):
3285
4157
  model_arch = gguf.MODEL_ARCH.GEMMA2
3286
4158
 
3287
4159
  def set_vocab(self):
@@ -3327,48 +4199,128 @@ class Gemma2Model(Model):
3327
4199
  return [(self.map_tensor_name(name), data_torch)]
3328
4200
 
3329
4201
 
3330
- @Model.register("Starcoder2ForCausalLM")
3331
- class StarCoder2Model(Model):
3332
- model_arch = gguf.MODEL_ARCH.STARCODER2
4202
+ @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
4203
+ class Gemma3Model(TextModel):
4204
+ model_arch = gguf.MODEL_ARCH.GEMMA3
3333
4205
 
4206
+ def set_vocab(self):
4207
+ self._set_vocab_sentencepiece()
3334
4208
 
3335
- @Model.register("Rwkv6ForCausalLM")
3336
- class Rwkv6Model(Model):
3337
- model_arch = gguf.MODEL_ARCH.RWKV6
4209
+ self.gguf_writer.add_add_space_prefix(False)
3338
4210
 
3339
- def set_vocab(self):
3340
- assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
3341
- vocab_size = self.hparams.get("vocab_size", 65536)
4211
+ def set_gguf_parameters(self):
4212
+ hparams = self.hparams
4213
+ block_count = hparams["num_hidden_layers"]
3342
4214
 
3343
- tokens: list[bytes] = ['<s>'.encode("utf-8")]
3344
- toktypes: list[int] = [gguf.TokenType.CONTROL]
4215
+ # some default values are not specified in the hparams
4216
+ self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
4217
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
4218
+ self.gguf_writer.add_block_count(block_count)
4219
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
4220
+ self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
4221
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
4222
+ self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
4223
+ self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
4224
+ self.gguf_writer.add_file_type(self.ftype)
4225
+ self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
4226
+ # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
4227
+ assert hparams.get("attn_logit_softcapping") is None
4228
+ assert hparams.get("final_logit_softcapping") is None
4229
+ self.gguf_writer.add_sliding_window(hparams["sliding_window"])
4230
+ self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
4231
+ if hparams.get("rope_scaling") is not None:
4232
+ assert hparams["rope_scaling"]["rope_type"] == "linear"
4233
+ # important: this rope_scaling is only applied for global layers, and not used by 1B model
4234
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
4235
+ self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
3345
4236
 
3346
- with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
3347
- lines = f.readlines()
3348
- for line in lines:
3349
- parts = line.split(' ')
3350
- assert len(parts) >= 3
3351
- token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
3352
- token = token.encode("utf-8") if isinstance(token, str) else token
3353
- assert isinstance(token, bytes)
3354
- assert len(token) == token_len
3355
- token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
3356
- tokens.append(token_text.encode("utf-8"))
3357
- toktypes.append(gguf.TokenType.NORMAL)
3358
- remainder = vocab_size - len(tokens)
3359
- assert remainder >= 0
3360
- for i in range(len(tokens), vocab_size):
3361
- tokens.append(f"[PAD{i}]".encode("utf-8"))
3362
- toktypes.append(gguf.TokenType.UNUSED)
4237
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4238
+ del bid # unused
3363
4239
 
3364
- self.gguf_writer.add_tokenizer_model("rwkv")
3365
- self.gguf_writer.add_token_list(tokens)
3366
- self.gguf_writer.add_token_types(toktypes)
3367
- special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
3368
- special_vocab.chat_template = "rwkv-world"
3369
- # hack: Add '\n\n' as the EOT token to make it chat normally
3370
- special_vocab._set_special_token("eot", 261)
3371
- special_vocab.add_to_gguf(self.gguf_writer)
4240
+ if name.startswith("language_model."):
4241
+ name = name.replace("language_model.", "")
4242
+
4243
+ elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
4244
+ or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
4245
+ return [] # skip vision tensors
4246
+
4247
+ # remove OOV (out-of-vocabulary) rows in token_embd
4248
+ if "embed_tokens.weight" in name:
4249
+ vocab = self._create_vocab_sentencepiece()
4250
+ tokens = vocab[0]
4251
+ data_torch = data_torch[:len(tokens)]
4252
+
4253
+ # ref code in Gemma3RMSNorm
4254
+ # output = output * (1.0 + self.weight.float())
4255
+ if name.endswith("norm.weight"):
4256
+ data_torch = data_torch + 1
4257
+
4258
+ return [(self.map_tensor_name(name), data_torch)]
4259
+
4260
+
4261
+ @ModelBase.register("Gemma3ForConditionalGeneration")
4262
+ class Gemma3VisionModel(MmprojModel):
4263
+ def set_gguf_parameters(self):
4264
+ super().set_gguf_parameters()
4265
+ hparams = self.hparams
4266
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
4267
+ # default values below are taken from HF tranformers code
4268
+ self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
4269
+ self.gguf_writer.add_vision_use_gelu(True)
4270
+ # calculate proj_scale_factor (used by tinygemma3 test model)
4271
+ image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
4272
+ n_per_side = int(image_seq_length ** 0.5)
4273
+ image_size = self.hparams["image_size"]
4274
+ patch_size = self.hparams["patch_size"]
4275
+ proj_scale_factor = (image_size // patch_size) // n_per_side
4276
+ if proj_scale_factor > 0 and proj_scale_factor != 4:
4277
+ # we only need to write this if it's not the default value
4278
+ # in this case, we are converting a test model
4279
+ self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
4280
+
4281
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
4282
+ del bid, new_name, n_dims # unused
4283
+ # related to https://github.com/ggml-org/llama.cpp/issues/13025
4284
+ if "input_projection" in name:
4285
+ return gguf.GGMLQuantizationType.F16
4286
+ if ".embeddings." in name:
4287
+ return gguf.GGMLQuantizationType.F32
4288
+ return False
4289
+
4290
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4291
+ del bid # unused
4292
+
4293
+ if "vision_model.head." in name:
4294
+ return [] # skip redundant tensors for tinygemma3
4295
+
4296
+ if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
4297
+ or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
4298
+ # process vision tensors
4299
+ name = name.replace("_weight", ".weight")
4300
+
4301
+ # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
4302
+ # the other norm values are part of SigLIP model, and they are already correct
4303
+ # ref code: Gemma3RMSNorm
4304
+ if "soft_emb_norm.weight" in name:
4305
+ logger.info(f"Correcting norm value for '{name}'")
4306
+ data_torch = data_torch + 1
4307
+
4308
+ return [(self.map_tensor_name(name), data_torch)]
4309
+
4310
+ return [] # skip other tensors
4311
+
4312
+
4313
+ @ModelBase.register("Starcoder2ForCausalLM")
4314
+ class StarCoder2Model(TextModel):
4315
+ model_arch = gguf.MODEL_ARCH.STARCODER2
4316
+
4317
+
4318
+ @ModelBase.register("Rwkv6ForCausalLM")
4319
+ class Rwkv6Model(TextModel):
4320
+ model_arch = gguf.MODEL_ARCH.RWKV6
4321
+
4322
+ def set_vocab(self):
4323
+ self._set_vocab_rwkv_world()
3372
4324
 
3373
4325
  def set_gguf_parameters(self):
3374
4326
  block_count = self.hparams["num_hidden_layers"]
@@ -3429,16 +4381,189 @@ class Rwkv6Model(Model):
3429
4381
  self.lerp_weights[bid] = {new_name: data_torch}
3430
4382
  if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
3431
4383
  new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3432
- data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
4384
+ data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
4385
+ yield (new_name, data)
4386
+ return
4387
+
4388
+ yield (new_name, data_torch)
4389
+
4390
+
4391
+ @ModelBase.register("RWKV6Qwen2ForCausalLM")
4392
+ class RWKV6Qwen2Model(Rwkv6Model):
4393
+ model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
4394
+
4395
+ def set_vocab(self):
4396
+ try:
4397
+ self._set_vocab_sentencepiece()
4398
+ except FileNotFoundError:
4399
+ self._set_vocab_gpt2()
4400
+
4401
+ def set_gguf_parameters(self):
4402
+ block_count = self.hparams["num_hidden_layers"]
4403
+ num_attention_heads = self.hparams["num_attention_heads"]
4404
+ num_key_value_heads = self.hparams["num_key_value_heads"]
4405
+ hidden_size = self.hparams["hidden_size"]
4406
+ head_size = hidden_size // num_attention_heads
4407
+ rms_norm_eps = self.hparams["rms_norm_eps"]
4408
+ intermediate_size = self.hparams["intermediate_size"]
4409
+ time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
4410
+ time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
4411
+
4412
+ # RWKV isn't context limited
4413
+ self.gguf_writer.add_context_length(1048576)
4414
+ self.gguf_writer.add_embedding_length(hidden_size)
4415
+ self.gguf_writer.add_block_count(block_count)
4416
+ self.gguf_writer.add_wkv_head_size(head_size)
4417
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
4418
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
4419
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
4420
+ self.gguf_writer.add_file_type(self.ftype)
4421
+
4422
+ # special parameters for time_mixing in RWKV6QWEN2
4423
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
4424
+ self.gguf_writer.add_token_shift_count(1)
4425
+ # RWKV6QWEN2 use grouped key/value like GQA
4426
+ self.gguf_writer.add_head_count_kv(num_key_value_heads)
4427
+
4428
+ # required by llama.cpp, unused
4429
+ self.gguf_writer.add_head_count(0)
4430
+
4431
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4432
+ for new_name, data in super().modify_tensors(data_torch, name, bid):
4433
+ if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
4434
+ data = data.view(5, -1, data.shape[-1])
4435
+ # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
4436
+ # permute them here to avoid code changes
4437
+ data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
4438
+ if "w2" in new_name:
4439
+ data = data.view(5, -1, data.shape[-1])
4440
+ yield (new_name, data)
4441
+ continue
4442
+ yield (new_name, data)
4443
+
4444
+
4445
+ @ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
4446
+ class Rwkv7Model(TextModel):
4447
+ model_arch = gguf.MODEL_ARCH.RWKV7
4448
+
4449
+ def set_vocab(self):
4450
+ self._set_vocab_rwkv_world()
4451
+
4452
+ def calc_lora_rank(self, hidden_size, exponent, multiplier):
4453
+ return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
4454
+
4455
+ def set_gguf_parameters(self):
4456
+ block_count = self.hparams["num_hidden_layers"]
4457
+ try:
4458
+ head_size = self.hparams["head_size"]
4459
+ layer_norm_eps = self.hparams["layer_norm_epsilon"]
4460
+ except KeyError:
4461
+ head_size = self.hparams["head_dim"]
4462
+ layer_norm_eps = self.hparams["norm_eps"]
4463
+ hidden_size = self.hparams["hidden_size"]
4464
+ intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4)
4465
+
4466
+ # ICLR: In-Context-Learning-Rate
4467
+ try:
4468
+ lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
4469
+ lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
4470
+ lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
4471
+ lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
4472
+ except KeyError:
4473
+ lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
4474
+ lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
4475
+ lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
4476
+ lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
4477
+
4478
+ # RWKV isn't context limited
4479
+ self.gguf_writer.add_context_length(1048576)
4480
+ self.gguf_writer.add_embedding_length(hidden_size)
4481
+ self.gguf_writer.add_block_count(block_count)
4482
+ self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
4483
+ self.gguf_writer.add_wkv_head_size(head_size)
4484
+ self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
4485
+ self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
4486
+ self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
4487
+ self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
4488
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
4489
+ self.gguf_writer.add_file_type(self.ftype)
4490
+
4491
+ # required by llama.cpp, unused
4492
+ self.gguf_writer.add_head_count(0)
4493
+
4494
+ lerp_weights: dict[int, dict[str, Tensor]] = {}
4495
+ lora_needs_transpose: bool = True
4496
+
4497
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4498
+ # unify tensor names here to make life easier
4499
+ name = name.replace("blocks", "layers").replace("ffn", "feed_forward")
4500
+ name = name.replace("self_attn", "attention").replace("attn", "attention")
4501
+ name = name.replace("time_mixer.", "")
4502
+ # lora layer names in fla-hub's impl
4503
+ if "_lora.lora" in name:
4504
+ self.lora_needs_transpose = False
4505
+ name = name.replace("_lora.lora.0.weight", "1.weight")
4506
+ name = name.replace("_lora.lora.2.weight", "2.weight")
4507
+ name = name.replace("_lora.lora.2.bias", "0.weight")
4508
+
4509
+ name = name.replace("feed_forward_norm", "ln2")
4510
+ name = name.replace("g_norm", "ln_x")
4511
+
4512
+ if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0:
4513
+ # some models have dummy v0/v1/v2 on first layer while others don't
4514
+ # ignore them all since they are not used
4515
+ return
4516
+
4517
+ wkv_has_gate = self.hparams.get("wkv_has_gate", True)
4518
+ lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"]
4519
+
4520
+ if bid is not None and "attention.x_" in name:
4521
+ if "attention.x_x" in name:
4522
+ # already concatenated
4523
+ new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
4524
+ data = data_torch.reshape(len(lerp_list), 1, 1, -1)
3433
4525
  yield (new_name, data)
4526
+ else:
4527
+ try:
4528
+ self.lerp_weights[bid][name] = data_torch
4529
+ except KeyError:
4530
+ self.lerp_weights[bid] = {name: data_torch}
4531
+ if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list):
4532
+ new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
4533
+ data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0)
4534
+ yield (new_name, data)
3434
4535
  return
4536
+ else:
4537
+ data_torch = data_torch.squeeze()
4538
+ new_name = self.map_tensor_name(name)
3435
4539
 
3436
- yield (new_name, data_torch)
4540
+ if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
4541
+ new_name += ".weight"
3437
4542
 
4543
+ if self.lora_needs_transpose and any(
4544
+ new_name.endswith(t) for t in [
4545
+ "time_mix_w1.weight", "time_mix_w2.weight",
4546
+ "time_mix_a1.weight", "time_mix_a2.weight",
4547
+ "time_mix_v1.weight", "time_mix_v2.weight",
4548
+ "time_mix_g1.weight", "time_mix_g2.weight",
4549
+ ]
4550
+ ):
4551
+ data_torch = data_torch.transpose(0, 1)
3438
4552
 
3439
- @Model.register("RWKV6Qwen2ForCausalLM")
3440
- class RWKV6Qwen2Model(Rwkv6Model):
3441
- model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
4553
+ if 'r_k' in new_name:
4554
+ data_torch = data_torch.flatten()
4555
+
4556
+ if bid == 0 and "time_mix_a" in new_name:
4557
+ # dummy v0/v1/v2 on first layer
4558
+ # easist way to make llama happy
4559
+ yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
4560
+
4561
+ yield (new_name, data_torch)
4562
+
4563
+
4564
+ @ModelBase.register("RwkvHybridForCausalLM")
4565
+ class ARwkv7Model(Rwkv7Model):
4566
+ model_arch = gguf.MODEL_ARCH.ARWKV7
3442
4567
 
3443
4568
  def set_vocab(self):
3444
4569
  try:
@@ -3448,50 +4573,39 @@ class RWKV6Qwen2Model(Rwkv6Model):
3448
4573
 
3449
4574
  def set_gguf_parameters(self):
3450
4575
  block_count = self.hparams["num_hidden_layers"]
3451
- num_attention_heads = self.hparams["num_attention_heads"]
3452
- num_key_value_heads = self.hparams["num_key_value_heads"]
3453
4576
  hidden_size = self.hparams["hidden_size"]
3454
- head_size = hidden_size // num_attention_heads
4577
+ head_size = self.hparams["head_size"]
3455
4578
  rms_norm_eps = self.hparams["rms_norm_eps"]
3456
4579
  intermediate_size = self.hparams["intermediate_size"]
3457
- time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3458
- time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
4580
+ wkv_has_gate = self.hparams["wkv_has_gate"]
4581
+ assert self.hparams["wkv_version"] == 7
4582
+
4583
+ # ICLR: In-Context-Learning-Rate
4584
+ lora_rank_decay = 64
4585
+ lora_rank_iclr = 64
4586
+ lora_rank_value_residual_mix = 32
4587
+ lora_rank_gate = 128 if wkv_has_gate else 0
3459
4588
 
3460
4589
  # RWKV isn't context limited
3461
4590
  self.gguf_writer.add_context_length(1048576)
3462
4591
  self.gguf_writer.add_embedding_length(hidden_size)
3463
4592
  self.gguf_writer.add_block_count(block_count)
4593
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3464
4594
  self.gguf_writer.add_wkv_head_size(head_size)
3465
- self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
3466
- self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
4595
+ self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
4596
+ self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
4597
+ self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
4598
+ self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
3467
4599
  self.gguf_writer.add_feed_forward_length(intermediate_size)
3468
4600
  self.gguf_writer.add_file_type(self.ftype)
3469
-
3470
- # special parameters for time_mixing in RWKV6QWEN2
3471
- self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3472
4601
  self.gguf_writer.add_token_shift_count(1)
3473
- # RWKV6QWEN2 use grouped key/value like GQA
3474
- self.gguf_writer.add_head_count_kv(num_key_value_heads)
3475
4602
 
3476
4603
  # required by llama.cpp, unused
3477
4604
  self.gguf_writer.add_head_count(0)
3478
4605
 
3479
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3480
- for new_name, data in super().modify_tensors(data_torch, name, bid):
3481
- if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
3482
- data = data.view(5, -1, data.shape[-1])
3483
- # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
3484
- # permute them here to avoid code changes
3485
- data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
3486
- if "w2" in new_name:
3487
- data = data.view(5, -1, data.shape[-1])
3488
- yield (new_name, data)
3489
- continue
3490
- yield (new_name, data)
3491
-
3492
4606
 
3493
- @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
3494
- class MambaModel(Model):
4607
+ @ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
4608
+ class MambaModel(TextModel):
3495
4609
  model_arch = gguf.MODEL_ARCH.MAMBA
3496
4610
 
3497
4611
  def set_vocab(self):
@@ -3544,8 +4658,6 @@ class MambaModel(Model):
3544
4658
  _tok_embd = None
3545
4659
 
3546
4660
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3547
- del bid # unused
3548
-
3549
4661
  output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
3550
4662
  tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
3551
4663
 
@@ -3555,6 +4667,10 @@ class MambaModel(Model):
3555
4667
  logger.debug("A_log --> A ==> " + new_name)
3556
4668
  data_torch = -torch.exp(data_torch)
3557
4669
 
4670
+ # [4 1 8192 1] -> [4 8192 1 1]
4671
+ if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
4672
+ data_torch = data_torch.squeeze()
4673
+
3558
4674
  # assuming token_embd.weight is seen before output.weight
3559
4675
  if self._tok_embd is not None and new_name == output_name:
3560
4676
  if torch.equal(self._tok_embd, data_torch):
@@ -3566,8 +4682,8 @@ class MambaModel(Model):
3566
4682
  return [(new_name, data_torch)]
3567
4683
 
3568
4684
 
3569
- @Model.register("CohereForCausalLM")
3570
- class CommandR2Model(Model):
4685
+ @ModelBase.register("CohereForCausalLM")
4686
+ class CommandR2Model(TextModel):
3571
4687
  model_arch = gguf.MODEL_ARCH.COMMAND_R
3572
4688
 
3573
4689
  def __init__(self, *args, **kwargs):
@@ -3584,8 +4700,8 @@ class CommandR2Model(Model):
3584
4700
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3585
4701
 
3586
4702
 
3587
- @Model.register("Cohere2ForCausalLM")
3588
- class Cohere2Model(Model):
4703
+ @ModelBase.register("Cohere2ForCausalLM")
4704
+ class Cohere2Model(TextModel):
3589
4705
  model_arch = gguf.MODEL_ARCH.COHERE2
3590
4706
 
3591
4707
  def set_gguf_parameters(self):
@@ -3602,9 +4718,9 @@ class Cohere2Model(Model):
3602
4718
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3603
4719
 
3604
4720
 
3605
- @Model.register("OlmoForCausalLM")
3606
- @Model.register("OLMoForCausalLM")
3607
- class OlmoModel(Model):
4721
+ @ModelBase.register("OlmoForCausalLM")
4722
+ @ModelBase.register("OLMoForCausalLM")
4723
+ class OlmoModel(TextModel):
3608
4724
  model_arch = gguf.MODEL_ARCH.OLMO
3609
4725
 
3610
4726
  def set_gguf_parameters(self):
@@ -3630,13 +4746,13 @@ class OlmoModel(Model):
3630
4746
  return [(self.map_tensor_name(name), data_torch)]
3631
4747
 
3632
4748
 
3633
- @Model.register("Olmo2ForCausalLM")
3634
- class Olmo2Model(Model):
4749
+ @ModelBase.register("Olmo2ForCausalLM")
4750
+ class Olmo2Model(TextModel):
3635
4751
  model_arch = gguf.MODEL_ARCH.OLMO2
3636
4752
 
3637
4753
 
3638
- @Model.register("OlmoeForCausalLM")
3639
- class OlmoeModel(Model):
4754
+ @ModelBase.register("OlmoeForCausalLM")
4755
+ class OlmoeModel(TextModel):
3640
4756
  model_arch = gguf.MODEL_ARCH.OLMOE
3641
4757
 
3642
4758
  def set_gguf_parameters(self):
@@ -3695,29 +4811,10 @@ class OlmoeModel(Model):
3695
4811
  raise ValueError(f"Unprocessed experts: {experts}")
3696
4812
 
3697
4813
 
3698
- @Model.register("JinaBertModel", "JinaBertForMaskedLM")
4814
+ @ModelBase.register("JinaBertModel", "JinaBertForMaskedLM")
3699
4815
  class JinaBertV2Model(BertModel):
3700
4816
  model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
3701
4817
 
3702
- def __init__(self, *args, **kwargs):
3703
- super().__init__(*args, **kwargs)
3704
- self.intermediate_size = self.hparams["intermediate_size"]
3705
-
3706
- def get_tensors(self):
3707
- for name, data in super().get_tensors():
3708
- if 'gated_layer' in name:
3709
- d1 = data[:self.intermediate_size, :]
3710
- name1 = name.replace('gated_layers', 'gated_layers_w')
3711
- name1 = name1.replace('up_gated_layer', 'gated_layers_v')
3712
- d2 = data[self.intermediate_size:, :]
3713
- name2 = name.replace('gated_layers', 'gated_layers_v')
3714
- name2 = name2.replace('up_gated_layer', 'gated_layers_w')
3715
- yield name1, d1
3716
- yield name2, d2
3717
- continue
3718
-
3719
- yield name, data
3720
-
3721
4818
  def set_vocab(self):
3722
4819
  tokenizer_class = 'BertTokenizer'
3723
4820
  with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
@@ -3733,17 +4830,9 @@ class JinaBertV2Model(BertModel):
3733
4830
  self.gguf_writer.add_add_bos_token(True)
3734
4831
  self.gguf_writer.add_add_eos_token(True)
3735
4832
 
3736
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3737
- # if name starts with "bert.", remove the prefix
3738
- # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
3739
- if name.startswith("bert."):
3740
- name = name[5:]
3741
-
3742
- return super().modify_tensors(data_torch, name, bid)
3743
-
3744
4833
 
3745
- @Model.register("OpenELMForCausalLM")
3746
- class OpenELMModel(Model):
4834
+ @ModelBase.register("OpenELMForCausalLM")
4835
+ class OpenELMModel(TextModel):
3747
4836
  model_arch = gguf.MODEL_ARCH.OPENELM
3748
4837
 
3749
4838
  @staticmethod
@@ -3817,8 +4906,8 @@ class OpenELMModel(Model):
3817
4906
  yield (self.map_tensor_name(name), data_torch)
3818
4907
 
3819
4908
 
3820
- @Model.register("ArcticForCausalLM")
3821
- class ArcticModel(Model):
4909
+ @ModelBase.register("ArcticForCausalLM")
4910
+ class ArcticModel(TextModel):
3822
4911
  model_arch = gguf.MODEL_ARCH.ARCTIC
3823
4912
 
3824
4913
  def set_vocab(self):
@@ -3968,8 +5057,8 @@ class ArcticModel(Model):
3968
5057
  raise ValueError(f"Unprocessed experts: {experts}")
3969
5058
 
3970
5059
 
3971
- @Model.register("DeepseekForCausalLM")
3972
- class DeepseekModel(Model):
5060
+ @ModelBase.register("DeepseekForCausalLM")
5061
+ class DeepseekModel(TextModel):
3973
5062
  model_arch = gguf.MODEL_ARCH.DEEPSEEK
3974
5063
 
3975
5064
  def set_vocab(self):
@@ -4059,15 +5148,19 @@ class DeepseekModel(Model):
4059
5148
  raise ValueError(f"Unprocessed experts: {experts}")
4060
5149
 
4061
5150
 
4062
- @Model.register("DeepseekV2ForCausalLM")
4063
- @Model.register("DeepseekV3ForCausalLM")
4064
- class DeepseekV2Model(Model):
5151
+ @ModelBase.register("DeepseekV2ForCausalLM")
5152
+ @ModelBase.register("DeepseekV3ForCausalLM")
5153
+ class DeepseekV2Model(TextModel):
4065
5154
  model_arch = gguf.MODEL_ARCH.DEEPSEEK2
4066
5155
 
4067
5156
  def set_vocab(self):
4068
5157
  self._set_vocab_gpt2()
4069
5158
 
4070
5159
  def set_gguf_parameters(self):
5160
+
5161
+ # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
5162
+ self.hparams["num_key_value_heads"] = 1
5163
+
4071
5164
  super().set_gguf_parameters()
4072
5165
  hparams = self.hparams
4073
5166
 
@@ -4076,8 +5169,13 @@ class DeepseekV2Model(Model):
4076
5169
  if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
4077
5170
  self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
4078
5171
  self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
4079
- self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
4080
- self.gguf_writer.add_value_length(hparams["v_head_dim"])
5172
+
5173
+ # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
5174
+ self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
5175
+ self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
5176
+ self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
5177
+ self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
5178
+
4081
5179
  self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
4082
5180
  self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
4083
5181
  self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
@@ -4093,12 +5191,12 @@ class DeepseekV2Model(Model):
4093
5191
 
4094
5192
  self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
4095
5193
 
4096
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
4097
- if self.hparams["rope_scaling"].get("type") == "yarn":
4098
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
4099
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
4100
- self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
4101
- self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
5194
+ rope_scaling = self.hparams.get("rope_scaling") or {}
5195
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
5196
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
5197
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
5198
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
5199
+ self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
4102
5200
 
4103
5201
  _experts: list[dict[str, Tensor]] | None = None
4104
5202
 
@@ -4146,6 +5244,26 @@ class DeepseekV2Model(Model):
4146
5244
  else:
4147
5245
  return []
4148
5246
 
5247
+ # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
5248
+ if name.endswith("kv_b_proj.weight"):
5249
+ name_kb = name.replace("kv_b_proj", "k_b_proj")
5250
+ name_vb = name.replace("kv_b_proj", "v_b_proj")
5251
+
5252
+ n_head_kv = self.hparams["num_key_value_heads"]
5253
+ v_head_dim = self.hparams["v_head_dim"]
5254
+ qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
5255
+
5256
+ assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
5257
+
5258
+ kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
5259
+ k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
5260
+ k_b = k_b.transpose(1, 2)
5261
+
5262
+ return [
5263
+ (self.map_tensor_name(name_kb), k_b),
5264
+ (self.map_tensor_name(name_vb), v_b)
5265
+ ]
5266
+
4149
5267
  return [(self.map_tensor_name(name), data_torch)]
4150
5268
 
4151
5269
  def prepare_tensors(self):
@@ -4158,11 +5276,62 @@ class DeepseekV2Model(Model):
4158
5276
  raise ValueError(f"Unprocessed experts: {experts}")
4159
5277
 
4160
5278
 
4161
- @Model.register("T5WithLMHeadModel")
4162
- @Model.register("T5ForConditionalGeneration")
4163
- @Model.register("MT5ForConditionalGeneration")
4164
- @Model.register("UMT5ForConditionalGeneration")
4165
- class T5Model(Model):
5279
+ @ModelBase.register("Dots1ForCausalLM")
5280
+ class Dots1Model(Qwen2MoeModel):
5281
+ model_arch = gguf.MODEL_ARCH.DOTS1
5282
+
5283
+ def __init__(self, *args, **kwargs):
5284
+ super().__init__(*args, **kwargs)
5285
+ self.hparams["num_experts"] = self.hparams["n_routed_experts"]
5286
+
5287
+ def set_gguf_parameters(self):
5288
+ super().set_gguf_parameters()
5289
+ self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5290
+ self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
5291
+ self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5292
+ self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
5293
+
5294
+ if self.hparams["scoring_func"] == "noaux_tc":
5295
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
5296
+ else:
5297
+ raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
5298
+
5299
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
5300
+ if name.endswith("e_score_correction_bias"):
5301
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5302
+ if "shared_experts" in name:
5303
+ return [(self.map_tensor_name(name), data_torch)]
5304
+ return super().modify_tensors(data_torch, name, bid)
5305
+
5306
+
5307
+ @ModelBase.register("PLMForCausalLM")
5308
+ class PLMModel(TextModel):
5309
+ model_arch = gguf.MODEL_ARCH.PLM
5310
+
5311
+ def set_vocab(self):
5312
+ self._set_vocab_gpt2()
5313
+
5314
+ def set_gguf_parameters(self):
5315
+ super().set_gguf_parameters()
5316
+ hparams = self.hparams
5317
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
5318
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
5319
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
5320
+ self.gguf_writer.add_value_length(hparams["v_head_dim"])
5321
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
5322
+
5323
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5324
+ return [(self.map_tensor_name(name), data_torch)]
5325
+
5326
+ def prepare_tensors(self):
5327
+ super().prepare_tensors()
5328
+
5329
+
5330
+ @ModelBase.register("T5WithLMHeadModel")
5331
+ @ModelBase.register("T5ForConditionalGeneration")
5332
+ @ModelBase.register("MT5ForConditionalGeneration")
5333
+ @ModelBase.register("UMT5ForConditionalGeneration")
5334
+ class T5Model(TextModel):
4166
5335
  model_arch = gguf.MODEL_ARCH.T5
4167
5336
 
4168
5337
  def __init__(self, *args, **kwargs):
@@ -4301,8 +5470,8 @@ class T5Model(Model):
4301
5470
  return [(self.map_tensor_name(name), data_torch)]
4302
5471
 
4303
5472
 
4304
- @Model.register("T5EncoderModel")
4305
- class T5EncoderModel(Model):
5473
+ @ModelBase.register("T5EncoderModel")
5474
+ class T5EncoderModel(TextModel):
4306
5475
  model_arch = gguf.MODEL_ARCH.T5ENCODER
4307
5476
 
4308
5477
  def __init__(self, *args, **kwargs):
@@ -4440,8 +5609,8 @@ class T5EncoderModel(Model):
4440
5609
  return [(self.map_tensor_name(name), data_torch)]
4441
5610
 
4442
5611
 
4443
- @Model.register("JAISLMHeadModel")
4444
- class JaisModel(Model):
5612
+ @ModelBase.register("JAISLMHeadModel")
5613
+ class JaisModel(TextModel):
4445
5614
  model_arch = gguf.MODEL_ARCH.JAIS
4446
5615
 
4447
5616
  def __init__(self, *args, **kwargs):
@@ -4523,8 +5692,39 @@ class JaisModel(Model):
4523
5692
  self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
4524
5693
 
4525
5694
 
4526
- @Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
4527
- class ChatGLMModel(Model):
5695
+ @ModelBase.register("Glm4ForCausalLM")
5696
+ class Glm4Model(TextModel):
5697
+ model_arch = gguf.MODEL_ARCH.GLM4
5698
+
5699
+ def set_vocab(self):
5700
+ from transformers import AutoTokenizer
5701
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
5702
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
5703
+ tokens, toktypes, tokpre = self.get_vocab_base()
5704
+ self.gguf_writer.add_tokenizer_model("gpt2")
5705
+ self.gguf_writer.add_tokenizer_pre(tokpre)
5706
+ self.gguf_writer.add_token_list(tokens)
5707
+ self.gguf_writer.add_token_types(toktypes)
5708
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
5709
+ special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
5710
+ special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
5711
+ special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
5712
+ special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
5713
+ special_vocab.add_to_gguf(self.gguf_writer)
5714
+
5715
+ def set_gguf_parameters(self):
5716
+ super().set_gguf_parameters()
5717
+ rope_dim = self.hparams["head_dim"]
5718
+ self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
5719
+ rope_scaling = self.hparams.get("rope_scaling") or {}
5720
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
5721
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
5722
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
5723
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
5724
+
5725
+
5726
+ @ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
5727
+ class ChatGLMModel(TextModel):
4528
5728
  model_arch = gguf.MODEL_ARCH.CHATGLM
4529
5729
 
4530
5730
  def set_vocab_chatglm3(self):
@@ -4678,8 +5878,8 @@ class ChatGLMModel(Model):
4678
5878
  return [(self.map_tensor_name(name), data_torch)]
4679
5879
 
4680
5880
 
4681
- @Model.register("NemotronForCausalLM")
4682
- class NemotronModel(Model):
5881
+ @ModelBase.register("NemotronForCausalLM")
5882
+ class NemotronModel(TextModel):
4683
5883
  model_arch = gguf.MODEL_ARCH.NEMOTRON
4684
5884
 
4685
5885
  def set_vocab(self):
@@ -4719,8 +5919,8 @@ class NemotronModel(Model):
4719
5919
  return [(self.map_tensor_name(name), data_torch)]
4720
5920
 
4721
5921
 
4722
- @Model.register("ExaoneForCausalLM")
4723
- class ExaoneModel(Model):
5922
+ @ModelBase.register("ExaoneForCausalLM")
5923
+ class ExaoneModel(TextModel):
4724
5924
  model_arch = gguf.MODEL_ARCH.EXAONE
4725
5925
 
4726
5926
  def set_gguf_parameters(self):
@@ -4753,10 +5953,10 @@ class ExaoneModel(Model):
4753
5953
  rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
4754
5954
  rotary_factor = rotary_factor if rotary_factor is not None else 1.0
4755
5955
  self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
4756
- if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
4757
- if hparams["rope_scaling"].get("type") == "linear":
4758
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
4759
- self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
5956
+ rope_scaling = self.hparams.get("rope_scaling") or {}
5957
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
5958
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
5959
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
4760
5960
 
4761
5961
  def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
4762
5962
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
@@ -4788,7 +5988,7 @@ class ExaoneModel(Model):
4788
5988
  yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
4789
5989
 
4790
5990
 
4791
- @Model.register("GraniteForCausalLM")
5991
+ @ModelBase.register("GraniteForCausalLM")
4792
5992
  class GraniteModel(LlamaModel):
4793
5993
  """Conversion for IBM's GraniteForCausalLM"""
4794
5994
  model_arch = gguf.MODEL_ARCH.GRANITE
@@ -4822,11 +6022,20 @@ class GraniteModel(LlamaModel):
4822
6022
  logger.info("gguf: (granite) logits_scale = %s", logits_scale)
4823
6023
 
4824
6024
 
4825
- @Model.register("GraniteMoeForCausalLM")
6025
+ @ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
4826
6026
  class GraniteMoeModel(GraniteModel):
4827
6027
  """Conversion for IBM's GraniteMoeForCausalLM"""
4828
6028
  model_arch = gguf.MODEL_ARCH.GRANITE_MOE
4829
6029
 
6030
+ def set_gguf_parameters(self):
6031
+ """GraniteMoeShared uses GraniteMoe parameters plus the following:
6032
+ - shared_intermediate_size
6033
+ """
6034
+ super().set_gguf_parameters()
6035
+ if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
6036
+ self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
6037
+ logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
6038
+
4830
6039
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4831
6040
  """In modeling_granitemoe, the JetMoe implementation of parallel experts
4832
6041
  is used. This essentially merges w1 and w3 into a single tensor with 2x
@@ -4837,18 +6046,132 @@ class GraniteMoeModel(GraniteModel):
4837
6046
  if name.endswith("block_sparse_moe.input_linear.weight"):
4838
6047
  ffn_dim = self.hparams["intermediate_size"]
4839
6048
  assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
4840
- gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
6049
+ gate, up = data_torch.split(ffn_dim, dim=-2)
4841
6050
  return [
4842
6051
  (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
4843
6052
  (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
4844
6053
  ]
4845
6054
 
6055
+ if name.endswith("shared_mlp.input_linear.weight"):
6056
+ ffn_dim = self.hparams["shared_intermediate_size"]
6057
+ assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
6058
+ gate, up = data_torch.split(ffn_dim, dim=-2)
6059
+ return [
6060
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6061
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6062
+ ]
6063
+
4846
6064
  return super().modify_tensors(data_torch, name, bid)
4847
6065
 
4848
6066
 
4849
- @Model.register("ChameleonForConditionalGeneration")
4850
- @Model.register("ChameleonForCausalLM") # obsolete
4851
- class ChameleonModel(Model):
6067
+ @ModelBase.register("BailingMoeForCausalLM")
6068
+ class BailingMoeModel(TextModel):
6069
+ model_arch = gguf.MODEL_ARCH.BAILINGMOE
6070
+
6071
+ def set_vocab(self):
6072
+ self._set_vocab_gpt2()
6073
+
6074
+ def set_gguf_parameters(self):
6075
+ super().set_gguf_parameters()
6076
+ hparams = self.hparams
6077
+ rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
6078
+
6079
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
6080
+ rope_scaling = self.hparams.get("rope_scaling") or {}
6081
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
6082
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
6083
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
6084
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
6085
+ else:
6086
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
6087
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
6088
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
6089
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
6090
+ self.gguf_writer.add_expert_weights_scale(1.0)
6091
+ self.gguf_writer.add_expert_count(hparams["num_experts"])
6092
+ self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
6093
+ self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
6094
+
6095
+ _experts: list[dict[str, Tensor]] | None = None
6096
+
6097
+ @staticmethod
6098
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
6099
+ if n_head_kv is not None and n_head != n_head_kv:
6100
+ n_head = n_head_kv
6101
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
6102
+ .swapaxes(1, 2)
6103
+ .reshape(weights.shape))
6104
+
6105
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6106
+ n_head = self.hparams["num_attention_heads"]
6107
+ n_kv_head = self.hparams.get("num_key_value_heads")
6108
+ n_embd = self.hparams["hidden_size"]
6109
+ head_dim = self.hparams.get("head_dim") or n_embd // n_head
6110
+
6111
+ output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
6112
+
6113
+ if name.endswith("attention.dense.weight"):
6114
+ return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
6115
+ elif name.endswith("query_key_value.weight"):
6116
+ q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
6117
+
6118
+ return [
6119
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
6120
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
6121
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
6122
+ ]
6123
+ elif name.find("mlp.experts") != -1:
6124
+ n_experts = self.hparams["num_experts"]
6125
+ assert bid is not None
6126
+
6127
+ tensors: list[tuple[str, Tensor]] = []
6128
+
6129
+ if self._experts is None:
6130
+ self._experts = [{} for _ in range(self.block_count)]
6131
+
6132
+ self._experts[bid][name] = data_torch
6133
+
6134
+ if len(self._experts[bid]) >= n_experts * 3:
6135
+ # merge the experts into a single 3d tensor
6136
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
6137
+ datas: list[Tensor] = []
6138
+
6139
+ for xid in range(n_experts):
6140
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
6141
+ datas.append(self._experts[bid][ename])
6142
+ del self._experts[bid][ename]
6143
+
6144
+ data_torch = torch.stack(datas, dim=0)
6145
+
6146
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
6147
+
6148
+ new_name = self.map_tensor_name(merged_name)
6149
+
6150
+ tensors.append((new_name, data_torch))
6151
+
6152
+ return tensors
6153
+
6154
+ new_name = self.map_tensor_name(name)
6155
+
6156
+ if new_name == output_name and self.hparams.get("norm_head"):
6157
+ data_torch = data_torch.float()
6158
+ data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
6159
+
6160
+ return [(new_name, data_torch)]
6161
+
6162
+ def prepare_tensors(self):
6163
+ super().prepare_tensors()
6164
+
6165
+ if self._experts is not None:
6166
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
6167
+ experts = [k for d in self._experts for k in d.keys()]
6168
+ if len(experts) > 0:
6169
+ raise ValueError(f"Unprocessed experts: {experts}")
6170
+
6171
+
6172
+ @ModelBase.register("ChameleonForConditionalGeneration")
6173
+ @ModelBase.register("ChameleonForCausalLM") # obsolete
6174
+ class ChameleonModel(TextModel):
4852
6175
  model_arch = gguf.MODEL_ARCH.CHAMELEON
4853
6176
 
4854
6177
  def set_gguf_parameters(self):
@@ -4888,8 +6211,68 @@ class ChameleonModel(Model):
4888
6211
  return data_torch
4889
6212
 
4890
6213
 
6214
+ @ModelBase.register("UltravoxModel")
6215
+ class UltravoxModel(TextModel):
6216
+ model_arch = gguf.MODEL_ARCH.LLAMA # dummy
6217
+
6218
+ def __init__(self, *args, **kwargs):
6219
+ super().__init__(*args, **kwargs)
6220
+ raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
6221
+
6222
+
6223
+ @ModelBase.register("Qwen2AudioForConditionalGeneration")
6224
+ class WhisperEncoderModel(MmprojModel):
6225
+ has_vision_encoder = False # no vision encoder
6226
+ has_audio_encoder = True
6227
+
6228
+ def __init__(self, *args, **kwargs):
6229
+ super().__init__(*args, **kwargs)
6230
+ self.hparams["hidden_size"] = self.hparams["d_model"]
6231
+ self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
6232
+ self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
6233
+
6234
+ def set_gguf_parameters(self):
6235
+ super().set_gguf_parameters()
6236
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
6237
+ self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
6238
+ self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
6239
+
6240
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
6241
+ del bid, new_name, n_dims # unused
6242
+ if ".conv" in name and ".weight" in name:
6243
+ return gguf.GGMLQuantizationType.F16
6244
+ return False
6245
+
6246
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6247
+ del bid # unused
6248
+
6249
+ if name.startswith("language_model."):
6250
+ # skip language model tensors
6251
+ return []
6252
+
6253
+ # prevent clash naming with vision tensors
6254
+ if name.startswith("multi_modal_projector"):
6255
+ name = "audio." + name
6256
+
6257
+ if "conv1.bias" in name or "conv2.bias" in name:
6258
+ # transpose conv1 and conv2 bias
6259
+ data_torch = data_torch.unsqueeze(-1)
6260
+
6261
+ return [(self.map_tensor_name(name), data_torch)]
6262
+
6263
+
6264
+ @ModelBase.register("UltravoxModel")
6265
+ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
6266
+ has_vision_encoder = False # no vision encoder
6267
+ has_audio_encoder = True
6268
+
6269
+ def set_gguf_parameters(self):
6270
+ super().set_gguf_parameters()
6271
+ self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
6272
+
4891
6273
  ###### CONVERSION LOGIC ######
4892
6274
 
6275
+
4893
6276
  # tree of lazy tensors
4894
6277
  class LazyTorchTensor(gguf.LazyBase):
4895
6278
  _tensor_type = torch.Tensor
@@ -4943,6 +6326,14 @@ class LazyTorchTensor(gguf.LazyBase):
4943
6326
  lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
4944
6327
  return cast(torch.Tensor, lazy)
4945
6328
 
6329
+ @classmethod
6330
+ def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
6331
+ dtype = cls._dtype_str_map[remote_tensor.dtype]
6332
+ shape = remote_tensor.shape
6333
+ meta = cls.meta_with_dtype_and_shape(dtype, shape)
6334
+ lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape))
6335
+ return cast(torch.Tensor, lazy)
6336
+
4946
6337
  @classmethod
4947
6338
  def __torch_function__(cls, func, types, args=(), kwargs=None):
4948
6339
  del types # unused
@@ -5020,6 +6411,14 @@ def parse_args() -> argparse.Namespace:
5020
6411
  "--print-supported-models", action="store_true",
5021
6412
  help="Print the supported models"
5022
6413
  )
6414
+ parser.add_argument(
6415
+ "--remote", action="store_true",
6416
+ help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.",
6417
+ )
6418
+ parser.add_argument(
6419
+ "--mmproj", action="store_true",
6420
+ help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
6421
+ )
5023
6422
 
5024
6423
  args = parser.parse_args()
5025
6424
  if not args.print_supported_models and args.model is None:
@@ -5045,12 +6444,26 @@ def split_str_to_n_bytes(split_str: str) -> int:
5045
6444
  return n
5046
6445
 
5047
6446
 
6447
+ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
6448
+ # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
6449
+ # maybe we should fallback to text model's arch in that case, since not many models have both
6450
+ text_config = hparams.get("text_config", {})
6451
+ vision_config = hparams.get("vision_config", {})
6452
+ arch = hparams["architectures"][0]
6453
+ # if "architectures" is found in the sub-config, use that instead
6454
+ if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
6455
+ arch = text_config["architectures"][0]
6456
+ elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
6457
+ arch = vision_config["architectures"][0]
6458
+ return arch
6459
+
6460
+
5048
6461
  def main() -> None:
5049
6462
  args = parse_args()
5050
6463
 
5051
6464
  if args.print_supported_models:
5052
6465
  logger.error("Supported models:")
5053
- Model.print_registered_models()
6466
+ ModelBase.print_registered_models()
5054
6467
  sys.exit(0)
5055
6468
 
5056
6469
  if args.verbose:
@@ -5060,6 +6473,14 @@ def main() -> None:
5060
6473
 
5061
6474
  dir_model = args.model
5062
6475
 
6476
+ if args.remote:
6477
+ from huggingface_hub import snapshot_download
6478
+ local_dir = snapshot_download(
6479
+ repo_id=str(dir_model),
6480
+ allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
6481
+ dir_model = Path(local_dir)
6482
+ logger.info(f"Downloaded config and tokenizer to {local_dir}")
6483
+
5063
6484
  if not dir_model.is_dir():
5064
6485
  logger.error(f'Error: {args.model} is not a directory')
5065
6486
  sys.exit(1)
@@ -5081,30 +6502,38 @@ def main() -> None:
5081
6502
 
5082
6503
  if args.outfile is not None:
5083
6504
  fname_out = args.outfile
6505
+ elif args.remote:
6506
+ # if remote, use the model ID as the output file name
6507
+ fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
5084
6508
  else:
5085
6509
  fname_out = dir_model
5086
6510
 
5087
6511
  logger.info(f"Loading model: {dir_model.name}")
5088
6512
 
5089
- hparams = Model.load_hparams(dir_model)
6513
+ if args.mmproj:
6514
+ if "mmproj" not in fname_out.name:
6515
+ fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
5090
6516
 
5091
6517
  with torch.inference_mode():
5092
6518
  output_type = ftype_map[args.outtype]
5093
- model_architecture = hparams["architectures"][0]
5094
-
6519
+ model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
6520
+ hparams = ModelBase.load_hparams(dir_model)
6521
+ model_architecture = get_model_architecture(hparams, model_type)
6522
+ logger.info(f"Model architecture: {model_architecture}")
5095
6523
  try:
5096
- model_class = Model.from_model_architecture(model_architecture)
6524
+ model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
5097
6525
  except NotImplementedError:
5098
6526
  logger.error(f"Model {model_architecture} is not supported")
5099
6527
  sys.exit(1)
5100
6528
 
5101
- model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
6529
+ model_instance = model_class(dir_model, output_type, fname_out,
5102
6530
  is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
5103
6531
  eager=args.no_lazy,
5104
6532
  metadata_override=args.metadata, model_name=args.model_name,
5105
6533
  split_max_tensors=args.split_max_tensors,
5106
6534
  split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
5107
- small_first_shard=args.no_tensor_first_split)
6535
+ small_first_shard=args.no_tensor_first_split,
6536
+ remote_hf_model_id=str(args.model) if args.remote else None)
5108
6537
 
5109
6538
  if args.vocab_only:
5110
6539
  logger.info("Exporting model vocab...")