bigdl-core-cpp 2.5.0rc1__py3-none-win_amd64.whl → 2.6.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +1673 -278
  2. bigdl/cpp/convert_hf_to_gguf_update.py +381 -0
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +450 -0
  4. bigdl/cpp/convert_lora_to_gguf.py +461 -0
  5. bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/constants.py +698 -171
  7. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  8. bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
  9. bigdl/cpp/gguf-py/gguf/gguf_writer.py +108 -17
  10. bigdl/cpp/gguf-py/gguf/lazy.py +3 -1
  11. bigdl/cpp/gguf-py/gguf/metadata.py +195 -76
  12. bigdl/cpp/gguf-py/gguf/quants.py +1210 -64
  13. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +262 -43
  14. bigdl/cpp/gguf-py/gguf/utility.py +2 -2
  15. bigdl/cpp/gguf-py/gguf/vocab.py +325 -3
  16. bigdl/cpp/libs/common.lib +0 -0
  17. bigdl/cpp/libs/ggml-base.dll +0 -0
  18. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  19. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  20. bigdl/cpp/libs/ggml.dll +0 -0
  21. bigdl/cpp/libs/libc++.dll +0 -0
  22. bigdl/cpp/libs/llama-batched.exe +0 -0
  23. bigdl/cpp/libs/llama-bench.exe +0 -0
  24. bigdl/cpp/libs/llama-cli.exe +0 -0
  25. bigdl/cpp/libs/llama-embedding.exe +0 -0
  26. bigdl/cpp/libs/llama-gemma3-cli.exe +0 -0
  27. bigdl/cpp/libs/llama-gguf.exe +0 -0
  28. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  29. bigdl/cpp/libs/llama-lookup.exe +0 -0
  30. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  31. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  32. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  33. bigdl/cpp/libs/llama-quantize.exe +0 -0
  34. bigdl/cpp/libs/llama-server.exe +0 -0
  35. bigdl/cpp/libs/llama-simple.exe +0 -0
  36. bigdl/cpp/libs/llama-speculative.exe +0 -0
  37. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  38. bigdl/cpp/libs/llama.dll +0 -0
  39. bigdl/cpp/libs/llava_shared.dll +0 -0
  40. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  41. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  42. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  43. bigdl/cpp/libs/ollama-lib.exe +0 -0
  44. bigdl/cpp/libs/ollama.exe +0 -0
  45. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  46. bigdl/cpp/libs/ollama_llama.dll +0 -0
  47. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  48. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.bat +7 -2
  49. bigdl_core_cpp-2.6.0.data/scripts/init-ollama.bat +16 -0
  50. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/METADATA +9 -5
  51. bigdl_core_cpp-2.6.0.dist-info/RECORD +57 -0
  52. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/WHEEL +1 -1
  53. bigdl/cpp/convert.py +0 -1714
  54. bigdl/cpp/libs/baby-llama.exe +0 -0
  55. bigdl/cpp/libs/batched-bench.exe +0 -0
  56. bigdl/cpp/libs/batched.exe +0 -0
  57. bigdl/cpp/libs/beam-search.exe +0 -0
  58. bigdl/cpp/libs/benchmark.exe +0 -0
  59. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  60. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  61. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  62. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  63. bigdl/cpp/libs/embedding.exe +0 -0
  64. bigdl/cpp/libs/export-lora.exe +0 -0
  65. bigdl/cpp/libs/finetune.exe +0 -0
  66. bigdl/cpp/libs/ggml_shared.dll +0 -0
  67. bigdl/cpp/libs/gguf.exe +0 -0
  68. bigdl/cpp/libs/gritlm.exe +0 -0
  69. bigdl/cpp/libs/imatrix.exe +0 -0
  70. bigdl/cpp/libs/infill.exe +0 -0
  71. bigdl/cpp/libs/llava-cli.exe +0 -0
  72. bigdl/cpp/libs/lookahead.exe +0 -0
  73. bigdl/cpp/libs/lookup.exe +0 -0
  74. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  75. bigdl/cpp/libs/main.exe +0 -0
  76. bigdl/cpp/libs/parallel.exe +0 -0
  77. bigdl/cpp/libs/passkey.exe +0 -0
  78. bigdl/cpp/libs/perplexity.exe +0 -0
  79. bigdl/cpp/libs/q8dot.exe +0 -0
  80. bigdl/cpp/libs/quantize-stats.exe +0 -0
  81. bigdl/cpp/libs/quantize.exe +0 -0
  82. bigdl/cpp/libs/save-load-state.exe +0 -0
  83. bigdl/cpp/libs/server.exe +0 -0
  84. bigdl/cpp/libs/simple.exe +0 -0
  85. bigdl/cpp/libs/speculative.exe +0 -0
  86. bigdl/cpp/libs/tokenize.exe +0 -0
  87. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  88. bigdl/cpp/libs/vdot.exe +0 -0
  89. bigdl_core_cpp-2.5.0rc1.data/scripts/init-ollama.bat +0 -13
  90. bigdl_core_cpp-2.5.0rc1.dist-info/RECORD +0 -63
  91. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.ps1 +0 -0
  92. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import ast
6
7
  import logging
7
8
  import argparse
8
9
  import contextlib
@@ -14,6 +15,7 @@ from enum import IntEnum
14
15
  from pathlib import Path
15
16
  from hashlib import sha256
16
17
  from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
18
+ from itertools import chain
17
19
 
18
20
  import math
19
21
  import numpy as np
@@ -70,7 +72,8 @@ class Model:
70
72
  def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
71
73
  use_temp_file: bool = False, eager: bool = False,
72
74
  metadata_override: Path | None = None, model_name: str | None = None,
73
- split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
75
+ split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
76
+ small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
74
77
  if type(self) is Model:
75
78
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
76
79
 
@@ -85,7 +88,7 @@ class Model:
85
88
  self.is_safetensors = len(self.part_names) > 0
86
89
  if not self.is_safetensors:
87
90
  self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
88
- self.hparams = Model.load_hparams(self.dir_model)
91
+ self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
89
92
  self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
90
93
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
91
94
  self.tensor_names = None
@@ -129,12 +132,14 @@ class Model:
129
132
  def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
130
133
  tensor_names_from_parts: set[str] = set()
131
134
 
132
- if len(self.part_names) > 1:
135
+ index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
136
+ index_name += ".index.json"
137
+ index_file = self.dir_model / index_name
138
+
139
+ if index_file.is_file():
133
140
  self.tensor_names = set()
134
- index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
135
- index_name += ".index.json"
136
141
  logger.info(f"gguf: loading model weight map from '{index_name}'")
137
- with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
142
+ with open(index_file, "r", encoding="utf-8") as f:
138
143
  index: dict[str, Any] = json.load(f)
139
144
  weight_map = index.get("weight_map")
140
145
  if weight_map is None or not isinstance(weight_map, dict):
@@ -142,6 +147,7 @@ class Model:
142
147
  self.tensor_names.update(weight_map.keys())
143
148
  else:
144
149
  self.tensor_names = tensor_names_from_parts
150
+ weight_map = {}
145
151
 
146
152
  for part_name in self.part_names:
147
153
  logger.info(f"gguf: loading model part '{part_name}'")
@@ -168,9 +174,17 @@ class Model:
168
174
  data = LazyTorchTensor.from_eager(data)
169
175
  yield name, data
170
176
 
171
- # only verify tensor name presence; it doesn't matter if they are not in the right files
172
- if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
173
- raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
177
+ # verify tensor name presence and identify potentially missing files
178
+ if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
179
+ missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
180
+ extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
181
+ missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
182
+ if len(extra) == 0 and len(missing_files) > 0:
183
+ raise ValueError(f"Missing or incomplete model files: {missing_files}")
184
+ else:
185
+ raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
186
+ f"Missing tensors: {missing}\n"
187
+ f"Extra tensors: {extra}")
174
188
 
175
189
  def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
176
190
  if key not in gguf.MODEL_TENSORS[self.model_arch]:
@@ -207,17 +221,17 @@ class Model:
207
221
  self.gguf_writer.add_context_length(n_ctx)
208
222
  logger.info(f"gguf: context length = {n_ctx}")
209
223
 
210
- n_embd = self.find_hparam(["hidden_size", "n_embd"])
211
- self.gguf_writer.add_embedding_length(n_embd)
212
- logger.info(f"gguf: embedding length = {n_embd}")
224
+ if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
225
+ self.gguf_writer.add_embedding_length(n_embd)
226
+ logger.info(f"gguf: embedding length = {n_embd}")
213
227
 
214
228
  if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
215
229
  self.gguf_writer.add_feed_forward_length(n_ff)
216
230
  logger.info(f"gguf: feed forward length = {n_ff}")
217
231
 
218
- n_head = self.find_hparam(["num_attention_heads", "n_head"])
219
- self.gguf_writer.add_head_count(n_head)
220
- logger.info(f"gguf: head count = {n_head}")
232
+ if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
233
+ self.gguf_writer.add_head_count(n_head)
234
+ logger.info(f"gguf: head count = {n_head}")
221
235
 
222
236
  if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
223
237
  self.gguf_writer.add_head_count_kv(n_head_kv)
@@ -251,20 +265,19 @@ class Model:
251
265
 
252
266
  return [(self.map_tensor_name(name), data_torch)]
253
267
 
254
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
268
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
255
269
  del name, new_name, bid, n_dims # unused
256
270
 
257
271
  return False
258
272
 
259
- def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
260
- del name, new_name, bid, n_dims # unused
261
-
262
- return False
273
+ # some models need extra generated tensors (like rope_freqs)
274
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
275
+ return ()
263
276
 
264
277
  def prepare_tensors(self):
265
278
  max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
266
279
 
267
- for name, data_torch in self.get_tensors():
280
+ for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
268
281
  # we don't need these
269
282
  if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
270
283
  continue
@@ -282,57 +295,83 @@ class Model:
282
295
  bid = int(part)
283
296
  break
284
297
 
285
- for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
286
- data: np.ndarray # type hint
287
- n_dims = len(data.shape)
288
- data_dtype = data.dtype
289
- data_qtype: gguf.GGMLQuantizationType | None = None
298
+ for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
299
+ # TODO: why do we squeeze here?
300
+ # data = data_torch.squeeze().numpy()
301
+ data = data_torch.numpy()
290
302
 
291
- # when both are True, f32 should win
292
- extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
293
- extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
303
+ # if data ends up empty, it means data_torch was a scalar tensor -> restore
304
+ if len(data.shape) == 0:
305
+ data = data_torch.numpy()
306
+
307
+ n_dims = len(data.shape)
308
+ data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
294
309
 
295
310
  # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
296
- # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
297
- extra_f32 = any(cond for cond in (
298
- extra_f32,
299
- n_dims == 1,
300
- new_name.endswith("_norm.weight"),
301
- ))
311
+ if n_dims <= 1 or new_name.endswith("_norm.weight"):
312
+ data_qtype = gguf.GGMLQuantizationType.F32
302
313
 
314
+ # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
303
315
  # Some tensor types are always in float32
304
- extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
305
- gguf.MODEL_TENSOR.FFN_GATE_INP,
306
- gguf.MODEL_TENSOR.POS_EMBD,
307
- gguf.MODEL_TENSOR.TOKEN_TYPES,
308
- ))
309
-
310
- # if f16 desired, convert any float32 2-dim weight tensors to float16
311
- extra_f16 = any(cond for cond in (
312
- extra_f16,
313
- (name.endswith(".weight") and n_dims >= 2),
314
- ))
315
-
316
- if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
317
- if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
318
- data = gguf.quantize_bf16(data)
319
- assert data.dtype == np.int16
320
- data_qtype = gguf.GGMLQuantizationType.BF16
316
+ if data_qtype is False and (
317
+ any(
318
+ self.match_model_tensor_name(new_name, key, bid)
319
+ for key in (
320
+ gguf.MODEL_TENSOR.FFN_GATE_INP,
321
+ gguf.MODEL_TENSOR.POS_EMBD,
322
+ gguf.MODEL_TENSOR.TOKEN_TYPES,
323
+ gguf.MODEL_TENSOR.SSM_CONV1D,
324
+ gguf.MODEL_TENSOR.TIME_MIX_FIRST,
325
+ gguf.MODEL_TENSOR.TIME_MIX_W1,
326
+ gguf.MODEL_TENSOR.TIME_MIX_W2,
327
+ gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
328
+ gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
329
+ gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
330
+ gguf.MODEL_TENSOR.POSNET_NORM1,
331
+ gguf.MODEL_TENSOR.POSNET_NORM2,
332
+ )
333
+ )
334
+ or not new_name.endswith(".weight")
335
+ ):
336
+ data_qtype = gguf.GGMLQuantizationType.F32
321
337
 
322
- elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
323
- data = gguf.quantize_q8_0(data)
324
- assert data.dtype == np.uint8
325
- data_qtype = gguf.GGMLQuantizationType.Q8_0
338
+ if data_qtype is False and any(
339
+ self.match_model_tensor_name(new_name, key, bid)
340
+ for key in (
341
+ gguf.MODEL_TENSOR.TOKEN_EMBD,
342
+ gguf.MODEL_TENSOR.OUTPUT,
343
+ )
344
+ ):
345
+ if self.ftype in (
346
+ gguf.LlamaFileType.MOSTLY_TQ1_0,
347
+ gguf.LlamaFileType.MOSTLY_TQ2_0,
348
+ ):
349
+ # TODO: use Q4_K and Q6_K
350
+ data_qtype = gguf.GGMLQuantizationType.F16
326
351
 
327
- else: # default to float16 for quantized tensors
328
- if data_dtype != np.float16:
329
- data = data.astype(np.float16)
352
+ # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
353
+ if isinstance(data_qtype, bool):
354
+ if self.ftype == gguf.LlamaFileType.ALL_F32:
355
+ data_qtype = gguf.GGMLQuantizationType.F32
356
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
330
357
  data_qtype = gguf.GGMLQuantizationType.F16
358
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
359
+ data_qtype = gguf.GGMLQuantizationType.BF16
360
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
361
+ data_qtype = gguf.GGMLQuantizationType.Q8_0
362
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
363
+ data_qtype = gguf.GGMLQuantizationType.TQ1_0
364
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
365
+ data_qtype = gguf.GGMLQuantizationType.TQ2_0
366
+ else:
367
+ raise ValueError(f"Unknown file type: {self.ftype.name}")
331
368
 
332
- if data_qtype is None: # by default, convert to float32
333
- if data_dtype != np.float32:
334
- data = data.astype(np.float32)
335
- data_qtype = gguf.GGMLQuantizationType.F32
369
+ try:
370
+ data = gguf.quants.quantize(data, data_qtype)
371
+ except gguf.QuantError as e:
372
+ logger.warning("%s, %s", e, "falling back to F16")
373
+ data_qtype = gguf.GGMLQuantizationType.F16
374
+ data = gguf.quants.quantize(data, data_qtype)
336
375
 
337
376
  shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
338
377
 
@@ -439,6 +478,11 @@ class Model:
439
478
  return modelcls
440
479
  return func
441
480
 
481
+ @classmethod
482
+ def print_registered_models(cls):
483
+ for name in sorted(cls._model_classes.keys()):
484
+ logger.error(f"- {name}")
485
+
442
486
  @classmethod
443
487
  def from_model_architecture(cls, arch: str) -> type[Model]:
444
488
  try:
@@ -491,9 +535,19 @@ class Model:
491
535
  else:
492
536
  token: str = reverse_vocab[i]
493
537
  if token in added_vocab:
538
+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539
+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
540
+ if not tokenizer.added_tokens_decoder[i].normalized:
541
+ previous_token = token
542
+ token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
543
+ if previous_token != token:
544
+ logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
545
+
494
546
  if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
495
547
  toktypes.append(gguf.TokenType.CONTROL)
496
548
  else:
549
+ # NOTE: this was added for Gemma.
550
+ # Encoding and decoding the tokens above isn't sufficient for this case.
497
551
  token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
498
552
  toktypes.append(gguf.TokenType.USER_DEFINED)
499
553
  else:
@@ -504,7 +558,7 @@ class Model:
504
558
 
505
559
  # NOTE: this function is generated by convert_hf_to_gguf_update.py
506
560
  # do not modify it manually!
507
- # ref: https://github.com/ggerganov/llama.cpp/pull/6920
561
+ # ref: https://github.com/ggml-org/llama.cpp/pull/6920
508
562
  # Marker: Start get_vocab_base_pre
509
563
  def get_vocab_base_pre(self, tokenizer) -> str:
510
564
  # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
@@ -537,9 +591,15 @@ class Model:
537
591
  if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
538
592
  # ref: https://huggingface.co/tiiuae/falcon-7b
539
593
  res = "falcon"
594
+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
595
+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
596
+ res = "falcon3"
540
597
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
541
598
  # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
542
599
  res = "bert-bge"
600
+ if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
601
+ # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
602
+ res = "bert-bge-large"
543
603
  if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
544
604
  # ref: https://huggingface.co/mosaicml/mpt-7b
545
605
  res = "mpt"
@@ -567,6 +627,9 @@ class Model:
567
627
  if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
568
628
  # ref: https://huggingface.co/databricks/dbrx-base
569
629
  res = "dbrx"
630
+ if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
631
+ # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
632
+ res = "jina-v1-en"
570
633
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
571
634
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
572
635
  res = "jina-v2-en"
@@ -585,7 +648,7 @@ class Model:
585
648
  if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
586
649
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
587
650
  res = "jina-v2-code"
588
- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
651
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
589
652
  # ref: https://huggingface.co/THUDM/glm-4-9b-chat
590
653
  res = "chatglm-bpe"
591
654
  if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
@@ -603,6 +666,39 @@ class Model:
603
666
  if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
604
667
  # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
605
668
  res = "smollm"
669
+ if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
670
+ # ref: https://huggingface.co/bigscience/bloom
671
+ res = "bloom"
672
+ if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
673
+ # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
674
+ res = "gpt3-finnish"
675
+ if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
676
+ # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
677
+ res = "exaone"
678
+ if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
679
+ # ref: https://huggingface.co/microsoft/phi-2
680
+ res = "phi-2"
681
+ if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
682
+ # ref: https://huggingface.co/facebook/chameleon-7b
683
+ res = "chameleon"
684
+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
685
+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
686
+ res = "minerva-7b"
687
+ if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
688
+ # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
689
+ res = "roberta-bpe"
690
+ if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
691
+ # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
692
+ res = "gigachat"
693
+ if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
694
+ # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
695
+ res = "megrez"
696
+ if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
697
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
698
+ res = "deepseek-v3"
699
+ if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
700
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
701
+ res = "deepseek-r1-qwen"
606
702
 
607
703
  if res is None:
608
704
  logger.warning("\n")
@@ -612,7 +708,7 @@ class Model:
612
708
  logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
613
709
  logger.warning("** - the pre-tokenization config has changed upstream")
614
710
  logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
615
- logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
711
+ logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
616
712
  logger.warning("**")
617
713
  logger.warning(f"** chkhsh: {chkhsh}")
618
714
  logger.warning("**************************************************************************************")
@@ -625,6 +721,9 @@ class Model:
625
721
  return res
626
722
  # Marker: End get_vocab_base_pre
627
723
 
724
+ def _set_vocab_none(self) -> None:
725
+ self.gguf_writer.add_tokenizer_model("none")
726
+
628
727
  def _set_vocab_gpt2(self) -> None:
629
728
  tokens, toktypes, tokpre = self.get_vocab_base()
630
729
  self.gguf_writer.add_tokenizer_model("gpt2")
@@ -906,7 +1005,7 @@ class GPTNeoXModel(Model):
906
1005
  return tensors
907
1006
 
908
1007
 
909
- @Model.register("BloomForCausalLM")
1008
+ @Model.register("BloomForCausalLM", "BloomModel")
910
1009
  class BloomModel(Model):
911
1010
  model_arch = gguf.MODEL_ARCH.BLOOM
912
1011
 
@@ -1461,7 +1560,7 @@ class StableLMModel(Model):
1461
1560
  raise ValueError(f"Unprocessed norms: {norms}")
1462
1561
 
1463
1562
 
1464
- @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1563
+ @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1465
1564
  class LlamaModel(Model):
1466
1565
  model_arch = gguf.MODEL_ARCH.LLAMA
1467
1566
 
@@ -1487,6 +1586,17 @@ class LlamaModel(Model):
1487
1586
  special_vocab._set_special_token("eot", 32010)
1488
1587
  special_vocab.add_to_gguf(self.gguf_writer)
1489
1588
 
1589
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1590
+ if tokenizer_config_file.is_file():
1591
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1592
+ tokenizer_config_json = json.load(f)
1593
+ if "add_prefix_space" in tokenizer_config_json:
1594
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1595
+
1596
+ # Apply to granite small models only
1597
+ if self.hparams.get("vocab_size", 32000) == 49152:
1598
+ self.gguf_writer.add_add_bos_token(False)
1599
+
1490
1600
  def set_gguf_parameters(self):
1491
1601
  super().set_gguf_parameters()
1492
1602
  hparams = self.hparams
@@ -1503,17 +1613,6 @@ class LlamaModel(Model):
1503
1613
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1504
1614
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1505
1615
 
1506
- tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1507
- if tokenizer_config_file.is_file():
1508
- with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1509
- tokenizer_config_json = json.load(f)
1510
- if "add_prefix_space" in tokenizer_config_json:
1511
- self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1512
-
1513
- # Apply to granite small models only
1514
- if self.hparams.get("vocab_size", 32000) == 49152:
1515
- self.gguf_writer.add_add_bos_token(False)
1516
-
1517
1616
  @staticmethod
1518
1617
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1519
1618
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1569,12 +1668,13 @@ class LlamaModel(Model):
1569
1668
 
1570
1669
  return [(self.map_tensor_name(name), data_torch)]
1571
1670
 
1572
- def prepare_tensors(self):
1671
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1573
1672
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1574
1673
  if rope_scaling.get("rope_type", '').lower() == "llama3":
1575
1674
  base = self.hparams.get("rope_theta", 10000.0)
1576
- dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1675
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1577
1676
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1677
+
1578
1678
  factor = rope_scaling.get("factor", 8.0)
1579
1679
  low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1580
1680
  high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
@@ -1595,8 +1695,9 @@ class LlamaModel(Model):
1595
1695
  smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1596
1696
  rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1597
1697
 
1598
- self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1698
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1599
1699
 
1700
+ def prepare_tensors(self):
1600
1701
  super().prepare_tensors()
1601
1702
 
1602
1703
  if self._experts is not None:
@@ -1606,6 +1707,178 @@ class LlamaModel(Model):
1606
1707
  raise ValueError(f"Unprocessed experts: {experts}")
1607
1708
 
1608
1709
 
1710
+ @Model.register("DeciLMForCausalLM")
1711
+ class DeciModel(Model):
1712
+ model_arch = gguf.MODEL_ARCH.DECI
1713
+
1714
+ @staticmethod
1715
+ def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
1716
+ # DeciLM-specific code
1717
+ intermediate_size = int(2 * ffn_mult * n_embd / 3)
1718
+ return DeciModel._find_multiple(intermediate_size, 256)
1719
+
1720
+ @staticmethod
1721
+ def _find_multiple(n: int, k: int) -> int:
1722
+ # DeciLM-specific code
1723
+ if n % k == 0:
1724
+ return n
1725
+ return n + k - (n % k)
1726
+
1727
+ def __init__(self, *args, **kwargs):
1728
+ super().__init__(*args, **kwargs)
1729
+
1730
+ if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1731
+ _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
1732
+ assert self.block_count == len(_block_configs)
1733
+ self._num_kv_heads = list()
1734
+ self._num_heads = list()
1735
+ _ffn_multipliers = list()
1736
+ # ***linear attention layer***
1737
+ # if n_heads_in_group is None and replace_with_linear is True
1738
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
1739
+ # ***attention-free layer***
1740
+ # if n_heads_in_group is None and replace_with_linear is False
1741
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
1742
+ # ***normal attention-layer***
1743
+ # if n_heads_in_group is not None, then
1744
+ # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
1745
+ # _num_heads[il] is num_attention_head
1746
+ for il in range(len(_block_configs)):
1747
+ if _block_configs[il]["attention"]["n_heads_in_group"] is None:
1748
+ if _block_configs[il]["attention"]["replace_with_linear"] is True:
1749
+ self._num_kv_heads.append(0)
1750
+ self._num_heads.append(self.hparams["num_attention_heads"])
1751
+ else:
1752
+ self._num_kv_heads.append(0)
1753
+ self._num_heads.append(0)
1754
+ else:
1755
+ self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
1756
+ self._num_heads.append(self.hparams["num_attention_heads"])
1757
+ _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
1758
+ assert self.block_count == len(self._num_kv_heads)
1759
+ assert self.block_count == len(self._num_heads)
1760
+ assert self.block_count == len(_ffn_multipliers)
1761
+ assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
1762
+ assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
1763
+ assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
1764
+ self._ffn_dims: list[int] = [
1765
+ DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
1766
+ for multiplier in _ffn_multipliers
1767
+ ]
1768
+
1769
+ def set_vocab(self):
1770
+ # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
1771
+ # eos_token from '|eot_id|' to '|end_of_text|'
1772
+ if self.hparams.get("vocab_size", 128256) == 128256:
1773
+ tokens, toktypes, tokpre = self.get_vocab_base()
1774
+ self.gguf_writer.add_tokenizer_model("gpt2")
1775
+ self.gguf_writer.add_tokenizer_pre(tokpre)
1776
+ self.gguf_writer.add_token_list(tokens)
1777
+ self.gguf_writer.add_token_types(toktypes)
1778
+
1779
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1780
+ special_vocab.add_to_gguf(self.gguf_writer)
1781
+ else:
1782
+ # DeciLM-7B
1783
+ self._set_vocab_llama_hf()
1784
+
1785
+ def set_gguf_parameters(self):
1786
+ if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1787
+ assert self.block_count == len(self._num_kv_heads)
1788
+ assert self.block_count == len(self._num_heads)
1789
+ assert self.block_count == len(self._ffn_dims)
1790
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
1791
+ self.gguf_writer.add_rope_freq_base(rope_theta)
1792
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1793
+ self.gguf_writer.add_head_count(self._num_heads)
1794
+ self.gguf_writer.add_feed_forward_length(self._ffn_dims)
1795
+ self.gguf_writer.add_block_count(self.block_count)
1796
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1797
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1798
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1799
+ self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1800
+ self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1801
+ self.gguf_writer.add_file_type(self.ftype)
1802
+ else: # DeciLM-7B
1803
+ super().set_gguf_parameters()
1804
+ if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
1805
+ self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
1806
+ assert self.block_count == len(self._num_kv_heads)
1807
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1808
+ hparams = self.hparams
1809
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1810
+
1811
+ if "head_dim" in hparams:
1812
+ rope_dim = hparams["head_dim"]
1813
+ else:
1814
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1815
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
1816
+
1817
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1818
+ if self.hparams["rope_scaling"].get("type") == "linear":
1819
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1820
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1821
+
1822
+ @staticmethod
1823
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1824
+ if n_head_kv is not None and n_head != n_head_kv:
1825
+ n_head = n_head_kv
1826
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1827
+ .swapaxes(1, 2)
1828
+ .reshape(weights.shape))
1829
+
1830
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1831
+ n_head = self.hparams["num_attention_heads"]
1832
+ if bid is not None:
1833
+ if "num_key_value_heads_per_layer" in self.hparams:
1834
+ n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
1835
+ elif "block_configs" in self.hparams:
1836
+ n_kv_head = self._num_kv_heads[bid]
1837
+ n_head = self._num_heads[bid]
1838
+ else:
1839
+ n_kv_head = self.hparams.get("num_key_value_heads")
1840
+ else:
1841
+ n_kv_head = self.hparams.get("num_key_value_heads")
1842
+
1843
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
1844
+ data_torch = DeciModel.permute(data_torch, n_head, n_head)
1845
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
1846
+ data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
1847
+ return [(self.map_tensor_name(name), data_torch)]
1848
+
1849
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1850
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1851
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
1852
+ base = self.hparams.get("rope_theta", 10000.0)
1853
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1854
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1855
+
1856
+ factor = rope_scaling.get("factor", 8.0)
1857
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1858
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1859
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1860
+
1861
+ low_freq_wavelen = old_context_len / low_freq_factor
1862
+ high_freq_wavelen = old_context_len / high_freq_factor
1863
+ assert low_freq_wavelen != high_freq_wavelen
1864
+
1865
+ rope_factors = []
1866
+ for freq in freqs:
1867
+ wavelen = 2 * math.pi / freq
1868
+ if wavelen < high_freq_wavelen:
1869
+ rope_factors.append(1)
1870
+ elif wavelen > low_freq_wavelen:
1871
+ rope_factors.append(factor)
1872
+ else:
1873
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1874
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1875
+
1876
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1877
+
1878
+ def prepare_tensors(self):
1879
+ super().prepare_tensors()
1880
+
1881
+
1609
1882
  @Model.register("BitnetForCausalLM")
1610
1883
  class BitnetModel(Model):
1611
1884
  model_arch = gguf.MODEL_ARCH.BITNET
@@ -1618,15 +1891,16 @@ class BitnetModel(Model):
1618
1891
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1619
1892
  self.gguf_writer.add_rope_scaling_factor(1.0)
1620
1893
 
1621
- def weight_quant(self, weight):
1894
+ def weight_quant(self, weight: Tensor) -> Tensor:
1622
1895
  dtype = weight.dtype
1623
1896
  weight = weight.float()
1624
- s = 1 / weight.abs().mean().clamp(min=1e-5)
1625
- weight = (weight * s).round().clamp(-1, 1) / s
1626
- scale = weight.abs().max().unsqueeze(0)
1627
- weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
1628
- weight = torch.sign(weight).type(dtype)
1629
- return weight.type(dtype), scale.type(torch.float32)
1897
+ scale = weight.abs().mean().clamp(min=1e-5)
1898
+ iscale = 1 / scale
1899
+ # TODO: multiply by the scale directly instead of inverting it twice
1900
+ # (this is also unnecessarily doubly inverted upstream)
1901
+ # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
1902
+ result = (weight * iscale).round().clamp(-1, 1) / iscale
1903
+ return result.type(dtype)
1630
1904
 
1631
1905
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1632
1906
  new_name = self.map_tensor_name(name)
@@ -1641,11 +1915,9 @@ class BitnetModel(Model):
1641
1915
  gguf.MODEL_TENSOR.FFN_GATE,
1642
1916
  ]):
1643
1917
  # transform weight into 1/0/-1 (in fp32)
1644
- weight_torch, scale_torch = self.weight_quant(data_torch)
1645
- yield (new_name, weight_torch)
1646
- yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
1647
- else:
1648
- yield (new_name, data_torch)
1918
+ data_torch = self.weight_quant(data_torch)
1919
+
1920
+ yield (new_name, data_torch)
1649
1921
 
1650
1922
 
1651
1923
  @Model.register("GrokForCausalLM")
@@ -1764,7 +2036,7 @@ class DbrxModel(Model):
1764
2036
 
1765
2037
  return [(new_name, data_torch)]
1766
2038
 
1767
- def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
2039
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
1768
2040
  del name, new_name, bid # unused
1769
2041
 
1770
2042
  return n_dims > 1
@@ -1775,29 +2047,40 @@ class MiniCPMModel(Model):
1775
2047
  model_arch = gguf.MODEL_ARCH.MINICPM
1776
2048
 
1777
2049
  def set_gguf_parameters(self):
1778
- block_count = self.hparams["num_hidden_layers"]
1779
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1780
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1781
- self.gguf_writer.add_block_count(block_count)
1782
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1783
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1784
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1785
- self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
1786
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1787
- self.gguf_writer.add_file_type(self.ftype)
2050
+ super().set_gguf_parameters()
2051
+ embedding_scale = float(self.hparams["scale_emb"])
2052
+ self.gguf_writer.add_embedding_scale(embedding_scale)
2053
+ logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
2054
+ residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
2055
+ self.gguf_writer.add_residual_scale(residual_scale)
2056
+ logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
2057
+ logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
2058
+ self.gguf_writer.add_logit_scale(logit_scale)
2059
+ logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
2060
+ if self.hparams.get("rope_scaling") is not None:
2061
+ if self.hparams["rope_scaling"].get("type") == "longrope":
2062
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
2063
+ logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
2064
+
2065
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2066
+ rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1788
2067
 
1789
- def set_vocab(self):
1790
- self._set_vocab_llama_hf()
2068
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
2069
+ if rope_scaling is not None:
2070
+ long_factors = rope_scaling.get('long_factor', None)
2071
+ short_factors = rope_scaling.get('short_factor', None)
1791
2072
 
1792
- def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1793
- if n_kv_head is not None and n_head != n_kv_head:
1794
- n_head = n_kv_head
2073
+ if long_factors is None or short_factors is None:
2074
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
1795
2075
 
1796
- return (
1797
- weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1798
- .swapaxes(1, 2)
1799
- .reshape(weights.shape)
1800
- )
2076
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2077
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2078
+
2079
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2080
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2081
+
2082
+ def set_vocab(self):
2083
+ self._set_vocab_sentencepiece()
1801
2084
 
1802
2085
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1803
2086
  del bid # unused
@@ -1807,13 +2090,66 @@ class MiniCPMModel(Model):
1807
2090
 
1808
2091
  # HF models permute some of the tensors, so we need to undo that
1809
2092
  if name.endswith(("q_proj.weight")):
1810
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
2093
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1811
2094
  if name.endswith(("k_proj.weight")):
1812
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
2095
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1813
2096
 
1814
2097
  return [(self.map_tensor_name(name), data_torch)]
1815
2098
 
1816
2099
 
2100
+ @Model.register("MiniCPM3ForCausalLM")
2101
+ class MiniCPM3Model(Model):
2102
+ model_arch = gguf.MODEL_ARCH.MINICPM3
2103
+
2104
+ def set_gguf_parameters(self):
2105
+ hparams = self.hparams
2106
+
2107
+ self.gguf_writer.add_file_type(self.ftype)
2108
+ self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2109
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2110
+ self.gguf_writer.add_block_count(self.block_count)
2111
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2112
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2113
+ self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
2114
+ self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
2115
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2116
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
2117
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
2118
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
2119
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
2120
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
2121
+
2122
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2123
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
2124
+ if rope_scaling is not None:
2125
+ rope_dims = self.hparams["qk_rope_head_dim"]
2126
+
2127
+ long_factors = rope_scaling.get('long_factor', None)
2128
+ short_factors = rope_scaling.get('short_factor', None)
2129
+
2130
+ if long_factors is None or short_factors is None:
2131
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
2132
+
2133
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2134
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2135
+
2136
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2137
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2138
+
2139
+ def set_vocab(self):
2140
+ self._set_vocab_sentencepiece()
2141
+
2142
+ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
2143
+ if n_kv_head is not None and n_head != n_kv_head:
2144
+ n_head //= n_kv_head
2145
+
2146
+ return (
2147
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2148
+ .swapaxes(1, 2)
2149
+ .reshape(weights.shape)
2150
+ )
2151
+
2152
+
1817
2153
  @Model.register("QWenLMHeadModel")
1818
2154
  class QwenModel(Model):
1819
2155
  model_arch = gguf.MODEL_ARCH.QWEN
@@ -1866,6 +2202,75 @@ class Qwen2Model(Model):
1866
2202
  except FileNotFoundError:
1867
2203
  self._set_vocab_gpt2()
1868
2204
 
2205
+ def set_gguf_parameters(self):
2206
+ super().set_gguf_parameters()
2207
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2208
+ if self.hparams["rope_scaling"].get("type") == "yarn":
2209
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2210
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2211
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
2212
+
2213
+
2214
+ @Model.register("Qwen2VLForConditionalGeneration")
2215
+ class Qwen2VLModel(Model):
2216
+ model_arch = gguf.MODEL_ARCH.QWEN2VL
2217
+
2218
+ def set_gguf_parameters(self):
2219
+ super().set_gguf_parameters()
2220
+ mrope_section = self.hparams["rope_scaling"]["mrope_section"]
2221
+ mrope_section += [0] * max(0, 4 - len(mrope_section))
2222
+ self.gguf_writer.add_rope_dimension_sections(mrope_section)
2223
+
2224
+ def set_vocab(self):
2225
+ try:
2226
+ self._set_vocab_sentencepiece()
2227
+ except FileNotFoundError:
2228
+ self._set_vocab_gpt2()
2229
+
2230
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
2231
+ for name, data in super().get_tensors():
2232
+ if name.startswith("visual."):
2233
+ continue
2234
+ yield name, data
2235
+
2236
+
2237
+ @Model.register("WavTokenizerDec")
2238
+ class WavTokenizerDecModel(Model):
2239
+ model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
2240
+
2241
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2242
+ del bid # unused
2243
+
2244
+ if \
2245
+ name.endswith("codebook.cluster_size") or \
2246
+ name.endswith("codebook.embed_avg") or \
2247
+ name.endswith("codebook.inited"):
2248
+ logger.debug(f"Skipping {name!r}")
2249
+ return []
2250
+
2251
+ logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
2252
+
2253
+ return [(self.map_tensor_name(name), data_torch)]
2254
+
2255
+ def set_vocab(self):
2256
+ self._set_vocab_none()
2257
+
2258
+ def set_gguf_parameters(self):
2259
+ super().set_gguf_parameters()
2260
+ self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
2261
+ self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
2262
+ self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
2263
+ self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
2264
+ self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
2265
+
2266
+ self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
2267
+ self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
2268
+
2269
+ self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
2270
+ self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
2271
+
2272
+ self.gguf_writer.add_causal_attention(False)
2273
+
1869
2274
 
1870
2275
  @Model.register("Qwen2MoeForCausalLM")
1871
2276
  class Qwen2MoeModel(Model):
@@ -1995,6 +2400,15 @@ class Phi3MiniModel(Model):
1995
2400
  model_arch = gguf.MODEL_ARCH.PHI3
1996
2401
 
1997
2402
  def set_vocab(self):
2403
+ # Phi-4 model uses GPT2Tokenizer
2404
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2405
+ if tokenizer_config_file.is_file():
2406
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2407
+ tokenizer_config_json = json.load(f)
2408
+ tokenizer_class = tokenizer_config_json['tokenizer_class']
2409
+ if tokenizer_class == 'GPT2Tokenizer':
2410
+ return self._set_vocab_gpt2()
2411
+
1998
2412
  from sentencepiece import SentencePieceProcessor
1999
2413
 
2000
2414
  tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -2111,7 +2525,18 @@ class Phi3MiniModel(Model):
2111
2525
  self.gguf_writer.add_rope_dimension_count(rope_dims)
2112
2526
  self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
2113
2527
  self.gguf_writer.add_file_type(self.ftype)
2114
- self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
2528
+ sliding_window = self.hparams.get("sliding_window")
2529
+ # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
2530
+ if sliding_window is None:
2531
+ sliding_window = 0
2532
+ self.gguf_writer.add_sliding_window(sliding_window)
2533
+
2534
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2535
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
2536
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
2537
+ max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
2538
+ orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2539
+ rope_dims = n_embd // n_head
2115
2540
 
2116
2541
  # write rope scaling for long context (128k) model
2117
2542
  rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2142,15 +2567,72 @@ class Phi3MiniModel(Model):
2142
2567
  if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2143
2568
  raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2144
2569
 
2145
- self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
2146
- self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
2570
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2571
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2147
2572
 
2148
2573
 
2149
- @Model.register("PlamoForCausalLM")
2150
- class PlamoModel(Model):
2151
- model_arch = gguf.MODEL_ARCH.PLAMO
2574
+ @Model.register("PhiMoEForCausalLM")
2575
+ class PhiMoeModel(Phi3MiniModel):
2576
+ model_arch = gguf.MODEL_ARCH.PHIMOE
2152
2577
 
2153
- def set_vocab(self):
2578
+ _experts: list[dict[str, Tensor]] | None = None
2579
+
2580
+ def set_gguf_parameters(self):
2581
+ super().set_gguf_parameters()
2582
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
2583
+ self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
2584
+
2585
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2586
+ # process the experts separately
2587
+ if name.find("block_sparse_moe.experts") != -1:
2588
+ n_experts = self.hparams["num_local_experts"]
2589
+ assert bid is not None
2590
+
2591
+ if self._experts is None:
2592
+ self._experts = [{} for _ in range(self.block_count)]
2593
+
2594
+ self._experts[bid][name] = data_torch
2595
+
2596
+ if len(self._experts[bid]) >= n_experts * 3:
2597
+ tensors: list[tuple[str, Tensor]] = []
2598
+
2599
+ # merge the experts into a single 3d tensor
2600
+ for w_name in ["w1", "w2", "w3"]:
2601
+ datas: list[Tensor] = []
2602
+
2603
+ for xid in range(n_experts):
2604
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
2605
+ datas.append(self._experts[bid][ename])
2606
+ del self._experts[bid][ename]
2607
+
2608
+ data_torch = torch.stack(datas, dim=0)
2609
+
2610
+ merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
2611
+
2612
+ new_name = self.map_tensor_name(merged_name)
2613
+
2614
+ tensors.append((new_name, data_torch))
2615
+ return tensors
2616
+ else:
2617
+ return []
2618
+
2619
+ return [(self.map_tensor_name(name), data_torch)]
2620
+
2621
+ def prepare_tensors(self):
2622
+ super().prepare_tensors()
2623
+
2624
+ if self._experts is not None:
2625
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2626
+ experts = [k for d in self._experts for k in d.keys()]
2627
+ if len(experts) > 0:
2628
+ raise ValueError(f"Unprocessed experts: {experts}")
2629
+
2630
+
2631
+ @Model.register("PlamoForCausalLM")
2632
+ class PlamoModel(Model):
2633
+ model_arch = gguf.MODEL_ARCH.PLAMO
2634
+
2635
+ def set_vocab(self):
2154
2636
  self._set_vocab_sentencepiece()
2155
2637
 
2156
2638
  def set_gguf_parameters(self):
@@ -2353,7 +2835,7 @@ class InternLM2Model(Model):
2353
2835
  if chat_eos_token_id is not None:
2354
2836
  # For the chat model, we replace the eos with '<|im_end|>'.
2355
2837
  # TODO: this is a hack, should be fixed
2356
- # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2838
+ # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
2357
2839
  special_vocab.special_token_ids["eos"] = chat_eos_token_id
2358
2840
  logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
2359
2841
  " in chat mode so that the conversation can end normally.")
@@ -2403,7 +2885,67 @@ class InternLM2Model(Model):
2403
2885
  return [(self.map_tensor_name(name), data_torch)]
2404
2886
 
2405
2887
 
2406
- @Model.register("BertModel", "CamembertModel")
2888
+ @Model.register("InternLM3ForCausalLM")
2889
+ class InternLM3Model(Model):
2890
+ model_arch = gguf.MODEL_ARCH.LLAMA
2891
+
2892
+ def set_vocab(self):
2893
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
2894
+
2895
+ self.gguf_writer.add_tokenizer_model("llama")
2896
+ self.gguf_writer.add_tokenizer_pre("default")
2897
+ self.gguf_writer.add_token_list(tokens)
2898
+ self.gguf_writer.add_token_scores(scores)
2899
+ self.gguf_writer.add_token_types(toktypes)
2900
+
2901
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2902
+
2903
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2904
+ if tokenizer_config_file.is_file():
2905
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2906
+ tokenizer_config_json = json.load(f)
2907
+ if "add_prefix_space" in tokenizer_config_json:
2908
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
2909
+
2910
+ if "added_tokens_decoder" in tokenizer_config_json:
2911
+ for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
2912
+ if token_data.get("special"):
2913
+ token_id = int(token_id)
2914
+ token = token_data["content"]
2915
+ special_vocab._set_special_token(token, token_id)
2916
+ # update eos token
2917
+ if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
2918
+ special_vocab.special_token_ids["eos"] = token_id
2919
+
2920
+ special_vocab.add_to_gguf(self.gguf_writer)
2921
+
2922
+ def set_gguf_parameters(self):
2923
+ super().set_gguf_parameters()
2924
+ hparams = self.hparams
2925
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2926
+
2927
+ if "head_dim" in hparams:
2928
+ rope_dim = hparams["head_dim"]
2929
+ else:
2930
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2931
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
2932
+
2933
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2934
+ if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
2935
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2936
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2937
+
2938
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2939
+ n_head = self.hparams["num_attention_heads"]
2940
+ n_kv_head = self.hparams.get("num_key_value_heads")
2941
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
2942
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2943
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
2944
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
2945
+ return [(self.map_tensor_name(name), data_torch)]
2946
+
2947
+
2948
+ @Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
2407
2949
  class BertModel(Model):
2408
2950
  model_arch = gguf.MODEL_ARCH.BERT
2409
2951
 
@@ -2444,7 +2986,8 @@ class BertModel(Model):
2444
2986
 
2445
2987
  # we need this to validate the size of the token_type embeddings
2446
2988
  # though currently we are passing all zeros to the token_type embeddings
2447
- self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
2989
+ # "Sequence A" or "Sequence B"
2990
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2448
2991
 
2449
2992
  # convert to phantom space vocab
2450
2993
  def phantom(tok):
@@ -2468,13 +3011,73 @@ class BertModel(Model):
2468
3011
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2469
3012
  del bid # unused
2470
3013
 
3014
+ if name.startswith("bert."):
3015
+ name = name[5:]
3016
+
3017
+ if name.endswith(".gamma"):
3018
+ name = name[:-6] + ".weight"
3019
+
3020
+ if name.endswith(".beta"):
3021
+ name = name[:-5] + ".bias"
3022
+
2471
3023
  # we are only using BERT for embeddings so we don't need the pooling layer
2472
3024
  if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
2473
3025
  return [] # we don't need these
2474
3026
 
3027
+ if name.startswith("cls.predictions"):
3028
+ return []
3029
+
3030
+ if name.startswith("cls.seq_relationship"):
3031
+ return []
3032
+
2475
3033
  return [(self.map_tensor_name(name), data_torch)]
2476
3034
 
2477
3035
 
3036
+ @Model.register("RobertaModel")
3037
+ class RobertaModel(BertModel):
3038
+ model_arch = gguf.MODEL_ARCH.BERT
3039
+
3040
+ def __init__(self, *args, **kwargs):
3041
+ super().__init__(*args, **kwargs)
3042
+
3043
+ # we need the pad_token_id to know how to chop down position_embd matrix
3044
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3045
+ self._position_offset = 1 + pad_token_id
3046
+ if "max_position_embeddings" in self.hparams:
3047
+ self.hparams["max_position_embeddings"] -= self._position_offset
3048
+ else:
3049
+ self._position_offset = None
3050
+
3051
+ def set_vocab(self):
3052
+ """Support BPE tokenizers for roberta models"""
3053
+ bpe_tok_path = self.dir_model / "tokenizer.json"
3054
+ if bpe_tok_path.exists():
3055
+ self._set_vocab_gpt2()
3056
+ self.gguf_writer.add_add_bos_token(True)
3057
+ self.gguf_writer.add_add_eos_token(True)
3058
+
3059
+ # we need this to validate the size of the token_type embeddings
3060
+ # though currently we are passing all zeros to the token_type embeddings
3061
+ # "Sequence A" or "Sequence B"
3062
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3063
+
3064
+ else:
3065
+ return super().set_vocab()
3066
+
3067
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3068
+ # if name starts with "roberta.", remove the prefix
3069
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
3070
+ if name.startswith("roberta."):
3071
+ name = name[8:]
3072
+
3073
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
3074
+ if name == "embeddings.position_embeddings.weight":
3075
+ if self._position_offset is not None:
3076
+ data_torch = data_torch[self._position_offset:,:]
3077
+
3078
+ return super().modify_tensors(data_torch, name, bid)
3079
+
3080
+
2478
3081
  @Model.register("NomicBertModel")
2479
3082
  class NomicBertModel(BertModel):
2480
3083
  model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@@ -2505,6 +3108,117 @@ class NomicBertModel(BertModel):
2505
3108
  self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
2506
3109
 
2507
3110
 
3111
+ @Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
3112
+ class XLMRobertaModel(BertModel):
3113
+ model_arch = gguf.MODEL_ARCH.BERT
3114
+
3115
+ def __init__(self, *args, **kwargs):
3116
+ super().__init__(*args, **kwargs)
3117
+
3118
+ # we need the pad_token_id to know how to chop down position_embd matrix
3119
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3120
+ self._position_offset = 1 + pad_token_id
3121
+ if "max_position_embeddings" in self.hparams:
3122
+ self.hparams["max_position_embeddings"] -= self._position_offset
3123
+ else:
3124
+ self._position_offset = None
3125
+
3126
+ def set_vocab(self):
3127
+ # to avoid TypeError: Descriptors cannot be created directly
3128
+ # exception when importing sentencepiece_model_pb2
3129
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3130
+ from sentencepiece import SentencePieceProcessor
3131
+ from sentencepiece import sentencepiece_model_pb2 as model
3132
+
3133
+ tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
3134
+ if not tokenizer_path.is_file():
3135
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
3136
+
3137
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3138
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3139
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3140
+
3141
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3142
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3143
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3144
+
3145
+ tokenizer = SentencePieceProcessor()
3146
+ tokenizer.LoadFromFile(str(tokenizer_path))
3147
+
3148
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3149
+
3150
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3151
+ scores: list[float] = [-10000.0] * vocab_size
3152
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3153
+
3154
+ for token_id in range(tokenizer.vocab_size()):
3155
+ piece = tokenizer.IdToPiece(token_id)
3156
+ text = piece.encode("utf-8")
3157
+ score = tokenizer.GetScore(token_id)
3158
+
3159
+ toktype = SentencePieceTokenTypes.NORMAL
3160
+ if tokenizer.IsUnknown(token_id):
3161
+ toktype = SentencePieceTokenTypes.UNKNOWN
3162
+ elif tokenizer.IsControl(token_id):
3163
+ toktype = SentencePieceTokenTypes.CONTROL
3164
+ elif tokenizer.IsUnused(token_id):
3165
+ toktype = SentencePieceTokenTypes.UNUSED
3166
+ elif tokenizer.IsByte(token_id):
3167
+ toktype = SentencePieceTokenTypes.BYTE
3168
+
3169
+ tokens[token_id] = text
3170
+ scores[token_id] = score
3171
+ toktypes[token_id] = toktype
3172
+
3173
+ if vocab_size > len(tokens):
3174
+ pad_count = vocab_size - len(tokens)
3175
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3176
+ for i in range(1, pad_count + 1):
3177
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3178
+ scores.append(-1000.0)
3179
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
3180
+
3181
+ # realign tokens (see HF tokenizer code)
3182
+ tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
3183
+ scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
3184
+ toktypes = [
3185
+ SentencePieceTokenTypes.CONTROL,
3186
+ SentencePieceTokenTypes.CONTROL,
3187
+ SentencePieceTokenTypes.CONTROL,
3188
+ SentencePieceTokenTypes.UNKNOWN,
3189
+ ] + toktypes[3:-1]
3190
+
3191
+ self.gguf_writer.add_tokenizer_model("t5")
3192
+ self.gguf_writer.add_tokenizer_pre("default")
3193
+ self.gguf_writer.add_token_list(tokens)
3194
+ self.gguf_writer.add_token_scores(scores)
3195
+ self.gguf_writer.add_token_types(toktypes)
3196
+ self.gguf_writer.add_add_space_prefix(add_prefix)
3197
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3198
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3199
+ if precompiled_charsmap:
3200
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3201
+
3202
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3203
+ special_vocab.add_to_gguf(self.gguf_writer)
3204
+
3205
+ self.gguf_writer.add_add_bos_token(True)
3206
+ self.gguf_writer.add_add_eos_token(True)
3207
+
3208
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3209
+ # if name starts with "roberta.", remove the prefix
3210
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
3211
+ if name.startswith("roberta."):
3212
+ name = name[8:]
3213
+
3214
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
3215
+ if name == "embeddings.position_embeddings.weight":
3216
+ if self._position_offset is not None:
3217
+ data_torch = data_torch[self._position_offset:,:]
3218
+
3219
+ return super().modify_tensors(data_torch, name, bid)
3220
+
3221
+
2508
3222
  @Model.register("GemmaForCausalLM")
2509
3223
  class GemmaModel(Model):
2510
3224
  model_arch = gguf.MODEL_ARCH.GEMMA
@@ -2608,54 +3322,216 @@ class StarCoder2Model(Model):
2608
3322
  model_arch = gguf.MODEL_ARCH.STARCODER2
2609
3323
 
2610
3324
 
2611
- @Model.register("MambaForCausalLM", "MambaLMHeadModel")
2612
- class MambaModel(Model):
2613
- model_arch = gguf.MODEL_ARCH.MAMBA
3325
+ @Model.register("Rwkv6ForCausalLM")
3326
+ class Rwkv6Model(Model):
3327
+ model_arch = gguf.MODEL_ARCH.RWKV6
2614
3328
 
2615
3329
  def set_vocab(self):
2616
- vocab_size = self.hparams["vocab_size"]
2617
- # Round vocab size to next multiple of 8
2618
- pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
2619
- # pad using ceiling division
2620
- # ref: https://stackoverflow.com/a/17511341/22827863
2621
- vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
2622
- self.hparams["vocab_size"] = vocab_size
3330
+ assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
3331
+ vocab_size = self.hparams.get("vocab_size", 65536)
3332
+
3333
+ tokens: list[bytes] = ['<s>'.encode("utf-8")]
3334
+ toktypes: list[int] = [gguf.TokenType.CONTROL]
3335
+
3336
+ with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
3337
+ lines = f.readlines()
3338
+ for line in lines:
3339
+ parts = line.split(' ')
3340
+ assert len(parts) >= 3
3341
+ token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
3342
+ token = token.encode("utf-8") if isinstance(token, str) else token
3343
+ assert isinstance(token, bytes)
3344
+ assert len(token) == token_len
3345
+ token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
3346
+ tokens.append(token_text.encode("utf-8"))
3347
+ toktypes.append(gguf.TokenType.NORMAL)
3348
+ remainder = vocab_size - len(tokens)
3349
+ assert remainder >= 0
3350
+ for i in range(len(tokens), vocab_size):
3351
+ tokens.append(f"[PAD{i}]".encode("utf-8"))
3352
+ toktypes.append(gguf.TokenType.UNUSED)
2623
3353
 
2624
- if (self.dir_model / "tokenizer.json").is_file():
2625
- self._set_vocab_gpt2()
2626
- elif (self.dir_model / "tokenizer.model").is_file():
2627
- self._set_vocab_sentencepiece()
2628
- else:
2629
- # Use the GPT-NeoX tokenizer when no tokenizer files are present
2630
- self._set_vocab_builtin("gpt-neox", vocab_size)
3354
+ self.gguf_writer.add_tokenizer_model("rwkv")
3355
+ self.gguf_writer.add_token_list(tokens)
3356
+ self.gguf_writer.add_token_types(toktypes)
3357
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
3358
+ special_vocab.chat_template = "rwkv-world"
3359
+ # hack: Add '\n\n' as the EOT token to make it chat normally
3360
+ special_vocab._set_special_token("eot", 261)
3361
+ special_vocab.add_to_gguf(self.gguf_writer)
2631
3362
 
2632
3363
  def set_gguf_parameters(self):
2633
- d_model = self.find_hparam(["hidden_size", "d_model"])
2634
- d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
2635
- d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
2636
- d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
2637
- # ceiling division
2638
- # ref: https://stackoverflow.com/a/17511341/22827863
2639
- # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
2640
- dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
2641
- rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
2642
-
2643
- # Fail early for models which don't have a block expansion factor of 2
2644
- assert d_inner == 2 * d_model
2645
-
2646
- self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
2647
- self.gguf_writer.add_embedding_length(d_model)
2648
- self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
2649
- self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
2650
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
2651
- self.gguf_writer.add_ssm_conv_kernel(d_conv)
2652
- self.gguf_writer.add_ssm_inner_size(d_inner)
2653
- self.gguf_writer.add_ssm_state_size(d_state)
2654
- self.gguf_writer.add_ssm_time_step_rank(dt_rank)
2655
- self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3364
+ block_count = self.hparams["num_hidden_layers"]
3365
+ head_size = self.hparams["head_size"]
3366
+ hidden_size = self.hparams["hidden_size"]
3367
+ layer_norm_eps = self.hparams["layer_norm_epsilon"]
3368
+ rescale_every_n_layers = self.hparams["rescale_every"]
3369
+ intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
3370
+ time_mix_extra_dim = 64 if hidden_size == 4096 else 32
3371
+ time_decay_extra_dim = 128 if hidden_size == 4096 else 64
3372
+
3373
+ # RWKV isn't context limited
3374
+ self.gguf_writer.add_context_length(1048576)
3375
+ self.gguf_writer.add_embedding_length(hidden_size)
3376
+ self.gguf_writer.add_block_count(block_count)
3377
+ self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
3378
+ self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
3379
+ self.gguf_writer.add_wkv_head_size(head_size)
3380
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
3381
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
3382
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
2656
3383
  self.gguf_writer.add_file_type(self.ftype)
2657
3384
 
2658
- _tok_embd = None
3385
+ # required by llama.cpp, unused
3386
+ self.gguf_writer.add_head_count(0)
3387
+
3388
+ lerp_weights: dict[int, dict[str, Tensor]] = {}
3389
+
3390
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3391
+ new_name = self.map_tensor_name(name)
3392
+
3393
+ if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
3394
+ new_name += ".weight"
3395
+
3396
+ if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
3397
+ data_torch = data_torch.transpose(0, 1)
3398
+
3399
+ if new_name.endswith("time_mix_w2.weight"):
3400
+ data_torch = data_torch.permute(0, 2, 1)
3401
+
3402
+ if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
3403
+ data_torch = data_torch.squeeze()
3404
+
3405
+ try:
3406
+ rescale_every_n_layers = self.hparams["rescale_every"]
3407
+ if rescale_every_n_layers > 0:
3408
+ if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
3409
+ data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
3410
+ except KeyError:
3411
+ pass
3412
+
3413
+ # concat time_mix_lerp weights to reduce some cpu overhead
3414
+ # also reduces the number of tensors in the model
3415
+ if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
3416
+ try:
3417
+ self.lerp_weights[bid][new_name] = data_torch
3418
+ except KeyError:
3419
+ self.lerp_weights[bid] = {new_name: data_torch}
3420
+ if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
3421
+ new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3422
+ data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
3423
+ yield (new_name, data)
3424
+ return
3425
+
3426
+ yield (new_name, data_torch)
3427
+
3428
+
3429
+ @Model.register("RWKV6Qwen2ForCausalLM")
3430
+ class RWKV6Qwen2Model(Rwkv6Model):
3431
+ model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
3432
+
3433
+ def set_vocab(self):
3434
+ try:
3435
+ self._set_vocab_sentencepiece()
3436
+ except FileNotFoundError:
3437
+ self._set_vocab_gpt2()
3438
+
3439
+ def set_gguf_parameters(self):
3440
+ block_count = self.hparams["num_hidden_layers"]
3441
+ num_attention_heads = self.hparams["num_attention_heads"]
3442
+ num_key_value_heads = self.hparams["num_key_value_heads"]
3443
+ hidden_size = self.hparams["hidden_size"]
3444
+ head_size = hidden_size // num_attention_heads
3445
+ rms_norm_eps = self.hparams["rms_norm_eps"]
3446
+ intermediate_size = self.hparams["intermediate_size"]
3447
+ time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3448
+ time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
3449
+
3450
+ # RWKV isn't context limited
3451
+ self.gguf_writer.add_context_length(1048576)
3452
+ self.gguf_writer.add_embedding_length(hidden_size)
3453
+ self.gguf_writer.add_block_count(block_count)
3454
+ self.gguf_writer.add_wkv_head_size(head_size)
3455
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
3456
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
3457
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
3458
+ self.gguf_writer.add_file_type(self.ftype)
3459
+
3460
+ # special parameters for time_mixing in RWKV6QWEN2
3461
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3462
+ self.gguf_writer.add_token_shift_count(1)
3463
+ # RWKV6QWEN2 use grouped key/value like GQA
3464
+ self.gguf_writer.add_head_count_kv(num_key_value_heads)
3465
+
3466
+ # required by llama.cpp, unused
3467
+ self.gguf_writer.add_head_count(0)
3468
+
3469
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3470
+ for new_name, data in super().modify_tensors(data_torch, name, bid):
3471
+ if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
3472
+ data = data.view(5, -1, data.shape[-1])
3473
+ # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
3474
+ # permute them here to avoid code changes
3475
+ data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
3476
+ if "w2" in new_name:
3477
+ data = data.view(5, -1, data.shape[-1])
3478
+ yield (new_name, data)
3479
+ continue
3480
+ yield (new_name, data)
3481
+
3482
+
3483
+ @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
3484
+ class MambaModel(Model):
3485
+ model_arch = gguf.MODEL_ARCH.MAMBA
3486
+
3487
+ def set_vocab(self):
3488
+ vocab_size = self.hparams["vocab_size"]
3489
+ # Round vocab size to next multiple of 8
3490
+ pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
3491
+ # pad using ceiling division
3492
+ # ref: https://stackoverflow.com/a/17511341/22827863
3493
+ vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
3494
+ self.hparams["vocab_size"] = vocab_size
3495
+
3496
+ if (self.dir_model / "tokenizer.json").is_file():
3497
+ self._set_vocab_gpt2()
3498
+ elif (self.dir_model / "tokenizer.model").is_file():
3499
+ self._set_vocab_sentencepiece()
3500
+ else:
3501
+ # Use the GPT-NeoX tokenizer when no tokenizer files are present
3502
+ self._set_vocab_builtin("gpt-neox", vocab_size)
3503
+
3504
+ def set_gguf_parameters(self):
3505
+ d_model = self.find_hparam(["hidden_size", "d_model"])
3506
+ d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
3507
+ d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
3508
+ d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
3509
+ # ceiling division
3510
+ # ref: https://stackoverflow.com/a/17511341/22827863
3511
+ # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
3512
+ dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
3513
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
3514
+ use_dt_b_c_norm = False
3515
+ # For falconmamba we do apply RMS norm on B / DT and C layers
3516
+ if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
3517
+ use_dt_b_c_norm = True
3518
+ # Fail early for models which don't have a block expansion factor of 2
3519
+ assert d_inner == 2 * d_model
3520
+
3521
+ self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
3522
+ self.gguf_writer.add_embedding_length(d_model)
3523
+ self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
3524
+ self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
3525
+ self.gguf_writer.add_block_count(self.block_count)
3526
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
3527
+ self.gguf_writer.add_ssm_inner_size(d_inner)
3528
+ self.gguf_writer.add_ssm_state_size(d_state)
3529
+ self.gguf_writer.add_ssm_time_step_rank(dt_rank)
3530
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3531
+ self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
3532
+ self.gguf_writer.add_file_type(self.ftype)
3533
+
3534
+ _tok_embd = None
2659
3535
 
2660
3536
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2661
3537
  del bid # unused
@@ -2679,19 +3555,6 @@ class MambaModel(Model):
2679
3555
 
2680
3556
  return [(new_name, data_torch)]
2681
3557
 
2682
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
2683
- del n_dims # unused
2684
-
2685
- return bid is not None and new_name in (
2686
- self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
2687
- gguf.MODEL_TENSOR.SSM_CONV1D,
2688
- gguf.MODEL_TENSOR.SSM_X,
2689
- gguf.MODEL_TENSOR.SSM_DT,
2690
- gguf.MODEL_TENSOR.SSM_A,
2691
- gguf.MODEL_TENSOR.SSM_D,
2692
- ]
2693
- )
2694
-
2695
3558
 
2696
3559
  @Model.register("CohereForCausalLM")
2697
3560
  class CommandR2Model(Model):
@@ -2711,6 +3574,24 @@ class CommandR2Model(Model):
2711
3574
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
2712
3575
 
2713
3576
 
3577
+ @Model.register("Cohere2ForCausalLM")
3578
+ class Cohere2Model(Model):
3579
+ model_arch = gguf.MODEL_ARCH.COHERE2
3580
+
3581
+ def set_gguf_parameters(self):
3582
+ super().set_gguf_parameters()
3583
+
3584
+ self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
3585
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
3586
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3587
+
3588
+ rotary_pct = self.hparams["rotary_pct"]
3589
+ hidden_size = self.hparams["hidden_size"]
3590
+ num_attention_heads = self.hparams["num_attention_heads"]
3591
+ self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
3592
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3593
+
3594
+
2714
3595
  @Model.register("OlmoForCausalLM")
2715
3596
  @Model.register("OLMoForCausalLM")
2716
3597
  class OlmoModel(Model):
@@ -2739,6 +3620,71 @@ class OlmoModel(Model):
2739
3620
  return [(self.map_tensor_name(name), data_torch)]
2740
3621
 
2741
3622
 
3623
+ @Model.register("Olmo2ForCausalLM")
3624
+ class Olmo2Model(Model):
3625
+ model_arch = gguf.MODEL_ARCH.OLMO2
3626
+
3627
+
3628
+ @Model.register("OlmoeForCausalLM")
3629
+ class OlmoeModel(Model):
3630
+ model_arch = gguf.MODEL_ARCH.OLMOE
3631
+
3632
+ def set_gguf_parameters(self):
3633
+ super().set_gguf_parameters()
3634
+ self.gguf_writer.add_layer_norm_rms_eps(1e-5)
3635
+ if (n_experts := self.hparams.get("num_experts")) is not None:
3636
+ self.gguf_writer.add_expert_count(n_experts)
3637
+
3638
+ _experts: list[dict[str, Tensor]] | None = None
3639
+
3640
+ # Copied from: Qwen2MoeModel
3641
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3642
+ # process the experts separately
3643
+ if name.find("experts") != -1:
3644
+ n_experts = self.hparams["num_experts"]
3645
+ assert bid is not None
3646
+
3647
+ if self._experts is None:
3648
+ self._experts = [{} for _ in range(self.block_count)]
3649
+
3650
+ self._experts[bid][name] = data_torch
3651
+
3652
+ if len(self._experts[bid]) >= n_experts * 3:
3653
+ tensors: list[tuple[str, Tensor]] = []
3654
+
3655
+ # merge the experts into a single 3d tensor
3656
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
3657
+ datas: list[Tensor] = []
3658
+
3659
+ for xid in range(n_experts):
3660
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3661
+ datas.append(self._experts[bid][ename])
3662
+ del self._experts[bid][ename]
3663
+
3664
+ data_torch = torch.stack(datas, dim=0)
3665
+
3666
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3667
+
3668
+ new_name = self.map_tensor_name(merged_name)
3669
+
3670
+ tensors.append((new_name, data_torch))
3671
+ return tensors
3672
+ else:
3673
+ return []
3674
+
3675
+ return [(self.map_tensor_name(name), data_torch)]
3676
+
3677
+ # Copied from: Qwen2MoeModel
3678
+ def prepare_tensors(self):
3679
+ super().prepare_tensors()
3680
+
3681
+ if self._experts is not None:
3682
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3683
+ experts = [k for d in self._experts for k in d.keys()]
3684
+ if len(experts) > 0:
3685
+ raise ValueError(f"Unprocessed experts: {experts}")
3686
+
3687
+
2742
3688
  @Model.register("JinaBertModel", "JinaBertForMaskedLM")
2743
3689
  class JinaBertV2Model(BertModel):
2744
3690
  model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
@@ -2777,6 +3723,14 @@ class JinaBertV2Model(BertModel):
2777
3723
  self.gguf_writer.add_add_bos_token(True)
2778
3724
  self.gguf_writer.add_add_eos_token(True)
2779
3725
 
3726
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3727
+ # if name starts with "bert.", remove the prefix
3728
+ # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
3729
+ if name.startswith("bert."):
3730
+ name = name[5:]
3731
+
3732
+ return super().modify_tensors(data_torch, name, bid)
3733
+
2780
3734
 
2781
3735
  @Model.register("OpenELMForCausalLM")
2782
3736
  class OpenELMModel(Model):
@@ -3004,7 +3958,99 @@ class ArcticModel(Model):
3004
3958
  raise ValueError(f"Unprocessed experts: {experts}")
3005
3959
 
3006
3960
 
3961
+ @Model.register("DeepseekForCausalLM")
3962
+ class DeepseekModel(Model):
3963
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK
3964
+
3965
+ def set_vocab(self):
3966
+ try:
3967
+ self._set_vocab_sentencepiece()
3968
+ except FileNotFoundError:
3969
+ self._set_vocab_gpt2()
3970
+
3971
+ def set_gguf_parameters(self):
3972
+ super().set_gguf_parameters()
3973
+ hparams = self.hparams
3974
+ if "head_dim" in hparams:
3975
+ rope_dim = hparams["head_dim"]
3976
+ else:
3977
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
3978
+
3979
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
3980
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3981
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
3982
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3983
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
3984
+ self.gguf_writer.add_expert_weights_scale(1.0)
3985
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3986
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3987
+
3988
+ _experts: list[dict[str, Tensor]] | None = None
3989
+
3990
+ @staticmethod
3991
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
3992
+ if n_head_kv is not None and n_head != n_head_kv:
3993
+ n_head = n_head_kv
3994
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
3995
+ .swapaxes(1, 2)
3996
+ .reshape(weights.shape))
3997
+
3998
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3999
+ n_head = self.hparams["num_attention_heads"]
4000
+ n_kv_head = self.hparams.get("num_key_value_heads")
4001
+
4002
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
4003
+ data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
4004
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
4005
+ data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
4006
+
4007
+ # process the experts separately
4008
+ if name.find("mlp.experts") != -1:
4009
+ n_experts = self.hparams["n_routed_experts"]
4010
+ assert bid is not None
4011
+
4012
+ if self._experts is None:
4013
+ self._experts = [{} for _ in range(self.block_count)]
4014
+
4015
+ self._experts[bid][name] = data_torch
4016
+
4017
+ if len(self._experts[bid]) >= n_experts * 3:
4018
+ tensors: list[tuple[str, Tensor]] = []
4019
+
4020
+ # merge the experts into a single 3d tensor
4021
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
4022
+ datas: list[Tensor] = []
4023
+
4024
+ for xid in range(n_experts):
4025
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4026
+ datas.append(self._experts[bid][ename])
4027
+ del self._experts[bid][ename]
4028
+
4029
+ data_torch = torch.stack(datas, dim=0)
4030
+
4031
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4032
+
4033
+ new_name = self.map_tensor_name(merged_name)
4034
+
4035
+ tensors.append((new_name, data_torch))
4036
+ return tensors
4037
+ else:
4038
+ return []
4039
+
4040
+ return [(self.map_tensor_name(name), data_torch)]
4041
+
4042
+ def prepare_tensors(self):
4043
+ super().prepare_tensors()
4044
+
4045
+ if self._experts is not None:
4046
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
4047
+ experts = [k for d in self._experts for k in d.keys()]
4048
+ if len(experts) > 0:
4049
+ raise ValueError(f"Unprocessed experts: {experts}")
4050
+
4051
+
3007
4052
  @Model.register("DeepseekV2ForCausalLM")
4053
+ @Model.register("DeepseekV3ForCausalLM")
3008
4054
  class DeepseekV2Model(Model):
3009
4055
  model_arch = gguf.MODEL_ARCH.DEEPSEEK2
3010
4056
 
@@ -3026,69 +4072,228 @@ class DeepseekV2Model(Model):
3026
4072
  self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3027
4073
  self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3028
4074
  self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
4075
+ self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
4076
+
4077
+ if hparams["scoring_func"] == "sigmoid":
4078
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
4079
+ elif hparams["scoring_func"] == "softmax":
4080
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
4081
+ else:
4082
+ raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
4083
+
3029
4084
  self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
3030
4085
 
3031
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
3032
- if self.hparams["rope_scaling"].get("type") == "yarn":
3033
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
3034
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
3035
- self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
3036
- self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
4086
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
4087
+ if self.hparams["rope_scaling"].get("type") == "yarn":
4088
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
4089
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
4090
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
4091
+ self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
4092
+
4093
+ _experts: list[dict[str, Tensor]] | None = None
4094
+
4095
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4096
+ # rename e_score_correction_bias tensors
4097
+ if name.endswith("e_score_correction_bias"):
4098
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
4099
+
4100
+ # skip Multi-Token Prediction (MTP) layers
4101
+ block_count = self.hparams["num_hidden_layers"]
4102
+ match = re.match(r"model.layers.(\d+)", name)
4103
+ if match and int(match.group(1)) >= block_count:
4104
+ return []
4105
+
4106
+ # process the experts separately
4107
+ if name.find("mlp.experts") != -1:
4108
+ n_experts = self.hparams["n_routed_experts"]
4109
+ assert bid is not None
4110
+
4111
+ if self._experts is None:
4112
+ self._experts = [{} for _ in range(self.block_count)]
4113
+
4114
+ self._experts[bid][name] = data_torch
4115
+
4116
+ if len(self._experts[bid]) >= n_experts * 3:
4117
+ tensors: list[tuple[str, Tensor]] = []
4118
+
4119
+ # merge the experts into a single 3d tensor
4120
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
4121
+ datas: list[Tensor] = []
4122
+
4123
+ for xid in range(n_experts):
4124
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4125
+ datas.append(self._experts[bid][ename])
4126
+ del self._experts[bid][ename]
4127
+
4128
+ data_torch = torch.stack(datas, dim=0)
4129
+
4130
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4131
+
4132
+ new_name = self.map_tensor_name(merged_name)
4133
+
4134
+ tensors.append((new_name, data_torch))
4135
+ return tensors
4136
+ else:
4137
+ return []
4138
+
4139
+ return [(self.map_tensor_name(name), data_torch)]
4140
+
4141
+ def prepare_tensors(self):
4142
+ super().prepare_tensors()
4143
+
4144
+ if self._experts is not None:
4145
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
4146
+ experts = [k for d in self._experts for k in d.keys()]
4147
+ if len(experts) > 0:
4148
+ raise ValueError(f"Unprocessed experts: {experts}")
4149
+
4150
+
4151
+ @Model.register("T5WithLMHeadModel")
4152
+ @Model.register("T5ForConditionalGeneration")
4153
+ @Model.register("MT5ForConditionalGeneration")
4154
+ @Model.register("UMT5ForConditionalGeneration")
4155
+ class T5Model(Model):
4156
+ model_arch = gguf.MODEL_ARCH.T5
4157
+
4158
+ def __init__(self, *args, **kwargs):
4159
+ super().__init__(*args, **kwargs)
4160
+ self.shared_token_embeddings_found = False
4161
+
4162
+ def set_vocab(self):
4163
+ # to avoid TypeError: Descriptors cannot be created directly
4164
+ # exception when importing sentencepiece_model_pb2
4165
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
4166
+ from sentencepiece import SentencePieceProcessor
4167
+ from sentencepiece import sentencepiece_model_pb2 as model
4168
+
4169
+ tokenizer_path = self.dir_model / 'tokenizer.model'
4170
+
4171
+ # many older models use spiece.model tokenizer model filename
4172
+ if not tokenizer_path.is_file():
4173
+ tokenizer_path = self.dir_model / 'spiece.model'
4174
+
4175
+ if not tokenizer_path.is_file():
4176
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
4177
+
4178
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
4179
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
4180
+
4181
+ # some models like Pile-T5 family use BPE tokenizer instead of Unigram
4182
+ if sentencepiece_model.trainer_spec.model_type == 2: # BPE
4183
+ # assure the tokenizer model file name is correct
4184
+ assert tokenizer_path.name == 'tokenizer.model'
4185
+ return self._set_vocab_sentencepiece()
4186
+ else:
4187
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
4188
+
4189
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
4190
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
4191
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
4192
+
4193
+ tokenizer = SentencePieceProcessor()
4194
+ tokenizer.LoadFromFile(str(tokenizer_path))
4195
+
4196
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
4197
+
4198
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
4199
+ scores: list[float] = [-10000.0] * vocab_size
4200
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3037
4201
 
3038
- _experts: list[dict[str, Tensor]] | None = None
4202
+ for token_id in range(tokenizer.vocab_size()):
4203
+ piece = tokenizer.IdToPiece(token_id)
4204
+ text = piece.encode("utf-8")
4205
+ score = tokenizer.GetScore(token_id)
3039
4206
 
3040
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3041
- # process the experts separately
3042
- if name.find("mlp.experts") != -1:
3043
- n_experts = self.hparams["n_routed_experts"]
3044
- assert bid is not None
4207
+ toktype = SentencePieceTokenTypes.NORMAL
4208
+ if tokenizer.IsUnknown(token_id):
4209
+ toktype = SentencePieceTokenTypes.UNKNOWN
4210
+ elif tokenizer.IsControl(token_id):
4211
+ toktype = SentencePieceTokenTypes.CONTROL
4212
+ elif tokenizer.IsUnused(token_id):
4213
+ toktype = SentencePieceTokenTypes.UNUSED
4214
+ elif tokenizer.IsByte(token_id):
4215
+ toktype = SentencePieceTokenTypes.BYTE
3045
4216
 
3046
- if self._experts is None:
3047
- self._experts = [{} for _ in range(self.block_count)]
4217
+ tokens[token_id] = text
4218
+ scores[token_id] = score
4219
+ toktypes[token_id] = toktype
3048
4220
 
3049
- self._experts[bid][name] = data_torch
4221
+ added_tokens_file = self.dir_model / 'added_tokens.json'
4222
+ if added_tokens_file.is_file():
4223
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
4224
+ added_tokens_json = json.load(f)
4225
+ for key in added_tokens_json:
4226
+ token_id = added_tokens_json[key]
4227
+ if token_id >= vocab_size:
4228
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
4229
+ continue
3050
4230
 
3051
- if len(self._experts[bid]) >= n_experts * 3:
3052
- tensors: list[tuple[str, Tensor]] = []
4231
+ tokens[token_id] = key.encode("utf-8")
4232
+ scores[token_id] = -1000.0
4233
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
3053
4234
 
3054
- # merge the experts into a single 3d tensor
3055
- for w_name in ["down_proj", "gate_proj", "up_proj"]:
3056
- datas: list[Tensor] = []
4235
+ if vocab_size > len(tokens):
4236
+ pad_count = vocab_size - len(tokens)
4237
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
4238
+ for i in range(1, pad_count + 1):
4239
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
4240
+ scores.append(-1000.0)
4241
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
3057
4242
 
3058
- for xid in range(n_experts):
3059
- ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3060
- datas.append(self._experts[bid][ename])
3061
- del self._experts[bid][ename]
4243
+ self.gguf_writer.add_tokenizer_model("t5")
4244
+ self.gguf_writer.add_tokenizer_pre("default")
4245
+ self.gguf_writer.add_token_list(tokens)
4246
+ self.gguf_writer.add_token_scores(scores)
4247
+ self.gguf_writer.add_token_types(toktypes)
4248
+ self.gguf_writer.add_add_space_prefix(add_prefix)
4249
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
4250
+ if precompiled_charsmap:
4251
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3062
4252
 
3063
- data_torch = torch.stack(datas, dim=0)
4253
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
4254
+ special_vocab.add_to_gguf(self.gguf_writer)
3064
4255
 
3065
- merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4256
+ self.gguf_writer.add_add_bos_token(False)
4257
+ self.gguf_writer.add_add_eos_token(True)
3066
4258
 
3067
- new_name = self.map_tensor_name(merged_name)
4259
+ def set_gguf_parameters(self):
4260
+ if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
4261
+ logger.warning("Couldn't find context length in config.json, assuming default value of 512")
4262
+ n_ctx = 512
4263
+ self.gguf_writer.add_context_length(n_ctx)
4264
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
4265
+ self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
4266
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
4267
+ self.gguf_writer.add_head_count(self.hparams["num_heads"])
4268
+ self.gguf_writer.add_key_length(self.hparams["d_kv"])
4269
+ self.gguf_writer.add_value_length(self.hparams["d_kv"])
4270
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
4271
+ self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
4272
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
4273
+ self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
4274
+ self.gguf_writer.add_file_type(self.ftype)
3068
4275
 
3069
- tensors.append((new_name, data_torch))
3070
- return tensors
4276
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4277
+ del bid # unused
4278
+
4279
+ # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
4280
+ # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
4281
+ # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
4282
+ # and decoder and ignore the remaining ones.
4283
+ if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
4284
+ if not self.shared_token_embeddings_found:
4285
+ name = "shared.weight"
4286
+ self.shared_token_embeddings_found = True
3071
4287
  else:
4288
+ logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
3072
4289
  return []
3073
4290
 
3074
4291
  return [(self.map_tensor_name(name), data_torch)]
3075
4292
 
3076
- def prepare_tensors(self):
3077
- super().prepare_tensors()
3078
-
3079
- if self._experts is not None:
3080
- # flatten `list[dict[str, Tensor]]` into `list[str]`
3081
- experts = [k for d in self._experts for k in d.keys()]
3082
- if len(experts) > 0:
3083
- raise ValueError(f"Unprocessed experts: {experts}")
3084
-
3085
4293
 
3086
- @Model.register("T5WithLMHeadModel")
3087
- @Model.register("T5ForConditionalGeneration")
3088
- @Model.register("MT5ForConditionalGeneration")
3089
- @Model.register("UMT5ForConditionalGeneration")
3090
- class T5Model(Model):
3091
- model_arch = gguf.MODEL_ARCH.T5
4294
+ @Model.register("T5EncoderModel")
4295
+ class T5EncoderModel(Model):
4296
+ model_arch = gguf.MODEL_ARCH.T5ENCODER
3092
4297
 
3093
4298
  def __init__(self, *args, **kwargs):
3094
4299
  super().__init__(*args, **kwargs)
@@ -3205,7 +4410,6 @@ class T5Model(Model):
3205
4410
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3206
4411
  self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
3207
4412
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
3208
- self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
3209
4413
  self.gguf_writer.add_file_type(self.ftype)
3210
4414
 
3211
4415
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -3240,10 +4444,7 @@ class JaisModel(Model):
3240
4444
 
3241
4445
  # Embeddings scale
3242
4446
  self.embeddings_scale = 1.0
3243
- # note: For some JAIS flavors, output is tied to (same as) wte in original model
3244
- self.output_is_wte = False
3245
4447
  if 'mup_embeddings_scale' in self.hparams:
3246
- self.output_is_wte = True # Hack (?)
3247
4448
  self.embeddings_scale = self.hparams['mup_embeddings_scale']
3248
4449
  elif 'embeddings_scale' in self.hparams:
3249
4450
  self.embeddings_scale = self.hparams['embeddings_scale']
@@ -3300,10 +4501,7 @@ class JaisModel(Model):
3300
4501
 
3301
4502
  if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3302
4503
  tensors.append((new_name, data_torch * self.embeddings_scale))
3303
- if self.output_is_wte:
3304
- tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
3305
4504
  elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3306
- assert not self.output_is_wte
3307
4505
  tensors.append((new_name, data_torch * self.width_scale))
3308
4506
  else:
3309
4507
  tensors.append((new_name, data_torch))
@@ -3315,7 +4513,7 @@ class JaisModel(Model):
3315
4513
  self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
3316
4514
 
3317
4515
 
3318
- @Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
4516
+ @Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
3319
4517
  class ChatGLMModel(Model):
3320
4518
  model_arch = gguf.MODEL_ARCH.CHATGLM
3321
4519
 
@@ -3421,47 +4619,15 @@ class ChatGLMModel(Model):
3421
4619
 
3422
4620
  from transformers import AutoTokenizer
3423
4621
  tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3424
- vocab_size = hparams["padded_vocab_size"]
4622
+ vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
3425
4623
  assert max(tokenizer.get_vocab().values()) < vocab_size
3426
4624
 
3427
- tokpre = self.get_vocab_base_pre(tokenizer)
3428
-
3429
- merges = []
3430
- vocab = {}
3431
- mergeable_ranks = tokenizer.mergeable_ranks
3432
- for token, rank in mergeable_ranks.items():
3433
- vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
3434
- if len(token) == 1:
3435
- continue
3436
- merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
3437
- assert len(merged) >= 2 and len(merged) <= 7
3438
- merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
3439
-
3440
- # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3441
- added_vocab = tokenizer.get_added_vocab()
3442
- reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
3443
-
3444
- for i in range(vocab_size):
3445
- if i not in reverse_vocab:
3446
- tokens.append(f"[PAD{i}]")
3447
- toktypes.append(gguf.TokenType.UNUSED)
3448
- elif reverse_vocab[i] in added_vocab:
3449
- tokens.append(reverse_vocab[i])
3450
- if tokenizer.added_tokens_decoder[i].special:
3451
- toktypes.append(gguf.TokenType.CONTROL)
3452
- else:
3453
- toktypes.append(gguf.TokenType.USER_DEFINED)
3454
- else:
3455
- tokens.append(reverse_vocab[i])
3456
- toktypes.append(gguf.TokenType.NORMAL)
3457
-
4625
+ tokens, toktypes, tokpre = self.get_vocab_base()
3458
4626
  self.gguf_writer.add_tokenizer_model("gpt2")
3459
4627
  self.gguf_writer.add_tokenizer_pre(tokpre)
3460
4628
  self.gguf_writer.add_token_list(tokens)
3461
4629
  self.gguf_writer.add_token_types(toktypes)
3462
-
3463
- special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
3464
- special_vocab.merges = merges
4630
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3465
4631
  # only add special tokens when they were not already loaded from config.json
3466
4632
  special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
3467
4633
  special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
@@ -3472,16 +4638,20 @@ class ChatGLMModel(Model):
3472
4638
  def set_gguf_parameters(self):
3473
4639
  n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
3474
4640
  n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
3475
- n_head_kv = self.hparams.get("multi_query_group_num", n_head)
4641
+ n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
3476
4642
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
3477
4643
  self.gguf_writer.add_embedding_length(n_embed)
3478
- self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
3479
- self.gguf_writer.add_block_count(self.hparams["num_layers"])
4644
+ self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
4645
+ self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
3480
4646
  self.gguf_writer.add_head_count(n_head)
3481
4647
  self.gguf_writer.add_head_count_kv(n_head_kv)
3482
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
4648
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
3483
4649
  self.gguf_writer.add_file_type(self.ftype)
3484
- self.gguf_writer.add_rope_dimension_count(64)
4650
+ if "attention_dim" in self.hparams:
4651
+ rope_dim = self.hparams["attention_dim"]
4652
+ else:
4653
+ rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4654
+ self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
3485
4655
  self.gguf_writer.add_add_bos_token(False)
3486
4656
  rope_freq = 10000
3487
4657
  if "rope_ratio" in self.hparams:
@@ -3491,14 +4661,224 @@ class ChatGLMModel(Model):
3491
4661
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3492
4662
  del bid # unused
3493
4663
 
3494
- if name.endswith(".rotary_pos_emb.inv_freq"):
4664
+ if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
3495
4665
  return []
3496
4666
 
3497
4667
  name = name.removeprefix("transformer.")
3498
4668
  return [(self.map_tensor_name(name), data_torch)]
3499
4669
 
3500
- ###### CONVERSION LOGIC ######
3501
4670
 
4671
+ @Model.register("NemotronForCausalLM")
4672
+ class NemotronModel(Model):
4673
+ model_arch = gguf.MODEL_ARCH.NEMOTRON
4674
+
4675
+ def set_vocab(self):
4676
+ self._set_vocab_sentencepiece()
4677
+ self.gguf_writer.add_pad_token_id(0)
4678
+ self.gguf_writer.add_unk_token_id(1)
4679
+
4680
+ def set_gguf_parameters(self):
4681
+ super().set_gguf_parameters()
4682
+ hparams = self.hparams
4683
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
4684
+
4685
+ f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
4686
+ self.gguf_writer.add_layer_norm_eps(f_norm_eps)
4687
+
4688
+ # * Partial RoPE
4689
+ rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
4690
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
4691
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
4692
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
4693
+
4694
+ # * RopeScaling for Nemotron
4695
+ if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
4696
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
4697
+ else:
4698
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
4699
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
4700
+
4701
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4702
+ # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
4703
+ # model.layers.{l}.input_layernorm.weight
4704
+ # model.layers.{l}.post_attention_layernorm.weight
4705
+ # model.norm.weight
4706
+ if name.endswith("norm.weight"):
4707
+ data_torch = data_torch + 1
4708
+
4709
+ return [(self.map_tensor_name(name), data_torch)]
4710
+
4711
+
4712
+ @Model.register("ExaoneForCausalLM")
4713
+ class ExaoneModel(Model):
4714
+ model_arch = gguf.MODEL_ARCH.EXAONE
4715
+
4716
+ def set_gguf_parameters(self):
4717
+ hparams = self.hparams
4718
+
4719
+ assert (hparams["activation_function"] == "silu")
4720
+
4721
+ max_position_embeddings = hparams["max_position_embeddings"]
4722
+ embed_dim = hparams["hidden_size"]
4723
+ num_heads = hparams["num_attention_heads"]
4724
+ num_kv_heads = hparams.get("num_key_value_heads", num_heads)
4725
+ layer_norm_eps = hparams["layer_norm_epsilon"]
4726
+ intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
4727
+ num_layers = hparams["num_layers"]
4728
+ # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
4729
+ # attention_dropout_rate = hparams["attention_dropout"]
4730
+ # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
4731
+ # embed_dropout_rate = hparams["embed_dropout"]
4732
+ self.gguf_writer.add_embedding_length(embed_dim)
4733
+ self.gguf_writer.add_head_count(num_heads)
4734
+ self.gguf_writer.add_head_count_kv(num_kv_heads)
4735
+ self.gguf_writer.add_context_length(max_position_embeddings)
4736
+ self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
4737
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
4738
+ self.gguf_writer.add_block_count(num_layers)
4739
+ self.gguf_writer.add_file_type(self.ftype)
4740
+
4741
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
4742
+ self.gguf_writer.add_rope_freq_base(rope_theta)
4743
+ rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
4744
+ rotary_factor = rotary_factor if rotary_factor is not None else 1.0
4745
+ self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
4746
+ if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
4747
+ if hparams["rope_scaling"].get("type") == "linear":
4748
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
4749
+ self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
4750
+
4751
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
4752
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
4753
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
4754
+ base = self.hparams.get("rope_theta", 10000.0)
4755
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
4756
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
4757
+
4758
+ factor = rope_scaling.get("factor", 8.0)
4759
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
4760
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
4761
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
4762
+
4763
+ low_freq_wavelen = old_context_len / low_freq_factor
4764
+ high_freq_wavelen = old_context_len / high_freq_factor
4765
+ assert low_freq_wavelen != high_freq_wavelen
4766
+
4767
+ rope_factors = []
4768
+ for freq in freqs:
4769
+ wavelen = 2 * math.pi / freq
4770
+ if wavelen < high_freq_wavelen:
4771
+ rope_factors.append(1)
4772
+ elif wavelen > low_freq_wavelen:
4773
+ rope_factors.append(factor)
4774
+ else:
4775
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
4776
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
4777
+
4778
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
4779
+
4780
+
4781
+ @Model.register("GraniteForCausalLM")
4782
+ class GraniteModel(LlamaModel):
4783
+ """Conversion for IBM's GraniteForCausalLM"""
4784
+ model_arch = gguf.MODEL_ARCH.GRANITE
4785
+
4786
+ def set_gguf_parameters(self):
4787
+ """Granite uses standard llama parameters with the following differences:
4788
+
4789
+ - No head_dim support
4790
+ - New multiplier params:
4791
+ - attention_scale
4792
+ - embedding_scale
4793
+ - residual_scale
4794
+ - logits_scaling
4795
+ """
4796
+ if head_dim := self.hparams.pop("head_dim", None):
4797
+ logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
4798
+ super().set_gguf_parameters()
4799
+ # NOTE: Convert _multiplier params to _scale params for naming
4800
+ # consistency
4801
+ if attention_scale := self.hparams.get("attention_multiplier"):
4802
+ self.gguf_writer.add_attention_scale(attention_scale)
4803
+ logger.info("gguf: (granite) attention_scale = %s", attention_scale)
4804
+ if embedding_scale := self.hparams.get("embedding_multiplier"):
4805
+ self.gguf_writer.add_embedding_scale(embedding_scale)
4806
+ logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
4807
+ if residual_scale := self.hparams.get("residual_multiplier"):
4808
+ self.gguf_writer.add_residual_scale(residual_scale)
4809
+ logger.info("gguf: (granite) residual_scale = %s", residual_scale)
4810
+ if logits_scale := self.hparams.get("logits_scaling"):
4811
+ self.gguf_writer.add_logit_scale(logits_scale)
4812
+ logger.info("gguf: (granite) logits_scale = %s", logits_scale)
4813
+
4814
+
4815
+ @Model.register("GraniteMoeForCausalLM")
4816
+ class GraniteMoeModel(GraniteModel):
4817
+ """Conversion for IBM's GraniteMoeForCausalLM"""
4818
+ model_arch = gguf.MODEL_ARCH.GRANITE_MOE
4819
+
4820
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4821
+ """In modeling_granitemoe, the JetMoe implementation of parallel experts
4822
+ is used. This essentially merges w1 and w3 into a single tensor with 2x
4823
+ the hidden size that is then split during forward. To keep compatibility
4824
+ with existing mixtral support, we pull them apart here.
4825
+ """
4826
+
4827
+ if name.endswith("block_sparse_moe.input_linear.weight"):
4828
+ ffn_dim = self.hparams["intermediate_size"]
4829
+ assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
4830
+ gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
4831
+ return [
4832
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
4833
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
4834
+ ]
4835
+
4836
+ return super().modify_tensors(data_torch, name, bid)
4837
+
4838
+
4839
+ @Model.register("ChameleonForConditionalGeneration")
4840
+ @Model.register("ChameleonForCausalLM") # obsolete
4841
+ class ChameleonModel(Model):
4842
+ model_arch = gguf.MODEL_ARCH.CHAMELEON
4843
+
4844
+ def set_gguf_parameters(self):
4845
+ super().set_gguf_parameters()
4846
+ self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
4847
+
4848
+ def set_vocab(self):
4849
+ self._set_vocab_gpt2()
4850
+
4851
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4852
+ # ignore image tokenizer for now
4853
+ # TODO: remove this once image support is implemented for Chameleon
4854
+ if name.startswith("model.vqmodel"):
4855
+ return []
4856
+
4857
+ n_head = self.hparams["num_attention_heads"]
4858
+ n_kv_head = self.hparams.get("num_key_value_heads")
4859
+ hidden_dim = self.hparams.get("hidden_size")
4860
+
4861
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
4862
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
4863
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
4864
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
4865
+ if name.endswith(("q_norm.weight", "q_norm.bias")):
4866
+ data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
4867
+ if name.endswith(("k_norm.weight", "k_norm.bias")):
4868
+ data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
4869
+
4870
+ return [(self.map_tensor_name(name), data_torch)]
4871
+
4872
+ # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
4873
+ @staticmethod
4874
+ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
4875
+ head_dim = hidden_dim // n_heads
4876
+ data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
4877
+ data_torch = data_torch.repeat_interleave(n_heads, 0)
4878
+ return data_torch
4879
+
4880
+
4881
+ ###### CONVERSION LOGIC ######
3502
4882
 
3503
4883
  # tree of lazy tensors
3504
4884
  class LazyTorchTensor(gguf.LazyBase):
@@ -3578,8 +4958,8 @@ def parse_args() -> argparse.Namespace:
3578
4958
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
3579
4959
  )
3580
4960
  parser.add_argument(
3581
- "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
3582
- help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
4961
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
4962
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
3583
4963
  )
3584
4964
  parser.add_argument(
3585
4965
  "--bigendian", action="store_true",
@@ -3588,6 +4968,7 @@ def parse_args() -> argparse.Namespace:
3588
4968
  parser.add_argument(
3589
4969
  "model", type=Path,
3590
4970
  help="directory containing model file",
4971
+ nargs="?",
3591
4972
  )
3592
4973
  parser.add_argument(
3593
4974
  "--use-temp-file", action="store_true",
@@ -3625,8 +5006,15 @@ def parse_args() -> argparse.Namespace:
3625
5006
  "--metadata", type=Path,
3626
5007
  help="Specify the path for an authorship metadata override file"
3627
5008
  )
5009
+ parser.add_argument(
5010
+ "--print-supported-models", action="store_true",
5011
+ help="Print the supported models"
5012
+ )
3628
5013
 
3629
- return parser.parse_args()
5014
+ args = parser.parse_args()
5015
+ if not args.print_supported_models and args.model is None:
5016
+ parser.error("the following arguments are required: model")
5017
+ return args
3630
5018
 
3631
5019
 
3632
5020
  def split_str_to_n_bytes(split_str: str) -> int:
@@ -3650,6 +5038,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
3650
5038
  def main() -> None:
3651
5039
  args = parse_args()
3652
5040
 
5041
+ if args.print_supported_models:
5042
+ logger.error("Supported models:")
5043
+ Model.print_registered_models()
5044
+ sys.exit(0)
5045
+
3653
5046
  if args.verbose:
3654
5047
  logging.basicConfig(level=logging.DEBUG)
3655
5048
  else:
@@ -3666,6 +5059,8 @@ def main() -> None:
3666
5059
  "f16": gguf.LlamaFileType.MOSTLY_F16,
3667
5060
  "bf16": gguf.LlamaFileType.MOSTLY_BF16,
3668
5061
  "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
5062
+ "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
5063
+ "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
3669
5064
  "auto": gguf.LlamaFileType.GUESSED,
3670
5065
  }
3671
5066