bigdl-core-cpp 2.5.0b20240827__py3-none-win_amd64.whl → 2.6.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +1196 -147
  2. bigdl/cpp/convert_hf_to_gguf_update.py +69 -42
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +0 -4
  4. bigdl/cpp/convert_lora_to_gguf.py +82 -14
  5. bigdl/cpp/gguf-py/gguf/constants.py +645 -187
  6. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  7. bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
  8. bigdl/cpp/gguf-py/gguf/gguf_writer.py +92 -16
  9. bigdl/cpp/gguf-py/gguf/lazy.py +0 -1
  10. bigdl/cpp/gguf-py/gguf/metadata.py +131 -19
  11. bigdl/cpp/gguf-py/gguf/quants.py +81 -0
  12. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +249 -38
  13. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  14. bigdl/cpp/gguf-py/gguf/vocab.py +24 -2
  15. bigdl/cpp/libs/common.lib +0 -0
  16. bigdl/cpp/libs/ggml-base.dll +0 -0
  17. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  18. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  19. bigdl/cpp/libs/ggml.dll +0 -0
  20. bigdl/cpp/libs/libc++.dll +0 -0
  21. bigdl/cpp/libs/llama-batched.exe +0 -0
  22. bigdl/cpp/libs/llama-bench.exe +0 -0
  23. bigdl/cpp/libs/llama-cli.exe +0 -0
  24. bigdl/cpp/libs/llama-embedding.exe +0 -0
  25. bigdl/cpp/libs/llama-gemma3-cli.exe +0 -0
  26. bigdl/cpp/libs/llama-gguf.exe +0 -0
  27. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  28. bigdl/cpp/libs/llama-lookup.exe +0 -0
  29. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  30. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  31. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  32. bigdl/cpp/libs/llama-quantize.exe +0 -0
  33. bigdl/cpp/libs/llama-server.exe +0 -0
  34. bigdl/cpp/libs/llama-simple.exe +0 -0
  35. bigdl/cpp/libs/llama-speculative.exe +0 -0
  36. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  37. bigdl/cpp/libs/llama.dll +0 -0
  38. bigdl/cpp/libs/llava_shared.dll +0 -0
  39. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  40. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  41. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  42. bigdl/cpp/libs/ollama-lib.exe +0 -0
  43. bigdl/cpp/libs/ollama.exe +0 -0
  44. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  45. bigdl/cpp/libs/ollama_llama.dll +0 -0
  46. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  47. bigdl_core_cpp-2.6.0.data/scripts/init-ollama.bat +16 -0
  48. {bigdl_core_cpp-2.5.0b20240827.dist-info → bigdl_core_cpp-2.6.0.dist-info}/METADATA +9 -5
  49. bigdl_core_cpp-2.6.0.dist-info/RECORD +57 -0
  50. {bigdl_core_cpp-2.5.0b20240827.dist-info → bigdl_core_cpp-2.6.0.dist-info}/WHEEL +1 -1
  51. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll +0 -0
  52. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
  53. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
  54. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
  55. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
  56. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
  57. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
  58. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
  59. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
  60. bigdl_core_cpp-2.5.0b20240827.data/scripts/init-ollama.bat +0 -19
  61. bigdl_core_cpp-2.5.0b20240827.dist-info/RECORD +0 -54
  62. {bigdl_core_cpp-2.5.0b20240827.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.bat +0 -0
  63. {bigdl_core_cpp-2.5.0b20240827.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.ps1 +0 -0
  64. {bigdl_core_cpp-2.5.0b20240827.dist-info → bigdl_core_cpp-2.6.0.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import ast
6
7
  import logging
7
8
  import argparse
8
9
  import contextlib
@@ -14,6 +15,7 @@ from enum import IntEnum
14
15
  from pathlib import Path
15
16
  from hashlib import sha256
16
17
  from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
18
+ from itertools import chain
17
19
 
18
20
  import math
19
21
  import numpy as np
@@ -70,7 +72,8 @@ class Model:
70
72
  def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
71
73
  use_temp_file: bool = False, eager: bool = False,
72
74
  metadata_override: Path | None = None, model_name: str | None = None,
73
- split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
75
+ split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
76
+ small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
74
77
  if type(self) is Model:
75
78
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
76
79
 
@@ -85,7 +88,7 @@ class Model:
85
88
  self.is_safetensors = len(self.part_names) > 0
86
89
  if not self.is_safetensors:
87
90
  self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
88
- self.hparams = Model.load_hparams(self.dir_model)
91
+ self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
89
92
  self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
90
93
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
91
94
  self.tensor_names = None
@@ -129,12 +132,14 @@ class Model:
129
132
  def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
130
133
  tensor_names_from_parts: set[str] = set()
131
134
 
132
- if len(self.part_names) > 1:
135
+ index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
136
+ index_name += ".index.json"
137
+ index_file = self.dir_model / index_name
138
+
139
+ if index_file.is_file():
133
140
  self.tensor_names = set()
134
- index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
135
- index_name += ".index.json"
136
141
  logger.info(f"gguf: loading model weight map from '{index_name}'")
137
- with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
142
+ with open(index_file, "r", encoding="utf-8") as f:
138
143
  index: dict[str, Any] = json.load(f)
139
144
  weight_map = index.get("weight_map")
140
145
  if weight_map is None or not isinstance(weight_map, dict):
@@ -142,6 +147,7 @@ class Model:
142
147
  self.tensor_names.update(weight_map.keys())
143
148
  else:
144
149
  self.tensor_names = tensor_names_from_parts
150
+ weight_map = {}
145
151
 
146
152
  for part_name in self.part_names:
147
153
  logger.info(f"gguf: loading model part '{part_name}'")
@@ -168,9 +174,17 @@ class Model:
168
174
  data = LazyTorchTensor.from_eager(data)
169
175
  yield name, data
170
176
 
171
- # only verify tensor name presence; it doesn't matter if they are not in the right files
172
- if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
173
- raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
177
+ # verify tensor name presence and identify potentially missing files
178
+ if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
179
+ missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
180
+ extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
181
+ missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
182
+ if len(extra) == 0 and len(missing_files) > 0:
183
+ raise ValueError(f"Missing or incomplete model files: {missing_files}")
184
+ else:
185
+ raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
186
+ f"Missing tensors: {missing}\n"
187
+ f"Extra tensors: {extra}")
174
188
 
175
189
  def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
176
190
  if key not in gguf.MODEL_TENSORS[self.model_arch]:
@@ -207,17 +221,17 @@ class Model:
207
221
  self.gguf_writer.add_context_length(n_ctx)
208
222
  logger.info(f"gguf: context length = {n_ctx}")
209
223
 
210
- n_embd = self.find_hparam(["hidden_size", "n_embd"])
211
- self.gguf_writer.add_embedding_length(n_embd)
212
- logger.info(f"gguf: embedding length = {n_embd}")
224
+ if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
225
+ self.gguf_writer.add_embedding_length(n_embd)
226
+ logger.info(f"gguf: embedding length = {n_embd}")
213
227
 
214
228
  if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
215
229
  self.gguf_writer.add_feed_forward_length(n_ff)
216
230
  logger.info(f"gguf: feed forward length = {n_ff}")
217
231
 
218
- n_head = self.find_hparam(["num_attention_heads", "n_head"])
219
- self.gguf_writer.add_head_count(n_head)
220
- logger.info(f"gguf: head count = {n_head}")
232
+ if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
233
+ self.gguf_writer.add_head_count(n_head)
234
+ logger.info(f"gguf: head count = {n_head}")
221
235
 
222
236
  if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
223
237
  self.gguf_writer.add_head_count_kv(n_head_kv)
@@ -256,10 +270,14 @@ class Model:
256
270
 
257
271
  return False
258
272
 
273
+ # some models need extra generated tensors (like rope_freqs)
274
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
275
+ return ()
276
+
259
277
  def prepare_tensors(self):
260
278
  max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
261
279
 
262
- for name, data_torch in self.get_tensors():
280
+ for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
263
281
  # we don't need these
264
282
  if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
265
283
  continue
@@ -277,8 +295,15 @@ class Model:
277
295
  bid = int(part)
278
296
  break
279
297
 
280
- for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
281
- data: np.ndarray # type hint
298
+ for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
299
+ # TODO: why do we squeeze here?
300
+ # data = data_torch.squeeze().numpy()
301
+ data = data_torch.numpy()
302
+
303
+ # if data ends up empty, it means data_torch was a scalar tensor -> restore
304
+ if len(data.shape) == 0:
305
+ data = data_torch.numpy()
306
+
282
307
  n_dims = len(data.shape)
283
308
  data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
284
309
 
@@ -296,12 +321,34 @@ class Model:
296
321
  gguf.MODEL_TENSOR.POS_EMBD,
297
322
  gguf.MODEL_TENSOR.TOKEN_TYPES,
298
323
  gguf.MODEL_TENSOR.SSM_CONV1D,
324
+ gguf.MODEL_TENSOR.TIME_MIX_FIRST,
325
+ gguf.MODEL_TENSOR.TIME_MIX_W1,
326
+ gguf.MODEL_TENSOR.TIME_MIX_W2,
327
+ gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
328
+ gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
329
+ gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
330
+ gguf.MODEL_TENSOR.POSNET_NORM1,
331
+ gguf.MODEL_TENSOR.POSNET_NORM2,
299
332
  )
300
333
  )
301
- or not name.endswith(".weight")
334
+ or not new_name.endswith(".weight")
302
335
  ):
303
336
  data_qtype = gguf.GGMLQuantizationType.F32
304
337
 
338
+ if data_qtype is False and any(
339
+ self.match_model_tensor_name(new_name, key, bid)
340
+ for key in (
341
+ gguf.MODEL_TENSOR.TOKEN_EMBD,
342
+ gguf.MODEL_TENSOR.OUTPUT,
343
+ )
344
+ ):
345
+ if self.ftype in (
346
+ gguf.LlamaFileType.MOSTLY_TQ1_0,
347
+ gguf.LlamaFileType.MOSTLY_TQ2_0,
348
+ ):
349
+ # TODO: use Q4_K and Q6_K
350
+ data_qtype = gguf.GGMLQuantizationType.F16
351
+
305
352
  # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
306
353
  if isinstance(data_qtype, bool):
307
354
  if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -312,6 +359,10 @@ class Model:
312
359
  data_qtype = gguf.GGMLQuantizationType.BF16
313
360
  elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
314
361
  data_qtype = gguf.GGMLQuantizationType.Q8_0
362
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
363
+ data_qtype = gguf.GGMLQuantizationType.TQ1_0
364
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
365
+ data_qtype = gguf.GGMLQuantizationType.TQ2_0
315
366
  else:
316
367
  raise ValueError(f"Unknown file type: {self.ftype.name}")
317
368
 
@@ -427,6 +478,11 @@ class Model:
427
478
  return modelcls
428
479
  return func
429
480
 
481
+ @classmethod
482
+ def print_registered_models(cls):
483
+ for name in sorted(cls._model_classes.keys()):
484
+ logger.error(f"- {name}")
485
+
430
486
  @classmethod
431
487
  def from_model_architecture(cls, arch: str) -> type[Model]:
432
488
  try:
@@ -479,9 +535,19 @@ class Model:
479
535
  else:
480
536
  token: str = reverse_vocab[i]
481
537
  if token in added_vocab:
538
+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
539
+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
540
+ if not tokenizer.added_tokens_decoder[i].normalized:
541
+ previous_token = token
542
+ token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
543
+ if previous_token != token:
544
+ logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
545
+
482
546
  if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
483
547
  toktypes.append(gguf.TokenType.CONTROL)
484
548
  else:
549
+ # NOTE: this was added for Gemma.
550
+ # Encoding and decoding the tokens above isn't sufficient for this case.
485
551
  token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
486
552
  toktypes.append(gguf.TokenType.USER_DEFINED)
487
553
  else:
@@ -492,7 +558,7 @@ class Model:
492
558
 
493
559
  # NOTE: this function is generated by convert_hf_to_gguf_update.py
494
560
  # do not modify it manually!
495
- # ref: https://github.com/ggerganov/llama.cpp/pull/6920
561
+ # ref: https://github.com/ggml-org/llama.cpp/pull/6920
496
562
  # Marker: Start get_vocab_base_pre
497
563
  def get_vocab_base_pre(self, tokenizer) -> str:
498
564
  # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
@@ -525,9 +591,15 @@ class Model:
525
591
  if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
526
592
  # ref: https://huggingface.co/tiiuae/falcon-7b
527
593
  res = "falcon"
594
+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
595
+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
596
+ res = "falcon3"
528
597
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
529
598
  # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
530
599
  res = "bert-bge"
600
+ if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
601
+ # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
602
+ res = "bert-bge-large"
531
603
  if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
532
604
  # ref: https://huggingface.co/mosaicml/mpt-7b
533
605
  res = "mpt"
@@ -555,6 +627,9 @@ class Model:
555
627
  if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
556
628
  # ref: https://huggingface.co/databricks/dbrx-base
557
629
  res = "dbrx"
630
+ if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
631
+ # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
632
+ res = "jina-v1-en"
558
633
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
559
634
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
560
635
  res = "jina-v2-en"
@@ -573,7 +648,7 @@ class Model:
573
648
  if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
574
649
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
575
650
  res = "jina-v2-code"
576
- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
651
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
577
652
  # ref: https://huggingface.co/THUDM/glm-4-9b-chat
578
653
  res = "chatglm-bpe"
579
654
  if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
@@ -600,6 +675,30 @@ class Model:
600
675
  if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
601
676
  # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
602
677
  res = "exaone"
678
+ if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
679
+ # ref: https://huggingface.co/microsoft/phi-2
680
+ res = "phi-2"
681
+ if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
682
+ # ref: https://huggingface.co/facebook/chameleon-7b
683
+ res = "chameleon"
684
+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
685
+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
686
+ res = "minerva-7b"
687
+ if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
688
+ # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
689
+ res = "roberta-bpe"
690
+ if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
691
+ # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
692
+ res = "gigachat"
693
+ if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
694
+ # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
695
+ res = "megrez"
696
+ if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
697
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
698
+ res = "deepseek-v3"
699
+ if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
700
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
701
+ res = "deepseek-r1-qwen"
603
702
 
604
703
  if res is None:
605
704
  logger.warning("\n")
@@ -609,7 +708,7 @@ class Model:
609
708
  logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
610
709
  logger.warning("** - the pre-tokenization config has changed upstream")
611
710
  logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
612
- logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
711
+ logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
613
712
  logger.warning("**")
614
713
  logger.warning(f"** chkhsh: {chkhsh}")
615
714
  logger.warning("**************************************************************************************")
@@ -622,6 +721,9 @@ class Model:
622
721
  return res
623
722
  # Marker: End get_vocab_base_pre
624
723
 
724
+ def _set_vocab_none(self) -> None:
725
+ self.gguf_writer.add_tokenizer_model("none")
726
+
625
727
  def _set_vocab_gpt2(self) -> None:
626
728
  tokens, toktypes, tokpre = self.get_vocab_base()
627
729
  self.gguf_writer.add_tokenizer_model("gpt2")
@@ -1458,7 +1560,7 @@ class StableLMModel(Model):
1458
1560
  raise ValueError(f"Unprocessed norms: {norms}")
1459
1561
 
1460
1562
 
1461
- @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1563
+ @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1462
1564
  class LlamaModel(Model):
1463
1565
  model_arch = gguf.MODEL_ARCH.LLAMA
1464
1566
 
@@ -1484,6 +1586,17 @@ class LlamaModel(Model):
1484
1586
  special_vocab._set_special_token("eot", 32010)
1485
1587
  special_vocab.add_to_gguf(self.gguf_writer)
1486
1588
 
1589
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1590
+ if tokenizer_config_file.is_file():
1591
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1592
+ tokenizer_config_json = json.load(f)
1593
+ if "add_prefix_space" in tokenizer_config_json:
1594
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1595
+
1596
+ # Apply to granite small models only
1597
+ if self.hparams.get("vocab_size", 32000) == 49152:
1598
+ self.gguf_writer.add_add_bos_token(False)
1599
+
1487
1600
  def set_gguf_parameters(self):
1488
1601
  super().set_gguf_parameters()
1489
1602
  hparams = self.hparams
@@ -1500,17 +1613,6 @@ class LlamaModel(Model):
1500
1613
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1501
1614
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1502
1615
 
1503
- tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1504
- if tokenizer_config_file.is_file():
1505
- with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1506
- tokenizer_config_json = json.load(f)
1507
- if "add_prefix_space" in tokenizer_config_json:
1508
- self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1509
-
1510
- # Apply to granite small models only
1511
- if self.hparams.get("vocab_size", 32000) == 49152:
1512
- self.gguf_writer.add_add_bos_token(False)
1513
-
1514
1616
  @staticmethod
1515
1617
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1516
1618
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1566,11 +1668,11 @@ class LlamaModel(Model):
1566
1668
 
1567
1669
  return [(self.map_tensor_name(name), data_torch)]
1568
1670
 
1569
- def prepare_tensors(self):
1671
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1570
1672
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1571
1673
  if rope_scaling.get("rope_type", '').lower() == "llama3":
1572
1674
  base = self.hparams.get("rope_theta", 10000.0)
1573
- dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1675
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1574
1676
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1575
1677
 
1576
1678
  factor = rope_scaling.get("factor", 8.0)
@@ -1593,8 +1695,9 @@ class LlamaModel(Model):
1593
1695
  smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1594
1696
  rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1595
1697
 
1596
- self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1698
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1597
1699
 
1700
+ def prepare_tensors(self):
1598
1701
  super().prepare_tensors()
1599
1702
 
1600
1703
  if self._experts is not None:
@@ -1604,6 +1707,178 @@ class LlamaModel(Model):
1604
1707
  raise ValueError(f"Unprocessed experts: {experts}")
1605
1708
 
1606
1709
 
1710
+ @Model.register("DeciLMForCausalLM")
1711
+ class DeciModel(Model):
1712
+ model_arch = gguf.MODEL_ARCH.DECI
1713
+
1714
+ @staticmethod
1715
+ def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
1716
+ # DeciLM-specific code
1717
+ intermediate_size = int(2 * ffn_mult * n_embd / 3)
1718
+ return DeciModel._find_multiple(intermediate_size, 256)
1719
+
1720
+ @staticmethod
1721
+ def _find_multiple(n: int, k: int) -> int:
1722
+ # DeciLM-specific code
1723
+ if n % k == 0:
1724
+ return n
1725
+ return n + k - (n % k)
1726
+
1727
+ def __init__(self, *args, **kwargs):
1728
+ super().__init__(*args, **kwargs)
1729
+
1730
+ if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1731
+ _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
1732
+ assert self.block_count == len(_block_configs)
1733
+ self._num_kv_heads = list()
1734
+ self._num_heads = list()
1735
+ _ffn_multipliers = list()
1736
+ # ***linear attention layer***
1737
+ # if n_heads_in_group is None and replace_with_linear is True
1738
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
1739
+ # ***attention-free layer***
1740
+ # if n_heads_in_group is None and replace_with_linear is False
1741
+ # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
1742
+ # ***normal attention-layer***
1743
+ # if n_heads_in_group is not None, then
1744
+ # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
1745
+ # _num_heads[il] is num_attention_head
1746
+ for il in range(len(_block_configs)):
1747
+ if _block_configs[il]["attention"]["n_heads_in_group"] is None:
1748
+ if _block_configs[il]["attention"]["replace_with_linear"] is True:
1749
+ self._num_kv_heads.append(0)
1750
+ self._num_heads.append(self.hparams["num_attention_heads"])
1751
+ else:
1752
+ self._num_kv_heads.append(0)
1753
+ self._num_heads.append(0)
1754
+ else:
1755
+ self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
1756
+ self._num_heads.append(self.hparams["num_attention_heads"])
1757
+ _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
1758
+ assert self.block_count == len(self._num_kv_heads)
1759
+ assert self.block_count == len(self._num_heads)
1760
+ assert self.block_count == len(_ffn_multipliers)
1761
+ assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
1762
+ assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
1763
+ assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
1764
+ self._ffn_dims: list[int] = [
1765
+ DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
1766
+ for multiplier in _ffn_multipliers
1767
+ ]
1768
+
1769
+ def set_vocab(self):
1770
+ # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
1771
+ # eos_token from '|eot_id|' to '|end_of_text|'
1772
+ if self.hparams.get("vocab_size", 128256) == 128256:
1773
+ tokens, toktypes, tokpre = self.get_vocab_base()
1774
+ self.gguf_writer.add_tokenizer_model("gpt2")
1775
+ self.gguf_writer.add_tokenizer_pre(tokpre)
1776
+ self.gguf_writer.add_token_list(tokens)
1777
+ self.gguf_writer.add_token_types(toktypes)
1778
+
1779
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1780
+ special_vocab.add_to_gguf(self.gguf_writer)
1781
+ else:
1782
+ # DeciLM-7B
1783
+ self._set_vocab_llama_hf()
1784
+
1785
+ def set_gguf_parameters(self):
1786
+ if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1787
+ assert self.block_count == len(self._num_kv_heads)
1788
+ assert self.block_count == len(self._num_heads)
1789
+ assert self.block_count == len(self._ffn_dims)
1790
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
1791
+ self.gguf_writer.add_rope_freq_base(rope_theta)
1792
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1793
+ self.gguf_writer.add_head_count(self._num_heads)
1794
+ self.gguf_writer.add_feed_forward_length(self._ffn_dims)
1795
+ self.gguf_writer.add_block_count(self.block_count)
1796
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1797
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1798
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1799
+ self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1800
+ self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1801
+ self.gguf_writer.add_file_type(self.ftype)
1802
+ else: # DeciLM-7B
1803
+ super().set_gguf_parameters()
1804
+ if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
1805
+ self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
1806
+ assert self.block_count == len(self._num_kv_heads)
1807
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1808
+ hparams = self.hparams
1809
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1810
+
1811
+ if "head_dim" in hparams:
1812
+ rope_dim = hparams["head_dim"]
1813
+ else:
1814
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1815
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
1816
+
1817
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1818
+ if self.hparams["rope_scaling"].get("type") == "linear":
1819
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1820
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1821
+
1822
+ @staticmethod
1823
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1824
+ if n_head_kv is not None and n_head != n_head_kv:
1825
+ n_head = n_head_kv
1826
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1827
+ .swapaxes(1, 2)
1828
+ .reshape(weights.shape))
1829
+
1830
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1831
+ n_head = self.hparams["num_attention_heads"]
1832
+ if bid is not None:
1833
+ if "num_key_value_heads_per_layer" in self.hparams:
1834
+ n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
1835
+ elif "block_configs" in self.hparams:
1836
+ n_kv_head = self._num_kv_heads[bid]
1837
+ n_head = self._num_heads[bid]
1838
+ else:
1839
+ n_kv_head = self.hparams.get("num_key_value_heads")
1840
+ else:
1841
+ n_kv_head = self.hparams.get("num_key_value_heads")
1842
+
1843
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
1844
+ data_torch = DeciModel.permute(data_torch, n_head, n_head)
1845
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
1846
+ data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
1847
+ return [(self.map_tensor_name(name), data_torch)]
1848
+
1849
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1850
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1851
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
1852
+ base = self.hparams.get("rope_theta", 10000.0)
1853
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1854
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1855
+
1856
+ factor = rope_scaling.get("factor", 8.0)
1857
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1858
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1859
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1860
+
1861
+ low_freq_wavelen = old_context_len / low_freq_factor
1862
+ high_freq_wavelen = old_context_len / high_freq_factor
1863
+ assert low_freq_wavelen != high_freq_wavelen
1864
+
1865
+ rope_factors = []
1866
+ for freq in freqs:
1867
+ wavelen = 2 * math.pi / freq
1868
+ if wavelen < high_freq_wavelen:
1869
+ rope_factors.append(1)
1870
+ elif wavelen > low_freq_wavelen:
1871
+ rope_factors.append(factor)
1872
+ else:
1873
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1874
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1875
+
1876
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1877
+
1878
+ def prepare_tensors(self):
1879
+ super().prepare_tensors()
1880
+
1881
+
1607
1882
  @Model.register("BitnetForCausalLM")
1608
1883
  class BitnetModel(Model):
1609
1884
  model_arch = gguf.MODEL_ARCH.BITNET
@@ -1616,15 +1891,16 @@ class BitnetModel(Model):
1616
1891
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1617
1892
  self.gguf_writer.add_rope_scaling_factor(1.0)
1618
1893
 
1619
- def weight_quant(self, weight):
1894
+ def weight_quant(self, weight: Tensor) -> Tensor:
1620
1895
  dtype = weight.dtype
1621
1896
  weight = weight.float()
1622
- s = 1 / weight.abs().mean().clamp(min=1e-5)
1623
- weight = (weight * s).round().clamp(-1, 1) / s
1624
- scale = weight.abs().max().unsqueeze(0)
1625
- weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
1626
- weight = torch.sign(weight).type(dtype)
1627
- return weight.type(dtype), scale.type(torch.float32)
1897
+ scale = weight.abs().mean().clamp(min=1e-5)
1898
+ iscale = 1 / scale
1899
+ # TODO: multiply by the scale directly instead of inverting it twice
1900
+ # (this is also unnecessarily doubly inverted upstream)
1901
+ # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
1902
+ result = (weight * iscale).round().clamp(-1, 1) / iscale
1903
+ return result.type(dtype)
1628
1904
 
1629
1905
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1630
1906
  new_name = self.map_tensor_name(name)
@@ -1639,11 +1915,9 @@ class BitnetModel(Model):
1639
1915
  gguf.MODEL_TENSOR.FFN_GATE,
1640
1916
  ]):
1641
1917
  # transform weight into 1/0/-1 (in fp32)
1642
- weight_torch, scale_torch = self.weight_quant(data_torch)
1643
- yield (new_name, weight_torch)
1644
- yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
1645
- else:
1646
- yield (new_name, data_torch)
1918
+ data_torch = self.weight_quant(data_torch)
1919
+
1920
+ yield (new_name, data_torch)
1647
1921
 
1648
1922
 
1649
1923
  @Model.register("GrokForCausalLM")
@@ -1773,29 +2047,40 @@ class MiniCPMModel(Model):
1773
2047
  model_arch = gguf.MODEL_ARCH.MINICPM
1774
2048
 
1775
2049
  def set_gguf_parameters(self):
1776
- block_count = self.hparams["num_hidden_layers"]
1777
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1778
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1779
- self.gguf_writer.add_block_count(block_count)
1780
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1781
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1782
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1783
- self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
1784
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1785
- self.gguf_writer.add_file_type(self.ftype)
2050
+ super().set_gguf_parameters()
2051
+ embedding_scale = float(self.hparams["scale_emb"])
2052
+ self.gguf_writer.add_embedding_scale(embedding_scale)
2053
+ logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
2054
+ residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
2055
+ self.gguf_writer.add_residual_scale(residual_scale)
2056
+ logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
2057
+ logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
2058
+ self.gguf_writer.add_logit_scale(logit_scale)
2059
+ logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
2060
+ if self.hparams.get("rope_scaling") is not None:
2061
+ if self.hparams["rope_scaling"].get("type") == "longrope":
2062
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
2063
+ logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
2064
+
2065
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2066
+ rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1786
2067
 
1787
- def set_vocab(self):
1788
- self._set_vocab_llama_hf()
2068
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
2069
+ if rope_scaling is not None:
2070
+ long_factors = rope_scaling.get('long_factor', None)
2071
+ short_factors = rope_scaling.get('short_factor', None)
1789
2072
 
1790
- def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1791
- if n_kv_head is not None and n_head != n_kv_head:
1792
- n_head = n_kv_head
2073
+ if long_factors is None or short_factors is None:
2074
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
1793
2075
 
1794
- return (
1795
- weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1796
- .swapaxes(1, 2)
1797
- .reshape(weights.shape)
1798
- )
2076
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2077
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2078
+
2079
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2080
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2081
+
2082
+ def set_vocab(self):
2083
+ self._set_vocab_sentencepiece()
1799
2084
 
1800
2085
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1801
2086
  del bid # unused
@@ -1805,13 +2090,66 @@ class MiniCPMModel(Model):
1805
2090
 
1806
2091
  # HF models permute some of the tensors, so we need to undo that
1807
2092
  if name.endswith(("q_proj.weight")):
1808
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
2093
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1809
2094
  if name.endswith(("k_proj.weight")):
1810
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
2095
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1811
2096
 
1812
2097
  return [(self.map_tensor_name(name), data_torch)]
1813
2098
 
1814
2099
 
2100
+ @Model.register("MiniCPM3ForCausalLM")
2101
+ class MiniCPM3Model(Model):
2102
+ model_arch = gguf.MODEL_ARCH.MINICPM3
2103
+
2104
+ def set_gguf_parameters(self):
2105
+ hparams = self.hparams
2106
+
2107
+ self.gguf_writer.add_file_type(self.ftype)
2108
+ self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2109
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2110
+ self.gguf_writer.add_block_count(self.block_count)
2111
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2112
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2113
+ self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
2114
+ self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
2115
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2116
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
2117
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
2118
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
2119
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
2120
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
2121
+
2122
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2123
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
2124
+ if rope_scaling is not None:
2125
+ rope_dims = self.hparams["qk_rope_head_dim"]
2126
+
2127
+ long_factors = rope_scaling.get('long_factor', None)
2128
+ short_factors = rope_scaling.get('short_factor', None)
2129
+
2130
+ if long_factors is None or short_factors is None:
2131
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
2132
+
2133
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2134
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2135
+
2136
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2137
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2138
+
2139
+ def set_vocab(self):
2140
+ self._set_vocab_sentencepiece()
2141
+
2142
+ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
2143
+ if n_kv_head is not None and n_head != n_kv_head:
2144
+ n_head //= n_kv_head
2145
+
2146
+ return (
2147
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2148
+ .swapaxes(1, 2)
2149
+ .reshape(weights.shape)
2150
+ )
2151
+
2152
+
1815
2153
  @Model.register("QWenLMHeadModel")
1816
2154
  class QwenModel(Model):
1817
2155
  model_arch = gguf.MODEL_ARCH.QWEN
@@ -1864,6 +2202,75 @@ class Qwen2Model(Model):
1864
2202
  except FileNotFoundError:
1865
2203
  self._set_vocab_gpt2()
1866
2204
 
2205
+ def set_gguf_parameters(self):
2206
+ super().set_gguf_parameters()
2207
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2208
+ if self.hparams["rope_scaling"].get("type") == "yarn":
2209
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2210
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2211
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
2212
+
2213
+
2214
+ @Model.register("Qwen2VLForConditionalGeneration")
2215
+ class Qwen2VLModel(Model):
2216
+ model_arch = gguf.MODEL_ARCH.QWEN2VL
2217
+
2218
+ def set_gguf_parameters(self):
2219
+ super().set_gguf_parameters()
2220
+ mrope_section = self.hparams["rope_scaling"]["mrope_section"]
2221
+ mrope_section += [0] * max(0, 4 - len(mrope_section))
2222
+ self.gguf_writer.add_rope_dimension_sections(mrope_section)
2223
+
2224
+ def set_vocab(self):
2225
+ try:
2226
+ self._set_vocab_sentencepiece()
2227
+ except FileNotFoundError:
2228
+ self._set_vocab_gpt2()
2229
+
2230
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
2231
+ for name, data in super().get_tensors():
2232
+ if name.startswith("visual."):
2233
+ continue
2234
+ yield name, data
2235
+
2236
+
2237
+ @Model.register("WavTokenizerDec")
2238
+ class WavTokenizerDecModel(Model):
2239
+ model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
2240
+
2241
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2242
+ del bid # unused
2243
+
2244
+ if \
2245
+ name.endswith("codebook.cluster_size") or \
2246
+ name.endswith("codebook.embed_avg") or \
2247
+ name.endswith("codebook.inited"):
2248
+ logger.debug(f"Skipping {name!r}")
2249
+ return []
2250
+
2251
+ logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
2252
+
2253
+ return [(self.map_tensor_name(name), data_torch)]
2254
+
2255
+ def set_vocab(self):
2256
+ self._set_vocab_none()
2257
+
2258
+ def set_gguf_parameters(self):
2259
+ super().set_gguf_parameters()
2260
+ self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
2261
+ self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
2262
+ self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
2263
+ self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
2264
+ self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
2265
+
2266
+ self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
2267
+ self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
2268
+
2269
+ self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
2270
+ self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
2271
+
2272
+ self.gguf_writer.add_causal_attention(False)
2273
+
1867
2274
 
1868
2275
  @Model.register("Qwen2MoeForCausalLM")
1869
2276
  class Qwen2MoeModel(Model):
@@ -1993,6 +2400,15 @@ class Phi3MiniModel(Model):
1993
2400
  model_arch = gguf.MODEL_ARCH.PHI3
1994
2401
 
1995
2402
  def set_vocab(self):
2403
+ # Phi-4 model uses GPT2Tokenizer
2404
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2405
+ if tokenizer_config_file.is_file():
2406
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2407
+ tokenizer_config_json = json.load(f)
2408
+ tokenizer_class = tokenizer_config_json['tokenizer_class']
2409
+ if tokenizer_class == 'GPT2Tokenizer':
2410
+ return self._set_vocab_gpt2()
2411
+
1996
2412
  from sentencepiece import SentencePieceProcessor
1997
2413
 
1998
2414
  tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -2109,7 +2525,18 @@ class Phi3MiniModel(Model):
2109
2525
  self.gguf_writer.add_rope_dimension_count(rope_dims)
2110
2526
  self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
2111
2527
  self.gguf_writer.add_file_type(self.ftype)
2112
- self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
2528
+ sliding_window = self.hparams.get("sliding_window")
2529
+ # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
2530
+ if sliding_window is None:
2531
+ sliding_window = 0
2532
+ self.gguf_writer.add_sliding_window(sliding_window)
2533
+
2534
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2535
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
2536
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
2537
+ max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
2538
+ orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2539
+ rope_dims = n_embd // n_head
2113
2540
 
2114
2541
  # write rope scaling for long context (128k) model
2115
2542
  rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2140,27 +2567,84 @@ class Phi3MiniModel(Model):
2140
2567
  if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2141
2568
  raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2142
2569
 
2143
- self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
2144
- self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
2570
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2571
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2145
2572
 
2146
2573
 
2147
- @Model.register("PlamoForCausalLM")
2148
- class PlamoModel(Model):
2149
- model_arch = gguf.MODEL_ARCH.PLAMO
2574
+ @Model.register("PhiMoEForCausalLM")
2575
+ class PhiMoeModel(Phi3MiniModel):
2576
+ model_arch = gguf.MODEL_ARCH.PHIMOE
2150
2577
 
2151
- def set_vocab(self):
2152
- self._set_vocab_sentencepiece()
2578
+ _experts: list[dict[str, Tensor]] | None = None
2153
2579
 
2154
2580
  def set_gguf_parameters(self):
2155
- hparams = self.hparams
2156
- block_count = hparams["num_hidden_layers"]
2581
+ super().set_gguf_parameters()
2582
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
2583
+ self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
2157
2584
 
2158
- self.gguf_writer.add_context_length(4096) # not in config.json
2159
- self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2160
- self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2161
- self.gguf_writer.add_block_count(block_count)
2162
- self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2163
- self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
2585
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2586
+ # process the experts separately
2587
+ if name.find("block_sparse_moe.experts") != -1:
2588
+ n_experts = self.hparams["num_local_experts"]
2589
+ assert bid is not None
2590
+
2591
+ if self._experts is None:
2592
+ self._experts = [{} for _ in range(self.block_count)]
2593
+
2594
+ self._experts[bid][name] = data_torch
2595
+
2596
+ if len(self._experts[bid]) >= n_experts * 3:
2597
+ tensors: list[tuple[str, Tensor]] = []
2598
+
2599
+ # merge the experts into a single 3d tensor
2600
+ for w_name in ["w1", "w2", "w3"]:
2601
+ datas: list[Tensor] = []
2602
+
2603
+ for xid in range(n_experts):
2604
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
2605
+ datas.append(self._experts[bid][ename])
2606
+ del self._experts[bid][ename]
2607
+
2608
+ data_torch = torch.stack(datas, dim=0)
2609
+
2610
+ merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
2611
+
2612
+ new_name = self.map_tensor_name(merged_name)
2613
+
2614
+ tensors.append((new_name, data_torch))
2615
+ return tensors
2616
+ else:
2617
+ return []
2618
+
2619
+ return [(self.map_tensor_name(name), data_torch)]
2620
+
2621
+ def prepare_tensors(self):
2622
+ super().prepare_tensors()
2623
+
2624
+ if self._experts is not None:
2625
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2626
+ experts = [k for d in self._experts for k in d.keys()]
2627
+ if len(experts) > 0:
2628
+ raise ValueError(f"Unprocessed experts: {experts}")
2629
+
2630
+
2631
+ @Model.register("PlamoForCausalLM")
2632
+ class PlamoModel(Model):
2633
+ model_arch = gguf.MODEL_ARCH.PLAMO
2634
+
2635
+ def set_vocab(self):
2636
+ self._set_vocab_sentencepiece()
2637
+
2638
+ def set_gguf_parameters(self):
2639
+ hparams = self.hparams
2640
+ block_count = hparams["num_hidden_layers"]
2641
+
2642
+ self.gguf_writer.add_context_length(4096) # not in config.json
2643
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2644
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2645
+ self.gguf_writer.add_block_count(block_count)
2646
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2647
+ self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
2164
2648
  self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
2165
2649
  self.gguf_writer.add_file_type(self.ftype)
2166
2650
 
@@ -2351,7 +2835,7 @@ class InternLM2Model(Model):
2351
2835
  if chat_eos_token_id is not None:
2352
2836
  # For the chat model, we replace the eos with '<|im_end|>'.
2353
2837
  # TODO: this is a hack, should be fixed
2354
- # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2838
+ # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
2355
2839
  special_vocab.special_token_ids["eos"] = chat_eos_token_id
2356
2840
  logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
2357
2841
  " in chat mode so that the conversation can end normally.")
@@ -2401,7 +2885,67 @@ class InternLM2Model(Model):
2401
2885
  return [(self.map_tensor_name(name), data_torch)]
2402
2886
 
2403
2887
 
2404
- @Model.register("BertModel", "CamembertModel")
2888
+ @Model.register("InternLM3ForCausalLM")
2889
+ class InternLM3Model(Model):
2890
+ model_arch = gguf.MODEL_ARCH.LLAMA
2891
+
2892
+ def set_vocab(self):
2893
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
2894
+
2895
+ self.gguf_writer.add_tokenizer_model("llama")
2896
+ self.gguf_writer.add_tokenizer_pre("default")
2897
+ self.gguf_writer.add_token_list(tokens)
2898
+ self.gguf_writer.add_token_scores(scores)
2899
+ self.gguf_writer.add_token_types(toktypes)
2900
+
2901
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2902
+
2903
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2904
+ if tokenizer_config_file.is_file():
2905
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2906
+ tokenizer_config_json = json.load(f)
2907
+ if "add_prefix_space" in tokenizer_config_json:
2908
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
2909
+
2910
+ if "added_tokens_decoder" in tokenizer_config_json:
2911
+ for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
2912
+ if token_data.get("special"):
2913
+ token_id = int(token_id)
2914
+ token = token_data["content"]
2915
+ special_vocab._set_special_token(token, token_id)
2916
+ # update eos token
2917
+ if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
2918
+ special_vocab.special_token_ids["eos"] = token_id
2919
+
2920
+ special_vocab.add_to_gguf(self.gguf_writer)
2921
+
2922
+ def set_gguf_parameters(self):
2923
+ super().set_gguf_parameters()
2924
+ hparams = self.hparams
2925
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2926
+
2927
+ if "head_dim" in hparams:
2928
+ rope_dim = hparams["head_dim"]
2929
+ else:
2930
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2931
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
2932
+
2933
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2934
+ if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
2935
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2936
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2937
+
2938
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2939
+ n_head = self.hparams["num_attention_heads"]
2940
+ n_kv_head = self.hparams.get("num_key_value_heads")
2941
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
2942
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2943
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
2944
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
2945
+ return [(self.map_tensor_name(name), data_torch)]
2946
+
2947
+
2948
+ @Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
2405
2949
  class BertModel(Model):
2406
2950
  model_arch = gguf.MODEL_ARCH.BERT
2407
2951
 
@@ -2442,7 +2986,8 @@ class BertModel(Model):
2442
2986
 
2443
2987
  # we need this to validate the size of the token_type embeddings
2444
2988
  # though currently we are passing all zeros to the token_type embeddings
2445
- self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
2989
+ # "Sequence A" or "Sequence B"
2990
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2446
2991
 
2447
2992
  # convert to phantom space vocab
2448
2993
  def phantom(tok):
@@ -2466,13 +3011,73 @@ class BertModel(Model):
2466
3011
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2467
3012
  del bid # unused
2468
3013
 
3014
+ if name.startswith("bert."):
3015
+ name = name[5:]
3016
+
3017
+ if name.endswith(".gamma"):
3018
+ name = name[:-6] + ".weight"
3019
+
3020
+ if name.endswith(".beta"):
3021
+ name = name[:-5] + ".bias"
3022
+
2469
3023
  # we are only using BERT for embeddings so we don't need the pooling layer
2470
3024
  if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
2471
3025
  return [] # we don't need these
2472
3026
 
3027
+ if name.startswith("cls.predictions"):
3028
+ return []
3029
+
3030
+ if name.startswith("cls.seq_relationship"):
3031
+ return []
3032
+
2473
3033
  return [(self.map_tensor_name(name), data_torch)]
2474
3034
 
2475
3035
 
3036
+ @Model.register("RobertaModel")
3037
+ class RobertaModel(BertModel):
3038
+ model_arch = gguf.MODEL_ARCH.BERT
3039
+
3040
+ def __init__(self, *args, **kwargs):
3041
+ super().__init__(*args, **kwargs)
3042
+
3043
+ # we need the pad_token_id to know how to chop down position_embd matrix
3044
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
3045
+ self._position_offset = 1 + pad_token_id
3046
+ if "max_position_embeddings" in self.hparams:
3047
+ self.hparams["max_position_embeddings"] -= self._position_offset
3048
+ else:
3049
+ self._position_offset = None
3050
+
3051
+ def set_vocab(self):
3052
+ """Support BPE tokenizers for roberta models"""
3053
+ bpe_tok_path = self.dir_model / "tokenizer.json"
3054
+ if bpe_tok_path.exists():
3055
+ self._set_vocab_gpt2()
3056
+ self.gguf_writer.add_add_bos_token(True)
3057
+ self.gguf_writer.add_add_eos_token(True)
3058
+
3059
+ # we need this to validate the size of the token_type embeddings
3060
+ # though currently we are passing all zeros to the token_type embeddings
3061
+ # "Sequence A" or "Sequence B"
3062
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
3063
+
3064
+ else:
3065
+ return super().set_vocab()
3066
+
3067
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3068
+ # if name starts with "roberta.", remove the prefix
3069
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
3070
+ if name.startswith("roberta."):
3071
+ name = name[8:]
3072
+
3073
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
3074
+ if name == "embeddings.position_embeddings.weight":
3075
+ if self._position_offset is not None:
3076
+ data_torch = data_torch[self._position_offset:,:]
3077
+
3078
+ return super().modify_tensors(data_torch, name, bid)
3079
+
3080
+
2476
3081
  @Model.register("NomicBertModel")
2477
3082
  class NomicBertModel(BertModel):
2478
3083
  model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@@ -2503,7 +3108,7 @@ class NomicBertModel(BertModel):
2503
3108
  self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
2504
3109
 
2505
3110
 
2506
- @Model.register("XLMRobertaModel")
3111
+ @Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
2507
3112
  class XLMRobertaModel(BertModel):
2508
3113
  model_arch = gguf.MODEL_ARCH.BERT
2509
3114
 
@@ -2589,7 +3194,7 @@ class XLMRobertaModel(BertModel):
2589
3194
  self.gguf_writer.add_token_scores(scores)
2590
3195
  self.gguf_writer.add_token_types(toktypes)
2591
3196
  self.gguf_writer.add_add_space_prefix(add_prefix)
2592
- self.gguf_writer.add_token_type_count(1)
3197
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2593
3198
  self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
2594
3199
  if precompiled_charsmap:
2595
3200
  self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
@@ -2601,6 +3206,11 @@ class XLMRobertaModel(BertModel):
2601
3206
  self.gguf_writer.add_add_eos_token(True)
2602
3207
 
2603
3208
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3209
+ # if name starts with "roberta.", remove the prefix
3210
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
3211
+ if name.startswith("roberta."):
3212
+ name = name[8:]
3213
+
2604
3214
  # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2605
3215
  if name == "embeddings.position_embeddings.weight":
2606
3216
  if self._position_offset is not None:
@@ -2712,6 +3322,164 @@ class StarCoder2Model(Model):
2712
3322
  model_arch = gguf.MODEL_ARCH.STARCODER2
2713
3323
 
2714
3324
 
3325
+ @Model.register("Rwkv6ForCausalLM")
3326
+ class Rwkv6Model(Model):
3327
+ model_arch = gguf.MODEL_ARCH.RWKV6
3328
+
3329
+ def set_vocab(self):
3330
+ assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
3331
+ vocab_size = self.hparams.get("vocab_size", 65536)
3332
+
3333
+ tokens: list[bytes] = ['<s>'.encode("utf-8")]
3334
+ toktypes: list[int] = [gguf.TokenType.CONTROL]
3335
+
3336
+ with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
3337
+ lines = f.readlines()
3338
+ for line in lines:
3339
+ parts = line.split(' ')
3340
+ assert len(parts) >= 3
3341
+ token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
3342
+ token = token.encode("utf-8") if isinstance(token, str) else token
3343
+ assert isinstance(token, bytes)
3344
+ assert len(token) == token_len
3345
+ token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
3346
+ tokens.append(token_text.encode("utf-8"))
3347
+ toktypes.append(gguf.TokenType.NORMAL)
3348
+ remainder = vocab_size - len(tokens)
3349
+ assert remainder >= 0
3350
+ for i in range(len(tokens), vocab_size):
3351
+ tokens.append(f"[PAD{i}]".encode("utf-8"))
3352
+ toktypes.append(gguf.TokenType.UNUSED)
3353
+
3354
+ self.gguf_writer.add_tokenizer_model("rwkv")
3355
+ self.gguf_writer.add_token_list(tokens)
3356
+ self.gguf_writer.add_token_types(toktypes)
3357
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
3358
+ special_vocab.chat_template = "rwkv-world"
3359
+ # hack: Add '\n\n' as the EOT token to make it chat normally
3360
+ special_vocab._set_special_token("eot", 261)
3361
+ special_vocab.add_to_gguf(self.gguf_writer)
3362
+
3363
+ def set_gguf_parameters(self):
3364
+ block_count = self.hparams["num_hidden_layers"]
3365
+ head_size = self.hparams["head_size"]
3366
+ hidden_size = self.hparams["hidden_size"]
3367
+ layer_norm_eps = self.hparams["layer_norm_epsilon"]
3368
+ rescale_every_n_layers = self.hparams["rescale_every"]
3369
+ intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
3370
+ time_mix_extra_dim = 64 if hidden_size == 4096 else 32
3371
+ time_decay_extra_dim = 128 if hidden_size == 4096 else 64
3372
+
3373
+ # RWKV isn't context limited
3374
+ self.gguf_writer.add_context_length(1048576)
3375
+ self.gguf_writer.add_embedding_length(hidden_size)
3376
+ self.gguf_writer.add_block_count(block_count)
3377
+ self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
3378
+ self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
3379
+ self.gguf_writer.add_wkv_head_size(head_size)
3380
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
3381
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
3382
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
3383
+ self.gguf_writer.add_file_type(self.ftype)
3384
+
3385
+ # required by llama.cpp, unused
3386
+ self.gguf_writer.add_head_count(0)
3387
+
3388
+ lerp_weights: dict[int, dict[str, Tensor]] = {}
3389
+
3390
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3391
+ new_name = self.map_tensor_name(name)
3392
+
3393
+ if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
3394
+ new_name += ".weight"
3395
+
3396
+ if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
3397
+ data_torch = data_torch.transpose(0, 1)
3398
+
3399
+ if new_name.endswith("time_mix_w2.weight"):
3400
+ data_torch = data_torch.permute(0, 2, 1)
3401
+
3402
+ if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
3403
+ data_torch = data_torch.squeeze()
3404
+
3405
+ try:
3406
+ rescale_every_n_layers = self.hparams["rescale_every"]
3407
+ if rescale_every_n_layers > 0:
3408
+ if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
3409
+ data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
3410
+ except KeyError:
3411
+ pass
3412
+
3413
+ # concat time_mix_lerp weights to reduce some cpu overhead
3414
+ # also reduces the number of tensors in the model
3415
+ if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
3416
+ try:
3417
+ self.lerp_weights[bid][new_name] = data_torch
3418
+ except KeyError:
3419
+ self.lerp_weights[bid] = {new_name: data_torch}
3420
+ if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
3421
+ new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3422
+ data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
3423
+ yield (new_name, data)
3424
+ return
3425
+
3426
+ yield (new_name, data_torch)
3427
+
3428
+
3429
+ @Model.register("RWKV6Qwen2ForCausalLM")
3430
+ class RWKV6Qwen2Model(Rwkv6Model):
3431
+ model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
3432
+
3433
+ def set_vocab(self):
3434
+ try:
3435
+ self._set_vocab_sentencepiece()
3436
+ except FileNotFoundError:
3437
+ self._set_vocab_gpt2()
3438
+
3439
+ def set_gguf_parameters(self):
3440
+ block_count = self.hparams["num_hidden_layers"]
3441
+ num_attention_heads = self.hparams["num_attention_heads"]
3442
+ num_key_value_heads = self.hparams["num_key_value_heads"]
3443
+ hidden_size = self.hparams["hidden_size"]
3444
+ head_size = hidden_size // num_attention_heads
3445
+ rms_norm_eps = self.hparams["rms_norm_eps"]
3446
+ intermediate_size = self.hparams["intermediate_size"]
3447
+ time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3448
+ time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
3449
+
3450
+ # RWKV isn't context limited
3451
+ self.gguf_writer.add_context_length(1048576)
3452
+ self.gguf_writer.add_embedding_length(hidden_size)
3453
+ self.gguf_writer.add_block_count(block_count)
3454
+ self.gguf_writer.add_wkv_head_size(head_size)
3455
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
3456
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
3457
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
3458
+ self.gguf_writer.add_file_type(self.ftype)
3459
+
3460
+ # special parameters for time_mixing in RWKV6QWEN2
3461
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3462
+ self.gguf_writer.add_token_shift_count(1)
3463
+ # RWKV6QWEN2 use grouped key/value like GQA
3464
+ self.gguf_writer.add_head_count_kv(num_key_value_heads)
3465
+
3466
+ # required by llama.cpp, unused
3467
+ self.gguf_writer.add_head_count(0)
3468
+
3469
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3470
+ for new_name, data in super().modify_tensors(data_torch, name, bid):
3471
+ if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
3472
+ data = data.view(5, -1, data.shape[-1])
3473
+ # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
3474
+ # permute them here to avoid code changes
3475
+ data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
3476
+ if "w2" in new_name:
3477
+ data = data.view(5, -1, data.shape[-1])
3478
+ yield (new_name, data)
3479
+ continue
3480
+ yield (new_name, data)
3481
+
3482
+
2715
3483
  @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
2716
3484
  class MambaModel(Model):
2717
3485
  model_arch = gguf.MODEL_ARCH.MAMBA
@@ -2806,6 +3574,24 @@ class CommandR2Model(Model):
2806
3574
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
2807
3575
 
2808
3576
 
3577
+ @Model.register("Cohere2ForCausalLM")
3578
+ class Cohere2Model(Model):
3579
+ model_arch = gguf.MODEL_ARCH.COHERE2
3580
+
3581
+ def set_gguf_parameters(self):
3582
+ super().set_gguf_parameters()
3583
+
3584
+ self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
3585
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
3586
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3587
+
3588
+ rotary_pct = self.hparams["rotary_pct"]
3589
+ hidden_size = self.hparams["hidden_size"]
3590
+ num_attention_heads = self.hparams["num_attention_heads"]
3591
+ self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
3592
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3593
+
3594
+
2809
3595
  @Model.register("OlmoForCausalLM")
2810
3596
  @Model.register("OLMoForCausalLM")
2811
3597
  class OlmoModel(Model):
@@ -2834,6 +3620,71 @@ class OlmoModel(Model):
2834
3620
  return [(self.map_tensor_name(name), data_torch)]
2835
3621
 
2836
3622
 
3623
+ @Model.register("Olmo2ForCausalLM")
3624
+ class Olmo2Model(Model):
3625
+ model_arch = gguf.MODEL_ARCH.OLMO2
3626
+
3627
+
3628
+ @Model.register("OlmoeForCausalLM")
3629
+ class OlmoeModel(Model):
3630
+ model_arch = gguf.MODEL_ARCH.OLMOE
3631
+
3632
+ def set_gguf_parameters(self):
3633
+ super().set_gguf_parameters()
3634
+ self.gguf_writer.add_layer_norm_rms_eps(1e-5)
3635
+ if (n_experts := self.hparams.get("num_experts")) is not None:
3636
+ self.gguf_writer.add_expert_count(n_experts)
3637
+
3638
+ _experts: list[dict[str, Tensor]] | None = None
3639
+
3640
+ # Copied from: Qwen2MoeModel
3641
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3642
+ # process the experts separately
3643
+ if name.find("experts") != -1:
3644
+ n_experts = self.hparams["num_experts"]
3645
+ assert bid is not None
3646
+
3647
+ if self._experts is None:
3648
+ self._experts = [{} for _ in range(self.block_count)]
3649
+
3650
+ self._experts[bid][name] = data_torch
3651
+
3652
+ if len(self._experts[bid]) >= n_experts * 3:
3653
+ tensors: list[tuple[str, Tensor]] = []
3654
+
3655
+ # merge the experts into a single 3d tensor
3656
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
3657
+ datas: list[Tensor] = []
3658
+
3659
+ for xid in range(n_experts):
3660
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3661
+ datas.append(self._experts[bid][ename])
3662
+ del self._experts[bid][ename]
3663
+
3664
+ data_torch = torch.stack(datas, dim=0)
3665
+
3666
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3667
+
3668
+ new_name = self.map_tensor_name(merged_name)
3669
+
3670
+ tensors.append((new_name, data_torch))
3671
+ return tensors
3672
+ else:
3673
+ return []
3674
+
3675
+ return [(self.map_tensor_name(name), data_torch)]
3676
+
3677
+ # Copied from: Qwen2MoeModel
3678
+ def prepare_tensors(self):
3679
+ super().prepare_tensors()
3680
+
3681
+ if self._experts is not None:
3682
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3683
+ experts = [k for d in self._experts for k in d.keys()]
3684
+ if len(experts) > 0:
3685
+ raise ValueError(f"Unprocessed experts: {experts}")
3686
+
3687
+
2837
3688
  @Model.register("JinaBertModel", "JinaBertForMaskedLM")
2838
3689
  class JinaBertV2Model(BertModel):
2839
3690
  model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
@@ -2872,6 +3723,14 @@ class JinaBertV2Model(BertModel):
2872
3723
  self.gguf_writer.add_add_bos_token(True)
2873
3724
  self.gguf_writer.add_add_eos_token(True)
2874
3725
 
3726
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3727
+ # if name starts with "bert.", remove the prefix
3728
+ # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
3729
+ if name.startswith("bert."):
3730
+ name = name[5:]
3731
+
3732
+ return super().modify_tensors(data_torch, name, bid)
3733
+
2875
3734
 
2876
3735
  @Model.register("OpenELMForCausalLM")
2877
3736
  class OpenELMModel(Model):
@@ -3099,7 +3958,99 @@ class ArcticModel(Model):
3099
3958
  raise ValueError(f"Unprocessed experts: {experts}")
3100
3959
 
3101
3960
 
3961
+ @Model.register("DeepseekForCausalLM")
3962
+ class DeepseekModel(Model):
3963
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK
3964
+
3965
+ def set_vocab(self):
3966
+ try:
3967
+ self._set_vocab_sentencepiece()
3968
+ except FileNotFoundError:
3969
+ self._set_vocab_gpt2()
3970
+
3971
+ def set_gguf_parameters(self):
3972
+ super().set_gguf_parameters()
3973
+ hparams = self.hparams
3974
+ if "head_dim" in hparams:
3975
+ rope_dim = hparams["head_dim"]
3976
+ else:
3977
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
3978
+
3979
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
3980
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3981
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
3982
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3983
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
3984
+ self.gguf_writer.add_expert_weights_scale(1.0)
3985
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3986
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3987
+
3988
+ _experts: list[dict[str, Tensor]] | None = None
3989
+
3990
+ @staticmethod
3991
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
3992
+ if n_head_kv is not None and n_head != n_head_kv:
3993
+ n_head = n_head_kv
3994
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
3995
+ .swapaxes(1, 2)
3996
+ .reshape(weights.shape))
3997
+
3998
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3999
+ n_head = self.hparams["num_attention_heads"]
4000
+ n_kv_head = self.hparams.get("num_key_value_heads")
4001
+
4002
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
4003
+ data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
4004
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
4005
+ data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
4006
+
4007
+ # process the experts separately
4008
+ if name.find("mlp.experts") != -1:
4009
+ n_experts = self.hparams["n_routed_experts"]
4010
+ assert bid is not None
4011
+
4012
+ if self._experts is None:
4013
+ self._experts = [{} for _ in range(self.block_count)]
4014
+
4015
+ self._experts[bid][name] = data_torch
4016
+
4017
+ if len(self._experts[bid]) >= n_experts * 3:
4018
+ tensors: list[tuple[str, Tensor]] = []
4019
+
4020
+ # merge the experts into a single 3d tensor
4021
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
4022
+ datas: list[Tensor] = []
4023
+
4024
+ for xid in range(n_experts):
4025
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4026
+ datas.append(self._experts[bid][ename])
4027
+ del self._experts[bid][ename]
4028
+
4029
+ data_torch = torch.stack(datas, dim=0)
4030
+
4031
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4032
+
4033
+ new_name = self.map_tensor_name(merged_name)
4034
+
4035
+ tensors.append((new_name, data_torch))
4036
+ return tensors
4037
+ else:
4038
+ return []
4039
+
4040
+ return [(self.map_tensor_name(name), data_torch)]
4041
+
4042
+ def prepare_tensors(self):
4043
+ super().prepare_tensors()
4044
+
4045
+ if self._experts is not None:
4046
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
4047
+ experts = [k for d in self._experts for k in d.keys()]
4048
+ if len(experts) > 0:
4049
+ raise ValueError(f"Unprocessed experts: {experts}")
4050
+
4051
+
3102
4052
  @Model.register("DeepseekV2ForCausalLM")
4053
+ @Model.register("DeepseekV3ForCausalLM")
3103
4054
  class DeepseekV2Model(Model):
3104
4055
  model_arch = gguf.MODEL_ARCH.DEEPSEEK2
3105
4056
 
@@ -3121,6 +4072,15 @@ class DeepseekV2Model(Model):
3121
4072
  self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3122
4073
  self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3123
4074
  self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
4075
+ self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
4076
+
4077
+ if hparams["scoring_func"] == "sigmoid":
4078
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
4079
+ elif hparams["scoring_func"] == "softmax":
4080
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
4081
+ else:
4082
+ raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
4083
+
3124
4084
  self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
3125
4085
 
3126
4086
  if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
@@ -3133,6 +4093,16 @@ class DeepseekV2Model(Model):
3133
4093
  _experts: list[dict[str, Tensor]] | None = None
3134
4094
 
3135
4095
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4096
+ # rename e_score_correction_bias tensors
4097
+ if name.endswith("e_score_correction_bias"):
4098
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
4099
+
4100
+ # skip Multi-Token Prediction (MTP) layers
4101
+ block_count = self.hparams["num_hidden_layers"]
4102
+ match = re.match(r"model.layers.(\d+)", name)
4103
+ if match and int(match.group(1)) >= block_count:
4104
+ return []
4105
+
3136
4106
  # process the experts separately
3137
4107
  if name.find("mlp.experts") != -1:
3138
4108
  n_experts = self.hparams["n_routed_experts"]
@@ -3474,10 +4444,7 @@ class JaisModel(Model):
3474
4444
 
3475
4445
  # Embeddings scale
3476
4446
  self.embeddings_scale = 1.0
3477
- # note: For some JAIS flavors, output is tied to (same as) wte in original model
3478
- self.output_is_wte = False
3479
4447
  if 'mup_embeddings_scale' in self.hparams:
3480
- self.output_is_wte = True # Hack (?)
3481
4448
  self.embeddings_scale = self.hparams['mup_embeddings_scale']
3482
4449
  elif 'embeddings_scale' in self.hparams:
3483
4450
  self.embeddings_scale = self.hparams['embeddings_scale']
@@ -3534,10 +4501,7 @@ class JaisModel(Model):
3534
4501
 
3535
4502
  if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3536
4503
  tensors.append((new_name, data_torch * self.embeddings_scale))
3537
- if self.output_is_wte:
3538
- tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
3539
4504
  elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3540
- assert not self.output_is_wte
3541
4505
  tensors.append((new_name, data_torch * self.width_scale))
3542
4506
  else:
3543
4507
  tensors.append((new_name, data_torch))
@@ -3549,7 +4513,7 @@ class JaisModel(Model):
3549
4513
  self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
3550
4514
 
3551
4515
 
3552
- @Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
4516
+ @Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
3553
4517
  class ChatGLMModel(Model):
3554
4518
  model_arch = gguf.MODEL_ARCH.CHATGLM
3555
4519
 
@@ -3655,47 +4619,15 @@ class ChatGLMModel(Model):
3655
4619
 
3656
4620
  from transformers import AutoTokenizer
3657
4621
  tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3658
- vocab_size = hparams["padded_vocab_size"]
4622
+ vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
3659
4623
  assert max(tokenizer.get_vocab().values()) < vocab_size
3660
4624
 
3661
- tokpre = self.get_vocab_base_pre(tokenizer)
3662
-
3663
- merges = []
3664
- vocab = {}
3665
- mergeable_ranks = tokenizer.mergeable_ranks
3666
- for token, rank in mergeable_ranks.items():
3667
- vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
3668
- if len(token) == 1:
3669
- continue
3670
- merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
3671
- assert len(merged) >= 2 and len(merged) <= 7
3672
- merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
3673
-
3674
- # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3675
- added_vocab = tokenizer.get_added_vocab()
3676
- reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
3677
-
3678
- for i in range(vocab_size):
3679
- if i not in reverse_vocab:
3680
- tokens.append(f"[PAD{i}]")
3681
- toktypes.append(gguf.TokenType.UNUSED)
3682
- elif reverse_vocab[i] in added_vocab:
3683
- tokens.append(reverse_vocab[i])
3684
- if tokenizer.added_tokens_decoder[i].special:
3685
- toktypes.append(gguf.TokenType.CONTROL)
3686
- else:
3687
- toktypes.append(gguf.TokenType.USER_DEFINED)
3688
- else:
3689
- tokens.append(reverse_vocab[i])
3690
- toktypes.append(gguf.TokenType.NORMAL)
3691
-
4625
+ tokens, toktypes, tokpre = self.get_vocab_base()
3692
4626
  self.gguf_writer.add_tokenizer_model("gpt2")
3693
4627
  self.gguf_writer.add_tokenizer_pre(tokpre)
3694
4628
  self.gguf_writer.add_token_list(tokens)
3695
4629
  self.gguf_writer.add_token_types(toktypes)
3696
-
3697
- special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
3698
- special_vocab.merges = merges
4630
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3699
4631
  # only add special tokens when they were not already loaded from config.json
3700
4632
  special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
3701
4633
  special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
@@ -3706,16 +4638,20 @@ class ChatGLMModel(Model):
3706
4638
  def set_gguf_parameters(self):
3707
4639
  n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
3708
4640
  n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
3709
- n_head_kv = self.hparams.get("multi_query_group_num", n_head)
4641
+ n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
3710
4642
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
3711
4643
  self.gguf_writer.add_embedding_length(n_embed)
3712
- self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
3713
- self.gguf_writer.add_block_count(self.hparams["num_layers"])
4644
+ self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
4645
+ self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
3714
4646
  self.gguf_writer.add_head_count(n_head)
3715
4647
  self.gguf_writer.add_head_count_kv(n_head_kv)
3716
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
4648
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
3717
4649
  self.gguf_writer.add_file_type(self.ftype)
3718
- self.gguf_writer.add_rope_dimension_count(64)
4650
+ if "attention_dim" in self.hparams:
4651
+ rope_dim = self.hparams["attention_dim"]
4652
+ else:
4653
+ rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4654
+ self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
3719
4655
  self.gguf_writer.add_add_bos_token(False)
3720
4656
  rope_freq = 10000
3721
4657
  if "rope_ratio" in self.hparams:
@@ -3725,7 +4661,7 @@ class ChatGLMModel(Model):
3725
4661
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3726
4662
  del bid # unused
3727
4663
 
3728
- if name.endswith(".rotary_pos_emb.inv_freq"):
4664
+ if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
3729
4665
  return []
3730
4666
 
3731
4667
  name = name.removeprefix("transformer.")
@@ -3812,11 +4748,11 @@ class ExaoneModel(Model):
3812
4748
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3813
4749
  self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
3814
4750
 
3815
- def prepare_tensors(self):
4751
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
3816
4752
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
3817
4753
  if rope_scaling.get("rope_type", '').lower() == "llama3":
3818
4754
  base = self.hparams.get("rope_theta", 10000.0)
3819
- dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4755
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
3820
4756
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
3821
4757
 
3822
4758
  factor = rope_scaling.get("factor", 8.0)
@@ -3839,9 +4775,107 @@ class ExaoneModel(Model):
3839
4775
  smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
3840
4776
  rope_factors.append(1 / ((1 - smooth) / factor + smooth))
3841
4777
 
3842
- self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
4778
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
4779
+
4780
+
4781
+ @Model.register("GraniteForCausalLM")
4782
+ class GraniteModel(LlamaModel):
4783
+ """Conversion for IBM's GraniteForCausalLM"""
4784
+ model_arch = gguf.MODEL_ARCH.GRANITE
4785
+
4786
+ def set_gguf_parameters(self):
4787
+ """Granite uses standard llama parameters with the following differences:
4788
+
4789
+ - No head_dim support
4790
+ - New multiplier params:
4791
+ - attention_scale
4792
+ - embedding_scale
4793
+ - residual_scale
4794
+ - logits_scaling
4795
+ """
4796
+ if head_dim := self.hparams.pop("head_dim", None):
4797
+ logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
4798
+ super().set_gguf_parameters()
4799
+ # NOTE: Convert _multiplier params to _scale params for naming
4800
+ # consistency
4801
+ if attention_scale := self.hparams.get("attention_multiplier"):
4802
+ self.gguf_writer.add_attention_scale(attention_scale)
4803
+ logger.info("gguf: (granite) attention_scale = %s", attention_scale)
4804
+ if embedding_scale := self.hparams.get("embedding_multiplier"):
4805
+ self.gguf_writer.add_embedding_scale(embedding_scale)
4806
+ logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
4807
+ if residual_scale := self.hparams.get("residual_multiplier"):
4808
+ self.gguf_writer.add_residual_scale(residual_scale)
4809
+ logger.info("gguf: (granite) residual_scale = %s", residual_scale)
4810
+ if logits_scale := self.hparams.get("logits_scaling"):
4811
+ self.gguf_writer.add_logit_scale(logits_scale)
4812
+ logger.info("gguf: (granite) logits_scale = %s", logits_scale)
4813
+
4814
+
4815
+ @Model.register("GraniteMoeForCausalLM")
4816
+ class GraniteMoeModel(GraniteModel):
4817
+ """Conversion for IBM's GraniteMoeForCausalLM"""
4818
+ model_arch = gguf.MODEL_ARCH.GRANITE_MOE
4819
+
4820
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4821
+ """In modeling_granitemoe, the JetMoe implementation of parallel experts
4822
+ is used. This essentially merges w1 and w3 into a single tensor with 2x
4823
+ the hidden size that is then split during forward. To keep compatibility
4824
+ with existing mixtral support, we pull them apart here.
4825
+ """
4826
+
4827
+ if name.endswith("block_sparse_moe.input_linear.weight"):
4828
+ ffn_dim = self.hparams["intermediate_size"]
4829
+ assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
4830
+ gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
4831
+ return [
4832
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
4833
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
4834
+ ]
4835
+
4836
+ return super().modify_tensors(data_torch, name, bid)
3843
4837
 
3844
- super().prepare_tensors()
4838
+
4839
+ @Model.register("ChameleonForConditionalGeneration")
4840
+ @Model.register("ChameleonForCausalLM") # obsolete
4841
+ class ChameleonModel(Model):
4842
+ model_arch = gguf.MODEL_ARCH.CHAMELEON
4843
+
4844
+ def set_gguf_parameters(self):
4845
+ super().set_gguf_parameters()
4846
+ self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
4847
+
4848
+ def set_vocab(self):
4849
+ self._set_vocab_gpt2()
4850
+
4851
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4852
+ # ignore image tokenizer for now
4853
+ # TODO: remove this once image support is implemented for Chameleon
4854
+ if name.startswith("model.vqmodel"):
4855
+ return []
4856
+
4857
+ n_head = self.hparams["num_attention_heads"]
4858
+ n_kv_head = self.hparams.get("num_key_value_heads")
4859
+ hidden_dim = self.hparams.get("hidden_size")
4860
+
4861
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
4862
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
4863
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
4864
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
4865
+ if name.endswith(("q_norm.weight", "q_norm.bias")):
4866
+ data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
4867
+ if name.endswith(("k_norm.weight", "k_norm.bias")):
4868
+ data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
4869
+
4870
+ return [(self.map_tensor_name(name), data_torch)]
4871
+
4872
+ # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
4873
+ @staticmethod
4874
+ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
4875
+ head_dim = hidden_dim // n_heads
4876
+ data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
4877
+ data_torch = data_torch.repeat_interleave(n_heads, 0)
4878
+ return data_torch
3845
4879
 
3846
4880
 
3847
4881
  ###### CONVERSION LOGIC ######
@@ -3924,8 +4958,8 @@ def parse_args() -> argparse.Namespace:
3924
4958
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
3925
4959
  )
3926
4960
  parser.add_argument(
3927
- "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
3928
- help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
4961
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
4962
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
3929
4963
  )
3930
4964
  parser.add_argument(
3931
4965
  "--bigendian", action="store_true",
@@ -3934,6 +4968,7 @@ def parse_args() -> argparse.Namespace:
3934
4968
  parser.add_argument(
3935
4969
  "model", type=Path,
3936
4970
  help="directory containing model file",
4971
+ nargs="?",
3937
4972
  )
3938
4973
  parser.add_argument(
3939
4974
  "--use-temp-file", action="store_true",
@@ -3971,8 +5006,15 @@ def parse_args() -> argparse.Namespace:
3971
5006
  "--metadata", type=Path,
3972
5007
  help="Specify the path for an authorship metadata override file"
3973
5008
  )
5009
+ parser.add_argument(
5010
+ "--print-supported-models", action="store_true",
5011
+ help="Print the supported models"
5012
+ )
3974
5013
 
3975
- return parser.parse_args()
5014
+ args = parser.parse_args()
5015
+ if not args.print_supported_models and args.model is None:
5016
+ parser.error("the following arguments are required: model")
5017
+ return args
3976
5018
 
3977
5019
 
3978
5020
  def split_str_to_n_bytes(split_str: str) -> int:
@@ -3996,6 +5038,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
3996
5038
  def main() -> None:
3997
5039
  args = parse_args()
3998
5040
 
5041
+ if args.print_supported_models:
5042
+ logger.error("Supported models:")
5043
+ Model.print_registered_models()
5044
+ sys.exit(0)
5045
+
3999
5046
  if args.verbose:
4000
5047
  logging.basicConfig(level=logging.DEBUG)
4001
5048
  else:
@@ -4012,6 +5059,8 @@ def main() -> None:
4012
5059
  "f16": gguf.LlamaFileType.MOSTLY_F16,
4013
5060
  "bf16": gguf.LlamaFileType.MOSTLY_BF16,
4014
5061
  "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
5062
+ "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
5063
+ "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
4015
5064
  "auto": gguf.LlamaFileType.GUESSED,
4016
5065
  }
4017
5066