bigdl-core-cpp 2.1.0b20240820.post1__py3-none-win_amd64.whl → 2.2.0b20250217.post0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +908 -140
  2. bigdl/cpp/convert_hf_to_gguf_update.py +376 -0
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +450 -0
  4. bigdl/cpp/convert_lora_to_gguf.py +433 -0
  5. bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/constants.py +414 -89
  7. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  8. bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
  9. bigdl/cpp/gguf-py/gguf/gguf_writer.py +77 -14
  10. bigdl/cpp/gguf-py/gguf/lazy.py +3 -1
  11. bigdl/cpp/gguf-py/gguf/metadata.py +195 -76
  12. bigdl/cpp/gguf-py/gguf/quants.py +1210 -64
  13. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +156 -34
  14. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  15. bigdl/cpp/gguf-py/gguf/vocab.py +325 -3
  16. bigdl/cpp/libs/common.lib +0 -0
  17. bigdl/cpp/libs/ggml-base.dll +0 -0
  18. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  19. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  20. bigdl/cpp/libs/ggml.dll +0 -0
  21. bigdl/cpp/libs/libc++.dll +0 -0
  22. bigdl/cpp/libs/llama-batched.exe +0 -0
  23. bigdl/cpp/libs/llama-bench.exe +0 -0
  24. bigdl/cpp/libs/llama-cli.exe +0 -0
  25. bigdl/cpp/libs/llama-embedding.exe +0 -0
  26. bigdl/cpp/libs/llama-gguf.exe +0 -0
  27. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  28. bigdl/cpp/libs/llama-lookup.exe +0 -0
  29. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  30. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  31. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  32. bigdl/cpp/libs/llama-quantize.exe +0 -0
  33. bigdl/cpp/libs/llama-server.exe +0 -0
  34. bigdl/cpp/libs/llama-simple.exe +0 -0
  35. bigdl/cpp/libs/llama-speculative.exe +0 -0
  36. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  37. bigdl/cpp/libs/llama.dll +0 -0
  38. bigdl/cpp/libs/llava_shared.dll +0 -0
  39. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  40. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  41. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  42. bigdl/cpp/libs/ollama-lib.exe +0 -0
  43. bigdl/cpp/libs/ollama.exe +0 -0
  44. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  45. bigdl/cpp/libs/ollama_llama.dll +0 -0
  46. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  47. {bigdl_core_cpp-2.1.0b20240820.post1.data → bigdl_core_cpp-2.2.0b20250217.post0.data}/scripts/init-llama-cpp.bat +7 -2
  48. bigdl_core_cpp-2.2.0b20250217.post0.data/scripts/init-ollama.bat +16 -0
  49. {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/METADATA +9 -5
  50. bigdl_core_cpp-2.2.0b20250217.post0.dist-info/RECORD +56 -0
  51. {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/WHEEL +1 -1
  52. bigdl/cpp/convert.py +0 -1714
  53. bigdl/cpp/libs/baby-llama.exe +0 -0
  54. bigdl/cpp/libs/batched-bench.exe +0 -0
  55. bigdl/cpp/libs/batched.exe +0 -0
  56. bigdl/cpp/libs/beam-search.exe +0 -0
  57. bigdl/cpp/libs/benchmark.exe +0 -0
  58. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  59. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  60. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  61. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  62. bigdl/cpp/libs/embedding.exe +0 -0
  63. bigdl/cpp/libs/export-lora.exe +0 -0
  64. bigdl/cpp/libs/finetune.exe +0 -0
  65. bigdl/cpp/libs/ggml_shared.dll +0 -0
  66. bigdl/cpp/libs/gguf.exe +0 -0
  67. bigdl/cpp/libs/gritlm.exe +0 -0
  68. bigdl/cpp/libs/imatrix.exe +0 -0
  69. bigdl/cpp/libs/infill.exe +0 -0
  70. bigdl/cpp/libs/llava-cli.exe +0 -0
  71. bigdl/cpp/libs/lookahead.exe +0 -0
  72. bigdl/cpp/libs/lookup.exe +0 -0
  73. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  74. bigdl/cpp/libs/main.exe +0 -0
  75. bigdl/cpp/libs/parallel.exe +0 -0
  76. bigdl/cpp/libs/passkey.exe +0 -0
  77. bigdl/cpp/libs/perplexity.exe +0 -0
  78. bigdl/cpp/libs/q8dot.exe +0 -0
  79. bigdl/cpp/libs/quantize-stats.exe +0 -0
  80. bigdl/cpp/libs/quantize.exe +0 -0
  81. bigdl/cpp/libs/save-load-state.exe +0 -0
  82. bigdl/cpp/libs/server.exe +0 -0
  83. bigdl/cpp/libs/simple.exe +0 -0
  84. bigdl/cpp/libs/speculative.exe +0 -0
  85. bigdl/cpp/libs/tokenize.exe +0 -0
  86. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  87. bigdl/cpp/libs/vdot.exe +0 -0
  88. bigdl_core_cpp-2.1.0b20240820.post1.data/scripts/init-ollama.bat +0 -13
  89. bigdl_core_cpp-2.1.0b20240820.post1.dist-info/RECORD +0 -63
  90. {bigdl_core_cpp-2.1.0b20240820.post1.data → bigdl_core_cpp-2.2.0b20250217.post0.data}/scripts/init-llama-cpp.ps1 +0 -0
  91. {bigdl_core_cpp-2.1.0b20240820.post1.dist-info → bigdl_core_cpp-2.2.0b20250217.post0.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import ast
6
7
  import logging
7
8
  import argparse
8
9
  import contextlib
@@ -14,6 +15,7 @@ from enum import IntEnum
14
15
  from pathlib import Path
15
16
  from hashlib import sha256
16
17
  from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
18
+ from itertools import chain
17
19
 
18
20
  import math
19
21
  import numpy as np
@@ -70,7 +72,8 @@ class Model:
70
72
  def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
71
73
  use_temp_file: bool = False, eager: bool = False,
72
74
  metadata_override: Path | None = None, model_name: str | None = None,
73
- split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
75
+ split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
76
+ small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
74
77
  if type(self) is Model:
75
78
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
76
79
 
@@ -85,7 +88,7 @@ class Model:
85
88
  self.is_safetensors = len(self.part_names) > 0
86
89
  if not self.is_safetensors:
87
90
  self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
88
- self.hparams = Model.load_hparams(self.dir_model)
91
+ self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
89
92
  self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
90
93
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
91
94
  self.tensor_names = None
@@ -129,12 +132,14 @@ class Model:
129
132
  def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
130
133
  tensor_names_from_parts: set[str] = set()
131
134
 
132
- if len(self.part_names) > 1:
135
+ index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
136
+ index_name += ".index.json"
137
+ index_file = self.dir_model / index_name
138
+
139
+ if index_file.is_file():
133
140
  self.tensor_names = set()
134
- index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
135
- index_name += ".index.json"
136
141
  logger.info(f"gguf: loading model weight map from '{index_name}'")
137
- with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
142
+ with open(index_file, "r", encoding="utf-8") as f:
138
143
  index: dict[str, Any] = json.load(f)
139
144
  weight_map = index.get("weight_map")
140
145
  if weight_map is None or not isinstance(weight_map, dict):
@@ -142,6 +147,7 @@ class Model:
142
147
  self.tensor_names.update(weight_map.keys())
143
148
  else:
144
149
  self.tensor_names = tensor_names_from_parts
150
+ weight_map = {}
145
151
 
146
152
  for part_name in self.part_names:
147
153
  logger.info(f"gguf: loading model part '{part_name}'")
@@ -168,9 +174,17 @@ class Model:
168
174
  data = LazyTorchTensor.from_eager(data)
169
175
  yield name, data
170
176
 
171
- # only verify tensor name presence; it doesn't matter if they are not in the right files
172
- if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
173
- raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
177
+ # verify tensor name presence and identify potentially missing files
178
+ if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
179
+ missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
180
+ extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
181
+ missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
182
+ if len(extra) == 0 and len(missing_files) > 0:
183
+ raise ValueError(f"Missing or incomplete model files: {missing_files}")
184
+ else:
185
+ raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
186
+ f"Missing tensors: {missing}\n"
187
+ f"Extra tensors: {extra}")
174
188
 
175
189
  def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
176
190
  if key not in gguf.MODEL_TENSORS[self.model_arch]:
@@ -251,20 +265,19 @@ class Model:
251
265
 
252
266
  return [(self.map_tensor_name(name), data_torch)]
253
267
 
254
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
268
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
255
269
  del name, new_name, bid, n_dims # unused
256
270
 
257
271
  return False
258
272
 
259
- def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
260
- del name, new_name, bid, n_dims # unused
261
-
262
- return False
273
+ # some models need extra generated tensors (like rope_freqs)
274
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
275
+ return ()
263
276
 
264
277
  def prepare_tensors(self):
265
278
  max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
266
279
 
267
- for name, data_torch in self.get_tensors():
280
+ for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
268
281
  # we don't need these
269
282
  if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
270
283
  continue
@@ -282,57 +295,78 @@ class Model:
282
295
  bid = int(part)
283
296
  break
284
297
 
285
- for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
286
- data: np.ndarray # type hint
287
- n_dims = len(data.shape)
288
- data_dtype = data.dtype
289
- data_qtype: gguf.GGMLQuantizationType | None = None
298
+ for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
299
+ data = data_torch.squeeze().numpy()
290
300
 
291
- # when both are True, f32 should win
292
- extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
293
- extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
301
+ # if data ends up empty, it means data_torch was a scalar tensor -> restore
302
+ if len(data.shape) == 0:
303
+ data = data_torch.numpy()
304
+
305
+ n_dims = len(data.shape)
306
+ data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
294
307
 
295
308
  # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
296
- # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
297
- extra_f32 = any(cond for cond in (
298
- extra_f32,
299
- n_dims == 1,
300
- new_name.endswith("_norm.weight"),
301
- ))
309
+ if n_dims <= 1 or new_name.endswith("_norm.weight"):
310
+ data_qtype = gguf.GGMLQuantizationType.F32
302
311
 
312
+ # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
303
313
  # Some tensor types are always in float32
304
- extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
305
- gguf.MODEL_TENSOR.FFN_GATE_INP,
306
- gguf.MODEL_TENSOR.POS_EMBD,
307
- gguf.MODEL_TENSOR.TOKEN_TYPES,
308
- ))
309
-
310
- # if f16 desired, convert any float32 2-dim weight tensors to float16
311
- extra_f16 = any(cond for cond in (
312
- extra_f16,
313
- (name.endswith(".weight") and n_dims >= 2),
314
- ))
315
-
316
- if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
317
- if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
318
- data = gguf.quantize_bf16(data)
319
- assert data.dtype == np.int16
320
- data_qtype = gguf.GGMLQuantizationType.BF16
314
+ if data_qtype is False and (
315
+ any(
316
+ self.match_model_tensor_name(new_name, key, bid)
317
+ for key in (
318
+ gguf.MODEL_TENSOR.FFN_GATE_INP,
319
+ gguf.MODEL_TENSOR.POS_EMBD,
320
+ gguf.MODEL_TENSOR.TOKEN_TYPES,
321
+ gguf.MODEL_TENSOR.SSM_CONV1D,
322
+ gguf.MODEL_TENSOR.TIME_MIX_FIRST,
323
+ gguf.MODEL_TENSOR.TIME_MIX_W1,
324
+ gguf.MODEL_TENSOR.TIME_MIX_W2,
325
+ gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
326
+ gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
327
+ )
328
+ )
329
+ or not new_name.endswith(".weight")
330
+ ):
331
+ data_qtype = gguf.GGMLQuantizationType.F32
321
332
 
322
- elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
323
- data = gguf.quantize_q8_0(data)
324
- assert data.dtype == np.uint8
325
- data_qtype = gguf.GGMLQuantizationType.Q8_0
333
+ if data_qtype is False and any(
334
+ self.match_model_tensor_name(new_name, key, bid)
335
+ for key in (
336
+ gguf.MODEL_TENSOR.TOKEN_EMBD,
337
+ gguf.MODEL_TENSOR.OUTPUT,
338
+ )
339
+ ):
340
+ if self.ftype in (
341
+ gguf.LlamaFileType.MOSTLY_TQ1_0,
342
+ gguf.LlamaFileType.MOSTLY_TQ2_0,
343
+ ):
344
+ # TODO: use Q4_K and Q6_K
345
+ data_qtype = gguf.GGMLQuantizationType.F16
326
346
 
327
- else: # default to float16 for quantized tensors
328
- if data_dtype != np.float16:
329
- data = data.astype(np.float16)
347
+ # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
348
+ if isinstance(data_qtype, bool):
349
+ if self.ftype == gguf.LlamaFileType.ALL_F32:
350
+ data_qtype = gguf.GGMLQuantizationType.F32
351
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
330
352
  data_qtype = gguf.GGMLQuantizationType.F16
353
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
354
+ data_qtype = gguf.GGMLQuantizationType.BF16
355
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
356
+ data_qtype = gguf.GGMLQuantizationType.Q8_0
357
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
358
+ data_qtype = gguf.GGMLQuantizationType.TQ1_0
359
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
360
+ data_qtype = gguf.GGMLQuantizationType.TQ2_0
361
+ else:
362
+ raise ValueError(f"Unknown file type: {self.ftype.name}")
331
363
 
332
- if data_qtype is None: # by default, convert to float32
333
- if data_dtype != np.float32:
334
- data = data.astype(np.float32)
335
- data_qtype = gguf.GGMLQuantizationType.F32
364
+ try:
365
+ data = gguf.quants.quantize(data, data_qtype)
366
+ except gguf.QuantError as e:
367
+ logger.warning("%s, %s", e, "falling back to F16")
368
+ data_qtype = gguf.GGMLQuantizationType.F16
369
+ data = gguf.quants.quantize(data, data_qtype)
336
370
 
337
371
  shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
338
372
 
@@ -540,6 +574,9 @@ class Model:
540
574
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
541
575
  # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
542
576
  res = "bert-bge"
577
+ if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
578
+ # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
579
+ res = "bert-bge-large"
543
580
  if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
544
581
  # ref: https://huggingface.co/mosaicml/mpt-7b
545
582
  res = "mpt"
@@ -567,6 +604,9 @@ class Model:
567
604
  if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
568
605
  # ref: https://huggingface.co/databricks/dbrx-base
569
606
  res = "dbrx"
607
+ if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
608
+ # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
609
+ res = "jina-v1-en"
570
610
  if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
571
611
  # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
572
612
  res = "jina-v2-en"
@@ -603,6 +643,27 @@ class Model:
603
643
  if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
604
644
  # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
605
645
  res = "smollm"
646
+ if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
647
+ # ref: https://huggingface.co/bigscience/bloom
648
+ res = "bloom"
649
+ if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
650
+ # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
651
+ res = "gpt3-finnish"
652
+ if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
653
+ # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
654
+ res = "exaone"
655
+ if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
656
+ # ref: https://huggingface.co/microsoft/phi-2
657
+ res = "phi-2"
658
+ if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
659
+ # ref: https://huggingface.co/facebook/chameleon-7b
660
+ res = "chameleon"
661
+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
662
+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
663
+ res = "minerva-7b"
664
+ if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
665
+ # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
666
+ res = "roberta-bpe"
606
667
 
607
668
  if res is None:
608
669
  logger.warning("\n")
@@ -906,7 +967,7 @@ class GPTNeoXModel(Model):
906
967
  return tensors
907
968
 
908
969
 
909
- @Model.register("BloomForCausalLM")
970
+ @Model.register("BloomForCausalLM", "BloomModel")
910
971
  class BloomModel(Model):
911
972
  model_arch = gguf.MODEL_ARCH.BLOOM
912
973
 
@@ -1461,7 +1522,7 @@ class StableLMModel(Model):
1461
1522
  raise ValueError(f"Unprocessed norms: {norms}")
1462
1523
 
1463
1524
 
1464
- @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1525
+ @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1465
1526
  class LlamaModel(Model):
1466
1527
  model_arch = gguf.MODEL_ARCH.LLAMA
1467
1528
 
@@ -1487,6 +1548,17 @@ class LlamaModel(Model):
1487
1548
  special_vocab._set_special_token("eot", 32010)
1488
1549
  special_vocab.add_to_gguf(self.gguf_writer)
1489
1550
 
1551
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1552
+ if tokenizer_config_file.is_file():
1553
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1554
+ tokenizer_config_json = json.load(f)
1555
+ if "add_prefix_space" in tokenizer_config_json:
1556
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1557
+
1558
+ # Apply to granite small models only
1559
+ if self.hparams.get("vocab_size", 32000) == 49152:
1560
+ self.gguf_writer.add_add_bos_token(False)
1561
+
1490
1562
  def set_gguf_parameters(self):
1491
1563
  super().set_gguf_parameters()
1492
1564
  hparams = self.hparams
@@ -1503,17 +1575,6 @@ class LlamaModel(Model):
1503
1575
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1504
1576
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1505
1577
 
1506
- tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1507
- if tokenizer_config_file.is_file():
1508
- with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1509
- tokenizer_config_json = json.load(f)
1510
- if "add_prefix_space" in tokenizer_config_json:
1511
- self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1512
-
1513
- # Apply to granite small models only
1514
- if self.hparams.get("vocab_size", 32000) == 49152:
1515
- self.gguf_writer.add_add_bos_token(False)
1516
-
1517
1578
  @staticmethod
1518
1579
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1519
1580
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1569,12 +1630,13 @@ class LlamaModel(Model):
1569
1630
 
1570
1631
  return [(self.map_tensor_name(name), data_torch)]
1571
1632
 
1572
- def prepare_tensors(self):
1633
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1573
1634
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1574
1635
  if rope_scaling.get("rope_type", '').lower() == "llama3":
1575
1636
  base = self.hparams.get("rope_theta", 10000.0)
1576
- dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1637
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1577
1638
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1639
+
1578
1640
  factor = rope_scaling.get("factor", 8.0)
1579
1641
  low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1580
1642
  high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
@@ -1595,8 +1657,9 @@ class LlamaModel(Model):
1595
1657
  smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1596
1658
  rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1597
1659
 
1598
- self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1660
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1599
1661
 
1662
+ def prepare_tensors(self):
1600
1663
  super().prepare_tensors()
1601
1664
 
1602
1665
  if self._experts is not None:
@@ -1618,15 +1681,16 @@ class BitnetModel(Model):
1618
1681
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1619
1682
  self.gguf_writer.add_rope_scaling_factor(1.0)
1620
1683
 
1621
- def weight_quant(self, weight):
1684
+ def weight_quant(self, weight: Tensor) -> Tensor:
1622
1685
  dtype = weight.dtype
1623
1686
  weight = weight.float()
1624
- s = 1 / weight.abs().mean().clamp(min=1e-5)
1625
- weight = (weight * s).round().clamp(-1, 1) / s
1626
- scale = weight.abs().max().unsqueeze(0)
1627
- weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
1628
- weight = torch.sign(weight).type(dtype)
1629
- return weight.type(dtype), scale.type(torch.float32)
1687
+ scale = weight.abs().mean().clamp(min=1e-5)
1688
+ iscale = 1 / scale
1689
+ # TODO: multiply by the scale directly instead of inverting it twice
1690
+ # (this is also unnecessarily doubly inverted upstream)
1691
+ # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
1692
+ result = (weight * iscale).round().clamp(-1, 1) / iscale
1693
+ return result.type(dtype)
1630
1694
 
1631
1695
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1632
1696
  new_name = self.map_tensor_name(name)
@@ -1641,11 +1705,9 @@ class BitnetModel(Model):
1641
1705
  gguf.MODEL_TENSOR.FFN_GATE,
1642
1706
  ]):
1643
1707
  # transform weight into 1/0/-1 (in fp32)
1644
- weight_torch, scale_torch = self.weight_quant(data_torch)
1645
- yield (new_name, weight_torch)
1646
- yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
1647
- else:
1648
- yield (new_name, data_torch)
1708
+ data_torch = self.weight_quant(data_torch)
1709
+
1710
+ yield (new_name, data_torch)
1649
1711
 
1650
1712
 
1651
1713
  @Model.register("GrokForCausalLM")
@@ -1764,7 +1826,7 @@ class DbrxModel(Model):
1764
1826
 
1765
1827
  return [(new_name, data_torch)]
1766
1828
 
1767
- def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
1829
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
1768
1830
  del name, new_name, bid # unused
1769
1831
 
1770
1832
  return n_dims > 1
@@ -1775,29 +1837,40 @@ class MiniCPMModel(Model):
1775
1837
  model_arch = gguf.MODEL_ARCH.MINICPM
1776
1838
 
1777
1839
  def set_gguf_parameters(self):
1778
- block_count = self.hparams["num_hidden_layers"]
1779
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1780
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1781
- self.gguf_writer.add_block_count(block_count)
1782
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1783
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1784
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1785
- self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
1786
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1787
- self.gguf_writer.add_file_type(self.ftype)
1840
+ super().set_gguf_parameters()
1841
+ embedding_scale = float(self.hparams["scale_emb"])
1842
+ self.gguf_writer.add_embedding_scale(embedding_scale)
1843
+ logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
1844
+ residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
1845
+ self.gguf_writer.add_residual_scale(residual_scale)
1846
+ logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
1847
+ logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
1848
+ self.gguf_writer.add_logit_scale(logit_scale)
1849
+ logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
1850
+ if self.hparams.get("rope_scaling") is not None:
1851
+ if self.hparams["rope_scaling"].get("type") == "longrope":
1852
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
1853
+ logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
1854
+
1855
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1856
+ rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1788
1857
 
1789
- def set_vocab(self):
1790
- self._set_vocab_llama_hf()
1858
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
1859
+ if rope_scaling is not None:
1860
+ long_factors = rope_scaling.get('long_factor', None)
1861
+ short_factors = rope_scaling.get('short_factor', None)
1791
1862
 
1792
- def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1793
- if n_kv_head is not None and n_head != n_kv_head:
1794
- n_head = n_kv_head
1863
+ if long_factors is None or short_factors is None:
1864
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
1795
1865
 
1796
- return (
1797
- weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1798
- .swapaxes(1, 2)
1799
- .reshape(weights.shape)
1800
- )
1866
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
1867
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
1868
+
1869
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
1870
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
1871
+
1872
+ def set_vocab(self):
1873
+ self._set_vocab_sentencepiece()
1801
1874
 
1802
1875
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1803
1876
  del bid # unused
@@ -1807,13 +1880,66 @@ class MiniCPMModel(Model):
1807
1880
 
1808
1881
  # HF models permute some of the tensors, so we need to undo that
1809
1882
  if name.endswith(("q_proj.weight")):
1810
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
1883
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1811
1884
  if name.endswith(("k_proj.weight")):
1812
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
1885
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1813
1886
 
1814
1887
  return [(self.map_tensor_name(name), data_torch)]
1815
1888
 
1816
1889
 
1890
+ @Model.register("MiniCPM3ForCausalLM")
1891
+ class MiniCPM3Model(Model):
1892
+ model_arch = gguf.MODEL_ARCH.MINICPM3
1893
+
1894
+ def set_gguf_parameters(self):
1895
+ hparams = self.hparams
1896
+
1897
+ self.gguf_writer.add_file_type(self.ftype)
1898
+ self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
1899
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1900
+ self.gguf_writer.add_block_count(self.block_count)
1901
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
1902
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
1903
+ self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
1904
+ self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
1905
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1906
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
1907
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
1908
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
1909
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
1910
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
1911
+
1912
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1913
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
1914
+ if rope_scaling is not None:
1915
+ rope_dims = self.hparams["qk_rope_head_dim"]
1916
+
1917
+ long_factors = rope_scaling.get('long_factor', None)
1918
+ short_factors = rope_scaling.get('short_factor', None)
1919
+
1920
+ if long_factors is None or short_factors is None:
1921
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
1922
+
1923
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
1924
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
1925
+
1926
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
1927
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
1928
+
1929
+ def set_vocab(self):
1930
+ self._set_vocab_sentencepiece()
1931
+
1932
+ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1933
+ if n_kv_head is not None and n_head != n_kv_head:
1934
+ n_head //= n_kv_head
1935
+
1936
+ return (
1937
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1938
+ .swapaxes(1, 2)
1939
+ .reshape(weights.shape)
1940
+ )
1941
+
1942
+
1817
1943
  @Model.register("QWenLMHeadModel")
1818
1944
  class QwenModel(Model):
1819
1945
  model_arch = gguf.MODEL_ARCH.QWEN
@@ -1866,6 +1992,37 @@ class Qwen2Model(Model):
1866
1992
  except FileNotFoundError:
1867
1993
  self._set_vocab_gpt2()
1868
1994
 
1995
+ def set_gguf_parameters(self):
1996
+ super().set_gguf_parameters()
1997
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1998
+ if self.hparams["rope_scaling"].get("type") == "yarn":
1999
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2000
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2001
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
2002
+
2003
+
2004
+ @Model.register("Qwen2VLForConditionalGeneration")
2005
+ class Qwen2VLModel(Model):
2006
+ model_arch = gguf.MODEL_ARCH.QWEN2VL
2007
+
2008
+ def set_gguf_parameters(self):
2009
+ super().set_gguf_parameters()
2010
+ mrope_section = self.hparams["rope_scaling"]["mrope_section"]
2011
+ mrope_section += [0] * max(0, 4 - len(mrope_section))
2012
+ self.gguf_writer.add_rope_dimension_sections(mrope_section)
2013
+
2014
+ def set_vocab(self):
2015
+ try:
2016
+ self._set_vocab_sentencepiece()
2017
+ except FileNotFoundError:
2018
+ self._set_vocab_gpt2()
2019
+
2020
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
2021
+ for name, data in super().get_tensors():
2022
+ if name.startswith("visual."):
2023
+ continue
2024
+ yield name, data
2025
+
1869
2026
 
1870
2027
  @Model.register("Qwen2MoeForCausalLM")
1871
2028
  class Qwen2MoeModel(Model):
@@ -2113,6 +2270,13 @@ class Phi3MiniModel(Model):
2113
2270
  self.gguf_writer.add_file_type(self.ftype)
2114
2271
  self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
2115
2272
 
2273
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2274
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
2275
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
2276
+ max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
2277
+ orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2278
+ rope_dims = n_embd // n_head
2279
+
2116
2280
  # write rope scaling for long context (128k) model
2117
2281
  rope_scaling = self.find_hparam(['rope_scaling'], True)
2118
2282
  if rope_scaling is None:
@@ -2142,8 +2306,8 @@ class Phi3MiniModel(Model):
2142
2306
  if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2143
2307
  raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2144
2308
 
2145
- self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
2146
- self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
2309
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
2310
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
2147
2311
 
2148
2312
 
2149
2313
  @Model.register("PlamoForCausalLM")
@@ -2403,7 +2567,7 @@ class InternLM2Model(Model):
2403
2567
  return [(self.map_tensor_name(name), data_torch)]
2404
2568
 
2405
2569
 
2406
- @Model.register("BertModel", "CamembertModel")
2570
+ @Model.register("BertModel", "CamembertModel", "RobertaModel")
2407
2571
  class BertModel(Model):
2408
2572
  model_arch = gguf.MODEL_ARCH.BERT
2409
2573
 
@@ -2444,7 +2608,8 @@ class BertModel(Model):
2444
2608
 
2445
2609
  # we need this to validate the size of the token_type embeddings
2446
2610
  # though currently we are passing all zeros to the token_type embeddings
2447
- self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
2611
+ # "Sequence A" or "Sequence B"
2612
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2448
2613
 
2449
2614
  # convert to phantom space vocab
2450
2615
  def phantom(tok):
@@ -2505,6 +2670,117 @@ class NomicBertModel(BertModel):
2505
2670
  self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
2506
2671
 
2507
2672
 
2673
+ @Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
2674
+ class XLMRobertaModel(BertModel):
2675
+ model_arch = gguf.MODEL_ARCH.BERT
2676
+
2677
+ def __init__(self, *args, **kwargs):
2678
+ super().__init__(*args, **kwargs)
2679
+
2680
+ # we need the pad_token_id to know how to chop down position_embd matrix
2681
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
2682
+ self._position_offset = 1 + pad_token_id
2683
+ if "max_position_embeddings" in self.hparams:
2684
+ self.hparams["max_position_embeddings"] -= self._position_offset
2685
+ else:
2686
+ self._position_offset = None
2687
+
2688
+ def set_vocab(self):
2689
+ # to avoid TypeError: Descriptors cannot be created directly
2690
+ # exception when importing sentencepiece_model_pb2
2691
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
2692
+ from sentencepiece import SentencePieceProcessor
2693
+ from sentencepiece import sentencepiece_model_pb2 as model
2694
+
2695
+ tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
2696
+ if not tokenizer_path.is_file():
2697
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
2698
+
2699
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
2700
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2701
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
2702
+
2703
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2704
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
2705
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
2706
+
2707
+ tokenizer = SentencePieceProcessor()
2708
+ tokenizer.LoadFromFile(str(tokenizer_path))
2709
+
2710
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2711
+
2712
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
2713
+ scores: list[float] = [-10000.0] * vocab_size
2714
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
2715
+
2716
+ for token_id in range(tokenizer.vocab_size()):
2717
+ piece = tokenizer.IdToPiece(token_id)
2718
+ text = piece.encode("utf-8")
2719
+ score = tokenizer.GetScore(token_id)
2720
+
2721
+ toktype = SentencePieceTokenTypes.NORMAL
2722
+ if tokenizer.IsUnknown(token_id):
2723
+ toktype = SentencePieceTokenTypes.UNKNOWN
2724
+ elif tokenizer.IsControl(token_id):
2725
+ toktype = SentencePieceTokenTypes.CONTROL
2726
+ elif tokenizer.IsUnused(token_id):
2727
+ toktype = SentencePieceTokenTypes.UNUSED
2728
+ elif tokenizer.IsByte(token_id):
2729
+ toktype = SentencePieceTokenTypes.BYTE
2730
+
2731
+ tokens[token_id] = text
2732
+ scores[token_id] = score
2733
+ toktypes[token_id] = toktype
2734
+
2735
+ if vocab_size > len(tokens):
2736
+ pad_count = vocab_size - len(tokens)
2737
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
2738
+ for i in range(1, pad_count + 1):
2739
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
2740
+ scores.append(-1000.0)
2741
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
2742
+
2743
+ # realign tokens (see HF tokenizer code)
2744
+ tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
2745
+ scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
2746
+ toktypes = [
2747
+ SentencePieceTokenTypes.CONTROL,
2748
+ SentencePieceTokenTypes.CONTROL,
2749
+ SentencePieceTokenTypes.CONTROL,
2750
+ SentencePieceTokenTypes.UNKNOWN,
2751
+ ] + toktypes[3:-1]
2752
+
2753
+ self.gguf_writer.add_tokenizer_model("t5")
2754
+ self.gguf_writer.add_tokenizer_pre("default")
2755
+ self.gguf_writer.add_token_list(tokens)
2756
+ self.gguf_writer.add_token_scores(scores)
2757
+ self.gguf_writer.add_token_types(toktypes)
2758
+ self.gguf_writer.add_add_space_prefix(add_prefix)
2759
+ self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2760
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
2761
+ if precompiled_charsmap:
2762
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
2763
+
2764
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2765
+ special_vocab.add_to_gguf(self.gguf_writer)
2766
+
2767
+ self.gguf_writer.add_add_bos_token(True)
2768
+ self.gguf_writer.add_add_eos_token(True)
2769
+
2770
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2771
+ # if name starts with "roberta.", remove the prefix
2772
+ # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
2773
+ if name.startswith("roberta."):
2774
+ name = name[8:]
2775
+
2776
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2777
+ if name == "embeddings.position_embeddings.weight":
2778
+ if self._position_offset is not None:
2779
+ data_torch = data_torch[self._position_offset:,:]
2780
+
2781
+ return super().modify_tensors(data_torch, name, bid)
2782
+
2783
+
2508
2784
  @Model.register("GemmaForCausalLM")
2509
2785
  class GemmaModel(Model):
2510
2786
  model_arch = gguf.MODEL_ARCH.GEMMA
@@ -2608,7 +2884,90 @@ class StarCoder2Model(Model):
2608
2884
  model_arch = gguf.MODEL_ARCH.STARCODER2
2609
2885
 
2610
2886
 
2611
- @Model.register("MambaForCausalLM", "MambaLMHeadModel")
2887
+ @Model.register("Rwkv6ForCausalLM")
2888
+ class Rwkv6Model(Model):
2889
+ model_arch = gguf.MODEL_ARCH.RWKV6
2890
+
2891
+ def set_vocab(self):
2892
+ assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
2893
+ vocab_size = self.hparams.get("vocab_size", 65536)
2894
+
2895
+ tokens: list[bytes] = ['<s>'.encode("utf-8")]
2896
+ toktypes: list[int] = [gguf.TokenType.CONTROL]
2897
+
2898
+ with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
2899
+ lines = f.readlines()
2900
+ for line in lines:
2901
+ parts = line.split(' ')
2902
+ assert len(parts) >= 3
2903
+ token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
2904
+ token = token.encode("utf-8") if isinstance(token, str) else token
2905
+ assert isinstance(token, bytes)
2906
+ assert len(token) == token_len
2907
+ token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
2908
+ tokens.append(token_text.encode("utf-8"))
2909
+ toktypes.append(gguf.TokenType.NORMAL)
2910
+ remainder = vocab_size - len(tokens)
2911
+ assert remainder >= 0
2912
+ for i in range(len(tokens), vocab_size):
2913
+ tokens.append(f"[PAD{i}]".encode("utf-8"))
2914
+ toktypes.append(gguf.TokenType.UNUSED)
2915
+
2916
+ self.gguf_writer.add_tokenizer_model("rwkv")
2917
+ self.gguf_writer.add_token_list(tokens)
2918
+ self.gguf_writer.add_token_types(toktypes)
2919
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
2920
+ special_vocab.chat_template = "rwkv-world"
2921
+ # hack: Add '\n\n' as the EOT token to make it chat normally
2922
+ special_vocab._set_special_token("eot", 261)
2923
+ special_vocab.add_to_gguf(self.gguf_writer)
2924
+
2925
+ def set_gguf_parameters(self):
2926
+ block_count = self.hparams["num_hidden_layers"]
2927
+ head_size = self.hparams["head_size"]
2928
+ hidden_size = self.hparams["hidden_size"]
2929
+ layer_norm_eps = self.hparams["layer_norm_epsilon"]
2930
+ rescale_every_n_layers = self.hparams["rescale_every"]
2931
+ intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
2932
+ time_mix_extra_dim = 64 if hidden_size == 4096 else 32
2933
+ time_decay_extra_dim = 128 if hidden_size == 4096 else 64
2934
+
2935
+ # RWKV isn't context limited
2936
+ self.gguf_writer.add_context_length(1048576)
2937
+ self.gguf_writer.add_embedding_length(hidden_size)
2938
+ self.gguf_writer.add_block_count(block_count)
2939
+ self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
2940
+ self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
2941
+ self.gguf_writer.add_wkv_head_size(head_size)
2942
+ self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
2943
+ self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
2944
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
2945
+ self.gguf_writer.add_file_type(self.ftype)
2946
+
2947
+ # required by llama.cpp, unused
2948
+ self.gguf_writer.add_head_count(0)
2949
+
2950
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2951
+ new_name = self.map_tensor_name(name)
2952
+
2953
+ if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
2954
+ new_name += ".weight"
2955
+
2956
+ if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
2957
+ data_torch = data_torch.transpose(0, 1)
2958
+
2959
+ if new_name.endswith("time_mix_w2.weight"):
2960
+ data_torch = data_torch.permute(0, 2, 1)
2961
+
2962
+ rescale_every_n_layers = self.hparams["rescale_every"]
2963
+ if rescale_every_n_layers > 0:
2964
+ if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
2965
+ data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
2966
+
2967
+ yield (new_name, data_torch)
2968
+
2969
+
2970
+ @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
2612
2971
  class MambaModel(Model):
2613
2972
  model_arch = gguf.MODEL_ARCH.MAMBA
2614
2973
 
@@ -2639,7 +2998,10 @@ class MambaModel(Model):
2639
2998
  # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
2640
2999
  dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
2641
3000
  rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
2642
-
3001
+ use_dt_b_c_norm = False
3002
+ # For falconmamba we do apply RMS norm on B / DT and C layers
3003
+ if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
3004
+ use_dt_b_c_norm = True
2643
3005
  # Fail early for models which don't have a block expansion factor of 2
2644
3006
  assert d_inner == 2 * d_model
2645
3007
 
@@ -2647,12 +3009,13 @@ class MambaModel(Model):
2647
3009
  self.gguf_writer.add_embedding_length(d_model)
2648
3010
  self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
2649
3011
  self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
2650
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
3012
+ self.gguf_writer.add_block_count(self.block_count)
2651
3013
  self.gguf_writer.add_ssm_conv_kernel(d_conv)
2652
3014
  self.gguf_writer.add_ssm_inner_size(d_inner)
2653
3015
  self.gguf_writer.add_ssm_state_size(d_state)
2654
3016
  self.gguf_writer.add_ssm_time_step_rank(dt_rank)
2655
3017
  self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3018
+ self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
2656
3019
  self.gguf_writer.add_file_type(self.ftype)
2657
3020
 
2658
3021
  _tok_embd = None
@@ -2679,19 +3042,6 @@ class MambaModel(Model):
2679
3042
 
2680
3043
  return [(new_name, data_torch)]
2681
3044
 
2682
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
2683
- del n_dims # unused
2684
-
2685
- return bid is not None and new_name in (
2686
- self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
2687
- gguf.MODEL_TENSOR.SSM_CONV1D,
2688
- gguf.MODEL_TENSOR.SSM_X,
2689
- gguf.MODEL_TENSOR.SSM_DT,
2690
- gguf.MODEL_TENSOR.SSM_A,
2691
- gguf.MODEL_TENSOR.SSM_D,
2692
- ]
2693
- )
2694
-
2695
3045
 
2696
3046
  @Model.register("CohereForCausalLM")
2697
3047
  class CommandR2Model(Model):
@@ -2739,9 +3089,74 @@ class OlmoModel(Model):
2739
3089
  return [(self.map_tensor_name(name), data_torch)]
2740
3090
 
2741
3091
 
2742
- @Model.register("JinaBertModel", "JinaBertForMaskedLM")
2743
- class JinaBertV2Model(BertModel):
2744
- model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
3092
+ @Model.register("Olmo2ForCausalLM")
3093
+ class Olmo2Model(Model):
3094
+ model_arch = gguf.MODEL_ARCH.OLMO2
3095
+
3096
+
3097
+ @Model.register("OlmoeForCausalLM")
3098
+ class OlmoeModel(Model):
3099
+ model_arch = gguf.MODEL_ARCH.OLMOE
3100
+
3101
+ def set_gguf_parameters(self):
3102
+ super().set_gguf_parameters()
3103
+ self.gguf_writer.add_layer_norm_rms_eps(1e-5)
3104
+ if (n_experts := self.hparams.get("num_experts")) is not None:
3105
+ self.gguf_writer.add_expert_count(n_experts)
3106
+
3107
+ _experts: list[dict[str, Tensor]] | None = None
3108
+
3109
+ # Copied from: Qwen2MoeModel
3110
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3111
+ # process the experts separately
3112
+ if name.find("experts") != -1:
3113
+ n_experts = self.hparams["num_experts"]
3114
+ assert bid is not None
3115
+
3116
+ if self._experts is None:
3117
+ self._experts = [{} for _ in range(self.block_count)]
3118
+
3119
+ self._experts[bid][name] = data_torch
3120
+
3121
+ if len(self._experts[bid]) >= n_experts * 3:
3122
+ tensors: list[tuple[str, Tensor]] = []
3123
+
3124
+ # merge the experts into a single 3d tensor
3125
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
3126
+ datas: list[Tensor] = []
3127
+
3128
+ for xid in range(n_experts):
3129
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3130
+ datas.append(self._experts[bid][ename])
3131
+ del self._experts[bid][ename]
3132
+
3133
+ data_torch = torch.stack(datas, dim=0)
3134
+
3135
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3136
+
3137
+ new_name = self.map_tensor_name(merged_name)
3138
+
3139
+ tensors.append((new_name, data_torch))
3140
+ return tensors
3141
+ else:
3142
+ return []
3143
+
3144
+ return [(self.map_tensor_name(name), data_torch)]
3145
+
3146
+ # Copied from: Qwen2MoeModel
3147
+ def prepare_tensors(self):
3148
+ super().prepare_tensors()
3149
+
3150
+ if self._experts is not None:
3151
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3152
+ experts = [k for d in self._experts for k in d.keys()]
3153
+ if len(experts) > 0:
3154
+ raise ValueError(f"Unprocessed experts: {experts}")
3155
+
3156
+
3157
+ @Model.register("JinaBertModel", "JinaBertForMaskedLM")
3158
+ class JinaBertV2Model(BertModel):
3159
+ model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
2745
3160
 
2746
3161
  def __init__(self, *args, **kwargs):
2747
3162
  super().__init__(*args, **kwargs)
@@ -2777,6 +3192,14 @@ class JinaBertV2Model(BertModel):
2777
3192
  self.gguf_writer.add_add_bos_token(True)
2778
3193
  self.gguf_writer.add_add_eos_token(True)
2779
3194
 
3195
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3196
+ # if name starts with "bert.", remove the prefix
3197
+ # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
3198
+ if name.startswith("bert."):
3199
+ name = name[5:]
3200
+
3201
+ return super().modify_tensors(data_torch, name, bid)
3202
+
2780
3203
 
2781
3204
  @Model.register("OpenELMForCausalLM")
2782
3205
  class OpenELMModel(Model):
@@ -3226,6 +3649,145 @@ class T5Model(Model):
3226
3649
  return [(self.map_tensor_name(name), data_torch)]
3227
3650
 
3228
3651
 
3652
+ @Model.register("T5EncoderModel")
3653
+ class T5EncoderModel(Model):
3654
+ model_arch = gguf.MODEL_ARCH.T5ENCODER
3655
+
3656
+ def __init__(self, *args, **kwargs):
3657
+ super().__init__(*args, **kwargs)
3658
+ self.shared_token_embeddings_found = False
3659
+
3660
+ def set_vocab(self):
3661
+ # to avoid TypeError: Descriptors cannot be created directly
3662
+ # exception when importing sentencepiece_model_pb2
3663
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3664
+ from sentencepiece import SentencePieceProcessor
3665
+ from sentencepiece import sentencepiece_model_pb2 as model
3666
+
3667
+ tokenizer_path = self.dir_model / 'tokenizer.model'
3668
+
3669
+ # many older models use spiece.model tokenizer model filename
3670
+ if not tokenizer_path.is_file():
3671
+ tokenizer_path = self.dir_model / 'spiece.model'
3672
+
3673
+ if not tokenizer_path.is_file():
3674
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
3675
+
3676
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3677
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3678
+
3679
+ # some models like Pile-T5 family use BPE tokenizer instead of Unigram
3680
+ if sentencepiece_model.trainer_spec.model_type == 2: # BPE
3681
+ # assure the tokenizer model file name is correct
3682
+ assert tokenizer_path.name == 'tokenizer.model'
3683
+ return self._set_vocab_sentencepiece()
3684
+ else:
3685
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3686
+
3687
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3688
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3689
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3690
+
3691
+ tokenizer = SentencePieceProcessor()
3692
+ tokenizer.LoadFromFile(str(tokenizer_path))
3693
+
3694
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3695
+
3696
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3697
+ scores: list[float] = [-10000.0] * vocab_size
3698
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3699
+
3700
+ for token_id in range(tokenizer.vocab_size()):
3701
+ piece = tokenizer.IdToPiece(token_id)
3702
+ text = piece.encode("utf-8")
3703
+ score = tokenizer.GetScore(token_id)
3704
+
3705
+ toktype = SentencePieceTokenTypes.NORMAL
3706
+ if tokenizer.IsUnknown(token_id):
3707
+ toktype = SentencePieceTokenTypes.UNKNOWN
3708
+ elif tokenizer.IsControl(token_id):
3709
+ toktype = SentencePieceTokenTypes.CONTROL
3710
+ elif tokenizer.IsUnused(token_id):
3711
+ toktype = SentencePieceTokenTypes.UNUSED
3712
+ elif tokenizer.IsByte(token_id):
3713
+ toktype = SentencePieceTokenTypes.BYTE
3714
+
3715
+ tokens[token_id] = text
3716
+ scores[token_id] = score
3717
+ toktypes[token_id] = toktype
3718
+
3719
+ added_tokens_file = self.dir_model / 'added_tokens.json'
3720
+ if added_tokens_file.is_file():
3721
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
3722
+ added_tokens_json = json.load(f)
3723
+ for key in added_tokens_json:
3724
+ token_id = added_tokens_json[key]
3725
+ if token_id >= vocab_size:
3726
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
3727
+ continue
3728
+
3729
+ tokens[token_id] = key.encode("utf-8")
3730
+ scores[token_id] = -1000.0
3731
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
3732
+
3733
+ if vocab_size > len(tokens):
3734
+ pad_count = vocab_size - len(tokens)
3735
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3736
+ for i in range(1, pad_count + 1):
3737
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3738
+ scores.append(-1000.0)
3739
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
3740
+
3741
+ self.gguf_writer.add_tokenizer_model("t5")
3742
+ self.gguf_writer.add_tokenizer_pre("default")
3743
+ self.gguf_writer.add_token_list(tokens)
3744
+ self.gguf_writer.add_token_scores(scores)
3745
+ self.gguf_writer.add_token_types(toktypes)
3746
+ self.gguf_writer.add_add_space_prefix(add_prefix)
3747
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3748
+ if precompiled_charsmap:
3749
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3750
+
3751
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3752
+ special_vocab.add_to_gguf(self.gguf_writer)
3753
+
3754
+ self.gguf_writer.add_add_bos_token(False)
3755
+ self.gguf_writer.add_add_eos_token(True)
3756
+
3757
+ def set_gguf_parameters(self):
3758
+ if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
3759
+ logger.warning("Couldn't find context length in config.json, assuming default value of 512")
3760
+ n_ctx = 512
3761
+ self.gguf_writer.add_context_length(n_ctx)
3762
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
3763
+ self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
3764
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
3765
+ self.gguf_writer.add_head_count(self.hparams["num_heads"])
3766
+ self.gguf_writer.add_key_length(self.hparams["d_kv"])
3767
+ self.gguf_writer.add_value_length(self.hparams["d_kv"])
3768
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3769
+ self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
3770
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
3771
+ self.gguf_writer.add_file_type(self.ftype)
3772
+
3773
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3774
+ del bid # unused
3775
+
3776
+ # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
3777
+ # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
3778
+ # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
3779
+ # and decoder and ignore the remaining ones.
3780
+ if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
3781
+ if not self.shared_token_embeddings_found:
3782
+ name = "shared.weight"
3783
+ self.shared_token_embeddings_found = True
3784
+ else:
3785
+ logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
3786
+ return []
3787
+
3788
+ return [(self.map_tensor_name(name), data_torch)]
3789
+
3790
+
3229
3791
  @Model.register("JAISLMHeadModel")
3230
3792
  class JaisModel(Model):
3231
3793
  model_arch = gguf.MODEL_ARCH.JAIS
@@ -3240,10 +3802,7 @@ class JaisModel(Model):
3240
3802
 
3241
3803
  # Embeddings scale
3242
3804
  self.embeddings_scale = 1.0
3243
- # note: For some JAIS flavors, output is tied to (same as) wte in original model
3244
- self.output_is_wte = False
3245
3805
  if 'mup_embeddings_scale' in self.hparams:
3246
- self.output_is_wte = True # Hack (?)
3247
3806
  self.embeddings_scale = self.hparams['mup_embeddings_scale']
3248
3807
  elif 'embeddings_scale' in self.hparams:
3249
3808
  self.embeddings_scale = self.hparams['embeddings_scale']
@@ -3300,10 +3859,7 @@ class JaisModel(Model):
3300
3859
 
3301
3860
  if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3302
3861
  tensors.append((new_name, data_torch * self.embeddings_scale))
3303
- if self.output_is_wte:
3304
- tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
3305
3862
  elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3306
- assert not self.output_is_wte
3307
3863
  tensors.append((new_name, data_torch * self.width_scale))
3308
3864
  else:
3309
3865
  tensors.append((new_name, data_torch))
@@ -3497,8 +4053,218 @@ class ChatGLMModel(Model):
3497
4053
  name = name.removeprefix("transformer.")
3498
4054
  return [(self.map_tensor_name(name), data_torch)]
3499
4055
 
3500
- ###### CONVERSION LOGIC ######
3501
4056
 
4057
+ @Model.register("NemotronForCausalLM")
4058
+ class NemotronModel(Model):
4059
+ model_arch = gguf.MODEL_ARCH.NEMOTRON
4060
+
4061
+ def set_vocab(self):
4062
+ self._set_vocab_sentencepiece()
4063
+ self.gguf_writer.add_pad_token_id(0)
4064
+ self.gguf_writer.add_unk_token_id(1)
4065
+
4066
+ def set_gguf_parameters(self):
4067
+ super().set_gguf_parameters()
4068
+ hparams = self.hparams
4069
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
4070
+
4071
+ f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
4072
+ self.gguf_writer.add_layer_norm_eps(f_norm_eps)
4073
+
4074
+ # * Partial RoPE
4075
+ rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
4076
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
4077
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
4078
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
4079
+
4080
+ # * RopeScaling for Nemotron
4081
+ if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
4082
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
4083
+ else:
4084
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
4085
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
4086
+
4087
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4088
+ # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
4089
+ # model.layers.{l}.input_layernorm.weight
4090
+ # model.layers.{l}.post_attention_layernorm.weight
4091
+ # model.norm.weight
4092
+ if name.endswith("norm.weight"):
4093
+ data_torch = data_torch + 1
4094
+
4095
+ return [(self.map_tensor_name(name), data_torch)]
4096
+
4097
+
4098
+ @Model.register("ExaoneForCausalLM")
4099
+ class ExaoneModel(Model):
4100
+ model_arch = gguf.MODEL_ARCH.EXAONE
4101
+
4102
+ def set_gguf_parameters(self):
4103
+ hparams = self.hparams
4104
+
4105
+ assert (hparams["activation_function"] == "silu")
4106
+
4107
+ max_position_embeddings = hparams["max_position_embeddings"]
4108
+ embed_dim = hparams["hidden_size"]
4109
+ num_heads = hparams["num_attention_heads"]
4110
+ num_kv_heads = hparams.get("num_key_value_heads", num_heads)
4111
+ layer_norm_eps = hparams["layer_norm_epsilon"]
4112
+ intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
4113
+ num_layers = hparams["num_layers"]
4114
+ # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
4115
+ # attention_dropout_rate = hparams["attention_dropout"]
4116
+ # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
4117
+ # embed_dropout_rate = hparams["embed_dropout"]
4118
+ self.gguf_writer.add_embedding_length(embed_dim)
4119
+ self.gguf_writer.add_head_count(num_heads)
4120
+ self.gguf_writer.add_head_count_kv(num_kv_heads)
4121
+ self.gguf_writer.add_context_length(max_position_embeddings)
4122
+ self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
4123
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
4124
+ self.gguf_writer.add_block_count(num_layers)
4125
+ self.gguf_writer.add_file_type(self.ftype)
4126
+
4127
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
4128
+ self.gguf_writer.add_rope_freq_base(rope_theta)
4129
+ rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
4130
+ rotary_factor = rotary_factor if rotary_factor is not None else 1.0
4131
+ self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
4132
+ if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
4133
+ if hparams["rope_scaling"].get("type") == "linear":
4134
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
4135
+ self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
4136
+
4137
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
4138
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
4139
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
4140
+ base = self.hparams.get("rope_theta", 10000.0)
4141
+ dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
4142
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
4143
+
4144
+ factor = rope_scaling.get("factor", 8.0)
4145
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
4146
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
4147
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
4148
+
4149
+ low_freq_wavelen = old_context_len / low_freq_factor
4150
+ high_freq_wavelen = old_context_len / high_freq_factor
4151
+ assert low_freq_wavelen != high_freq_wavelen
4152
+
4153
+ rope_factors = []
4154
+ for freq in freqs:
4155
+ wavelen = 2 * math.pi / freq
4156
+ if wavelen < high_freq_wavelen:
4157
+ rope_factors.append(1)
4158
+ elif wavelen > low_freq_wavelen:
4159
+ rope_factors.append(factor)
4160
+ else:
4161
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
4162
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
4163
+
4164
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
4165
+
4166
+
4167
+ @Model.register("GraniteForCausalLM")
4168
+ class GraniteModel(LlamaModel):
4169
+ """Conversion for IBM's GraniteForCausalLM"""
4170
+ model_arch = gguf.MODEL_ARCH.GRANITE
4171
+
4172
+ def set_gguf_parameters(self):
4173
+ """Granite uses standard llama parameters with the following differences:
4174
+
4175
+ - No head_dim support
4176
+ - New multiplier params:
4177
+ - attention_scale
4178
+ - embedding_scale
4179
+ - residual_scale
4180
+ - logits_scaling
4181
+ """
4182
+ if head_dim := self.hparams.pop("head_dim", None):
4183
+ logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
4184
+ super().set_gguf_parameters()
4185
+ # NOTE: Convert _multiplier params to _scale params for naming
4186
+ # consistency
4187
+ if attention_scale := self.hparams.get("attention_multiplier"):
4188
+ self.gguf_writer.add_attention_scale(attention_scale)
4189
+ logger.info("gguf: (granite) attention_scale = %s", attention_scale)
4190
+ if embedding_scale := self.hparams.get("embedding_multiplier"):
4191
+ self.gguf_writer.add_embedding_scale(embedding_scale)
4192
+ logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
4193
+ if residual_scale := self.hparams.get("residual_multiplier"):
4194
+ self.gguf_writer.add_residual_scale(residual_scale)
4195
+ logger.info("gguf: (granite) residual_scale = %s", residual_scale)
4196
+ if logits_scale := self.hparams.get("logits_scaling"):
4197
+ self.gguf_writer.add_logit_scale(logits_scale)
4198
+ logger.info("gguf: (granite) logits_scale = %s", logits_scale)
4199
+
4200
+
4201
+ @Model.register("GraniteMoeForCausalLM")
4202
+ class GraniteMoeModel(GraniteModel):
4203
+ """Conversion for IBM's GraniteMoeForCausalLM"""
4204
+ model_arch = gguf.MODEL_ARCH.GRANITE_MOE
4205
+
4206
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4207
+ """In modeling_granitemoe, the JetMoe implementation of parallel experts
4208
+ is used. This essentially merges w1 and w3 into a single tensor with 2x
4209
+ the hidden size that is then split during forward. To keep compatibility
4210
+ with existing mixtral support, we pull them apart here.
4211
+ """
4212
+
4213
+ if name.endswith("block_sparse_moe.input_linear.weight"):
4214
+ ffn_dim = self.hparams["intermediate_size"]
4215
+ assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
4216
+ gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
4217
+ return [
4218
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
4219
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
4220
+ ]
4221
+
4222
+ return super().modify_tensors(data_torch, name, bid)
4223
+
4224
+
4225
+ @Model.register("ChameleonForConditionalGeneration")
4226
+ @Model.register("ChameleonForCausalLM") # obsolete
4227
+ class ChameleonModel(Model):
4228
+ model_arch = gguf.MODEL_ARCH.CHAMELEON
4229
+
4230
+ def set_gguf_parameters(self):
4231
+ super().set_gguf_parameters()
4232
+ self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
4233
+
4234
+ def set_vocab(self):
4235
+ self._set_vocab_gpt2()
4236
+
4237
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4238
+ # ignore image tokenizer for now
4239
+ # TODO: remove this once image support is implemented for Chameleon
4240
+ if name.startswith("model.vqmodel"):
4241
+ return []
4242
+
4243
+ n_head = self.hparams["num_attention_heads"]
4244
+ n_kv_head = self.hparams.get("num_key_value_heads")
4245
+ hidden_dim = self.hparams.get("hidden_size")
4246
+
4247
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
4248
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
4249
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
4250
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
4251
+ if name.endswith(("q_norm.weight", "q_norm.bias")):
4252
+ data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
4253
+ if name.endswith(("k_norm.weight", "k_norm.bias")):
4254
+ data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
4255
+
4256
+ return [(self.map_tensor_name(name), data_torch)]
4257
+
4258
+ # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
4259
+ @staticmethod
4260
+ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
4261
+ head_dim = hidden_dim // n_heads
4262
+ data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
4263
+ data_torch = data_torch.repeat_interleave(n_heads, 0)
4264
+ return data_torch
4265
+
4266
+
4267
+ ###### CONVERSION LOGIC ######
3502
4268
 
3503
4269
  # tree of lazy tensors
3504
4270
  class LazyTorchTensor(gguf.LazyBase):
@@ -3578,8 +4344,8 @@ def parse_args() -> argparse.Namespace:
3578
4344
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
3579
4345
  )
3580
4346
  parser.add_argument(
3581
- "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
3582
- help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
4347
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
4348
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
3583
4349
  )
3584
4350
  parser.add_argument(
3585
4351
  "--bigendian", action="store_true",
@@ -3666,6 +4432,8 @@ def main() -> None:
3666
4432
  "f16": gguf.LlamaFileType.MOSTLY_F16,
3667
4433
  "bf16": gguf.LlamaFileType.MOSTLY_BF16,
3668
4434
  "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
4435
+ "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
4436
+ "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
3669
4437
  "auto": gguf.LlamaFileType.GUESSED,
3670
4438
  }
3671
4439