bigdl-core-cpp 2.5.0rc1__py3-none-win_amd64.whl → 2.6.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +1673 -278
- bigdl/cpp/convert_hf_to_gguf_update.py +381 -0
- bigdl/cpp/convert_llama_ggml_to_gguf.py +450 -0
- bigdl/cpp/convert_lora_to_gguf.py +461 -0
- bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
- bigdl/cpp/gguf-py/gguf/constants.py +698 -171
- bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +108 -17
- bigdl/cpp/gguf-py/gguf/lazy.py +3 -1
- bigdl/cpp/gguf-py/gguf/metadata.py +195 -76
- bigdl/cpp/gguf-py/gguf/quants.py +1210 -64
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +262 -43
- bigdl/cpp/gguf-py/gguf/utility.py +2 -2
- bigdl/cpp/gguf-py/gguf/vocab.py +325 -3
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/ggml-base.dll +0 -0
- bigdl/cpp/libs/ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ggml.dll +0 -0
- bigdl/cpp/libs/libc++.dll +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gemma3-cli.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ollama-lib.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/ollama_ggml.dll +0 -0
- bigdl/cpp/libs/ollama_llama.dll +0 -0
- bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
- {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.bat +7 -2
- bigdl_core_cpp-2.6.0.data/scripts/init-ollama.bat +16 -0
- {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/METADATA +9 -5
- bigdl_core_cpp-2.6.0.dist-info/RECORD +57 -0
- {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/WHEEL +1 -1
- bigdl/cpp/convert.py +0 -1714
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/ggml_shared.dll +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- bigdl_core_cpp-2.5.0rc1.data/scripts/init-ollama.bat +0 -13
- bigdl_core_cpp-2.5.0rc1.dist-info/RECORD +0 -63
- {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
from __future__ import annotations
|
5
5
|
|
6
|
+
import ast
|
6
7
|
import logging
|
7
8
|
import argparse
|
8
9
|
import contextlib
|
@@ -14,6 +15,7 @@ from enum import IntEnum
|
|
14
15
|
from pathlib import Path
|
15
16
|
from hashlib import sha256
|
16
17
|
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
18
|
+
from itertools import chain
|
17
19
|
|
18
20
|
import math
|
19
21
|
import numpy as np
|
@@ -70,7 +72,8 @@ class Model:
|
|
70
72
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
71
73
|
use_temp_file: bool = False, eager: bool = False,
|
72
74
|
metadata_override: Path | None = None, model_name: str | None = None,
|
73
|
-
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
75
|
+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
76
|
+
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
|
74
77
|
if type(self) is Model:
|
75
78
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
76
79
|
|
@@ -85,7 +88,7 @@ class Model:
|
|
85
88
|
self.is_safetensors = len(self.part_names) > 0
|
86
89
|
if not self.is_safetensors:
|
87
90
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
88
|
-
self.hparams = Model.load_hparams(self.dir_model)
|
91
|
+
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
|
89
92
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
90
93
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
91
94
|
self.tensor_names = None
|
@@ -129,12 +132,14 @@ class Model:
|
|
129
132
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
130
133
|
tensor_names_from_parts: set[str] = set()
|
131
134
|
|
132
|
-
if
|
135
|
+
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
136
|
+
index_name += ".index.json"
|
137
|
+
index_file = self.dir_model / index_name
|
138
|
+
|
139
|
+
if index_file.is_file():
|
133
140
|
self.tensor_names = set()
|
134
|
-
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
135
|
-
index_name += ".index.json"
|
136
141
|
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
137
|
-
with open(
|
142
|
+
with open(index_file, "r", encoding="utf-8") as f:
|
138
143
|
index: dict[str, Any] = json.load(f)
|
139
144
|
weight_map = index.get("weight_map")
|
140
145
|
if weight_map is None or not isinstance(weight_map, dict):
|
@@ -142,6 +147,7 @@ class Model:
|
|
142
147
|
self.tensor_names.update(weight_map.keys())
|
143
148
|
else:
|
144
149
|
self.tensor_names = tensor_names_from_parts
|
150
|
+
weight_map = {}
|
145
151
|
|
146
152
|
for part_name in self.part_names:
|
147
153
|
logger.info(f"gguf: loading model part '{part_name}'")
|
@@ -168,9 +174,17 @@ class Model:
|
|
168
174
|
data = LazyTorchTensor.from_eager(data)
|
169
175
|
yield name, data
|
170
176
|
|
171
|
-
#
|
172
|
-
if len(
|
173
|
-
|
177
|
+
# verify tensor name presence and identify potentially missing files
|
178
|
+
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
179
|
+
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
|
180
|
+
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
181
|
+
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
182
|
+
if len(extra) == 0 and len(missing_files) > 0:
|
183
|
+
raise ValueError(f"Missing or incomplete model files: {missing_files}")
|
184
|
+
else:
|
185
|
+
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
186
|
+
f"Missing tensors: {missing}\n"
|
187
|
+
f"Extra tensors: {extra}")
|
174
188
|
|
175
189
|
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
176
190
|
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
@@ -207,17 +221,17 @@ class Model:
|
|
207
221
|
self.gguf_writer.add_context_length(n_ctx)
|
208
222
|
logger.info(f"gguf: context length = {n_ctx}")
|
209
223
|
|
210
|
-
n_embd
|
211
|
-
|
212
|
-
|
224
|
+
if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
|
225
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
226
|
+
logger.info(f"gguf: embedding length = {n_embd}")
|
213
227
|
|
214
228
|
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
215
229
|
self.gguf_writer.add_feed_forward_length(n_ff)
|
216
230
|
logger.info(f"gguf: feed forward length = {n_ff}")
|
217
231
|
|
218
|
-
n_head
|
219
|
-
|
220
|
-
|
232
|
+
if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
|
233
|
+
self.gguf_writer.add_head_count(n_head)
|
234
|
+
logger.info(f"gguf: head count = {n_head}")
|
221
235
|
|
222
236
|
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
223
237
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
@@ -251,20 +265,19 @@ class Model:
|
|
251
265
|
|
252
266
|
return [(self.map_tensor_name(name), data_torch)]
|
253
267
|
|
254
|
-
def
|
268
|
+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
255
269
|
del name, new_name, bid, n_dims # unused
|
256
270
|
|
257
271
|
return False
|
258
272
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
return False
|
273
|
+
# some models need extra generated tensors (like rope_freqs)
|
274
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
275
|
+
return ()
|
263
276
|
|
264
277
|
def prepare_tensors(self):
|
265
278
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
266
279
|
|
267
|
-
for name, data_torch in self.get_tensors():
|
280
|
+
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
|
268
281
|
# we don't need these
|
269
282
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
270
283
|
continue
|
@@ -282,57 +295,83 @@ class Model:
|
|
282
295
|
bid = int(part)
|
283
296
|
break
|
284
297
|
|
285
|
-
for new_name,
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
data_qtype: gguf.GGMLQuantizationType | None = None
|
298
|
+
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
|
299
|
+
# TODO: why do we squeeze here?
|
300
|
+
# data = data_torch.squeeze().numpy()
|
301
|
+
data = data_torch.numpy()
|
290
302
|
|
291
|
-
#
|
292
|
-
|
293
|
-
|
303
|
+
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
304
|
+
if len(data.shape) == 0:
|
305
|
+
data = data_torch.numpy()
|
306
|
+
|
307
|
+
n_dims = len(data.shape)
|
308
|
+
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
294
309
|
|
295
310
|
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
296
|
-
|
297
|
-
|
298
|
-
extra_f32,
|
299
|
-
n_dims == 1,
|
300
|
-
new_name.endswith("_norm.weight"),
|
301
|
-
))
|
311
|
+
if n_dims <= 1 or new_name.endswith("_norm.weight"):
|
312
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
302
313
|
|
314
|
+
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
303
315
|
# Some tensor types are always in float32
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
316
|
+
if data_qtype is False and (
|
317
|
+
any(
|
318
|
+
self.match_model_tensor_name(new_name, key, bid)
|
319
|
+
for key in (
|
320
|
+
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
321
|
+
gguf.MODEL_TENSOR.POS_EMBD,
|
322
|
+
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
323
|
+
gguf.MODEL_TENSOR.SSM_CONV1D,
|
324
|
+
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
325
|
+
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
326
|
+
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
327
|
+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
328
|
+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
329
|
+
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
330
|
+
gguf.MODEL_TENSOR.POSNET_NORM1,
|
331
|
+
gguf.MODEL_TENSOR.POSNET_NORM2,
|
332
|
+
)
|
333
|
+
)
|
334
|
+
or not new_name.endswith(".weight")
|
335
|
+
):
|
336
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
321
337
|
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
338
|
+
if data_qtype is False and any(
|
339
|
+
self.match_model_tensor_name(new_name, key, bid)
|
340
|
+
for key in (
|
341
|
+
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
342
|
+
gguf.MODEL_TENSOR.OUTPUT,
|
343
|
+
)
|
344
|
+
):
|
345
|
+
if self.ftype in (
|
346
|
+
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
347
|
+
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
348
|
+
):
|
349
|
+
# TODO: use Q4_K and Q6_K
|
350
|
+
data_qtype = gguf.GGMLQuantizationType.F16
|
326
351
|
|
327
|
-
|
328
|
-
|
329
|
-
|
352
|
+
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
353
|
+
if isinstance(data_qtype, bool):
|
354
|
+
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
355
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
356
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
330
357
|
data_qtype = gguf.GGMLQuantizationType.F16
|
358
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
359
|
+
data_qtype = gguf.GGMLQuantizationType.BF16
|
360
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
361
|
+
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
362
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
363
|
+
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
364
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
365
|
+
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
366
|
+
else:
|
367
|
+
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
331
368
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
369
|
+
try:
|
370
|
+
data = gguf.quants.quantize(data, data_qtype)
|
371
|
+
except gguf.QuantError as e:
|
372
|
+
logger.warning("%s, %s", e, "falling back to F16")
|
373
|
+
data_qtype = gguf.GGMLQuantizationType.F16
|
374
|
+
data = gguf.quants.quantize(data, data_qtype)
|
336
375
|
|
337
376
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
338
377
|
|
@@ -439,6 +478,11 @@ class Model:
|
|
439
478
|
return modelcls
|
440
479
|
return func
|
441
480
|
|
481
|
+
@classmethod
|
482
|
+
def print_registered_models(cls):
|
483
|
+
for name in sorted(cls._model_classes.keys()):
|
484
|
+
logger.error(f"- {name}")
|
485
|
+
|
442
486
|
@classmethod
|
443
487
|
def from_model_architecture(cls, arch: str) -> type[Model]:
|
444
488
|
try:
|
@@ -491,9 +535,19 @@ class Model:
|
|
491
535
|
else:
|
492
536
|
token: str = reverse_vocab[i]
|
493
537
|
if token in added_vocab:
|
538
|
+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
539
|
+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
540
|
+
if not tokenizer.added_tokens_decoder[i].normalized:
|
541
|
+
previous_token = token
|
542
|
+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
543
|
+
if previous_token != token:
|
544
|
+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
545
|
+
|
494
546
|
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
495
547
|
toktypes.append(gguf.TokenType.CONTROL)
|
496
548
|
else:
|
549
|
+
# NOTE: this was added for Gemma.
|
550
|
+
# Encoding and decoding the tokens above isn't sufficient for this case.
|
497
551
|
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
498
552
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
499
553
|
else:
|
@@ -504,7 +558,7 @@ class Model:
|
|
504
558
|
|
505
559
|
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
506
560
|
# do not modify it manually!
|
507
|
-
# ref: https://github.com/
|
561
|
+
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
508
562
|
# Marker: Start get_vocab_base_pre
|
509
563
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
510
564
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
@@ -537,9 +591,15 @@ class Model:
|
|
537
591
|
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
538
592
|
# ref: https://huggingface.co/tiiuae/falcon-7b
|
539
593
|
res = "falcon"
|
594
|
+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
595
|
+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
596
|
+
res = "falcon3"
|
540
597
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
541
598
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
542
599
|
res = "bert-bge"
|
600
|
+
if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
|
601
|
+
# ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
|
602
|
+
res = "bert-bge-large"
|
543
603
|
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
544
604
|
# ref: https://huggingface.co/mosaicml/mpt-7b
|
545
605
|
res = "mpt"
|
@@ -567,6 +627,9 @@ class Model:
|
|
567
627
|
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
568
628
|
# ref: https://huggingface.co/databricks/dbrx-base
|
569
629
|
res = "dbrx"
|
630
|
+
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
|
631
|
+
# ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
632
|
+
res = "jina-v1-en"
|
570
633
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
571
634
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
572
635
|
res = "jina-v2-en"
|
@@ -585,7 +648,7 @@ class Model:
|
|
585
648
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
586
649
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
587
650
|
res = "jina-v2-code"
|
588
|
-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
651
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
589
652
|
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
590
653
|
res = "chatglm-bpe"
|
591
654
|
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
@@ -603,6 +666,39 @@ class Model:
|
|
603
666
|
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
604
667
|
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
605
668
|
res = "smollm"
|
669
|
+
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
|
670
|
+
# ref: https://huggingface.co/bigscience/bloom
|
671
|
+
res = "bloom"
|
672
|
+
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
|
673
|
+
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
|
674
|
+
res = "gpt3-finnish"
|
675
|
+
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
676
|
+
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
677
|
+
res = "exaone"
|
678
|
+
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
679
|
+
# ref: https://huggingface.co/microsoft/phi-2
|
680
|
+
res = "phi-2"
|
681
|
+
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
|
682
|
+
# ref: https://huggingface.co/facebook/chameleon-7b
|
683
|
+
res = "chameleon"
|
684
|
+
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
685
|
+
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
686
|
+
res = "minerva-7b"
|
687
|
+
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
|
688
|
+
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
|
689
|
+
res = "roberta-bpe"
|
690
|
+
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
691
|
+
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
692
|
+
res = "gigachat"
|
693
|
+
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
694
|
+
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
695
|
+
res = "megrez"
|
696
|
+
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
|
697
|
+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
698
|
+
res = "deepseek-v3"
|
699
|
+
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
|
700
|
+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
701
|
+
res = "deepseek-r1-qwen"
|
606
702
|
|
607
703
|
if res is None:
|
608
704
|
logger.warning("\n")
|
@@ -612,7 +708,7 @@ class Model:
|
|
612
708
|
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
613
709
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
614
710
|
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
615
|
-
logger.warning("** ref: https://github.com/
|
711
|
+
logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
|
616
712
|
logger.warning("**")
|
617
713
|
logger.warning(f"** chkhsh: {chkhsh}")
|
618
714
|
logger.warning("**************************************************************************************")
|
@@ -625,6 +721,9 @@ class Model:
|
|
625
721
|
return res
|
626
722
|
# Marker: End get_vocab_base_pre
|
627
723
|
|
724
|
+
def _set_vocab_none(self) -> None:
|
725
|
+
self.gguf_writer.add_tokenizer_model("none")
|
726
|
+
|
628
727
|
def _set_vocab_gpt2(self) -> None:
|
629
728
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
630
729
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
@@ -906,7 +1005,7 @@ class GPTNeoXModel(Model):
|
|
906
1005
|
return tensors
|
907
1006
|
|
908
1007
|
|
909
|
-
@Model.register("BloomForCausalLM")
|
1008
|
+
@Model.register("BloomForCausalLM", "BloomModel")
|
910
1009
|
class BloomModel(Model):
|
911
1010
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
912
1011
|
|
@@ -1461,7 +1560,7 @@ class StableLMModel(Model):
|
|
1461
1560
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1462
1561
|
|
1463
1562
|
|
1464
|
-
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1563
|
+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1465
1564
|
class LlamaModel(Model):
|
1466
1565
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
1467
1566
|
|
@@ -1487,6 +1586,17 @@ class LlamaModel(Model):
|
|
1487
1586
|
special_vocab._set_special_token("eot", 32010)
|
1488
1587
|
special_vocab.add_to_gguf(self.gguf_writer)
|
1489
1588
|
|
1589
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1590
|
+
if tokenizer_config_file.is_file():
|
1591
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1592
|
+
tokenizer_config_json = json.load(f)
|
1593
|
+
if "add_prefix_space" in tokenizer_config_json:
|
1594
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1595
|
+
|
1596
|
+
# Apply to granite small models only
|
1597
|
+
if self.hparams.get("vocab_size", 32000) == 49152:
|
1598
|
+
self.gguf_writer.add_add_bos_token(False)
|
1599
|
+
|
1490
1600
|
def set_gguf_parameters(self):
|
1491
1601
|
super().set_gguf_parameters()
|
1492
1602
|
hparams = self.hparams
|
@@ -1503,17 +1613,6 @@ class LlamaModel(Model):
|
|
1503
1613
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1504
1614
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1505
1615
|
|
1506
|
-
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1507
|
-
if tokenizer_config_file.is_file():
|
1508
|
-
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1509
|
-
tokenizer_config_json = json.load(f)
|
1510
|
-
if "add_prefix_space" in tokenizer_config_json:
|
1511
|
-
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1512
|
-
|
1513
|
-
# Apply to granite small models only
|
1514
|
-
if self.hparams.get("vocab_size", 32000) == 49152:
|
1515
|
-
self.gguf_writer.add_add_bos_token(False)
|
1516
|
-
|
1517
1616
|
@staticmethod
|
1518
1617
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1519
1618
|
if n_head_kv is not None and n_head != n_head_kv:
|
@@ -1569,12 +1668,13 @@ class LlamaModel(Model):
|
|
1569
1668
|
|
1570
1669
|
return [(self.map_tensor_name(name), data_torch)]
|
1571
1670
|
|
1572
|
-
def
|
1671
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1573
1672
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1574
1673
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1575
1674
|
base = self.hparams.get("rope_theta", 10000.0)
|
1576
|
-
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1675
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1577
1676
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1677
|
+
|
1578
1678
|
factor = rope_scaling.get("factor", 8.0)
|
1579
1679
|
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1580
1680
|
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
@@ -1595,8 +1695,9 @@ class LlamaModel(Model):
|
|
1595
1695
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1596
1696
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1597
1697
|
|
1598
|
-
|
1698
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
1599
1699
|
|
1700
|
+
def prepare_tensors(self):
|
1600
1701
|
super().prepare_tensors()
|
1601
1702
|
|
1602
1703
|
if self._experts is not None:
|
@@ -1606,6 +1707,178 @@ class LlamaModel(Model):
|
|
1606
1707
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1607
1708
|
|
1608
1709
|
|
1710
|
+
@Model.register("DeciLMForCausalLM")
|
1711
|
+
class DeciModel(Model):
|
1712
|
+
model_arch = gguf.MODEL_ARCH.DECI
|
1713
|
+
|
1714
|
+
@staticmethod
|
1715
|
+
def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
|
1716
|
+
# DeciLM-specific code
|
1717
|
+
intermediate_size = int(2 * ffn_mult * n_embd / 3)
|
1718
|
+
return DeciModel._find_multiple(intermediate_size, 256)
|
1719
|
+
|
1720
|
+
@staticmethod
|
1721
|
+
def _find_multiple(n: int, k: int) -> int:
|
1722
|
+
# DeciLM-specific code
|
1723
|
+
if n % k == 0:
|
1724
|
+
return n
|
1725
|
+
return n + k - (n % k)
|
1726
|
+
|
1727
|
+
def __init__(self, *args, **kwargs):
|
1728
|
+
super().__init__(*args, **kwargs)
|
1729
|
+
|
1730
|
+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
1731
|
+
_block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
|
1732
|
+
assert self.block_count == len(_block_configs)
|
1733
|
+
self._num_kv_heads = list()
|
1734
|
+
self._num_heads = list()
|
1735
|
+
_ffn_multipliers = list()
|
1736
|
+
# ***linear attention layer***
|
1737
|
+
# if n_heads_in_group is None and replace_with_linear is True
|
1738
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
|
1739
|
+
# ***attention-free layer***
|
1740
|
+
# if n_heads_in_group is None and replace_with_linear is False
|
1741
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0
|
1742
|
+
# ***normal attention-layer***
|
1743
|
+
# if n_heads_in_group is not None, then
|
1744
|
+
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
|
1745
|
+
# _num_heads[il] is num_attention_head
|
1746
|
+
for il in range(len(_block_configs)):
|
1747
|
+
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
|
1748
|
+
if _block_configs[il]["attention"]["replace_with_linear"] is True:
|
1749
|
+
self._num_kv_heads.append(0)
|
1750
|
+
self._num_heads.append(self.hparams["num_attention_heads"])
|
1751
|
+
else:
|
1752
|
+
self._num_kv_heads.append(0)
|
1753
|
+
self._num_heads.append(0)
|
1754
|
+
else:
|
1755
|
+
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
|
1756
|
+
self._num_heads.append(self.hparams["num_attention_heads"])
|
1757
|
+
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
1758
|
+
assert self.block_count == len(self._num_kv_heads)
|
1759
|
+
assert self.block_count == len(self._num_heads)
|
1760
|
+
assert self.block_count == len(_ffn_multipliers)
|
1761
|
+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
1762
|
+
assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
|
1763
|
+
assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
|
1764
|
+
self._ffn_dims: list[int] = [
|
1765
|
+
DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
|
1766
|
+
for multiplier in _ffn_multipliers
|
1767
|
+
]
|
1768
|
+
|
1769
|
+
def set_vocab(self):
|
1770
|
+
# Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
|
1771
|
+
# eos_token from '|eot_id|' to '|end_of_text|'
|
1772
|
+
if self.hparams.get("vocab_size", 128256) == 128256:
|
1773
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
1774
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
1775
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
1776
|
+
self.gguf_writer.add_token_list(tokens)
|
1777
|
+
self.gguf_writer.add_token_types(toktypes)
|
1778
|
+
|
1779
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
1780
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
1781
|
+
else:
|
1782
|
+
# DeciLM-7B
|
1783
|
+
self._set_vocab_llama_hf()
|
1784
|
+
|
1785
|
+
def set_gguf_parameters(self):
|
1786
|
+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
1787
|
+
assert self.block_count == len(self._num_kv_heads)
|
1788
|
+
assert self.block_count == len(self._num_heads)
|
1789
|
+
assert self.block_count == len(self._ffn_dims)
|
1790
|
+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
1791
|
+
self.gguf_writer.add_rope_freq_base(rope_theta)
|
1792
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
1793
|
+
self.gguf_writer.add_head_count(self._num_heads)
|
1794
|
+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
1795
|
+
self.gguf_writer.add_block_count(self.block_count)
|
1796
|
+
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1797
|
+
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
1798
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
1799
|
+
self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1800
|
+
self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1801
|
+
self.gguf_writer.add_file_type(self.ftype)
|
1802
|
+
else: # DeciLM-7B
|
1803
|
+
super().set_gguf_parameters()
|
1804
|
+
if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
|
1805
|
+
self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
|
1806
|
+
assert self.block_count == len(self._num_kv_heads)
|
1807
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
1808
|
+
hparams = self.hparams
|
1809
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1810
|
+
|
1811
|
+
if "head_dim" in hparams:
|
1812
|
+
rope_dim = hparams["head_dim"]
|
1813
|
+
else:
|
1814
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1815
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1816
|
+
|
1817
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1818
|
+
if self.hparams["rope_scaling"].get("type") == "linear":
|
1819
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1820
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1821
|
+
|
1822
|
+
@staticmethod
|
1823
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1824
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
1825
|
+
n_head = n_head_kv
|
1826
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1827
|
+
.swapaxes(1, 2)
|
1828
|
+
.reshape(weights.shape))
|
1829
|
+
|
1830
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1831
|
+
n_head = self.hparams["num_attention_heads"]
|
1832
|
+
if bid is not None:
|
1833
|
+
if "num_key_value_heads_per_layer" in self.hparams:
|
1834
|
+
n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
|
1835
|
+
elif "block_configs" in self.hparams:
|
1836
|
+
n_kv_head = self._num_kv_heads[bid]
|
1837
|
+
n_head = self._num_heads[bid]
|
1838
|
+
else:
|
1839
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1840
|
+
else:
|
1841
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1842
|
+
|
1843
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1844
|
+
data_torch = DeciModel.permute(data_torch, n_head, n_head)
|
1845
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1846
|
+
data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
|
1847
|
+
return [(self.map_tensor_name(name), data_torch)]
|
1848
|
+
|
1849
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1850
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1851
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1852
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
1853
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1854
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1855
|
+
|
1856
|
+
factor = rope_scaling.get("factor", 8.0)
|
1857
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1858
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
1859
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
1860
|
+
|
1861
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
1862
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
1863
|
+
assert low_freq_wavelen != high_freq_wavelen
|
1864
|
+
|
1865
|
+
rope_factors = []
|
1866
|
+
for freq in freqs:
|
1867
|
+
wavelen = 2 * math.pi / freq
|
1868
|
+
if wavelen < high_freq_wavelen:
|
1869
|
+
rope_factors.append(1)
|
1870
|
+
elif wavelen > low_freq_wavelen:
|
1871
|
+
rope_factors.append(factor)
|
1872
|
+
else:
|
1873
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1874
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1875
|
+
|
1876
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
1877
|
+
|
1878
|
+
def prepare_tensors(self):
|
1879
|
+
super().prepare_tensors()
|
1880
|
+
|
1881
|
+
|
1609
1882
|
@Model.register("BitnetForCausalLM")
|
1610
1883
|
class BitnetModel(Model):
|
1611
1884
|
model_arch = gguf.MODEL_ARCH.BITNET
|
@@ -1618,15 +1891,16 @@ class BitnetModel(Model):
|
|
1618
1891
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1619
1892
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
1620
1893
|
|
1621
|
-
def weight_quant(self, weight):
|
1894
|
+
def weight_quant(self, weight: Tensor) -> Tensor:
|
1622
1895
|
dtype = weight.dtype
|
1623
1896
|
weight = weight.float()
|
1624
|
-
|
1625
|
-
|
1626
|
-
scale
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1897
|
+
scale = weight.abs().mean().clamp(min=1e-5)
|
1898
|
+
iscale = 1 / scale
|
1899
|
+
# TODO: multiply by the scale directly instead of inverting it twice
|
1900
|
+
# (this is also unnecessarily doubly inverted upstream)
|
1901
|
+
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
1902
|
+
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
1903
|
+
return result.type(dtype)
|
1630
1904
|
|
1631
1905
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1632
1906
|
new_name = self.map_tensor_name(name)
|
@@ -1641,11 +1915,9 @@ class BitnetModel(Model):
|
|
1641
1915
|
gguf.MODEL_TENSOR.FFN_GATE,
|
1642
1916
|
]):
|
1643
1917
|
# transform weight into 1/0/-1 (in fp32)
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
else:
|
1648
|
-
yield (new_name, data_torch)
|
1918
|
+
data_torch = self.weight_quant(data_torch)
|
1919
|
+
|
1920
|
+
yield (new_name, data_torch)
|
1649
1921
|
|
1650
1922
|
|
1651
1923
|
@Model.register("GrokForCausalLM")
|
@@ -1764,7 +2036,7 @@ class DbrxModel(Model):
|
|
1764
2036
|
|
1765
2037
|
return [(new_name, data_torch)]
|
1766
2038
|
|
1767
|
-
def
|
2039
|
+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
1768
2040
|
del name, new_name, bid # unused
|
1769
2041
|
|
1770
2042
|
return n_dims > 1
|
@@ -1775,29 +2047,40 @@ class MiniCPMModel(Model):
|
|
1775
2047
|
model_arch = gguf.MODEL_ARCH.MINICPM
|
1776
2048
|
|
1777
2049
|
def set_gguf_parameters(self):
|
1778
|
-
|
1779
|
-
|
1780
|
-
self.gguf_writer.
|
1781
|
-
|
1782
|
-
self.
|
1783
|
-
self.gguf_writer.
|
1784
|
-
|
1785
|
-
self.
|
1786
|
-
self.gguf_writer.
|
1787
|
-
|
2050
|
+
super().set_gguf_parameters()
|
2051
|
+
embedding_scale = float(self.hparams["scale_emb"])
|
2052
|
+
self.gguf_writer.add_embedding_scale(embedding_scale)
|
2053
|
+
logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
|
2054
|
+
residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
|
2055
|
+
self.gguf_writer.add_residual_scale(residual_scale)
|
2056
|
+
logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
|
2057
|
+
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
2058
|
+
self.gguf_writer.add_logit_scale(logit_scale)
|
2059
|
+
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
2060
|
+
if self.hparams.get("rope_scaling") is not None:
|
2061
|
+
if self.hparams["rope_scaling"].get("type") == "longrope":
|
2062
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
2063
|
+
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
2064
|
+
|
2065
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2066
|
+
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1788
2067
|
|
1789
|
-
|
1790
|
-
|
2068
|
+
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
2069
|
+
if rope_scaling is not None:
|
2070
|
+
long_factors = rope_scaling.get('long_factor', None)
|
2071
|
+
short_factors = rope_scaling.get('short_factor', None)
|
1791
2072
|
|
1792
|
-
|
1793
|
-
|
1794
|
-
n_head = n_kv_head
|
2073
|
+
if long_factors is None or short_factors is None:
|
2074
|
+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
1795
2075
|
|
1796
|
-
|
1797
|
-
|
1798
|
-
|
1799
|
-
.
|
1800
|
-
|
2076
|
+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2077
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2078
|
+
|
2079
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2080
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2081
|
+
|
2082
|
+
def set_vocab(self):
|
2083
|
+
self._set_vocab_sentencepiece()
|
1801
2084
|
|
1802
2085
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1803
2086
|
del bid # unused
|
@@ -1807,13 +2090,66 @@ class MiniCPMModel(Model):
|
|
1807
2090
|
|
1808
2091
|
# HF models permute some of the tensors, so we need to undo that
|
1809
2092
|
if name.endswith(("q_proj.weight")):
|
1810
|
-
data_torch =
|
2093
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1811
2094
|
if name.endswith(("k_proj.weight")):
|
1812
|
-
data_torch =
|
2095
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1813
2096
|
|
1814
2097
|
return [(self.map_tensor_name(name), data_torch)]
|
1815
2098
|
|
1816
2099
|
|
2100
|
+
@Model.register("MiniCPM3ForCausalLM")
|
2101
|
+
class MiniCPM3Model(Model):
|
2102
|
+
model_arch = gguf.MODEL_ARCH.MINICPM3
|
2103
|
+
|
2104
|
+
def set_gguf_parameters(self):
|
2105
|
+
hparams = self.hparams
|
2106
|
+
|
2107
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2108
|
+
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2109
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2110
|
+
self.gguf_writer.add_block_count(self.block_count)
|
2111
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
2112
|
+
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
2113
|
+
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
2114
|
+
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
2115
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
2116
|
+
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
2117
|
+
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
2118
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
2119
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
2120
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
2121
|
+
|
2122
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2123
|
+
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
2124
|
+
if rope_scaling is not None:
|
2125
|
+
rope_dims = self.hparams["qk_rope_head_dim"]
|
2126
|
+
|
2127
|
+
long_factors = rope_scaling.get('long_factor', None)
|
2128
|
+
short_factors = rope_scaling.get('short_factor', None)
|
2129
|
+
|
2130
|
+
if long_factors is None or short_factors is None:
|
2131
|
+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
2132
|
+
|
2133
|
+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2134
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2135
|
+
|
2136
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2137
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2138
|
+
|
2139
|
+
def set_vocab(self):
|
2140
|
+
self._set_vocab_sentencepiece()
|
2141
|
+
|
2142
|
+
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
2143
|
+
if n_kv_head is not None and n_head != n_kv_head:
|
2144
|
+
n_head //= n_kv_head
|
2145
|
+
|
2146
|
+
return (
|
2147
|
+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
2148
|
+
.swapaxes(1, 2)
|
2149
|
+
.reshape(weights.shape)
|
2150
|
+
)
|
2151
|
+
|
2152
|
+
|
1817
2153
|
@Model.register("QWenLMHeadModel")
|
1818
2154
|
class QwenModel(Model):
|
1819
2155
|
model_arch = gguf.MODEL_ARCH.QWEN
|
@@ -1866,6 +2202,75 @@ class Qwen2Model(Model):
|
|
1866
2202
|
except FileNotFoundError:
|
1867
2203
|
self._set_vocab_gpt2()
|
1868
2204
|
|
2205
|
+
def set_gguf_parameters(self):
|
2206
|
+
super().set_gguf_parameters()
|
2207
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2208
|
+
if self.hparams["rope_scaling"].get("type") == "yarn":
|
2209
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
2210
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2211
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
2212
|
+
|
2213
|
+
|
2214
|
+
@Model.register("Qwen2VLForConditionalGeneration")
|
2215
|
+
class Qwen2VLModel(Model):
|
2216
|
+
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
2217
|
+
|
2218
|
+
def set_gguf_parameters(self):
|
2219
|
+
super().set_gguf_parameters()
|
2220
|
+
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
|
2221
|
+
mrope_section += [0] * max(0, 4 - len(mrope_section))
|
2222
|
+
self.gguf_writer.add_rope_dimension_sections(mrope_section)
|
2223
|
+
|
2224
|
+
def set_vocab(self):
|
2225
|
+
try:
|
2226
|
+
self._set_vocab_sentencepiece()
|
2227
|
+
except FileNotFoundError:
|
2228
|
+
self._set_vocab_gpt2()
|
2229
|
+
|
2230
|
+
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
2231
|
+
for name, data in super().get_tensors():
|
2232
|
+
if name.startswith("visual."):
|
2233
|
+
continue
|
2234
|
+
yield name, data
|
2235
|
+
|
2236
|
+
|
2237
|
+
@Model.register("WavTokenizerDec")
|
2238
|
+
class WavTokenizerDecModel(Model):
|
2239
|
+
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
2240
|
+
|
2241
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2242
|
+
del bid # unused
|
2243
|
+
|
2244
|
+
if \
|
2245
|
+
name.endswith("codebook.cluster_size") or \
|
2246
|
+
name.endswith("codebook.embed_avg") or \
|
2247
|
+
name.endswith("codebook.inited"):
|
2248
|
+
logger.debug(f"Skipping {name!r}")
|
2249
|
+
return []
|
2250
|
+
|
2251
|
+
logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
|
2252
|
+
|
2253
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2254
|
+
|
2255
|
+
def set_vocab(self):
|
2256
|
+
self._set_vocab_none()
|
2257
|
+
|
2258
|
+
def set_gguf_parameters(self):
|
2259
|
+
super().set_gguf_parameters()
|
2260
|
+
self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
|
2261
|
+
self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
|
2262
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
|
2263
|
+
self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
|
2264
|
+
self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
|
2265
|
+
|
2266
|
+
self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
|
2267
|
+
self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
|
2268
|
+
|
2269
|
+
self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
|
2270
|
+
self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
|
2271
|
+
|
2272
|
+
self.gguf_writer.add_causal_attention(False)
|
2273
|
+
|
1869
2274
|
|
1870
2275
|
@Model.register("Qwen2MoeForCausalLM")
|
1871
2276
|
class Qwen2MoeModel(Model):
|
@@ -1995,6 +2400,15 @@ class Phi3MiniModel(Model):
|
|
1995
2400
|
model_arch = gguf.MODEL_ARCH.PHI3
|
1996
2401
|
|
1997
2402
|
def set_vocab(self):
|
2403
|
+
# Phi-4 model uses GPT2Tokenizer
|
2404
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2405
|
+
if tokenizer_config_file.is_file():
|
2406
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2407
|
+
tokenizer_config_json = json.load(f)
|
2408
|
+
tokenizer_class = tokenizer_config_json['tokenizer_class']
|
2409
|
+
if tokenizer_class == 'GPT2Tokenizer':
|
2410
|
+
return self._set_vocab_gpt2()
|
2411
|
+
|
1998
2412
|
from sentencepiece import SentencePieceProcessor
|
1999
2413
|
|
2000
2414
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
@@ -2111,7 +2525,18 @@ class Phi3MiniModel(Model):
|
|
2111
2525
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
2112
2526
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
2113
2527
|
self.gguf_writer.add_file_type(self.ftype)
|
2114
|
-
self.
|
2528
|
+
sliding_window = self.hparams.get("sliding_window")
|
2529
|
+
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
2530
|
+
if sliding_window is None:
|
2531
|
+
sliding_window = 0
|
2532
|
+
self.gguf_writer.add_sliding_window(sliding_window)
|
2533
|
+
|
2534
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2535
|
+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
2536
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
2537
|
+
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
2538
|
+
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
2539
|
+
rope_dims = n_embd // n_head
|
2115
2540
|
|
2116
2541
|
# write rope scaling for long context (128k) model
|
2117
2542
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
@@ -2142,15 +2567,72 @@ class Phi3MiniModel(Model):
|
|
2142
2567
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2143
2568
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2144
2569
|
|
2145
|
-
self.
|
2146
|
-
self.
|
2570
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2571
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2147
2572
|
|
2148
2573
|
|
2149
|
-
@Model.register("
|
2150
|
-
class
|
2151
|
-
model_arch = gguf.MODEL_ARCH.
|
2574
|
+
@Model.register("PhiMoEForCausalLM")
|
2575
|
+
class PhiMoeModel(Phi3MiniModel):
|
2576
|
+
model_arch = gguf.MODEL_ARCH.PHIMOE
|
2152
2577
|
|
2153
|
-
|
2578
|
+
_experts: list[dict[str, Tensor]] | None = None
|
2579
|
+
|
2580
|
+
def set_gguf_parameters(self):
|
2581
|
+
super().set_gguf_parameters()
|
2582
|
+
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
2583
|
+
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
|
2584
|
+
|
2585
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2586
|
+
# process the experts separately
|
2587
|
+
if name.find("block_sparse_moe.experts") != -1:
|
2588
|
+
n_experts = self.hparams["num_local_experts"]
|
2589
|
+
assert bid is not None
|
2590
|
+
|
2591
|
+
if self._experts is None:
|
2592
|
+
self._experts = [{} for _ in range(self.block_count)]
|
2593
|
+
|
2594
|
+
self._experts[bid][name] = data_torch
|
2595
|
+
|
2596
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
2597
|
+
tensors: list[tuple[str, Tensor]] = []
|
2598
|
+
|
2599
|
+
# merge the experts into a single 3d tensor
|
2600
|
+
for w_name in ["w1", "w2", "w3"]:
|
2601
|
+
datas: list[Tensor] = []
|
2602
|
+
|
2603
|
+
for xid in range(n_experts):
|
2604
|
+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
|
2605
|
+
datas.append(self._experts[bid][ename])
|
2606
|
+
del self._experts[bid][ename]
|
2607
|
+
|
2608
|
+
data_torch = torch.stack(datas, dim=0)
|
2609
|
+
|
2610
|
+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
|
2611
|
+
|
2612
|
+
new_name = self.map_tensor_name(merged_name)
|
2613
|
+
|
2614
|
+
tensors.append((new_name, data_torch))
|
2615
|
+
return tensors
|
2616
|
+
else:
|
2617
|
+
return []
|
2618
|
+
|
2619
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2620
|
+
|
2621
|
+
def prepare_tensors(self):
|
2622
|
+
super().prepare_tensors()
|
2623
|
+
|
2624
|
+
if self._experts is not None:
|
2625
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
2626
|
+
experts = [k for d in self._experts for k in d.keys()]
|
2627
|
+
if len(experts) > 0:
|
2628
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
2629
|
+
|
2630
|
+
|
2631
|
+
@Model.register("PlamoForCausalLM")
|
2632
|
+
class PlamoModel(Model):
|
2633
|
+
model_arch = gguf.MODEL_ARCH.PLAMO
|
2634
|
+
|
2635
|
+
def set_vocab(self):
|
2154
2636
|
self._set_vocab_sentencepiece()
|
2155
2637
|
|
2156
2638
|
def set_gguf_parameters(self):
|
@@ -2353,7 +2835,7 @@ class InternLM2Model(Model):
|
|
2353
2835
|
if chat_eos_token_id is not None:
|
2354
2836
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
2355
2837
|
# TODO: this is a hack, should be fixed
|
2356
|
-
# https://github.com/
|
2838
|
+
# https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
|
2357
2839
|
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
2358
2840
|
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
2359
2841
|
" in chat mode so that the conversation can end normally.")
|
@@ -2403,7 +2885,67 @@ class InternLM2Model(Model):
|
|
2403
2885
|
return [(self.map_tensor_name(name), data_torch)]
|
2404
2886
|
|
2405
2887
|
|
2406
|
-
@Model.register("
|
2888
|
+
@Model.register("InternLM3ForCausalLM")
|
2889
|
+
class InternLM3Model(Model):
|
2890
|
+
model_arch = gguf.MODEL_ARCH.LLAMA
|
2891
|
+
|
2892
|
+
def set_vocab(self):
|
2893
|
+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
2894
|
+
|
2895
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
2896
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
2897
|
+
self.gguf_writer.add_token_list(tokens)
|
2898
|
+
self.gguf_writer.add_token_scores(scores)
|
2899
|
+
self.gguf_writer.add_token_types(toktypes)
|
2900
|
+
|
2901
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2902
|
+
|
2903
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2904
|
+
if tokenizer_config_file.is_file():
|
2905
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2906
|
+
tokenizer_config_json = json.load(f)
|
2907
|
+
if "add_prefix_space" in tokenizer_config_json:
|
2908
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
2909
|
+
|
2910
|
+
if "added_tokens_decoder" in tokenizer_config_json:
|
2911
|
+
for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
|
2912
|
+
if token_data.get("special"):
|
2913
|
+
token_id = int(token_id)
|
2914
|
+
token = token_data["content"]
|
2915
|
+
special_vocab._set_special_token(token, token_id)
|
2916
|
+
# update eos token
|
2917
|
+
if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
|
2918
|
+
special_vocab.special_token_ids["eos"] = token_id
|
2919
|
+
|
2920
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2921
|
+
|
2922
|
+
def set_gguf_parameters(self):
|
2923
|
+
super().set_gguf_parameters()
|
2924
|
+
hparams = self.hparams
|
2925
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
2926
|
+
|
2927
|
+
if "head_dim" in hparams:
|
2928
|
+
rope_dim = hparams["head_dim"]
|
2929
|
+
else:
|
2930
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
2931
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
2932
|
+
|
2933
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2934
|
+
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
|
2935
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2936
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2937
|
+
|
2938
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2939
|
+
n_head = self.hparams["num_attention_heads"]
|
2940
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
2941
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
2942
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
2943
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
2944
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
2945
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2946
|
+
|
2947
|
+
|
2948
|
+
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
|
2407
2949
|
class BertModel(Model):
|
2408
2950
|
model_arch = gguf.MODEL_ARCH.BERT
|
2409
2951
|
|
@@ -2444,7 +2986,8 @@ class BertModel(Model):
|
|
2444
2986
|
|
2445
2987
|
# we need this to validate the size of the token_type embeddings
|
2446
2988
|
# though currently we are passing all zeros to the token_type embeddings
|
2447
|
-
|
2989
|
+
# "Sequence A" or "Sequence B"
|
2990
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
2448
2991
|
|
2449
2992
|
# convert to phantom space vocab
|
2450
2993
|
def phantom(tok):
|
@@ -2468,13 +3011,73 @@ class BertModel(Model):
|
|
2468
3011
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2469
3012
|
del bid # unused
|
2470
3013
|
|
3014
|
+
if name.startswith("bert."):
|
3015
|
+
name = name[5:]
|
3016
|
+
|
3017
|
+
if name.endswith(".gamma"):
|
3018
|
+
name = name[:-6] + ".weight"
|
3019
|
+
|
3020
|
+
if name.endswith(".beta"):
|
3021
|
+
name = name[:-5] + ".bias"
|
3022
|
+
|
2471
3023
|
# we are only using BERT for embeddings so we don't need the pooling layer
|
2472
3024
|
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
2473
3025
|
return [] # we don't need these
|
2474
3026
|
|
3027
|
+
if name.startswith("cls.predictions"):
|
3028
|
+
return []
|
3029
|
+
|
3030
|
+
if name.startswith("cls.seq_relationship"):
|
3031
|
+
return []
|
3032
|
+
|
2475
3033
|
return [(self.map_tensor_name(name), data_torch)]
|
2476
3034
|
|
2477
3035
|
|
3036
|
+
@Model.register("RobertaModel")
|
3037
|
+
class RobertaModel(BertModel):
|
3038
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
3039
|
+
|
3040
|
+
def __init__(self, *args, **kwargs):
|
3041
|
+
super().__init__(*args, **kwargs)
|
3042
|
+
|
3043
|
+
# we need the pad_token_id to know how to chop down position_embd matrix
|
3044
|
+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
3045
|
+
self._position_offset = 1 + pad_token_id
|
3046
|
+
if "max_position_embeddings" in self.hparams:
|
3047
|
+
self.hparams["max_position_embeddings"] -= self._position_offset
|
3048
|
+
else:
|
3049
|
+
self._position_offset = None
|
3050
|
+
|
3051
|
+
def set_vocab(self):
|
3052
|
+
"""Support BPE tokenizers for roberta models"""
|
3053
|
+
bpe_tok_path = self.dir_model / "tokenizer.json"
|
3054
|
+
if bpe_tok_path.exists():
|
3055
|
+
self._set_vocab_gpt2()
|
3056
|
+
self.gguf_writer.add_add_bos_token(True)
|
3057
|
+
self.gguf_writer.add_add_eos_token(True)
|
3058
|
+
|
3059
|
+
# we need this to validate the size of the token_type embeddings
|
3060
|
+
# though currently we are passing all zeros to the token_type embeddings
|
3061
|
+
# "Sequence A" or "Sequence B"
|
3062
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
3063
|
+
|
3064
|
+
else:
|
3065
|
+
return super().set_vocab()
|
3066
|
+
|
3067
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3068
|
+
# if name starts with "roberta.", remove the prefix
|
3069
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
3070
|
+
if name.startswith("roberta."):
|
3071
|
+
name = name[8:]
|
3072
|
+
|
3073
|
+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
3074
|
+
if name == "embeddings.position_embeddings.weight":
|
3075
|
+
if self._position_offset is not None:
|
3076
|
+
data_torch = data_torch[self._position_offset:,:]
|
3077
|
+
|
3078
|
+
return super().modify_tensors(data_torch, name, bid)
|
3079
|
+
|
3080
|
+
|
2478
3081
|
@Model.register("NomicBertModel")
|
2479
3082
|
class NomicBertModel(BertModel):
|
2480
3083
|
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
|
@@ -2505,6 +3108,117 @@ class NomicBertModel(BertModel):
|
|
2505
3108
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
2506
3109
|
|
2507
3110
|
|
3111
|
+
@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
3112
|
+
class XLMRobertaModel(BertModel):
|
3113
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
3114
|
+
|
3115
|
+
def __init__(self, *args, **kwargs):
|
3116
|
+
super().__init__(*args, **kwargs)
|
3117
|
+
|
3118
|
+
# we need the pad_token_id to know how to chop down position_embd matrix
|
3119
|
+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
3120
|
+
self._position_offset = 1 + pad_token_id
|
3121
|
+
if "max_position_embeddings" in self.hparams:
|
3122
|
+
self.hparams["max_position_embeddings"] -= self._position_offset
|
3123
|
+
else:
|
3124
|
+
self._position_offset = None
|
3125
|
+
|
3126
|
+
def set_vocab(self):
|
3127
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
3128
|
+
# exception when importing sentencepiece_model_pb2
|
3129
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
3130
|
+
from sentencepiece import SentencePieceProcessor
|
3131
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
3132
|
+
|
3133
|
+
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
|
3134
|
+
if not tokenizer_path.is_file():
|
3135
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
3136
|
+
|
3137
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
3138
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
3139
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3140
|
+
|
3141
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
3142
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
3143
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
3144
|
+
|
3145
|
+
tokenizer = SentencePieceProcessor()
|
3146
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3147
|
+
|
3148
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
3149
|
+
|
3150
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3151
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3152
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3153
|
+
|
3154
|
+
for token_id in range(tokenizer.vocab_size()):
|
3155
|
+
piece = tokenizer.IdToPiece(token_id)
|
3156
|
+
text = piece.encode("utf-8")
|
3157
|
+
score = tokenizer.GetScore(token_id)
|
3158
|
+
|
3159
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3160
|
+
if tokenizer.IsUnknown(token_id):
|
3161
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3162
|
+
elif tokenizer.IsControl(token_id):
|
3163
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3164
|
+
elif tokenizer.IsUnused(token_id):
|
3165
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3166
|
+
elif tokenizer.IsByte(token_id):
|
3167
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3168
|
+
|
3169
|
+
tokens[token_id] = text
|
3170
|
+
scores[token_id] = score
|
3171
|
+
toktypes[token_id] = toktype
|
3172
|
+
|
3173
|
+
if vocab_size > len(tokens):
|
3174
|
+
pad_count = vocab_size - len(tokens)
|
3175
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
3176
|
+
for i in range(1, pad_count + 1):
|
3177
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
3178
|
+
scores.append(-1000.0)
|
3179
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
3180
|
+
|
3181
|
+
# realign tokens (see HF tokenizer code)
|
3182
|
+
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
|
3183
|
+
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
|
3184
|
+
toktypes = [
|
3185
|
+
SentencePieceTokenTypes.CONTROL,
|
3186
|
+
SentencePieceTokenTypes.CONTROL,
|
3187
|
+
SentencePieceTokenTypes.CONTROL,
|
3188
|
+
SentencePieceTokenTypes.UNKNOWN,
|
3189
|
+
] + toktypes[3:-1]
|
3190
|
+
|
3191
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
3192
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3193
|
+
self.gguf_writer.add_token_list(tokens)
|
3194
|
+
self.gguf_writer.add_token_scores(scores)
|
3195
|
+
self.gguf_writer.add_token_types(toktypes)
|
3196
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
3197
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
3198
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3199
|
+
if precompiled_charsmap:
|
3200
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3201
|
+
|
3202
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3203
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3204
|
+
|
3205
|
+
self.gguf_writer.add_add_bos_token(True)
|
3206
|
+
self.gguf_writer.add_add_eos_token(True)
|
3207
|
+
|
3208
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3209
|
+
# if name starts with "roberta.", remove the prefix
|
3210
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
3211
|
+
if name.startswith("roberta."):
|
3212
|
+
name = name[8:]
|
3213
|
+
|
3214
|
+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
3215
|
+
if name == "embeddings.position_embeddings.weight":
|
3216
|
+
if self._position_offset is not None:
|
3217
|
+
data_torch = data_torch[self._position_offset:,:]
|
3218
|
+
|
3219
|
+
return super().modify_tensors(data_torch, name, bid)
|
3220
|
+
|
3221
|
+
|
2508
3222
|
@Model.register("GemmaForCausalLM")
|
2509
3223
|
class GemmaModel(Model):
|
2510
3224
|
model_arch = gguf.MODEL_ARCH.GEMMA
|
@@ -2608,54 +3322,216 @@ class StarCoder2Model(Model):
|
|
2608
3322
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
2609
3323
|
|
2610
3324
|
|
2611
|
-
@Model.register("
|
2612
|
-
class
|
2613
|
-
model_arch = gguf.MODEL_ARCH.
|
3325
|
+
@Model.register("Rwkv6ForCausalLM")
|
3326
|
+
class Rwkv6Model(Model):
|
3327
|
+
model_arch = gguf.MODEL_ARCH.RWKV6
|
2614
3328
|
|
2615
3329
|
def set_vocab(self):
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2620
|
-
|
2621
|
-
|
2622
|
-
self.
|
3330
|
+
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
3331
|
+
vocab_size = self.hparams.get("vocab_size", 65536)
|
3332
|
+
|
3333
|
+
tokens: list[bytes] = ['<s>'.encode("utf-8")]
|
3334
|
+
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
3335
|
+
|
3336
|
+
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
|
3337
|
+
lines = f.readlines()
|
3338
|
+
for line in lines:
|
3339
|
+
parts = line.split(' ')
|
3340
|
+
assert len(parts) >= 3
|
3341
|
+
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
3342
|
+
token = token.encode("utf-8") if isinstance(token, str) else token
|
3343
|
+
assert isinstance(token, bytes)
|
3344
|
+
assert len(token) == token_len
|
3345
|
+
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
3346
|
+
tokens.append(token_text.encode("utf-8"))
|
3347
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
3348
|
+
remainder = vocab_size - len(tokens)
|
3349
|
+
assert remainder >= 0
|
3350
|
+
for i in range(len(tokens), vocab_size):
|
3351
|
+
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
3352
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
2623
3353
|
|
2624
|
-
|
2625
|
-
|
2626
|
-
|
2627
|
-
|
2628
|
-
|
2629
|
-
|
2630
|
-
|
3354
|
+
self.gguf_writer.add_tokenizer_model("rwkv")
|
3355
|
+
self.gguf_writer.add_token_list(tokens)
|
3356
|
+
self.gguf_writer.add_token_types(toktypes)
|
3357
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
3358
|
+
special_vocab.chat_template = "rwkv-world"
|
3359
|
+
# hack: Add '\n\n' as the EOT token to make it chat normally
|
3360
|
+
special_vocab._set_special_token("eot", 261)
|
3361
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2631
3362
|
|
2632
3363
|
def set_gguf_parameters(self):
|
2633
|
-
|
2634
|
-
|
2635
|
-
|
2636
|
-
|
2637
|
-
|
2638
|
-
|
2639
|
-
|
2640
|
-
|
2641
|
-
|
2642
|
-
|
2643
|
-
|
2644
|
-
|
2645
|
-
|
2646
|
-
self.gguf_writer.
|
2647
|
-
self.gguf_writer.
|
2648
|
-
self.gguf_writer.
|
2649
|
-
self.gguf_writer.
|
2650
|
-
self.gguf_writer.
|
2651
|
-
self.gguf_writer.
|
2652
|
-
self.gguf_writer.add_ssm_inner_size(d_inner)
|
2653
|
-
self.gguf_writer.add_ssm_state_size(d_state)
|
2654
|
-
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
2655
|
-
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3364
|
+
block_count = self.hparams["num_hidden_layers"]
|
3365
|
+
head_size = self.hparams["head_size"]
|
3366
|
+
hidden_size = self.hparams["hidden_size"]
|
3367
|
+
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
3368
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
3369
|
+
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
3370
|
+
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
3371
|
+
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
3372
|
+
|
3373
|
+
# RWKV isn't context limited
|
3374
|
+
self.gguf_writer.add_context_length(1048576)
|
3375
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
3376
|
+
self.gguf_writer.add_block_count(block_count)
|
3377
|
+
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
3378
|
+
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
3379
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
3380
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
3381
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
3382
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
2656
3383
|
self.gguf_writer.add_file_type(self.ftype)
|
2657
3384
|
|
2658
|
-
|
3385
|
+
# required by llama.cpp, unused
|
3386
|
+
self.gguf_writer.add_head_count(0)
|
3387
|
+
|
3388
|
+
lerp_weights: dict[int, dict[str, Tensor]] = {}
|
3389
|
+
|
3390
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3391
|
+
new_name = self.map_tensor_name(name)
|
3392
|
+
|
3393
|
+
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
3394
|
+
new_name += ".weight"
|
3395
|
+
|
3396
|
+
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
|
3397
|
+
data_torch = data_torch.transpose(0, 1)
|
3398
|
+
|
3399
|
+
if new_name.endswith("time_mix_w2.weight"):
|
3400
|
+
data_torch = data_torch.permute(0, 2, 1)
|
3401
|
+
|
3402
|
+
if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
|
3403
|
+
data_torch = data_torch.squeeze()
|
3404
|
+
|
3405
|
+
try:
|
3406
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
3407
|
+
if rescale_every_n_layers > 0:
|
3408
|
+
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
3409
|
+
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
3410
|
+
except KeyError:
|
3411
|
+
pass
|
3412
|
+
|
3413
|
+
# concat time_mix_lerp weights to reduce some cpu overhead
|
3414
|
+
# also reduces the number of tensors in the model
|
3415
|
+
if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
|
3416
|
+
try:
|
3417
|
+
self.lerp_weights[bid][new_name] = data_torch
|
3418
|
+
except KeyError:
|
3419
|
+
self.lerp_weights[bid] = {new_name: data_torch}
|
3420
|
+
if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
|
3421
|
+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
|
3422
|
+
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
|
3423
|
+
yield (new_name, data)
|
3424
|
+
return
|
3425
|
+
|
3426
|
+
yield (new_name, data_torch)
|
3427
|
+
|
3428
|
+
|
3429
|
+
@Model.register("RWKV6Qwen2ForCausalLM")
|
3430
|
+
class RWKV6Qwen2Model(Rwkv6Model):
|
3431
|
+
model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
|
3432
|
+
|
3433
|
+
def set_vocab(self):
|
3434
|
+
try:
|
3435
|
+
self._set_vocab_sentencepiece()
|
3436
|
+
except FileNotFoundError:
|
3437
|
+
self._set_vocab_gpt2()
|
3438
|
+
|
3439
|
+
def set_gguf_parameters(self):
|
3440
|
+
block_count = self.hparams["num_hidden_layers"]
|
3441
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
3442
|
+
num_key_value_heads = self.hparams["num_key_value_heads"]
|
3443
|
+
hidden_size = self.hparams["hidden_size"]
|
3444
|
+
head_size = hidden_size // num_attention_heads
|
3445
|
+
rms_norm_eps = self.hparams["rms_norm_eps"]
|
3446
|
+
intermediate_size = self.hparams["intermediate_size"]
|
3447
|
+
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
|
3448
|
+
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
|
3449
|
+
|
3450
|
+
# RWKV isn't context limited
|
3451
|
+
self.gguf_writer.add_context_length(1048576)
|
3452
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
3453
|
+
self.gguf_writer.add_block_count(block_count)
|
3454
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
3455
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
3456
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
3457
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
3458
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3459
|
+
|
3460
|
+
# special parameters for time_mixing in RWKV6QWEN2
|
3461
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3462
|
+
self.gguf_writer.add_token_shift_count(1)
|
3463
|
+
# RWKV6QWEN2 use grouped key/value like GQA
|
3464
|
+
self.gguf_writer.add_head_count_kv(num_key_value_heads)
|
3465
|
+
|
3466
|
+
# required by llama.cpp, unused
|
3467
|
+
self.gguf_writer.add_head_count(0)
|
3468
|
+
|
3469
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3470
|
+
for new_name, data in super().modify_tensors(data_torch, name, bid):
|
3471
|
+
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
|
3472
|
+
data = data.view(5, -1, data.shape[-1])
|
3473
|
+
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
|
3474
|
+
# permute them here to avoid code changes
|
3475
|
+
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
|
3476
|
+
if "w2" in new_name:
|
3477
|
+
data = data.view(5, -1, data.shape[-1])
|
3478
|
+
yield (new_name, data)
|
3479
|
+
continue
|
3480
|
+
yield (new_name, data)
|
3481
|
+
|
3482
|
+
|
3483
|
+
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
3484
|
+
class MambaModel(Model):
|
3485
|
+
model_arch = gguf.MODEL_ARCH.MAMBA
|
3486
|
+
|
3487
|
+
def set_vocab(self):
|
3488
|
+
vocab_size = self.hparams["vocab_size"]
|
3489
|
+
# Round vocab size to next multiple of 8
|
3490
|
+
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
|
3491
|
+
# pad using ceiling division
|
3492
|
+
# ref: https://stackoverflow.com/a/17511341/22827863
|
3493
|
+
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
3494
|
+
self.hparams["vocab_size"] = vocab_size
|
3495
|
+
|
3496
|
+
if (self.dir_model / "tokenizer.json").is_file():
|
3497
|
+
self._set_vocab_gpt2()
|
3498
|
+
elif (self.dir_model / "tokenizer.model").is_file():
|
3499
|
+
self._set_vocab_sentencepiece()
|
3500
|
+
else:
|
3501
|
+
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
3502
|
+
self._set_vocab_builtin("gpt-neox", vocab_size)
|
3503
|
+
|
3504
|
+
def set_gguf_parameters(self):
|
3505
|
+
d_model = self.find_hparam(["hidden_size", "d_model"])
|
3506
|
+
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
3507
|
+
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
|
3508
|
+
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
|
3509
|
+
# ceiling division
|
3510
|
+
# ref: https://stackoverflow.com/a/17511341/22827863
|
3511
|
+
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
3512
|
+
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
|
3513
|
+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
3514
|
+
use_dt_b_c_norm = False
|
3515
|
+
# For falconmamba we do apply RMS norm on B / DT and C layers
|
3516
|
+
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
|
3517
|
+
use_dt_b_c_norm = True
|
3518
|
+
# Fail early for models which don't have a block expansion factor of 2
|
3519
|
+
assert d_inner == 2 * d_model
|
3520
|
+
|
3521
|
+
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
3522
|
+
self.gguf_writer.add_embedding_length(d_model)
|
3523
|
+
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
3524
|
+
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
3525
|
+
self.gguf_writer.add_block_count(self.block_count)
|
3526
|
+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
3527
|
+
self.gguf_writer.add_ssm_inner_size(d_inner)
|
3528
|
+
self.gguf_writer.add_ssm_state_size(d_state)
|
3529
|
+
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
3530
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3531
|
+
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
|
3532
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3533
|
+
|
3534
|
+
_tok_embd = None
|
2659
3535
|
|
2660
3536
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2661
3537
|
del bid # unused
|
@@ -2679,19 +3555,6 @@ class MambaModel(Model):
|
|
2679
3555
|
|
2680
3556
|
return [(new_name, data_torch)]
|
2681
3557
|
|
2682
|
-
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
2683
|
-
del n_dims # unused
|
2684
|
-
|
2685
|
-
return bid is not None and new_name in (
|
2686
|
-
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
2687
|
-
gguf.MODEL_TENSOR.SSM_CONV1D,
|
2688
|
-
gguf.MODEL_TENSOR.SSM_X,
|
2689
|
-
gguf.MODEL_TENSOR.SSM_DT,
|
2690
|
-
gguf.MODEL_TENSOR.SSM_A,
|
2691
|
-
gguf.MODEL_TENSOR.SSM_D,
|
2692
|
-
]
|
2693
|
-
)
|
2694
|
-
|
2695
3558
|
|
2696
3559
|
@Model.register("CohereForCausalLM")
|
2697
3560
|
class CommandR2Model(Model):
|
@@ -2711,6 +3574,24 @@ class CommandR2Model(Model):
|
|
2711
3574
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
2712
3575
|
|
2713
3576
|
|
3577
|
+
@Model.register("Cohere2ForCausalLM")
|
3578
|
+
class Cohere2Model(Model):
|
3579
|
+
model_arch = gguf.MODEL_ARCH.COHERE2
|
3580
|
+
|
3581
|
+
def set_gguf_parameters(self):
|
3582
|
+
super().set_gguf_parameters()
|
3583
|
+
|
3584
|
+
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
3585
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
3586
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
3587
|
+
|
3588
|
+
rotary_pct = self.hparams["rotary_pct"]
|
3589
|
+
hidden_size = self.hparams["hidden_size"]
|
3590
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
3591
|
+
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
|
3592
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3593
|
+
|
3594
|
+
|
2714
3595
|
@Model.register("OlmoForCausalLM")
|
2715
3596
|
@Model.register("OLMoForCausalLM")
|
2716
3597
|
class OlmoModel(Model):
|
@@ -2739,6 +3620,71 @@ class OlmoModel(Model):
|
|
2739
3620
|
return [(self.map_tensor_name(name), data_torch)]
|
2740
3621
|
|
2741
3622
|
|
3623
|
+
@Model.register("Olmo2ForCausalLM")
|
3624
|
+
class Olmo2Model(Model):
|
3625
|
+
model_arch = gguf.MODEL_ARCH.OLMO2
|
3626
|
+
|
3627
|
+
|
3628
|
+
@Model.register("OlmoeForCausalLM")
|
3629
|
+
class OlmoeModel(Model):
|
3630
|
+
model_arch = gguf.MODEL_ARCH.OLMOE
|
3631
|
+
|
3632
|
+
def set_gguf_parameters(self):
|
3633
|
+
super().set_gguf_parameters()
|
3634
|
+
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
|
3635
|
+
if (n_experts := self.hparams.get("num_experts")) is not None:
|
3636
|
+
self.gguf_writer.add_expert_count(n_experts)
|
3637
|
+
|
3638
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3639
|
+
|
3640
|
+
# Copied from: Qwen2MoeModel
|
3641
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3642
|
+
# process the experts separately
|
3643
|
+
if name.find("experts") != -1:
|
3644
|
+
n_experts = self.hparams["num_experts"]
|
3645
|
+
assert bid is not None
|
3646
|
+
|
3647
|
+
if self._experts is None:
|
3648
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3649
|
+
|
3650
|
+
self._experts[bid][name] = data_torch
|
3651
|
+
|
3652
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3653
|
+
tensors: list[tuple[str, Tensor]] = []
|
3654
|
+
|
3655
|
+
# merge the experts into a single 3d tensor
|
3656
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
3657
|
+
datas: list[Tensor] = []
|
3658
|
+
|
3659
|
+
for xid in range(n_experts):
|
3660
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
3661
|
+
datas.append(self._experts[bid][ename])
|
3662
|
+
del self._experts[bid][ename]
|
3663
|
+
|
3664
|
+
data_torch = torch.stack(datas, dim=0)
|
3665
|
+
|
3666
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
3667
|
+
|
3668
|
+
new_name = self.map_tensor_name(merged_name)
|
3669
|
+
|
3670
|
+
tensors.append((new_name, data_torch))
|
3671
|
+
return tensors
|
3672
|
+
else:
|
3673
|
+
return []
|
3674
|
+
|
3675
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3676
|
+
|
3677
|
+
# Copied from: Qwen2MoeModel
|
3678
|
+
def prepare_tensors(self):
|
3679
|
+
super().prepare_tensors()
|
3680
|
+
|
3681
|
+
if self._experts is not None:
|
3682
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3683
|
+
experts = [k for d in self._experts for k in d.keys()]
|
3684
|
+
if len(experts) > 0:
|
3685
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
3686
|
+
|
3687
|
+
|
2742
3688
|
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
2743
3689
|
class JinaBertV2Model(BertModel):
|
2744
3690
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
@@ -2777,6 +3723,14 @@ class JinaBertV2Model(BertModel):
|
|
2777
3723
|
self.gguf_writer.add_add_bos_token(True)
|
2778
3724
|
self.gguf_writer.add_add_eos_token(True)
|
2779
3725
|
|
3726
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3727
|
+
# if name starts with "bert.", remove the prefix
|
3728
|
+
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
3729
|
+
if name.startswith("bert."):
|
3730
|
+
name = name[5:]
|
3731
|
+
|
3732
|
+
return super().modify_tensors(data_torch, name, bid)
|
3733
|
+
|
2780
3734
|
|
2781
3735
|
@Model.register("OpenELMForCausalLM")
|
2782
3736
|
class OpenELMModel(Model):
|
@@ -3004,7 +3958,99 @@ class ArcticModel(Model):
|
|
3004
3958
|
raise ValueError(f"Unprocessed experts: {experts}")
|
3005
3959
|
|
3006
3960
|
|
3961
|
+
@Model.register("DeepseekForCausalLM")
|
3962
|
+
class DeepseekModel(Model):
|
3963
|
+
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
3964
|
+
|
3965
|
+
def set_vocab(self):
|
3966
|
+
try:
|
3967
|
+
self._set_vocab_sentencepiece()
|
3968
|
+
except FileNotFoundError:
|
3969
|
+
self._set_vocab_gpt2()
|
3970
|
+
|
3971
|
+
def set_gguf_parameters(self):
|
3972
|
+
super().set_gguf_parameters()
|
3973
|
+
hparams = self.hparams
|
3974
|
+
if "head_dim" in hparams:
|
3975
|
+
rope_dim = hparams["head_dim"]
|
3976
|
+
else:
|
3977
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
3978
|
+
|
3979
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
3980
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3981
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
3982
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
3983
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
3984
|
+
self.gguf_writer.add_expert_weights_scale(1.0)
|
3985
|
+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3986
|
+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3987
|
+
|
3988
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3989
|
+
|
3990
|
+
@staticmethod
|
3991
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
3992
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
3993
|
+
n_head = n_head_kv
|
3994
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
3995
|
+
.swapaxes(1, 2)
|
3996
|
+
.reshape(weights.shape))
|
3997
|
+
|
3998
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3999
|
+
n_head = self.hparams["num_attention_heads"]
|
4000
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
4001
|
+
|
4002
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
4003
|
+
data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
|
4004
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
4005
|
+
data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
|
4006
|
+
|
4007
|
+
# process the experts separately
|
4008
|
+
if name.find("mlp.experts") != -1:
|
4009
|
+
n_experts = self.hparams["n_routed_experts"]
|
4010
|
+
assert bid is not None
|
4011
|
+
|
4012
|
+
if self._experts is None:
|
4013
|
+
self._experts = [{} for _ in range(self.block_count)]
|
4014
|
+
|
4015
|
+
self._experts[bid][name] = data_torch
|
4016
|
+
|
4017
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
4018
|
+
tensors: list[tuple[str, Tensor]] = []
|
4019
|
+
|
4020
|
+
# merge the experts into a single 3d tensor
|
4021
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
4022
|
+
datas: list[Tensor] = []
|
4023
|
+
|
4024
|
+
for xid in range(n_experts):
|
4025
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
4026
|
+
datas.append(self._experts[bid][ename])
|
4027
|
+
del self._experts[bid][ename]
|
4028
|
+
|
4029
|
+
data_torch = torch.stack(datas, dim=0)
|
4030
|
+
|
4031
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
4032
|
+
|
4033
|
+
new_name = self.map_tensor_name(merged_name)
|
4034
|
+
|
4035
|
+
tensors.append((new_name, data_torch))
|
4036
|
+
return tensors
|
4037
|
+
else:
|
4038
|
+
return []
|
4039
|
+
|
4040
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4041
|
+
|
4042
|
+
def prepare_tensors(self):
|
4043
|
+
super().prepare_tensors()
|
4044
|
+
|
4045
|
+
if self._experts is not None:
|
4046
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
4047
|
+
experts = [k for d in self._experts for k in d.keys()]
|
4048
|
+
if len(experts) > 0:
|
4049
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
4050
|
+
|
4051
|
+
|
3007
4052
|
@Model.register("DeepseekV2ForCausalLM")
|
4053
|
+
@Model.register("DeepseekV3ForCausalLM")
|
3008
4054
|
class DeepseekV2Model(Model):
|
3009
4055
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
3010
4056
|
|
@@ -3026,69 +4072,228 @@ class DeepseekV2Model(Model):
|
|
3026
4072
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3027
4073
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3028
4074
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
4075
|
+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
4076
|
+
|
4077
|
+
if hparams["scoring_func"] == "sigmoid":
|
4078
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
4079
|
+
elif hparams["scoring_func"] == "softmax":
|
4080
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
4081
|
+
else:
|
4082
|
+
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
|
4083
|
+
|
3029
4084
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
3030
4085
|
|
3031
|
-
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
3032
|
-
if self.hparams["rope_scaling"].get("type") == "yarn":
|
3033
|
-
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
3034
|
-
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
3035
|
-
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
3036
|
-
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
4086
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
4087
|
+
if self.hparams["rope_scaling"].get("type") == "yarn":
|
4088
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
4089
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
4090
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
4091
|
+
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
4092
|
+
|
4093
|
+
_experts: list[dict[str, Tensor]] | None = None
|
4094
|
+
|
4095
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4096
|
+
# rename e_score_correction_bias tensors
|
4097
|
+
if name.endswith("e_score_correction_bias"):
|
4098
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
4099
|
+
|
4100
|
+
# skip Multi-Token Prediction (MTP) layers
|
4101
|
+
block_count = self.hparams["num_hidden_layers"]
|
4102
|
+
match = re.match(r"model.layers.(\d+)", name)
|
4103
|
+
if match and int(match.group(1)) >= block_count:
|
4104
|
+
return []
|
4105
|
+
|
4106
|
+
# process the experts separately
|
4107
|
+
if name.find("mlp.experts") != -1:
|
4108
|
+
n_experts = self.hparams["n_routed_experts"]
|
4109
|
+
assert bid is not None
|
4110
|
+
|
4111
|
+
if self._experts is None:
|
4112
|
+
self._experts = [{} for _ in range(self.block_count)]
|
4113
|
+
|
4114
|
+
self._experts[bid][name] = data_torch
|
4115
|
+
|
4116
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
4117
|
+
tensors: list[tuple[str, Tensor]] = []
|
4118
|
+
|
4119
|
+
# merge the experts into a single 3d tensor
|
4120
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
4121
|
+
datas: list[Tensor] = []
|
4122
|
+
|
4123
|
+
for xid in range(n_experts):
|
4124
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
4125
|
+
datas.append(self._experts[bid][ename])
|
4126
|
+
del self._experts[bid][ename]
|
4127
|
+
|
4128
|
+
data_torch = torch.stack(datas, dim=0)
|
4129
|
+
|
4130
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
4131
|
+
|
4132
|
+
new_name = self.map_tensor_name(merged_name)
|
4133
|
+
|
4134
|
+
tensors.append((new_name, data_torch))
|
4135
|
+
return tensors
|
4136
|
+
else:
|
4137
|
+
return []
|
4138
|
+
|
4139
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4140
|
+
|
4141
|
+
def prepare_tensors(self):
|
4142
|
+
super().prepare_tensors()
|
4143
|
+
|
4144
|
+
if self._experts is not None:
|
4145
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
4146
|
+
experts = [k for d in self._experts for k in d.keys()]
|
4147
|
+
if len(experts) > 0:
|
4148
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
4149
|
+
|
4150
|
+
|
4151
|
+
@Model.register("T5WithLMHeadModel")
|
4152
|
+
@Model.register("T5ForConditionalGeneration")
|
4153
|
+
@Model.register("MT5ForConditionalGeneration")
|
4154
|
+
@Model.register("UMT5ForConditionalGeneration")
|
4155
|
+
class T5Model(Model):
|
4156
|
+
model_arch = gguf.MODEL_ARCH.T5
|
4157
|
+
|
4158
|
+
def __init__(self, *args, **kwargs):
|
4159
|
+
super().__init__(*args, **kwargs)
|
4160
|
+
self.shared_token_embeddings_found = False
|
4161
|
+
|
4162
|
+
def set_vocab(self):
|
4163
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
4164
|
+
# exception when importing sentencepiece_model_pb2
|
4165
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
4166
|
+
from sentencepiece import SentencePieceProcessor
|
4167
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
4168
|
+
|
4169
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
4170
|
+
|
4171
|
+
# many older models use spiece.model tokenizer model filename
|
4172
|
+
if not tokenizer_path.is_file():
|
4173
|
+
tokenizer_path = self.dir_model / 'spiece.model'
|
4174
|
+
|
4175
|
+
if not tokenizer_path.is_file():
|
4176
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
4177
|
+
|
4178
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
4179
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
4180
|
+
|
4181
|
+
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
4182
|
+
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
4183
|
+
# assure the tokenizer model file name is correct
|
4184
|
+
assert tokenizer_path.name == 'tokenizer.model'
|
4185
|
+
return self._set_vocab_sentencepiece()
|
4186
|
+
else:
|
4187
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
4188
|
+
|
4189
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
4190
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
4191
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
4192
|
+
|
4193
|
+
tokenizer = SentencePieceProcessor()
|
4194
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
4195
|
+
|
4196
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
4197
|
+
|
4198
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
4199
|
+
scores: list[float] = [-10000.0] * vocab_size
|
4200
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3037
4201
|
|
3038
|
-
|
4202
|
+
for token_id in range(tokenizer.vocab_size()):
|
4203
|
+
piece = tokenizer.IdToPiece(token_id)
|
4204
|
+
text = piece.encode("utf-8")
|
4205
|
+
score = tokenizer.GetScore(token_id)
|
3039
4206
|
|
3040
|
-
|
3041
|
-
|
3042
|
-
|
3043
|
-
|
3044
|
-
|
4207
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
4208
|
+
if tokenizer.IsUnknown(token_id):
|
4209
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
4210
|
+
elif tokenizer.IsControl(token_id):
|
4211
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
4212
|
+
elif tokenizer.IsUnused(token_id):
|
4213
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
4214
|
+
elif tokenizer.IsByte(token_id):
|
4215
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3045
4216
|
|
3046
|
-
|
3047
|
-
|
4217
|
+
tokens[token_id] = text
|
4218
|
+
scores[token_id] = score
|
4219
|
+
toktypes[token_id] = toktype
|
3048
4220
|
|
3049
|
-
|
4221
|
+
added_tokens_file = self.dir_model / 'added_tokens.json'
|
4222
|
+
if added_tokens_file.is_file():
|
4223
|
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
4224
|
+
added_tokens_json = json.load(f)
|
4225
|
+
for key in added_tokens_json:
|
4226
|
+
token_id = added_tokens_json[key]
|
4227
|
+
if token_id >= vocab_size:
|
4228
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
4229
|
+
continue
|
3050
4230
|
|
3051
|
-
|
3052
|
-
|
4231
|
+
tokens[token_id] = key.encode("utf-8")
|
4232
|
+
scores[token_id] = -1000.0
|
4233
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
3053
4234
|
|
3054
|
-
|
3055
|
-
|
3056
|
-
|
4235
|
+
if vocab_size > len(tokens):
|
4236
|
+
pad_count = vocab_size - len(tokens)
|
4237
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
4238
|
+
for i in range(1, pad_count + 1):
|
4239
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
4240
|
+
scores.append(-1000.0)
|
4241
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
3057
4242
|
|
3058
|
-
|
3059
|
-
|
3060
|
-
|
3061
|
-
|
4243
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
4244
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
4245
|
+
self.gguf_writer.add_token_list(tokens)
|
4246
|
+
self.gguf_writer.add_token_scores(scores)
|
4247
|
+
self.gguf_writer.add_token_types(toktypes)
|
4248
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
4249
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
4250
|
+
if precompiled_charsmap:
|
4251
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3062
4252
|
|
3063
|
-
|
4253
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
4254
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3064
4255
|
|
3065
|
-
|
4256
|
+
self.gguf_writer.add_add_bos_token(False)
|
4257
|
+
self.gguf_writer.add_add_eos_token(True)
|
3066
4258
|
|
3067
|
-
|
4259
|
+
def set_gguf_parameters(self):
|
4260
|
+
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
4261
|
+
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
4262
|
+
n_ctx = 512
|
4263
|
+
self.gguf_writer.add_context_length(n_ctx)
|
4264
|
+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
4265
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
4266
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
4267
|
+
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
4268
|
+
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
4269
|
+
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
4270
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
4271
|
+
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
4272
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
4273
|
+
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
4274
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3068
4275
|
|
3069
|
-
|
3070
|
-
|
4276
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4277
|
+
del bid # unused
|
4278
|
+
|
4279
|
+
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
|
4280
|
+
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
|
4281
|
+
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
|
4282
|
+
# and decoder and ignore the remaining ones.
|
4283
|
+
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
|
4284
|
+
if not self.shared_token_embeddings_found:
|
4285
|
+
name = "shared.weight"
|
4286
|
+
self.shared_token_embeddings_found = True
|
3071
4287
|
else:
|
4288
|
+
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
|
3072
4289
|
return []
|
3073
4290
|
|
3074
4291
|
return [(self.map_tensor_name(name), data_torch)]
|
3075
4292
|
|
3076
|
-
def prepare_tensors(self):
|
3077
|
-
super().prepare_tensors()
|
3078
|
-
|
3079
|
-
if self._experts is not None:
|
3080
|
-
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3081
|
-
experts = [k for d in self._experts for k in d.keys()]
|
3082
|
-
if len(experts) > 0:
|
3083
|
-
raise ValueError(f"Unprocessed experts: {experts}")
|
3084
|
-
|
3085
4293
|
|
3086
|
-
@Model.register("
|
3087
|
-
|
3088
|
-
|
3089
|
-
@Model.register("UMT5ForConditionalGeneration")
|
3090
|
-
class T5Model(Model):
|
3091
|
-
model_arch = gguf.MODEL_ARCH.T5
|
4294
|
+
@Model.register("T5EncoderModel")
|
4295
|
+
class T5EncoderModel(Model):
|
4296
|
+
model_arch = gguf.MODEL_ARCH.T5ENCODER
|
3092
4297
|
|
3093
4298
|
def __init__(self, *args, **kwargs):
|
3094
4299
|
super().__init__(*args, **kwargs)
|
@@ -3205,7 +4410,6 @@ class T5Model(Model):
|
|
3205
4410
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3206
4411
|
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
3207
4412
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
3208
|
-
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
3209
4413
|
self.gguf_writer.add_file_type(self.ftype)
|
3210
4414
|
|
3211
4415
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
@@ -3240,10 +4444,7 @@ class JaisModel(Model):
|
|
3240
4444
|
|
3241
4445
|
# Embeddings scale
|
3242
4446
|
self.embeddings_scale = 1.0
|
3243
|
-
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
3244
|
-
self.output_is_wte = False
|
3245
4447
|
if 'mup_embeddings_scale' in self.hparams:
|
3246
|
-
self.output_is_wte = True # Hack (?)
|
3247
4448
|
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
3248
4449
|
elif 'embeddings_scale' in self.hparams:
|
3249
4450
|
self.embeddings_scale = self.hparams['embeddings_scale']
|
@@ -3300,10 +4501,7 @@ class JaisModel(Model):
|
|
3300
4501
|
|
3301
4502
|
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3302
4503
|
tensors.append((new_name, data_torch * self.embeddings_scale))
|
3303
|
-
if self.output_is_wte:
|
3304
|
-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
3305
4504
|
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
3306
|
-
assert not self.output_is_wte
|
3307
4505
|
tensors.append((new_name, data_torch * self.width_scale))
|
3308
4506
|
else:
|
3309
4507
|
tensors.append((new_name, data_torch))
|
@@ -3315,7 +4513,7 @@ class JaisModel(Model):
|
|
3315
4513
|
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
3316
4514
|
|
3317
4515
|
|
3318
|
-
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
4516
|
+
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
3319
4517
|
class ChatGLMModel(Model):
|
3320
4518
|
model_arch = gguf.MODEL_ARCH.CHATGLM
|
3321
4519
|
|
@@ -3421,47 +4619,15 @@ class ChatGLMModel(Model):
|
|
3421
4619
|
|
3422
4620
|
from transformers import AutoTokenizer
|
3423
4621
|
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3424
|
-
vocab_size = hparams
|
4622
|
+
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
|
3425
4623
|
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3426
4624
|
|
3427
|
-
tokpre = self.
|
3428
|
-
|
3429
|
-
merges = []
|
3430
|
-
vocab = {}
|
3431
|
-
mergeable_ranks = tokenizer.mergeable_ranks
|
3432
|
-
for token, rank in mergeable_ranks.items():
|
3433
|
-
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
3434
|
-
if len(token) == 1:
|
3435
|
-
continue
|
3436
|
-
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
3437
|
-
assert len(merged) >= 2 and len(merged) <= 7
|
3438
|
-
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
3439
|
-
|
3440
|
-
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
3441
|
-
added_vocab = tokenizer.get_added_vocab()
|
3442
|
-
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
3443
|
-
|
3444
|
-
for i in range(vocab_size):
|
3445
|
-
if i not in reverse_vocab:
|
3446
|
-
tokens.append(f"[PAD{i}]")
|
3447
|
-
toktypes.append(gguf.TokenType.UNUSED)
|
3448
|
-
elif reverse_vocab[i] in added_vocab:
|
3449
|
-
tokens.append(reverse_vocab[i])
|
3450
|
-
if tokenizer.added_tokens_decoder[i].special:
|
3451
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
3452
|
-
else:
|
3453
|
-
toktypes.append(gguf.TokenType.USER_DEFINED)
|
3454
|
-
else:
|
3455
|
-
tokens.append(reverse_vocab[i])
|
3456
|
-
toktypes.append(gguf.TokenType.NORMAL)
|
3457
|
-
|
4625
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
3458
4626
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
3459
4627
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
3460
4628
|
self.gguf_writer.add_token_list(tokens)
|
3461
4629
|
self.gguf_writer.add_token_types(toktypes)
|
3462
|
-
|
3463
|
-
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
3464
|
-
special_vocab.merges = merges
|
4630
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
3465
4631
|
# only add special tokens when they were not already loaded from config.json
|
3466
4632
|
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3467
4633
|
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
@@ -3472,16 +4638,20 @@ class ChatGLMModel(Model):
|
|
3472
4638
|
def set_gguf_parameters(self):
|
3473
4639
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
3474
4640
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
3475
|
-
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
4641
|
+
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
|
3476
4642
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
3477
4643
|
self.gguf_writer.add_embedding_length(n_embed)
|
3478
|
-
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
3479
|
-
self.gguf_writer.add_block_count(self.hparams
|
4644
|
+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
|
4645
|
+
self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
|
3480
4646
|
self.gguf_writer.add_head_count(n_head)
|
3481
4647
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
3482
|
-
self.gguf_writer.add_layer_norm_rms_eps(self.hparams
|
4648
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
|
3483
4649
|
self.gguf_writer.add_file_type(self.ftype)
|
3484
|
-
self.
|
4650
|
+
if "attention_dim" in self.hparams:
|
4651
|
+
rope_dim = self.hparams["attention_dim"]
|
4652
|
+
else:
|
4653
|
+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
4654
|
+
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
3485
4655
|
self.gguf_writer.add_add_bos_token(False)
|
3486
4656
|
rope_freq = 10000
|
3487
4657
|
if "rope_ratio" in self.hparams:
|
@@ -3491,14 +4661,224 @@ class ChatGLMModel(Model):
|
|
3491
4661
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3492
4662
|
del bid # unused
|
3493
4663
|
|
3494
|
-
if name.endswith(".rotary_pos_emb.inv_freq"):
|
4664
|
+
if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
|
3495
4665
|
return []
|
3496
4666
|
|
3497
4667
|
name = name.removeprefix("transformer.")
|
3498
4668
|
return [(self.map_tensor_name(name), data_torch)]
|
3499
4669
|
|
3500
|
-
###### CONVERSION LOGIC ######
|
3501
4670
|
|
4671
|
+
@Model.register("NemotronForCausalLM")
|
4672
|
+
class NemotronModel(Model):
|
4673
|
+
model_arch = gguf.MODEL_ARCH.NEMOTRON
|
4674
|
+
|
4675
|
+
def set_vocab(self):
|
4676
|
+
self._set_vocab_sentencepiece()
|
4677
|
+
self.gguf_writer.add_pad_token_id(0)
|
4678
|
+
self.gguf_writer.add_unk_token_id(1)
|
4679
|
+
|
4680
|
+
def set_gguf_parameters(self):
|
4681
|
+
super().set_gguf_parameters()
|
4682
|
+
hparams = self.hparams
|
4683
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
4684
|
+
|
4685
|
+
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
|
4686
|
+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
4687
|
+
|
4688
|
+
# * Partial RoPE
|
4689
|
+
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
|
4690
|
+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
4691
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
4692
|
+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
4693
|
+
|
4694
|
+
# * RopeScaling for Nemotron
|
4695
|
+
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
|
4696
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
4697
|
+
else:
|
4698
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
4699
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
|
4700
|
+
|
4701
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4702
|
+
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
|
4703
|
+
# model.layers.{l}.input_layernorm.weight
|
4704
|
+
# model.layers.{l}.post_attention_layernorm.weight
|
4705
|
+
# model.norm.weight
|
4706
|
+
if name.endswith("norm.weight"):
|
4707
|
+
data_torch = data_torch + 1
|
4708
|
+
|
4709
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4710
|
+
|
4711
|
+
|
4712
|
+
@Model.register("ExaoneForCausalLM")
|
4713
|
+
class ExaoneModel(Model):
|
4714
|
+
model_arch = gguf.MODEL_ARCH.EXAONE
|
4715
|
+
|
4716
|
+
def set_gguf_parameters(self):
|
4717
|
+
hparams = self.hparams
|
4718
|
+
|
4719
|
+
assert (hparams["activation_function"] == "silu")
|
4720
|
+
|
4721
|
+
max_position_embeddings = hparams["max_position_embeddings"]
|
4722
|
+
embed_dim = hparams["hidden_size"]
|
4723
|
+
num_heads = hparams["num_attention_heads"]
|
4724
|
+
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
|
4725
|
+
layer_norm_eps = hparams["layer_norm_epsilon"]
|
4726
|
+
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
|
4727
|
+
num_layers = hparams["num_layers"]
|
4728
|
+
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
|
4729
|
+
# attention_dropout_rate = hparams["attention_dropout"]
|
4730
|
+
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
|
4731
|
+
# embed_dropout_rate = hparams["embed_dropout"]
|
4732
|
+
self.gguf_writer.add_embedding_length(embed_dim)
|
4733
|
+
self.gguf_writer.add_head_count(num_heads)
|
4734
|
+
self.gguf_writer.add_head_count_kv(num_kv_heads)
|
4735
|
+
self.gguf_writer.add_context_length(max_position_embeddings)
|
4736
|
+
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
|
4737
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
4738
|
+
self.gguf_writer.add_block_count(num_layers)
|
4739
|
+
self.gguf_writer.add_file_type(self.ftype)
|
4740
|
+
|
4741
|
+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
4742
|
+
self.gguf_writer.add_rope_freq_base(rope_theta)
|
4743
|
+
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
4744
|
+
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
4745
|
+
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
4746
|
+
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
|
4747
|
+
if hparams["rope_scaling"].get("type") == "linear":
|
4748
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
4749
|
+
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
4750
|
+
|
4751
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
4752
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
4753
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
4754
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
4755
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
4756
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
4757
|
+
|
4758
|
+
factor = rope_scaling.get("factor", 8.0)
|
4759
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
4760
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
4761
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
4762
|
+
|
4763
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
4764
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
4765
|
+
assert low_freq_wavelen != high_freq_wavelen
|
4766
|
+
|
4767
|
+
rope_factors = []
|
4768
|
+
for freq in freqs:
|
4769
|
+
wavelen = 2 * math.pi / freq
|
4770
|
+
if wavelen < high_freq_wavelen:
|
4771
|
+
rope_factors.append(1)
|
4772
|
+
elif wavelen > low_freq_wavelen:
|
4773
|
+
rope_factors.append(factor)
|
4774
|
+
else:
|
4775
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
4776
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
4777
|
+
|
4778
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
4779
|
+
|
4780
|
+
|
4781
|
+
@Model.register("GraniteForCausalLM")
|
4782
|
+
class GraniteModel(LlamaModel):
|
4783
|
+
"""Conversion for IBM's GraniteForCausalLM"""
|
4784
|
+
model_arch = gguf.MODEL_ARCH.GRANITE
|
4785
|
+
|
4786
|
+
def set_gguf_parameters(self):
|
4787
|
+
"""Granite uses standard llama parameters with the following differences:
|
4788
|
+
|
4789
|
+
- No head_dim support
|
4790
|
+
- New multiplier params:
|
4791
|
+
- attention_scale
|
4792
|
+
- embedding_scale
|
4793
|
+
- residual_scale
|
4794
|
+
- logits_scaling
|
4795
|
+
"""
|
4796
|
+
if head_dim := self.hparams.pop("head_dim", None):
|
4797
|
+
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
|
4798
|
+
super().set_gguf_parameters()
|
4799
|
+
# NOTE: Convert _multiplier params to _scale params for naming
|
4800
|
+
# consistency
|
4801
|
+
if attention_scale := self.hparams.get("attention_multiplier"):
|
4802
|
+
self.gguf_writer.add_attention_scale(attention_scale)
|
4803
|
+
logger.info("gguf: (granite) attention_scale = %s", attention_scale)
|
4804
|
+
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
4805
|
+
self.gguf_writer.add_embedding_scale(embedding_scale)
|
4806
|
+
logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
|
4807
|
+
if residual_scale := self.hparams.get("residual_multiplier"):
|
4808
|
+
self.gguf_writer.add_residual_scale(residual_scale)
|
4809
|
+
logger.info("gguf: (granite) residual_scale = %s", residual_scale)
|
4810
|
+
if logits_scale := self.hparams.get("logits_scaling"):
|
4811
|
+
self.gguf_writer.add_logit_scale(logits_scale)
|
4812
|
+
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
4813
|
+
|
4814
|
+
|
4815
|
+
@Model.register("GraniteMoeForCausalLM")
|
4816
|
+
class GraniteMoeModel(GraniteModel):
|
4817
|
+
"""Conversion for IBM's GraniteMoeForCausalLM"""
|
4818
|
+
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
|
4819
|
+
|
4820
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4821
|
+
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
|
4822
|
+
is used. This essentially merges w1 and w3 into a single tensor with 2x
|
4823
|
+
the hidden size that is then split during forward. To keep compatibility
|
4824
|
+
with existing mixtral support, we pull them apart here.
|
4825
|
+
"""
|
4826
|
+
|
4827
|
+
if name.endswith("block_sparse_moe.input_linear.weight"):
|
4828
|
+
ffn_dim = self.hparams["intermediate_size"]
|
4829
|
+
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
|
4830
|
+
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
|
4831
|
+
return [
|
4832
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
|
4833
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
4834
|
+
]
|
4835
|
+
|
4836
|
+
return super().modify_tensors(data_torch, name, bid)
|
4837
|
+
|
4838
|
+
|
4839
|
+
@Model.register("ChameleonForConditionalGeneration")
|
4840
|
+
@Model.register("ChameleonForCausalLM") # obsolete
|
4841
|
+
class ChameleonModel(Model):
|
4842
|
+
model_arch = gguf.MODEL_ARCH.CHAMELEON
|
4843
|
+
|
4844
|
+
def set_gguf_parameters(self):
|
4845
|
+
super().set_gguf_parameters()
|
4846
|
+
self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
|
4847
|
+
|
4848
|
+
def set_vocab(self):
|
4849
|
+
self._set_vocab_gpt2()
|
4850
|
+
|
4851
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4852
|
+
# ignore image tokenizer for now
|
4853
|
+
# TODO: remove this once image support is implemented for Chameleon
|
4854
|
+
if name.startswith("model.vqmodel"):
|
4855
|
+
return []
|
4856
|
+
|
4857
|
+
n_head = self.hparams["num_attention_heads"]
|
4858
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
4859
|
+
hidden_dim = self.hparams.get("hidden_size")
|
4860
|
+
|
4861
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
4862
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
4863
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
4864
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
4865
|
+
if name.endswith(("q_norm.weight", "q_norm.bias")):
|
4866
|
+
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
|
4867
|
+
if name.endswith(("k_norm.weight", "k_norm.bias")):
|
4868
|
+
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
|
4869
|
+
|
4870
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4871
|
+
|
4872
|
+
# see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
|
4873
|
+
@staticmethod
|
4874
|
+
def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
|
4875
|
+
head_dim = hidden_dim // n_heads
|
4876
|
+
data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
|
4877
|
+
data_torch = data_torch.repeat_interleave(n_heads, 0)
|
4878
|
+
return data_torch
|
4879
|
+
|
4880
|
+
|
4881
|
+
###### CONVERSION LOGIC ######
|
3502
4882
|
|
3503
4883
|
# tree of lazy tensors
|
3504
4884
|
class LazyTorchTensor(gguf.LazyBase):
|
@@ -3578,8 +4958,8 @@ def parse_args() -> argparse.Namespace:
|
|
3578
4958
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
3579
4959
|
)
|
3580
4960
|
parser.add_argument(
|
3581
|
-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
3582
|
-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
4961
|
+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
4962
|
+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
3583
4963
|
)
|
3584
4964
|
parser.add_argument(
|
3585
4965
|
"--bigendian", action="store_true",
|
@@ -3588,6 +4968,7 @@ def parse_args() -> argparse.Namespace:
|
|
3588
4968
|
parser.add_argument(
|
3589
4969
|
"model", type=Path,
|
3590
4970
|
help="directory containing model file",
|
4971
|
+
nargs="?",
|
3591
4972
|
)
|
3592
4973
|
parser.add_argument(
|
3593
4974
|
"--use-temp-file", action="store_true",
|
@@ -3625,8 +5006,15 @@ def parse_args() -> argparse.Namespace:
|
|
3625
5006
|
"--metadata", type=Path,
|
3626
5007
|
help="Specify the path for an authorship metadata override file"
|
3627
5008
|
)
|
5009
|
+
parser.add_argument(
|
5010
|
+
"--print-supported-models", action="store_true",
|
5011
|
+
help="Print the supported models"
|
5012
|
+
)
|
3628
5013
|
|
3629
|
-
|
5014
|
+
args = parser.parse_args()
|
5015
|
+
if not args.print_supported_models and args.model is None:
|
5016
|
+
parser.error("the following arguments are required: model")
|
5017
|
+
return args
|
3630
5018
|
|
3631
5019
|
|
3632
5020
|
def split_str_to_n_bytes(split_str: str) -> int:
|
@@ -3650,6 +5038,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
|
3650
5038
|
def main() -> None:
|
3651
5039
|
args = parse_args()
|
3652
5040
|
|
5041
|
+
if args.print_supported_models:
|
5042
|
+
logger.error("Supported models:")
|
5043
|
+
Model.print_registered_models()
|
5044
|
+
sys.exit(0)
|
5045
|
+
|
3653
5046
|
if args.verbose:
|
3654
5047
|
logging.basicConfig(level=logging.DEBUG)
|
3655
5048
|
else:
|
@@ -3666,6 +5059,8 @@ def main() -> None:
|
|
3666
5059
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
3667
5060
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
3668
5061
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
5062
|
+
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
5063
|
+
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
3669
5064
|
"auto": gguf.LlamaFileType.GUESSED,
|
3670
5065
|
}
|
3671
5066
|
|