bigdl-core-cpp 2.5.0b20240827__py3-none-win_amd64.whl → 2.6.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert_hf_to_gguf.py +1196 -147
- bigdl/cpp/convert_hf_to_gguf_update.py +69 -42
- bigdl/cpp/convert_llama_ggml_to_gguf.py +0 -4
- bigdl/cpp/convert_lora_to_gguf.py +82 -14
- bigdl/cpp/gguf-py/gguf/constants.py +645 -187
- bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +92 -16
- bigdl/cpp/gguf-py/gguf/lazy.py +0 -1
- bigdl/cpp/gguf-py/gguf/metadata.py +131 -19
- bigdl/cpp/gguf-py/gguf/quants.py +81 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +249 -38
- bigdl/cpp/gguf-py/gguf/utility.py +1 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +24 -2
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/ggml-base.dll +0 -0
- bigdl/cpp/libs/ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ggml.dll +0 -0
- bigdl/cpp/libs/libc++.dll +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gemma3-cli.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
- bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
- bigdl/cpp/libs/ollama-lib.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/ollama_ggml.dll +0 -0
- bigdl/cpp/libs/ollama_llama.dll +0 -0
- bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
- bigdl_core_cpp-2.6.0.data/scripts/init-ollama.bat +16 -0
- {bigdl_core_cpp-2.5.0b20240827.dist-info → bigdl_core_cpp-2.6.0.dist-info}/METADATA +9 -5
- bigdl_core_cpp-2.6.0.dist-info/RECORD +57 -0
- {bigdl_core_cpp-2.5.0b20240827.dist-info → bigdl_core_cpp-2.6.0.dist-info}/WHEEL +1 -1
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl_core_cpp-2.5.0b20240827.data/scripts/init-ollama.bat +0 -19
- bigdl_core_cpp-2.5.0b20240827.dist-info/RECORD +0 -54
- {bigdl_core_cpp-2.5.0b20240827.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240827.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240827.dist-info → bigdl_core_cpp-2.6.0.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert_hf_to_gguf.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
from __future__ import annotations
|
5
5
|
|
6
|
+
import ast
|
6
7
|
import logging
|
7
8
|
import argparse
|
8
9
|
import contextlib
|
@@ -14,6 +15,7 @@ from enum import IntEnum
|
|
14
15
|
from pathlib import Path
|
15
16
|
from hashlib import sha256
|
16
17
|
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
18
|
+
from itertools import chain
|
17
19
|
|
18
20
|
import math
|
19
21
|
import numpy as np
|
@@ -70,7 +72,8 @@ class Model:
|
|
70
72
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
71
73
|
use_temp_file: bool = False, eager: bool = False,
|
72
74
|
metadata_override: Path | None = None, model_name: str | None = None,
|
73
|
-
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
75
|
+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
76
|
+
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
|
74
77
|
if type(self) is Model:
|
75
78
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
76
79
|
|
@@ -85,7 +88,7 @@ class Model:
|
|
85
88
|
self.is_safetensors = len(self.part_names) > 0
|
86
89
|
if not self.is_safetensors:
|
87
90
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
88
|
-
self.hparams = Model.load_hparams(self.dir_model)
|
91
|
+
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
|
89
92
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
90
93
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
91
94
|
self.tensor_names = None
|
@@ -129,12 +132,14 @@ class Model:
|
|
129
132
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
130
133
|
tensor_names_from_parts: set[str] = set()
|
131
134
|
|
132
|
-
if
|
135
|
+
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
136
|
+
index_name += ".index.json"
|
137
|
+
index_file = self.dir_model / index_name
|
138
|
+
|
139
|
+
if index_file.is_file():
|
133
140
|
self.tensor_names = set()
|
134
|
-
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
135
|
-
index_name += ".index.json"
|
136
141
|
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
137
|
-
with open(
|
142
|
+
with open(index_file, "r", encoding="utf-8") as f:
|
138
143
|
index: dict[str, Any] = json.load(f)
|
139
144
|
weight_map = index.get("weight_map")
|
140
145
|
if weight_map is None or not isinstance(weight_map, dict):
|
@@ -142,6 +147,7 @@ class Model:
|
|
142
147
|
self.tensor_names.update(weight_map.keys())
|
143
148
|
else:
|
144
149
|
self.tensor_names = tensor_names_from_parts
|
150
|
+
weight_map = {}
|
145
151
|
|
146
152
|
for part_name in self.part_names:
|
147
153
|
logger.info(f"gguf: loading model part '{part_name}'")
|
@@ -168,9 +174,17 @@ class Model:
|
|
168
174
|
data = LazyTorchTensor.from_eager(data)
|
169
175
|
yield name, data
|
170
176
|
|
171
|
-
#
|
172
|
-
if len(
|
173
|
-
|
177
|
+
# verify tensor name presence and identify potentially missing files
|
178
|
+
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
179
|
+
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
|
180
|
+
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
181
|
+
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
182
|
+
if len(extra) == 0 and len(missing_files) > 0:
|
183
|
+
raise ValueError(f"Missing or incomplete model files: {missing_files}")
|
184
|
+
else:
|
185
|
+
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
186
|
+
f"Missing tensors: {missing}\n"
|
187
|
+
f"Extra tensors: {extra}")
|
174
188
|
|
175
189
|
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
176
190
|
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
@@ -207,17 +221,17 @@ class Model:
|
|
207
221
|
self.gguf_writer.add_context_length(n_ctx)
|
208
222
|
logger.info(f"gguf: context length = {n_ctx}")
|
209
223
|
|
210
|
-
n_embd
|
211
|
-
|
212
|
-
|
224
|
+
if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
|
225
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
226
|
+
logger.info(f"gguf: embedding length = {n_embd}")
|
213
227
|
|
214
228
|
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
215
229
|
self.gguf_writer.add_feed_forward_length(n_ff)
|
216
230
|
logger.info(f"gguf: feed forward length = {n_ff}")
|
217
231
|
|
218
|
-
n_head
|
219
|
-
|
220
|
-
|
232
|
+
if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
|
233
|
+
self.gguf_writer.add_head_count(n_head)
|
234
|
+
logger.info(f"gguf: head count = {n_head}")
|
221
235
|
|
222
236
|
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
223
237
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
@@ -256,10 +270,14 @@ class Model:
|
|
256
270
|
|
257
271
|
return False
|
258
272
|
|
273
|
+
# some models need extra generated tensors (like rope_freqs)
|
274
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
275
|
+
return ()
|
276
|
+
|
259
277
|
def prepare_tensors(self):
|
260
278
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
261
279
|
|
262
|
-
for name, data_torch in self.get_tensors():
|
280
|
+
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
|
263
281
|
# we don't need these
|
264
282
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
265
283
|
continue
|
@@ -277,8 +295,15 @@ class Model:
|
|
277
295
|
bid = int(part)
|
278
296
|
break
|
279
297
|
|
280
|
-
for new_name,
|
281
|
-
|
298
|
+
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
|
299
|
+
# TODO: why do we squeeze here?
|
300
|
+
# data = data_torch.squeeze().numpy()
|
301
|
+
data = data_torch.numpy()
|
302
|
+
|
303
|
+
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
304
|
+
if len(data.shape) == 0:
|
305
|
+
data = data_torch.numpy()
|
306
|
+
|
282
307
|
n_dims = len(data.shape)
|
283
308
|
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
284
309
|
|
@@ -296,12 +321,34 @@ class Model:
|
|
296
321
|
gguf.MODEL_TENSOR.POS_EMBD,
|
297
322
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
298
323
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
324
|
+
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
325
|
+
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
326
|
+
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
327
|
+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
328
|
+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
329
|
+
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
330
|
+
gguf.MODEL_TENSOR.POSNET_NORM1,
|
331
|
+
gguf.MODEL_TENSOR.POSNET_NORM2,
|
299
332
|
)
|
300
333
|
)
|
301
|
-
or not
|
334
|
+
or not new_name.endswith(".weight")
|
302
335
|
):
|
303
336
|
data_qtype = gguf.GGMLQuantizationType.F32
|
304
337
|
|
338
|
+
if data_qtype is False and any(
|
339
|
+
self.match_model_tensor_name(new_name, key, bid)
|
340
|
+
for key in (
|
341
|
+
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
342
|
+
gguf.MODEL_TENSOR.OUTPUT,
|
343
|
+
)
|
344
|
+
):
|
345
|
+
if self.ftype in (
|
346
|
+
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
347
|
+
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
348
|
+
):
|
349
|
+
# TODO: use Q4_K and Q6_K
|
350
|
+
data_qtype = gguf.GGMLQuantizationType.F16
|
351
|
+
|
305
352
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
306
353
|
if isinstance(data_qtype, bool):
|
307
354
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
@@ -312,6 +359,10 @@ class Model:
|
|
312
359
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
313
360
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
314
361
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
362
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
363
|
+
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
364
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
365
|
+
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
315
366
|
else:
|
316
367
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
317
368
|
|
@@ -427,6 +478,11 @@ class Model:
|
|
427
478
|
return modelcls
|
428
479
|
return func
|
429
480
|
|
481
|
+
@classmethod
|
482
|
+
def print_registered_models(cls):
|
483
|
+
for name in sorted(cls._model_classes.keys()):
|
484
|
+
logger.error(f"- {name}")
|
485
|
+
|
430
486
|
@classmethod
|
431
487
|
def from_model_architecture(cls, arch: str) -> type[Model]:
|
432
488
|
try:
|
@@ -479,9 +535,19 @@ class Model:
|
|
479
535
|
else:
|
480
536
|
token: str = reverse_vocab[i]
|
481
537
|
if token in added_vocab:
|
538
|
+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
539
|
+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
540
|
+
if not tokenizer.added_tokens_decoder[i].normalized:
|
541
|
+
previous_token = token
|
542
|
+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
543
|
+
if previous_token != token:
|
544
|
+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
545
|
+
|
482
546
|
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
483
547
|
toktypes.append(gguf.TokenType.CONTROL)
|
484
548
|
else:
|
549
|
+
# NOTE: this was added for Gemma.
|
550
|
+
# Encoding and decoding the tokens above isn't sufficient for this case.
|
485
551
|
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
486
552
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
487
553
|
else:
|
@@ -492,7 +558,7 @@ class Model:
|
|
492
558
|
|
493
559
|
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
494
560
|
# do not modify it manually!
|
495
|
-
# ref: https://github.com/
|
561
|
+
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
496
562
|
# Marker: Start get_vocab_base_pre
|
497
563
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
498
564
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
@@ -525,9 +591,15 @@ class Model:
|
|
525
591
|
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
526
592
|
# ref: https://huggingface.co/tiiuae/falcon-7b
|
527
593
|
res = "falcon"
|
594
|
+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
595
|
+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
596
|
+
res = "falcon3"
|
528
597
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
529
598
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
530
599
|
res = "bert-bge"
|
600
|
+
if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
|
601
|
+
# ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
|
602
|
+
res = "bert-bge-large"
|
531
603
|
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
532
604
|
# ref: https://huggingface.co/mosaicml/mpt-7b
|
533
605
|
res = "mpt"
|
@@ -555,6 +627,9 @@ class Model:
|
|
555
627
|
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
556
628
|
# ref: https://huggingface.co/databricks/dbrx-base
|
557
629
|
res = "dbrx"
|
630
|
+
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
|
631
|
+
# ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
632
|
+
res = "jina-v1-en"
|
558
633
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
559
634
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
560
635
|
res = "jina-v2-en"
|
@@ -573,7 +648,7 @@ class Model:
|
|
573
648
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
574
649
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
575
650
|
res = "jina-v2-code"
|
576
|
-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
651
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
577
652
|
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
578
653
|
res = "chatglm-bpe"
|
579
654
|
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
@@ -600,6 +675,30 @@ class Model:
|
|
600
675
|
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
601
676
|
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
602
677
|
res = "exaone"
|
678
|
+
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
679
|
+
# ref: https://huggingface.co/microsoft/phi-2
|
680
|
+
res = "phi-2"
|
681
|
+
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
|
682
|
+
# ref: https://huggingface.co/facebook/chameleon-7b
|
683
|
+
res = "chameleon"
|
684
|
+
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
685
|
+
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
686
|
+
res = "minerva-7b"
|
687
|
+
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
|
688
|
+
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
|
689
|
+
res = "roberta-bpe"
|
690
|
+
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
691
|
+
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
692
|
+
res = "gigachat"
|
693
|
+
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
|
694
|
+
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
|
695
|
+
res = "megrez"
|
696
|
+
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
|
697
|
+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
698
|
+
res = "deepseek-v3"
|
699
|
+
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
|
700
|
+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
701
|
+
res = "deepseek-r1-qwen"
|
603
702
|
|
604
703
|
if res is None:
|
605
704
|
logger.warning("\n")
|
@@ -609,7 +708,7 @@ class Model:
|
|
609
708
|
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
610
709
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
611
710
|
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
612
|
-
logger.warning("** ref: https://github.com/
|
711
|
+
logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
|
613
712
|
logger.warning("**")
|
614
713
|
logger.warning(f"** chkhsh: {chkhsh}")
|
615
714
|
logger.warning("**************************************************************************************")
|
@@ -622,6 +721,9 @@ class Model:
|
|
622
721
|
return res
|
623
722
|
# Marker: End get_vocab_base_pre
|
624
723
|
|
724
|
+
def _set_vocab_none(self) -> None:
|
725
|
+
self.gguf_writer.add_tokenizer_model("none")
|
726
|
+
|
625
727
|
def _set_vocab_gpt2(self) -> None:
|
626
728
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
627
729
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
@@ -1458,7 +1560,7 @@ class StableLMModel(Model):
|
|
1458
1560
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1459
1561
|
|
1460
1562
|
|
1461
|
-
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1563
|
+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1462
1564
|
class LlamaModel(Model):
|
1463
1565
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
1464
1566
|
|
@@ -1484,6 +1586,17 @@ class LlamaModel(Model):
|
|
1484
1586
|
special_vocab._set_special_token("eot", 32010)
|
1485
1587
|
special_vocab.add_to_gguf(self.gguf_writer)
|
1486
1588
|
|
1589
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1590
|
+
if tokenizer_config_file.is_file():
|
1591
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1592
|
+
tokenizer_config_json = json.load(f)
|
1593
|
+
if "add_prefix_space" in tokenizer_config_json:
|
1594
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1595
|
+
|
1596
|
+
# Apply to granite small models only
|
1597
|
+
if self.hparams.get("vocab_size", 32000) == 49152:
|
1598
|
+
self.gguf_writer.add_add_bos_token(False)
|
1599
|
+
|
1487
1600
|
def set_gguf_parameters(self):
|
1488
1601
|
super().set_gguf_parameters()
|
1489
1602
|
hparams = self.hparams
|
@@ -1500,17 +1613,6 @@ class LlamaModel(Model):
|
|
1500
1613
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1501
1614
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1502
1615
|
|
1503
|
-
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1504
|
-
if tokenizer_config_file.is_file():
|
1505
|
-
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1506
|
-
tokenizer_config_json = json.load(f)
|
1507
|
-
if "add_prefix_space" in tokenizer_config_json:
|
1508
|
-
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1509
|
-
|
1510
|
-
# Apply to granite small models only
|
1511
|
-
if self.hparams.get("vocab_size", 32000) == 49152:
|
1512
|
-
self.gguf_writer.add_add_bos_token(False)
|
1513
|
-
|
1514
1616
|
@staticmethod
|
1515
1617
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1516
1618
|
if n_head_kv is not None and n_head != n_head_kv:
|
@@ -1566,11 +1668,11 @@ class LlamaModel(Model):
|
|
1566
1668
|
|
1567
1669
|
return [(self.map_tensor_name(name), data_torch)]
|
1568
1670
|
|
1569
|
-
def
|
1671
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1570
1672
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1571
1673
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1572
1674
|
base = self.hparams.get("rope_theta", 10000.0)
|
1573
|
-
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1675
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1574
1676
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1575
1677
|
|
1576
1678
|
factor = rope_scaling.get("factor", 8.0)
|
@@ -1593,8 +1695,9 @@ class LlamaModel(Model):
|
|
1593
1695
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1594
1696
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1595
1697
|
|
1596
|
-
|
1698
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
1597
1699
|
|
1700
|
+
def prepare_tensors(self):
|
1598
1701
|
super().prepare_tensors()
|
1599
1702
|
|
1600
1703
|
if self._experts is not None:
|
@@ -1604,6 +1707,178 @@ class LlamaModel(Model):
|
|
1604
1707
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1605
1708
|
|
1606
1709
|
|
1710
|
+
@Model.register("DeciLMForCausalLM")
|
1711
|
+
class DeciModel(Model):
|
1712
|
+
model_arch = gguf.MODEL_ARCH.DECI
|
1713
|
+
|
1714
|
+
@staticmethod
|
1715
|
+
def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
|
1716
|
+
# DeciLM-specific code
|
1717
|
+
intermediate_size = int(2 * ffn_mult * n_embd / 3)
|
1718
|
+
return DeciModel._find_multiple(intermediate_size, 256)
|
1719
|
+
|
1720
|
+
@staticmethod
|
1721
|
+
def _find_multiple(n: int, k: int) -> int:
|
1722
|
+
# DeciLM-specific code
|
1723
|
+
if n % k == 0:
|
1724
|
+
return n
|
1725
|
+
return n + k - (n % k)
|
1726
|
+
|
1727
|
+
def __init__(self, *args, **kwargs):
|
1728
|
+
super().__init__(*args, **kwargs)
|
1729
|
+
|
1730
|
+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
1731
|
+
_block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
|
1732
|
+
assert self.block_count == len(_block_configs)
|
1733
|
+
self._num_kv_heads = list()
|
1734
|
+
self._num_heads = list()
|
1735
|
+
_ffn_multipliers = list()
|
1736
|
+
# ***linear attention layer***
|
1737
|
+
# if n_heads_in_group is None and replace_with_linear is True
|
1738
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
|
1739
|
+
# ***attention-free layer***
|
1740
|
+
# if n_heads_in_group is None and replace_with_linear is False
|
1741
|
+
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0
|
1742
|
+
# ***normal attention-layer***
|
1743
|
+
# if n_heads_in_group is not None, then
|
1744
|
+
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
|
1745
|
+
# _num_heads[il] is num_attention_head
|
1746
|
+
for il in range(len(_block_configs)):
|
1747
|
+
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
|
1748
|
+
if _block_configs[il]["attention"]["replace_with_linear"] is True:
|
1749
|
+
self._num_kv_heads.append(0)
|
1750
|
+
self._num_heads.append(self.hparams["num_attention_heads"])
|
1751
|
+
else:
|
1752
|
+
self._num_kv_heads.append(0)
|
1753
|
+
self._num_heads.append(0)
|
1754
|
+
else:
|
1755
|
+
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
|
1756
|
+
self._num_heads.append(self.hparams["num_attention_heads"])
|
1757
|
+
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
|
1758
|
+
assert self.block_count == len(self._num_kv_heads)
|
1759
|
+
assert self.block_count == len(self._num_heads)
|
1760
|
+
assert self.block_count == len(_ffn_multipliers)
|
1761
|
+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
1762
|
+
assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
|
1763
|
+
assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
|
1764
|
+
self._ffn_dims: list[int] = [
|
1765
|
+
DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
|
1766
|
+
for multiplier in _ffn_multipliers
|
1767
|
+
]
|
1768
|
+
|
1769
|
+
def set_vocab(self):
|
1770
|
+
# Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
|
1771
|
+
# eos_token from '|eot_id|' to '|end_of_text|'
|
1772
|
+
if self.hparams.get("vocab_size", 128256) == 128256:
|
1773
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
1774
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
1775
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
1776
|
+
self.gguf_writer.add_token_list(tokens)
|
1777
|
+
self.gguf_writer.add_token_types(toktypes)
|
1778
|
+
|
1779
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
1780
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
1781
|
+
else:
|
1782
|
+
# DeciLM-7B
|
1783
|
+
self._set_vocab_llama_hf()
|
1784
|
+
|
1785
|
+
def set_gguf_parameters(self):
|
1786
|
+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
|
1787
|
+
assert self.block_count == len(self._num_kv_heads)
|
1788
|
+
assert self.block_count == len(self._num_heads)
|
1789
|
+
assert self.block_count == len(self._ffn_dims)
|
1790
|
+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
1791
|
+
self.gguf_writer.add_rope_freq_base(rope_theta)
|
1792
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
1793
|
+
self.gguf_writer.add_head_count(self._num_heads)
|
1794
|
+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
1795
|
+
self.gguf_writer.add_block_count(self.block_count)
|
1796
|
+
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1797
|
+
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
1798
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
1799
|
+
self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1800
|
+
self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1801
|
+
self.gguf_writer.add_file_type(self.ftype)
|
1802
|
+
else: # DeciLM-7B
|
1803
|
+
super().set_gguf_parameters()
|
1804
|
+
if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
|
1805
|
+
self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
|
1806
|
+
assert self.block_count == len(self._num_kv_heads)
|
1807
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
1808
|
+
hparams = self.hparams
|
1809
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1810
|
+
|
1811
|
+
if "head_dim" in hparams:
|
1812
|
+
rope_dim = hparams["head_dim"]
|
1813
|
+
else:
|
1814
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1815
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1816
|
+
|
1817
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1818
|
+
if self.hparams["rope_scaling"].get("type") == "linear":
|
1819
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1820
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1821
|
+
|
1822
|
+
@staticmethod
|
1823
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1824
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
1825
|
+
n_head = n_head_kv
|
1826
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1827
|
+
.swapaxes(1, 2)
|
1828
|
+
.reshape(weights.shape))
|
1829
|
+
|
1830
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1831
|
+
n_head = self.hparams["num_attention_heads"]
|
1832
|
+
if bid is not None:
|
1833
|
+
if "num_key_value_heads_per_layer" in self.hparams:
|
1834
|
+
n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
|
1835
|
+
elif "block_configs" in self.hparams:
|
1836
|
+
n_kv_head = self._num_kv_heads[bid]
|
1837
|
+
n_head = self._num_heads[bid]
|
1838
|
+
else:
|
1839
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1840
|
+
else:
|
1841
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1842
|
+
|
1843
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1844
|
+
data_torch = DeciModel.permute(data_torch, n_head, n_head)
|
1845
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1846
|
+
data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
|
1847
|
+
return [(self.map_tensor_name(name), data_torch)]
|
1848
|
+
|
1849
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
1850
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1851
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1852
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
1853
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1854
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1855
|
+
|
1856
|
+
factor = rope_scaling.get("factor", 8.0)
|
1857
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1858
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
1859
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
1860
|
+
|
1861
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
1862
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
1863
|
+
assert low_freq_wavelen != high_freq_wavelen
|
1864
|
+
|
1865
|
+
rope_factors = []
|
1866
|
+
for freq in freqs:
|
1867
|
+
wavelen = 2 * math.pi / freq
|
1868
|
+
if wavelen < high_freq_wavelen:
|
1869
|
+
rope_factors.append(1)
|
1870
|
+
elif wavelen > low_freq_wavelen:
|
1871
|
+
rope_factors.append(factor)
|
1872
|
+
else:
|
1873
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1874
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1875
|
+
|
1876
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
1877
|
+
|
1878
|
+
def prepare_tensors(self):
|
1879
|
+
super().prepare_tensors()
|
1880
|
+
|
1881
|
+
|
1607
1882
|
@Model.register("BitnetForCausalLM")
|
1608
1883
|
class BitnetModel(Model):
|
1609
1884
|
model_arch = gguf.MODEL_ARCH.BITNET
|
@@ -1616,15 +1891,16 @@ class BitnetModel(Model):
|
|
1616
1891
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1617
1892
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
1618
1893
|
|
1619
|
-
def weight_quant(self, weight):
|
1894
|
+
def weight_quant(self, weight: Tensor) -> Tensor:
|
1620
1895
|
dtype = weight.dtype
|
1621
1896
|
weight = weight.float()
|
1622
|
-
|
1623
|
-
|
1624
|
-
scale
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1897
|
+
scale = weight.abs().mean().clamp(min=1e-5)
|
1898
|
+
iscale = 1 / scale
|
1899
|
+
# TODO: multiply by the scale directly instead of inverting it twice
|
1900
|
+
# (this is also unnecessarily doubly inverted upstream)
|
1901
|
+
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
1902
|
+
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
1903
|
+
return result.type(dtype)
|
1628
1904
|
|
1629
1905
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1630
1906
|
new_name = self.map_tensor_name(name)
|
@@ -1639,11 +1915,9 @@ class BitnetModel(Model):
|
|
1639
1915
|
gguf.MODEL_TENSOR.FFN_GATE,
|
1640
1916
|
]):
|
1641
1917
|
# transform weight into 1/0/-1 (in fp32)
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
else:
|
1646
|
-
yield (new_name, data_torch)
|
1918
|
+
data_torch = self.weight_quant(data_torch)
|
1919
|
+
|
1920
|
+
yield (new_name, data_torch)
|
1647
1921
|
|
1648
1922
|
|
1649
1923
|
@Model.register("GrokForCausalLM")
|
@@ -1773,29 +2047,40 @@ class MiniCPMModel(Model):
|
|
1773
2047
|
model_arch = gguf.MODEL_ARCH.MINICPM
|
1774
2048
|
|
1775
2049
|
def set_gguf_parameters(self):
|
1776
|
-
|
1777
|
-
|
1778
|
-
self.gguf_writer.
|
1779
|
-
|
1780
|
-
self.
|
1781
|
-
self.gguf_writer.
|
1782
|
-
|
1783
|
-
self.
|
1784
|
-
self.gguf_writer.
|
1785
|
-
|
2050
|
+
super().set_gguf_parameters()
|
2051
|
+
embedding_scale = float(self.hparams["scale_emb"])
|
2052
|
+
self.gguf_writer.add_embedding_scale(embedding_scale)
|
2053
|
+
logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
|
2054
|
+
residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
|
2055
|
+
self.gguf_writer.add_residual_scale(residual_scale)
|
2056
|
+
logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
|
2057
|
+
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
2058
|
+
self.gguf_writer.add_logit_scale(logit_scale)
|
2059
|
+
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
2060
|
+
if self.hparams.get("rope_scaling") is not None:
|
2061
|
+
if self.hparams["rope_scaling"].get("type") == "longrope":
|
2062
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
2063
|
+
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
2064
|
+
|
2065
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2066
|
+
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1786
2067
|
|
1787
|
-
|
1788
|
-
|
2068
|
+
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
2069
|
+
if rope_scaling is not None:
|
2070
|
+
long_factors = rope_scaling.get('long_factor', None)
|
2071
|
+
short_factors = rope_scaling.get('short_factor', None)
|
1789
2072
|
|
1790
|
-
|
1791
|
-
|
1792
|
-
n_head = n_kv_head
|
2073
|
+
if long_factors is None or short_factors is None:
|
2074
|
+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
1793
2075
|
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
.
|
1798
|
-
|
2076
|
+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2077
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2078
|
+
|
2079
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2080
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2081
|
+
|
2082
|
+
def set_vocab(self):
|
2083
|
+
self._set_vocab_sentencepiece()
|
1799
2084
|
|
1800
2085
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1801
2086
|
del bid # unused
|
@@ -1805,13 +2090,66 @@ class MiniCPMModel(Model):
|
|
1805
2090
|
|
1806
2091
|
# HF models permute some of the tensors, so we need to undo that
|
1807
2092
|
if name.endswith(("q_proj.weight")):
|
1808
|
-
data_torch =
|
2093
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1809
2094
|
if name.endswith(("k_proj.weight")):
|
1810
|
-
data_torch =
|
2095
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1811
2096
|
|
1812
2097
|
return [(self.map_tensor_name(name), data_torch)]
|
1813
2098
|
|
1814
2099
|
|
2100
|
+
@Model.register("MiniCPM3ForCausalLM")
|
2101
|
+
class MiniCPM3Model(Model):
|
2102
|
+
model_arch = gguf.MODEL_ARCH.MINICPM3
|
2103
|
+
|
2104
|
+
def set_gguf_parameters(self):
|
2105
|
+
hparams = self.hparams
|
2106
|
+
|
2107
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2108
|
+
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2109
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2110
|
+
self.gguf_writer.add_block_count(self.block_count)
|
2111
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
2112
|
+
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
2113
|
+
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
2114
|
+
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
2115
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
2116
|
+
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
2117
|
+
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
2118
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
2119
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
2120
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
2121
|
+
|
2122
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2123
|
+
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
2124
|
+
if rope_scaling is not None:
|
2125
|
+
rope_dims = self.hparams["qk_rope_head_dim"]
|
2126
|
+
|
2127
|
+
long_factors = rope_scaling.get('long_factor', None)
|
2128
|
+
short_factors = rope_scaling.get('short_factor', None)
|
2129
|
+
|
2130
|
+
if long_factors is None or short_factors is None:
|
2131
|
+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
2132
|
+
|
2133
|
+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2134
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2135
|
+
|
2136
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2137
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2138
|
+
|
2139
|
+
def set_vocab(self):
|
2140
|
+
self._set_vocab_sentencepiece()
|
2141
|
+
|
2142
|
+
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
2143
|
+
if n_kv_head is not None and n_head != n_kv_head:
|
2144
|
+
n_head //= n_kv_head
|
2145
|
+
|
2146
|
+
return (
|
2147
|
+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
2148
|
+
.swapaxes(1, 2)
|
2149
|
+
.reshape(weights.shape)
|
2150
|
+
)
|
2151
|
+
|
2152
|
+
|
1815
2153
|
@Model.register("QWenLMHeadModel")
|
1816
2154
|
class QwenModel(Model):
|
1817
2155
|
model_arch = gguf.MODEL_ARCH.QWEN
|
@@ -1864,6 +2202,75 @@ class Qwen2Model(Model):
|
|
1864
2202
|
except FileNotFoundError:
|
1865
2203
|
self._set_vocab_gpt2()
|
1866
2204
|
|
2205
|
+
def set_gguf_parameters(self):
|
2206
|
+
super().set_gguf_parameters()
|
2207
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2208
|
+
if self.hparams["rope_scaling"].get("type") == "yarn":
|
2209
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
2210
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2211
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
2212
|
+
|
2213
|
+
|
2214
|
+
@Model.register("Qwen2VLForConditionalGeneration")
|
2215
|
+
class Qwen2VLModel(Model):
|
2216
|
+
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
2217
|
+
|
2218
|
+
def set_gguf_parameters(self):
|
2219
|
+
super().set_gguf_parameters()
|
2220
|
+
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
|
2221
|
+
mrope_section += [0] * max(0, 4 - len(mrope_section))
|
2222
|
+
self.gguf_writer.add_rope_dimension_sections(mrope_section)
|
2223
|
+
|
2224
|
+
def set_vocab(self):
|
2225
|
+
try:
|
2226
|
+
self._set_vocab_sentencepiece()
|
2227
|
+
except FileNotFoundError:
|
2228
|
+
self._set_vocab_gpt2()
|
2229
|
+
|
2230
|
+
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
2231
|
+
for name, data in super().get_tensors():
|
2232
|
+
if name.startswith("visual."):
|
2233
|
+
continue
|
2234
|
+
yield name, data
|
2235
|
+
|
2236
|
+
|
2237
|
+
@Model.register("WavTokenizerDec")
|
2238
|
+
class WavTokenizerDecModel(Model):
|
2239
|
+
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
2240
|
+
|
2241
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2242
|
+
del bid # unused
|
2243
|
+
|
2244
|
+
if \
|
2245
|
+
name.endswith("codebook.cluster_size") or \
|
2246
|
+
name.endswith("codebook.embed_avg") or \
|
2247
|
+
name.endswith("codebook.inited"):
|
2248
|
+
logger.debug(f"Skipping {name!r}")
|
2249
|
+
return []
|
2250
|
+
|
2251
|
+
logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
|
2252
|
+
|
2253
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2254
|
+
|
2255
|
+
def set_vocab(self):
|
2256
|
+
self._set_vocab_none()
|
2257
|
+
|
2258
|
+
def set_gguf_parameters(self):
|
2259
|
+
super().set_gguf_parameters()
|
2260
|
+
self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
|
2261
|
+
self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
|
2262
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
|
2263
|
+
self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
|
2264
|
+
self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
|
2265
|
+
|
2266
|
+
self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
|
2267
|
+
self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
|
2268
|
+
|
2269
|
+
self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
|
2270
|
+
self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
|
2271
|
+
|
2272
|
+
self.gguf_writer.add_causal_attention(False)
|
2273
|
+
|
1867
2274
|
|
1868
2275
|
@Model.register("Qwen2MoeForCausalLM")
|
1869
2276
|
class Qwen2MoeModel(Model):
|
@@ -1993,6 +2400,15 @@ class Phi3MiniModel(Model):
|
|
1993
2400
|
model_arch = gguf.MODEL_ARCH.PHI3
|
1994
2401
|
|
1995
2402
|
def set_vocab(self):
|
2403
|
+
# Phi-4 model uses GPT2Tokenizer
|
2404
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2405
|
+
if tokenizer_config_file.is_file():
|
2406
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2407
|
+
tokenizer_config_json = json.load(f)
|
2408
|
+
tokenizer_class = tokenizer_config_json['tokenizer_class']
|
2409
|
+
if tokenizer_class == 'GPT2Tokenizer':
|
2410
|
+
return self._set_vocab_gpt2()
|
2411
|
+
|
1996
2412
|
from sentencepiece import SentencePieceProcessor
|
1997
2413
|
|
1998
2414
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
@@ -2109,7 +2525,18 @@ class Phi3MiniModel(Model):
|
|
2109
2525
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
2110
2526
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
2111
2527
|
self.gguf_writer.add_file_type(self.ftype)
|
2112
|
-
self.
|
2528
|
+
sliding_window = self.hparams.get("sliding_window")
|
2529
|
+
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
2530
|
+
if sliding_window is None:
|
2531
|
+
sliding_window = 0
|
2532
|
+
self.gguf_writer.add_sliding_window(sliding_window)
|
2533
|
+
|
2534
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
2535
|
+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
2536
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
2537
|
+
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
2538
|
+
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
2539
|
+
rope_dims = n_embd // n_head
|
2113
2540
|
|
2114
2541
|
# write rope scaling for long context (128k) model
|
2115
2542
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
@@ -2140,27 +2567,84 @@ class Phi3MiniModel(Model):
|
|
2140
2567
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2141
2568
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2142
2569
|
|
2143
|
-
self.
|
2144
|
-
self.
|
2570
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
2571
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
2145
2572
|
|
2146
2573
|
|
2147
|
-
@Model.register("
|
2148
|
-
class
|
2149
|
-
model_arch = gguf.MODEL_ARCH.
|
2574
|
+
@Model.register("PhiMoEForCausalLM")
|
2575
|
+
class PhiMoeModel(Phi3MiniModel):
|
2576
|
+
model_arch = gguf.MODEL_ARCH.PHIMOE
|
2150
2577
|
|
2151
|
-
|
2152
|
-
self._set_vocab_sentencepiece()
|
2578
|
+
_experts: list[dict[str, Tensor]] | None = None
|
2153
2579
|
|
2154
2580
|
def set_gguf_parameters(self):
|
2155
|
-
|
2156
|
-
|
2581
|
+
super().set_gguf_parameters()
|
2582
|
+
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
2583
|
+
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
|
2157
2584
|
|
2158
|
-
|
2159
|
-
|
2160
|
-
|
2161
|
-
|
2162
|
-
|
2163
|
-
|
2585
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2586
|
+
# process the experts separately
|
2587
|
+
if name.find("block_sparse_moe.experts") != -1:
|
2588
|
+
n_experts = self.hparams["num_local_experts"]
|
2589
|
+
assert bid is not None
|
2590
|
+
|
2591
|
+
if self._experts is None:
|
2592
|
+
self._experts = [{} for _ in range(self.block_count)]
|
2593
|
+
|
2594
|
+
self._experts[bid][name] = data_torch
|
2595
|
+
|
2596
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
2597
|
+
tensors: list[tuple[str, Tensor]] = []
|
2598
|
+
|
2599
|
+
# merge the experts into a single 3d tensor
|
2600
|
+
for w_name in ["w1", "w2", "w3"]:
|
2601
|
+
datas: list[Tensor] = []
|
2602
|
+
|
2603
|
+
for xid in range(n_experts):
|
2604
|
+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
|
2605
|
+
datas.append(self._experts[bid][ename])
|
2606
|
+
del self._experts[bid][ename]
|
2607
|
+
|
2608
|
+
data_torch = torch.stack(datas, dim=0)
|
2609
|
+
|
2610
|
+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
|
2611
|
+
|
2612
|
+
new_name = self.map_tensor_name(merged_name)
|
2613
|
+
|
2614
|
+
tensors.append((new_name, data_torch))
|
2615
|
+
return tensors
|
2616
|
+
else:
|
2617
|
+
return []
|
2618
|
+
|
2619
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2620
|
+
|
2621
|
+
def prepare_tensors(self):
|
2622
|
+
super().prepare_tensors()
|
2623
|
+
|
2624
|
+
if self._experts is not None:
|
2625
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
2626
|
+
experts = [k for d in self._experts for k in d.keys()]
|
2627
|
+
if len(experts) > 0:
|
2628
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
2629
|
+
|
2630
|
+
|
2631
|
+
@Model.register("PlamoForCausalLM")
|
2632
|
+
class PlamoModel(Model):
|
2633
|
+
model_arch = gguf.MODEL_ARCH.PLAMO
|
2634
|
+
|
2635
|
+
def set_vocab(self):
|
2636
|
+
self._set_vocab_sentencepiece()
|
2637
|
+
|
2638
|
+
def set_gguf_parameters(self):
|
2639
|
+
hparams = self.hparams
|
2640
|
+
block_count = hparams["num_hidden_layers"]
|
2641
|
+
|
2642
|
+
self.gguf_writer.add_context_length(4096) # not in config.json
|
2643
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2644
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
2645
|
+
self.gguf_writer.add_block_count(block_count)
|
2646
|
+
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
2647
|
+
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
|
2164
2648
|
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
2165
2649
|
self.gguf_writer.add_file_type(self.ftype)
|
2166
2650
|
|
@@ -2351,7 +2835,7 @@ class InternLM2Model(Model):
|
|
2351
2835
|
if chat_eos_token_id is not None:
|
2352
2836
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
2353
2837
|
# TODO: this is a hack, should be fixed
|
2354
|
-
# https://github.com/
|
2838
|
+
# https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
|
2355
2839
|
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
2356
2840
|
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
2357
2841
|
" in chat mode so that the conversation can end normally.")
|
@@ -2401,7 +2885,67 @@ class InternLM2Model(Model):
|
|
2401
2885
|
return [(self.map_tensor_name(name), data_torch)]
|
2402
2886
|
|
2403
2887
|
|
2404
|
-
@Model.register("
|
2888
|
+
@Model.register("InternLM3ForCausalLM")
|
2889
|
+
class InternLM3Model(Model):
|
2890
|
+
model_arch = gguf.MODEL_ARCH.LLAMA
|
2891
|
+
|
2892
|
+
def set_vocab(self):
|
2893
|
+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
2894
|
+
|
2895
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
2896
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
2897
|
+
self.gguf_writer.add_token_list(tokens)
|
2898
|
+
self.gguf_writer.add_token_scores(scores)
|
2899
|
+
self.gguf_writer.add_token_types(toktypes)
|
2900
|
+
|
2901
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2902
|
+
|
2903
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2904
|
+
if tokenizer_config_file.is_file():
|
2905
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2906
|
+
tokenizer_config_json = json.load(f)
|
2907
|
+
if "add_prefix_space" in tokenizer_config_json:
|
2908
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
2909
|
+
|
2910
|
+
if "added_tokens_decoder" in tokenizer_config_json:
|
2911
|
+
for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
|
2912
|
+
if token_data.get("special"):
|
2913
|
+
token_id = int(token_id)
|
2914
|
+
token = token_data["content"]
|
2915
|
+
special_vocab._set_special_token(token, token_id)
|
2916
|
+
# update eos token
|
2917
|
+
if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
|
2918
|
+
special_vocab.special_token_ids["eos"] = token_id
|
2919
|
+
|
2920
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2921
|
+
|
2922
|
+
def set_gguf_parameters(self):
|
2923
|
+
super().set_gguf_parameters()
|
2924
|
+
hparams = self.hparams
|
2925
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
2926
|
+
|
2927
|
+
if "head_dim" in hparams:
|
2928
|
+
rope_dim = hparams["head_dim"]
|
2929
|
+
else:
|
2930
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
2931
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
2932
|
+
|
2933
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2934
|
+
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
|
2935
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2936
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2937
|
+
|
2938
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2939
|
+
n_head = self.hparams["num_attention_heads"]
|
2940
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
2941
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
2942
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
2943
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
2944
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
2945
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2946
|
+
|
2947
|
+
|
2948
|
+
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
|
2405
2949
|
class BertModel(Model):
|
2406
2950
|
model_arch = gguf.MODEL_ARCH.BERT
|
2407
2951
|
|
@@ -2442,7 +2986,8 @@ class BertModel(Model):
|
|
2442
2986
|
|
2443
2987
|
# we need this to validate the size of the token_type embeddings
|
2444
2988
|
# though currently we are passing all zeros to the token_type embeddings
|
2445
|
-
|
2989
|
+
# "Sequence A" or "Sequence B"
|
2990
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
2446
2991
|
|
2447
2992
|
# convert to phantom space vocab
|
2448
2993
|
def phantom(tok):
|
@@ -2466,13 +3011,73 @@ class BertModel(Model):
|
|
2466
3011
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2467
3012
|
del bid # unused
|
2468
3013
|
|
3014
|
+
if name.startswith("bert."):
|
3015
|
+
name = name[5:]
|
3016
|
+
|
3017
|
+
if name.endswith(".gamma"):
|
3018
|
+
name = name[:-6] + ".weight"
|
3019
|
+
|
3020
|
+
if name.endswith(".beta"):
|
3021
|
+
name = name[:-5] + ".bias"
|
3022
|
+
|
2469
3023
|
# we are only using BERT for embeddings so we don't need the pooling layer
|
2470
3024
|
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
2471
3025
|
return [] # we don't need these
|
2472
3026
|
|
3027
|
+
if name.startswith("cls.predictions"):
|
3028
|
+
return []
|
3029
|
+
|
3030
|
+
if name.startswith("cls.seq_relationship"):
|
3031
|
+
return []
|
3032
|
+
|
2473
3033
|
return [(self.map_tensor_name(name), data_torch)]
|
2474
3034
|
|
2475
3035
|
|
3036
|
+
@Model.register("RobertaModel")
|
3037
|
+
class RobertaModel(BertModel):
|
3038
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
3039
|
+
|
3040
|
+
def __init__(self, *args, **kwargs):
|
3041
|
+
super().__init__(*args, **kwargs)
|
3042
|
+
|
3043
|
+
# we need the pad_token_id to know how to chop down position_embd matrix
|
3044
|
+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
3045
|
+
self._position_offset = 1 + pad_token_id
|
3046
|
+
if "max_position_embeddings" in self.hparams:
|
3047
|
+
self.hparams["max_position_embeddings"] -= self._position_offset
|
3048
|
+
else:
|
3049
|
+
self._position_offset = None
|
3050
|
+
|
3051
|
+
def set_vocab(self):
|
3052
|
+
"""Support BPE tokenizers for roberta models"""
|
3053
|
+
bpe_tok_path = self.dir_model / "tokenizer.json"
|
3054
|
+
if bpe_tok_path.exists():
|
3055
|
+
self._set_vocab_gpt2()
|
3056
|
+
self.gguf_writer.add_add_bos_token(True)
|
3057
|
+
self.gguf_writer.add_add_eos_token(True)
|
3058
|
+
|
3059
|
+
# we need this to validate the size of the token_type embeddings
|
3060
|
+
# though currently we are passing all zeros to the token_type embeddings
|
3061
|
+
# "Sequence A" or "Sequence B"
|
3062
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
3063
|
+
|
3064
|
+
else:
|
3065
|
+
return super().set_vocab()
|
3066
|
+
|
3067
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3068
|
+
# if name starts with "roberta.", remove the prefix
|
3069
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
3070
|
+
if name.startswith("roberta."):
|
3071
|
+
name = name[8:]
|
3072
|
+
|
3073
|
+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
3074
|
+
if name == "embeddings.position_embeddings.weight":
|
3075
|
+
if self._position_offset is not None:
|
3076
|
+
data_torch = data_torch[self._position_offset:,:]
|
3077
|
+
|
3078
|
+
return super().modify_tensors(data_torch, name, bid)
|
3079
|
+
|
3080
|
+
|
2476
3081
|
@Model.register("NomicBertModel")
|
2477
3082
|
class NomicBertModel(BertModel):
|
2478
3083
|
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
|
@@ -2503,7 +3108,7 @@ class NomicBertModel(BertModel):
|
|
2503
3108
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
2504
3109
|
|
2505
3110
|
|
2506
|
-
@Model.register("XLMRobertaModel")
|
3111
|
+
@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
2507
3112
|
class XLMRobertaModel(BertModel):
|
2508
3113
|
model_arch = gguf.MODEL_ARCH.BERT
|
2509
3114
|
|
@@ -2589,7 +3194,7 @@ class XLMRobertaModel(BertModel):
|
|
2589
3194
|
self.gguf_writer.add_token_scores(scores)
|
2590
3195
|
self.gguf_writer.add_token_types(toktypes)
|
2591
3196
|
self.gguf_writer.add_add_space_prefix(add_prefix)
|
2592
|
-
self.gguf_writer.add_token_type_count(1)
|
3197
|
+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
2593
3198
|
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
2594
3199
|
if precompiled_charsmap:
|
2595
3200
|
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
@@ -2601,6 +3206,11 @@ class XLMRobertaModel(BertModel):
|
|
2601
3206
|
self.gguf_writer.add_add_eos_token(True)
|
2602
3207
|
|
2603
3208
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3209
|
+
# if name starts with "roberta.", remove the prefix
|
3210
|
+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
3211
|
+
if name.startswith("roberta."):
|
3212
|
+
name = name[8:]
|
3213
|
+
|
2604
3214
|
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
2605
3215
|
if name == "embeddings.position_embeddings.weight":
|
2606
3216
|
if self._position_offset is not None:
|
@@ -2712,6 +3322,164 @@ class StarCoder2Model(Model):
|
|
2712
3322
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
2713
3323
|
|
2714
3324
|
|
3325
|
+
@Model.register("Rwkv6ForCausalLM")
|
3326
|
+
class Rwkv6Model(Model):
|
3327
|
+
model_arch = gguf.MODEL_ARCH.RWKV6
|
3328
|
+
|
3329
|
+
def set_vocab(self):
|
3330
|
+
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
3331
|
+
vocab_size = self.hparams.get("vocab_size", 65536)
|
3332
|
+
|
3333
|
+
tokens: list[bytes] = ['<s>'.encode("utf-8")]
|
3334
|
+
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
3335
|
+
|
3336
|
+
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
|
3337
|
+
lines = f.readlines()
|
3338
|
+
for line in lines:
|
3339
|
+
parts = line.split(' ')
|
3340
|
+
assert len(parts) >= 3
|
3341
|
+
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
3342
|
+
token = token.encode("utf-8") if isinstance(token, str) else token
|
3343
|
+
assert isinstance(token, bytes)
|
3344
|
+
assert len(token) == token_len
|
3345
|
+
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
3346
|
+
tokens.append(token_text.encode("utf-8"))
|
3347
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
3348
|
+
remainder = vocab_size - len(tokens)
|
3349
|
+
assert remainder >= 0
|
3350
|
+
for i in range(len(tokens), vocab_size):
|
3351
|
+
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
3352
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
3353
|
+
|
3354
|
+
self.gguf_writer.add_tokenizer_model("rwkv")
|
3355
|
+
self.gguf_writer.add_token_list(tokens)
|
3356
|
+
self.gguf_writer.add_token_types(toktypes)
|
3357
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
3358
|
+
special_vocab.chat_template = "rwkv-world"
|
3359
|
+
# hack: Add '\n\n' as the EOT token to make it chat normally
|
3360
|
+
special_vocab._set_special_token("eot", 261)
|
3361
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3362
|
+
|
3363
|
+
def set_gguf_parameters(self):
|
3364
|
+
block_count = self.hparams["num_hidden_layers"]
|
3365
|
+
head_size = self.hparams["head_size"]
|
3366
|
+
hidden_size = self.hparams["hidden_size"]
|
3367
|
+
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
3368
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
3369
|
+
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
3370
|
+
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
3371
|
+
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
3372
|
+
|
3373
|
+
# RWKV isn't context limited
|
3374
|
+
self.gguf_writer.add_context_length(1048576)
|
3375
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
3376
|
+
self.gguf_writer.add_block_count(block_count)
|
3377
|
+
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
3378
|
+
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
3379
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
3380
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
3381
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
3382
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
3383
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3384
|
+
|
3385
|
+
# required by llama.cpp, unused
|
3386
|
+
self.gguf_writer.add_head_count(0)
|
3387
|
+
|
3388
|
+
lerp_weights: dict[int, dict[str, Tensor]] = {}
|
3389
|
+
|
3390
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3391
|
+
new_name = self.map_tensor_name(name)
|
3392
|
+
|
3393
|
+
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
3394
|
+
new_name += ".weight"
|
3395
|
+
|
3396
|
+
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
|
3397
|
+
data_torch = data_torch.transpose(0, 1)
|
3398
|
+
|
3399
|
+
if new_name.endswith("time_mix_w2.weight"):
|
3400
|
+
data_torch = data_torch.permute(0, 2, 1)
|
3401
|
+
|
3402
|
+
if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
|
3403
|
+
data_torch = data_torch.squeeze()
|
3404
|
+
|
3405
|
+
try:
|
3406
|
+
rescale_every_n_layers = self.hparams["rescale_every"]
|
3407
|
+
if rescale_every_n_layers > 0:
|
3408
|
+
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
3409
|
+
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
3410
|
+
except KeyError:
|
3411
|
+
pass
|
3412
|
+
|
3413
|
+
# concat time_mix_lerp weights to reduce some cpu overhead
|
3414
|
+
# also reduces the number of tensors in the model
|
3415
|
+
if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
|
3416
|
+
try:
|
3417
|
+
self.lerp_weights[bid][new_name] = data_torch
|
3418
|
+
except KeyError:
|
3419
|
+
self.lerp_weights[bid] = {new_name: data_torch}
|
3420
|
+
if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
|
3421
|
+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
|
3422
|
+
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
|
3423
|
+
yield (new_name, data)
|
3424
|
+
return
|
3425
|
+
|
3426
|
+
yield (new_name, data_torch)
|
3427
|
+
|
3428
|
+
|
3429
|
+
@Model.register("RWKV6Qwen2ForCausalLM")
|
3430
|
+
class RWKV6Qwen2Model(Rwkv6Model):
|
3431
|
+
model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
|
3432
|
+
|
3433
|
+
def set_vocab(self):
|
3434
|
+
try:
|
3435
|
+
self._set_vocab_sentencepiece()
|
3436
|
+
except FileNotFoundError:
|
3437
|
+
self._set_vocab_gpt2()
|
3438
|
+
|
3439
|
+
def set_gguf_parameters(self):
|
3440
|
+
block_count = self.hparams["num_hidden_layers"]
|
3441
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
3442
|
+
num_key_value_heads = self.hparams["num_key_value_heads"]
|
3443
|
+
hidden_size = self.hparams["hidden_size"]
|
3444
|
+
head_size = hidden_size // num_attention_heads
|
3445
|
+
rms_norm_eps = self.hparams["rms_norm_eps"]
|
3446
|
+
intermediate_size = self.hparams["intermediate_size"]
|
3447
|
+
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
|
3448
|
+
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
|
3449
|
+
|
3450
|
+
# RWKV isn't context limited
|
3451
|
+
self.gguf_writer.add_context_length(1048576)
|
3452
|
+
self.gguf_writer.add_embedding_length(hidden_size)
|
3453
|
+
self.gguf_writer.add_block_count(block_count)
|
3454
|
+
self.gguf_writer.add_wkv_head_size(head_size)
|
3455
|
+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
3456
|
+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
3457
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
3458
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3459
|
+
|
3460
|
+
# special parameters for time_mixing in RWKV6QWEN2
|
3461
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
3462
|
+
self.gguf_writer.add_token_shift_count(1)
|
3463
|
+
# RWKV6QWEN2 use grouped key/value like GQA
|
3464
|
+
self.gguf_writer.add_head_count_kv(num_key_value_heads)
|
3465
|
+
|
3466
|
+
# required by llama.cpp, unused
|
3467
|
+
self.gguf_writer.add_head_count(0)
|
3468
|
+
|
3469
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3470
|
+
for new_name, data in super().modify_tensors(data_torch, name, bid):
|
3471
|
+
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
|
3472
|
+
data = data.view(5, -1, data.shape[-1])
|
3473
|
+
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
|
3474
|
+
# permute them here to avoid code changes
|
3475
|
+
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
|
3476
|
+
if "w2" in new_name:
|
3477
|
+
data = data.view(5, -1, data.shape[-1])
|
3478
|
+
yield (new_name, data)
|
3479
|
+
continue
|
3480
|
+
yield (new_name, data)
|
3481
|
+
|
3482
|
+
|
2715
3483
|
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
2716
3484
|
class MambaModel(Model):
|
2717
3485
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
@@ -2806,6 +3574,24 @@ class CommandR2Model(Model):
|
|
2806
3574
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
2807
3575
|
|
2808
3576
|
|
3577
|
+
@Model.register("Cohere2ForCausalLM")
|
3578
|
+
class Cohere2Model(Model):
|
3579
|
+
model_arch = gguf.MODEL_ARCH.COHERE2
|
3580
|
+
|
3581
|
+
def set_gguf_parameters(self):
|
3582
|
+
super().set_gguf_parameters()
|
3583
|
+
|
3584
|
+
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
3585
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
3586
|
+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
3587
|
+
|
3588
|
+
rotary_pct = self.hparams["rotary_pct"]
|
3589
|
+
hidden_size = self.hparams["hidden_size"]
|
3590
|
+
num_attention_heads = self.hparams["num_attention_heads"]
|
3591
|
+
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
|
3592
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3593
|
+
|
3594
|
+
|
2809
3595
|
@Model.register("OlmoForCausalLM")
|
2810
3596
|
@Model.register("OLMoForCausalLM")
|
2811
3597
|
class OlmoModel(Model):
|
@@ -2834,6 +3620,71 @@ class OlmoModel(Model):
|
|
2834
3620
|
return [(self.map_tensor_name(name), data_torch)]
|
2835
3621
|
|
2836
3622
|
|
3623
|
+
@Model.register("Olmo2ForCausalLM")
|
3624
|
+
class Olmo2Model(Model):
|
3625
|
+
model_arch = gguf.MODEL_ARCH.OLMO2
|
3626
|
+
|
3627
|
+
|
3628
|
+
@Model.register("OlmoeForCausalLM")
|
3629
|
+
class OlmoeModel(Model):
|
3630
|
+
model_arch = gguf.MODEL_ARCH.OLMOE
|
3631
|
+
|
3632
|
+
def set_gguf_parameters(self):
|
3633
|
+
super().set_gguf_parameters()
|
3634
|
+
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
|
3635
|
+
if (n_experts := self.hparams.get("num_experts")) is not None:
|
3636
|
+
self.gguf_writer.add_expert_count(n_experts)
|
3637
|
+
|
3638
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3639
|
+
|
3640
|
+
# Copied from: Qwen2MoeModel
|
3641
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3642
|
+
# process the experts separately
|
3643
|
+
if name.find("experts") != -1:
|
3644
|
+
n_experts = self.hparams["num_experts"]
|
3645
|
+
assert bid is not None
|
3646
|
+
|
3647
|
+
if self._experts is None:
|
3648
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3649
|
+
|
3650
|
+
self._experts[bid][name] = data_torch
|
3651
|
+
|
3652
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3653
|
+
tensors: list[tuple[str, Tensor]] = []
|
3654
|
+
|
3655
|
+
# merge the experts into a single 3d tensor
|
3656
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
3657
|
+
datas: list[Tensor] = []
|
3658
|
+
|
3659
|
+
for xid in range(n_experts):
|
3660
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
3661
|
+
datas.append(self._experts[bid][ename])
|
3662
|
+
del self._experts[bid][ename]
|
3663
|
+
|
3664
|
+
data_torch = torch.stack(datas, dim=0)
|
3665
|
+
|
3666
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
3667
|
+
|
3668
|
+
new_name = self.map_tensor_name(merged_name)
|
3669
|
+
|
3670
|
+
tensors.append((new_name, data_torch))
|
3671
|
+
return tensors
|
3672
|
+
else:
|
3673
|
+
return []
|
3674
|
+
|
3675
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3676
|
+
|
3677
|
+
# Copied from: Qwen2MoeModel
|
3678
|
+
def prepare_tensors(self):
|
3679
|
+
super().prepare_tensors()
|
3680
|
+
|
3681
|
+
if self._experts is not None:
|
3682
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3683
|
+
experts = [k for d in self._experts for k in d.keys()]
|
3684
|
+
if len(experts) > 0:
|
3685
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
3686
|
+
|
3687
|
+
|
2837
3688
|
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
2838
3689
|
class JinaBertV2Model(BertModel):
|
2839
3690
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
@@ -2872,6 +3723,14 @@ class JinaBertV2Model(BertModel):
|
|
2872
3723
|
self.gguf_writer.add_add_bos_token(True)
|
2873
3724
|
self.gguf_writer.add_add_eos_token(True)
|
2874
3725
|
|
3726
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3727
|
+
# if name starts with "bert.", remove the prefix
|
3728
|
+
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
3729
|
+
if name.startswith("bert."):
|
3730
|
+
name = name[5:]
|
3731
|
+
|
3732
|
+
return super().modify_tensors(data_torch, name, bid)
|
3733
|
+
|
2875
3734
|
|
2876
3735
|
@Model.register("OpenELMForCausalLM")
|
2877
3736
|
class OpenELMModel(Model):
|
@@ -3099,7 +3958,99 @@ class ArcticModel(Model):
|
|
3099
3958
|
raise ValueError(f"Unprocessed experts: {experts}")
|
3100
3959
|
|
3101
3960
|
|
3961
|
+
@Model.register("DeepseekForCausalLM")
|
3962
|
+
class DeepseekModel(Model):
|
3963
|
+
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
3964
|
+
|
3965
|
+
def set_vocab(self):
|
3966
|
+
try:
|
3967
|
+
self._set_vocab_sentencepiece()
|
3968
|
+
except FileNotFoundError:
|
3969
|
+
self._set_vocab_gpt2()
|
3970
|
+
|
3971
|
+
def set_gguf_parameters(self):
|
3972
|
+
super().set_gguf_parameters()
|
3973
|
+
hparams = self.hparams
|
3974
|
+
if "head_dim" in hparams:
|
3975
|
+
rope_dim = hparams["head_dim"]
|
3976
|
+
else:
|
3977
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
3978
|
+
|
3979
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
3980
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3981
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
3982
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
3983
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
3984
|
+
self.gguf_writer.add_expert_weights_scale(1.0)
|
3985
|
+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3986
|
+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3987
|
+
|
3988
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3989
|
+
|
3990
|
+
@staticmethod
|
3991
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
3992
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
3993
|
+
n_head = n_head_kv
|
3994
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
3995
|
+
.swapaxes(1, 2)
|
3996
|
+
.reshape(weights.shape))
|
3997
|
+
|
3998
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3999
|
+
n_head = self.hparams["num_attention_heads"]
|
4000
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
4001
|
+
|
4002
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
4003
|
+
data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
|
4004
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
4005
|
+
data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
|
4006
|
+
|
4007
|
+
# process the experts separately
|
4008
|
+
if name.find("mlp.experts") != -1:
|
4009
|
+
n_experts = self.hparams["n_routed_experts"]
|
4010
|
+
assert bid is not None
|
4011
|
+
|
4012
|
+
if self._experts is None:
|
4013
|
+
self._experts = [{} for _ in range(self.block_count)]
|
4014
|
+
|
4015
|
+
self._experts[bid][name] = data_torch
|
4016
|
+
|
4017
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
4018
|
+
tensors: list[tuple[str, Tensor]] = []
|
4019
|
+
|
4020
|
+
# merge the experts into a single 3d tensor
|
4021
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
4022
|
+
datas: list[Tensor] = []
|
4023
|
+
|
4024
|
+
for xid in range(n_experts):
|
4025
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
4026
|
+
datas.append(self._experts[bid][ename])
|
4027
|
+
del self._experts[bid][ename]
|
4028
|
+
|
4029
|
+
data_torch = torch.stack(datas, dim=0)
|
4030
|
+
|
4031
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
4032
|
+
|
4033
|
+
new_name = self.map_tensor_name(merged_name)
|
4034
|
+
|
4035
|
+
tensors.append((new_name, data_torch))
|
4036
|
+
return tensors
|
4037
|
+
else:
|
4038
|
+
return []
|
4039
|
+
|
4040
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4041
|
+
|
4042
|
+
def prepare_tensors(self):
|
4043
|
+
super().prepare_tensors()
|
4044
|
+
|
4045
|
+
if self._experts is not None:
|
4046
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
4047
|
+
experts = [k for d in self._experts for k in d.keys()]
|
4048
|
+
if len(experts) > 0:
|
4049
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
4050
|
+
|
4051
|
+
|
3102
4052
|
@Model.register("DeepseekV2ForCausalLM")
|
4053
|
+
@Model.register("DeepseekV3ForCausalLM")
|
3103
4054
|
class DeepseekV2Model(Model):
|
3104
4055
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
3105
4056
|
|
@@ -3121,6 +4072,15 @@ class DeepseekV2Model(Model):
|
|
3121
4072
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3122
4073
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3123
4074
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
4075
|
+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
|
4076
|
+
|
4077
|
+
if hparams["scoring_func"] == "sigmoid":
|
4078
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
4079
|
+
elif hparams["scoring_func"] == "softmax":
|
4080
|
+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
4081
|
+
else:
|
4082
|
+
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
|
4083
|
+
|
3124
4084
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
3125
4085
|
|
3126
4086
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
@@ -3133,6 +4093,16 @@ class DeepseekV2Model(Model):
|
|
3133
4093
|
_experts: list[dict[str, Tensor]] | None = None
|
3134
4094
|
|
3135
4095
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4096
|
+
# rename e_score_correction_bias tensors
|
4097
|
+
if name.endswith("e_score_correction_bias"):
|
4098
|
+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
4099
|
+
|
4100
|
+
# skip Multi-Token Prediction (MTP) layers
|
4101
|
+
block_count = self.hparams["num_hidden_layers"]
|
4102
|
+
match = re.match(r"model.layers.(\d+)", name)
|
4103
|
+
if match and int(match.group(1)) >= block_count:
|
4104
|
+
return []
|
4105
|
+
|
3136
4106
|
# process the experts separately
|
3137
4107
|
if name.find("mlp.experts") != -1:
|
3138
4108
|
n_experts = self.hparams["n_routed_experts"]
|
@@ -3474,10 +4444,7 @@ class JaisModel(Model):
|
|
3474
4444
|
|
3475
4445
|
# Embeddings scale
|
3476
4446
|
self.embeddings_scale = 1.0
|
3477
|
-
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
3478
|
-
self.output_is_wte = False
|
3479
4447
|
if 'mup_embeddings_scale' in self.hparams:
|
3480
|
-
self.output_is_wte = True # Hack (?)
|
3481
4448
|
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
3482
4449
|
elif 'embeddings_scale' in self.hparams:
|
3483
4450
|
self.embeddings_scale = self.hparams['embeddings_scale']
|
@@ -3534,10 +4501,7 @@ class JaisModel(Model):
|
|
3534
4501
|
|
3535
4502
|
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3536
4503
|
tensors.append((new_name, data_torch * self.embeddings_scale))
|
3537
|
-
if self.output_is_wte:
|
3538
|
-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
3539
4504
|
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
3540
|
-
assert not self.output_is_wte
|
3541
4505
|
tensors.append((new_name, data_torch * self.width_scale))
|
3542
4506
|
else:
|
3543
4507
|
tensors.append((new_name, data_torch))
|
@@ -3549,7 +4513,7 @@ class JaisModel(Model):
|
|
3549
4513
|
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
3550
4514
|
|
3551
4515
|
|
3552
|
-
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
4516
|
+
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
3553
4517
|
class ChatGLMModel(Model):
|
3554
4518
|
model_arch = gguf.MODEL_ARCH.CHATGLM
|
3555
4519
|
|
@@ -3655,47 +4619,15 @@ class ChatGLMModel(Model):
|
|
3655
4619
|
|
3656
4620
|
from transformers import AutoTokenizer
|
3657
4621
|
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3658
|
-
vocab_size = hparams
|
4622
|
+
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
|
3659
4623
|
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3660
4624
|
|
3661
|
-
tokpre = self.
|
3662
|
-
|
3663
|
-
merges = []
|
3664
|
-
vocab = {}
|
3665
|
-
mergeable_ranks = tokenizer.mergeable_ranks
|
3666
|
-
for token, rank in mergeable_ranks.items():
|
3667
|
-
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
3668
|
-
if len(token) == 1:
|
3669
|
-
continue
|
3670
|
-
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
3671
|
-
assert len(merged) >= 2 and len(merged) <= 7
|
3672
|
-
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
3673
|
-
|
3674
|
-
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
3675
|
-
added_vocab = tokenizer.get_added_vocab()
|
3676
|
-
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
3677
|
-
|
3678
|
-
for i in range(vocab_size):
|
3679
|
-
if i not in reverse_vocab:
|
3680
|
-
tokens.append(f"[PAD{i}]")
|
3681
|
-
toktypes.append(gguf.TokenType.UNUSED)
|
3682
|
-
elif reverse_vocab[i] in added_vocab:
|
3683
|
-
tokens.append(reverse_vocab[i])
|
3684
|
-
if tokenizer.added_tokens_decoder[i].special:
|
3685
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
3686
|
-
else:
|
3687
|
-
toktypes.append(gguf.TokenType.USER_DEFINED)
|
3688
|
-
else:
|
3689
|
-
tokens.append(reverse_vocab[i])
|
3690
|
-
toktypes.append(gguf.TokenType.NORMAL)
|
3691
|
-
|
4625
|
+
tokens, toktypes, tokpre = self.get_vocab_base()
|
3692
4626
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
3693
4627
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
3694
4628
|
self.gguf_writer.add_token_list(tokens)
|
3695
4629
|
self.gguf_writer.add_token_types(toktypes)
|
3696
|
-
|
3697
|
-
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
3698
|
-
special_vocab.merges = merges
|
4630
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
3699
4631
|
# only add special tokens when they were not already loaded from config.json
|
3700
4632
|
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3701
4633
|
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
@@ -3706,16 +4638,20 @@ class ChatGLMModel(Model):
|
|
3706
4638
|
def set_gguf_parameters(self):
|
3707
4639
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
3708
4640
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
3709
|
-
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
4641
|
+
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
|
3710
4642
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
3711
4643
|
self.gguf_writer.add_embedding_length(n_embed)
|
3712
|
-
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
3713
|
-
self.gguf_writer.add_block_count(self.hparams
|
4644
|
+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
|
4645
|
+
self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
|
3714
4646
|
self.gguf_writer.add_head_count(n_head)
|
3715
4647
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
3716
|
-
self.gguf_writer.add_layer_norm_rms_eps(self.hparams
|
4648
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
|
3717
4649
|
self.gguf_writer.add_file_type(self.ftype)
|
3718
|
-
self.
|
4650
|
+
if "attention_dim" in self.hparams:
|
4651
|
+
rope_dim = self.hparams["attention_dim"]
|
4652
|
+
else:
|
4653
|
+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
4654
|
+
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
3719
4655
|
self.gguf_writer.add_add_bos_token(False)
|
3720
4656
|
rope_freq = 10000
|
3721
4657
|
if "rope_ratio" in self.hparams:
|
@@ -3725,7 +4661,7 @@ class ChatGLMModel(Model):
|
|
3725
4661
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3726
4662
|
del bid # unused
|
3727
4663
|
|
3728
|
-
if name.endswith(".rotary_pos_emb.inv_freq"):
|
4664
|
+
if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
|
3729
4665
|
return []
|
3730
4666
|
|
3731
4667
|
name = name.removeprefix("transformer.")
|
@@ -3812,11 +4748,11 @@ class ExaoneModel(Model):
|
|
3812
4748
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
3813
4749
|
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
3814
4750
|
|
3815
|
-
def
|
4751
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
3816
4752
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
3817
4753
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
3818
4754
|
base = self.hparams.get("rope_theta", 10000.0)
|
3819
|
-
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
4755
|
+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
3820
4756
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
3821
4757
|
|
3822
4758
|
factor = rope_scaling.get("factor", 8.0)
|
@@ -3839,9 +4775,107 @@ class ExaoneModel(Model):
|
|
3839
4775
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
3840
4776
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
3841
4777
|
|
3842
|
-
|
4778
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
4779
|
+
|
4780
|
+
|
4781
|
+
@Model.register("GraniteForCausalLM")
|
4782
|
+
class GraniteModel(LlamaModel):
|
4783
|
+
"""Conversion for IBM's GraniteForCausalLM"""
|
4784
|
+
model_arch = gguf.MODEL_ARCH.GRANITE
|
4785
|
+
|
4786
|
+
def set_gguf_parameters(self):
|
4787
|
+
"""Granite uses standard llama parameters with the following differences:
|
4788
|
+
|
4789
|
+
- No head_dim support
|
4790
|
+
- New multiplier params:
|
4791
|
+
- attention_scale
|
4792
|
+
- embedding_scale
|
4793
|
+
- residual_scale
|
4794
|
+
- logits_scaling
|
4795
|
+
"""
|
4796
|
+
if head_dim := self.hparams.pop("head_dim", None):
|
4797
|
+
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
|
4798
|
+
super().set_gguf_parameters()
|
4799
|
+
# NOTE: Convert _multiplier params to _scale params for naming
|
4800
|
+
# consistency
|
4801
|
+
if attention_scale := self.hparams.get("attention_multiplier"):
|
4802
|
+
self.gguf_writer.add_attention_scale(attention_scale)
|
4803
|
+
logger.info("gguf: (granite) attention_scale = %s", attention_scale)
|
4804
|
+
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
4805
|
+
self.gguf_writer.add_embedding_scale(embedding_scale)
|
4806
|
+
logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
|
4807
|
+
if residual_scale := self.hparams.get("residual_multiplier"):
|
4808
|
+
self.gguf_writer.add_residual_scale(residual_scale)
|
4809
|
+
logger.info("gguf: (granite) residual_scale = %s", residual_scale)
|
4810
|
+
if logits_scale := self.hparams.get("logits_scaling"):
|
4811
|
+
self.gguf_writer.add_logit_scale(logits_scale)
|
4812
|
+
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
4813
|
+
|
4814
|
+
|
4815
|
+
@Model.register("GraniteMoeForCausalLM")
|
4816
|
+
class GraniteMoeModel(GraniteModel):
|
4817
|
+
"""Conversion for IBM's GraniteMoeForCausalLM"""
|
4818
|
+
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
|
4819
|
+
|
4820
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4821
|
+
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
|
4822
|
+
is used. This essentially merges w1 and w3 into a single tensor with 2x
|
4823
|
+
the hidden size that is then split during forward. To keep compatibility
|
4824
|
+
with existing mixtral support, we pull them apart here.
|
4825
|
+
"""
|
4826
|
+
|
4827
|
+
if name.endswith("block_sparse_moe.input_linear.weight"):
|
4828
|
+
ffn_dim = self.hparams["intermediate_size"]
|
4829
|
+
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
|
4830
|
+
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
|
4831
|
+
return [
|
4832
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
|
4833
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
4834
|
+
]
|
4835
|
+
|
4836
|
+
return super().modify_tensors(data_torch, name, bid)
|
3843
4837
|
|
3844
|
-
|
4838
|
+
|
4839
|
+
@Model.register("ChameleonForConditionalGeneration")
|
4840
|
+
@Model.register("ChameleonForCausalLM") # obsolete
|
4841
|
+
class ChameleonModel(Model):
|
4842
|
+
model_arch = gguf.MODEL_ARCH.CHAMELEON
|
4843
|
+
|
4844
|
+
def set_gguf_parameters(self):
|
4845
|
+
super().set_gguf_parameters()
|
4846
|
+
self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
|
4847
|
+
|
4848
|
+
def set_vocab(self):
|
4849
|
+
self._set_vocab_gpt2()
|
4850
|
+
|
4851
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
4852
|
+
# ignore image tokenizer for now
|
4853
|
+
# TODO: remove this once image support is implemented for Chameleon
|
4854
|
+
if name.startswith("model.vqmodel"):
|
4855
|
+
return []
|
4856
|
+
|
4857
|
+
n_head = self.hparams["num_attention_heads"]
|
4858
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
4859
|
+
hidden_dim = self.hparams.get("hidden_size")
|
4860
|
+
|
4861
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
4862
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
4863
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
4864
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
4865
|
+
if name.endswith(("q_norm.weight", "q_norm.bias")):
|
4866
|
+
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
|
4867
|
+
if name.endswith(("k_norm.weight", "k_norm.bias")):
|
4868
|
+
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
|
4869
|
+
|
4870
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4871
|
+
|
4872
|
+
# see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
|
4873
|
+
@staticmethod
|
4874
|
+
def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
|
4875
|
+
head_dim = hidden_dim // n_heads
|
4876
|
+
data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
|
4877
|
+
data_torch = data_torch.repeat_interleave(n_heads, 0)
|
4878
|
+
return data_torch
|
3845
4879
|
|
3846
4880
|
|
3847
4881
|
###### CONVERSION LOGIC ######
|
@@ -3924,8 +4958,8 @@ def parse_args() -> argparse.Namespace:
|
|
3924
4958
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
3925
4959
|
)
|
3926
4960
|
parser.add_argument(
|
3927
|
-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
3928
|
-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
4961
|
+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
4962
|
+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
3929
4963
|
)
|
3930
4964
|
parser.add_argument(
|
3931
4965
|
"--bigendian", action="store_true",
|
@@ -3934,6 +4968,7 @@ def parse_args() -> argparse.Namespace:
|
|
3934
4968
|
parser.add_argument(
|
3935
4969
|
"model", type=Path,
|
3936
4970
|
help="directory containing model file",
|
4971
|
+
nargs="?",
|
3937
4972
|
)
|
3938
4973
|
parser.add_argument(
|
3939
4974
|
"--use-temp-file", action="store_true",
|
@@ -3971,8 +5006,15 @@ def parse_args() -> argparse.Namespace:
|
|
3971
5006
|
"--metadata", type=Path,
|
3972
5007
|
help="Specify the path for an authorship metadata override file"
|
3973
5008
|
)
|
5009
|
+
parser.add_argument(
|
5010
|
+
"--print-supported-models", action="store_true",
|
5011
|
+
help="Print the supported models"
|
5012
|
+
)
|
3974
5013
|
|
3975
|
-
|
5014
|
+
args = parser.parse_args()
|
5015
|
+
if not args.print_supported_models and args.model is None:
|
5016
|
+
parser.error("the following arguments are required: model")
|
5017
|
+
return args
|
3976
5018
|
|
3977
5019
|
|
3978
5020
|
def split_str_to_n_bytes(split_str: str) -> int:
|
@@ -3996,6 +5038,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
|
3996
5038
|
def main() -> None:
|
3997
5039
|
args = parse_args()
|
3998
5040
|
|
5041
|
+
if args.print_supported_models:
|
5042
|
+
logger.error("Supported models:")
|
5043
|
+
Model.print_registered_models()
|
5044
|
+
sys.exit(0)
|
5045
|
+
|
3999
5046
|
if args.verbose:
|
4000
5047
|
logging.basicConfig(level=logging.DEBUG)
|
4001
5048
|
else:
|
@@ -4012,6 +5059,8 @@ def main() -> None:
|
|
4012
5059
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
4013
5060
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
4014
5061
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
5062
|
+
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
5063
|
+
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
4015
5064
|
"auto": gguf.LlamaFileType.GUESSED,
|
4016
5065
|
}
|
4017
5066
|
|