bigdl-core-cpp 2.1.0b20230202__py3-none-manylinux2010_x86_64.whl → 2.5.0__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert-hf-to-gguf.py +1169 -311
- bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
- bigdl/cpp/gguf-py/gguf/constants.py +463 -167
- bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +29 -8
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +475 -156
- bigdl/cpp/gguf-py/gguf/lazy.py +24 -49
- bigdl/cpp/gguf-py/gguf/metadata.py +503 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +209 -23
- bigdl/cpp/gguf-py/gguf/utility.py +69 -0
- bigdl/cpp/libs/baby-llama +0 -0
- bigdl/cpp/libs/batched +0 -0
- bigdl/cpp/libs/batched-bench +0 -0
- bigdl/cpp/libs/benchmark +0 -0
- bigdl/cpp/libs/embedding +0 -0
- bigdl/cpp/libs/gguf +0 -0
- bigdl/cpp/libs/imatrix +0 -0
- bigdl/cpp/libs/llama-bench +0 -0
- bigdl/cpp/libs/llava-cli +0 -0
- bigdl/cpp/libs/lookahead +0 -0
- bigdl/cpp/libs/lookup +0 -0
- bigdl/cpp/libs/ls-sycl-device +0 -0
- bigdl/cpp/libs/main +0 -0
- bigdl/cpp/libs/ollama +0 -0
- bigdl/cpp/libs/perplexity +0 -0
- bigdl/cpp/libs/quantize +0 -0
- bigdl/cpp/libs/quantize-stats +0 -0
- bigdl/cpp/libs/save-load-state +0 -0
- bigdl/cpp/libs/server +0 -0
- bigdl/cpp/libs/speculative +0 -0
- bigdl/cpp/libs/tokenize +0 -0
- {bigdl_core_cpp-2.1.0b20230202.dist-info → bigdl_core_cpp-2.5.0.dist-info}/METADATA +8 -8
- bigdl_core_cpp-2.5.0.dist-info/RECORD +45 -0
- {bigdl_core_cpp-2.1.0b20230202.dist-info → bigdl_core_cpp-2.5.0.dist-info}/WHEEL +1 -1
- bigdl/cpp/libs/export-lora +0 -0
- bigdl/cpp/libs/finetune +0 -0
- bigdl/cpp/libs/gritlm +0 -0
- bigdl/cpp/libs/infill +0 -0
- bigdl/cpp/libs/parallel +0 -0
- bigdl/cpp/libs/simple +0 -0
- bigdl/cpp/libs/train-text-from-scratch +0 -0
- bigdl_core_cpp-2.1.0b20230202.dist-info/RECORD +0 -50
- {bigdl_core_cpp-2.1.0b20230202.data → bigdl_core_cpp-2.5.0.data}/scripts/init-llama-cpp +0 -0
- {bigdl_core_cpp-2.1.0b20230202.data → bigdl_core_cpp-2.5.0.data}/scripts/init-ollama +0 -0
- {bigdl_core_cpp-2.1.0b20230202.dist-info → bigdl_core_cpp-2.5.0.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert-hf-to-gguf.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
2
3
|
|
3
4
|
from __future__ import annotations
|
4
5
|
|
@@ -12,7 +13,7 @@ import sys
|
|
12
13
|
from enum import IntEnum
|
13
14
|
from pathlib import Path
|
14
15
|
from hashlib import sha256
|
15
|
-
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
16
|
+
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
16
17
|
|
17
18
|
import math
|
18
19
|
import numpy as np
|
@@ -25,10 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|
25
26
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
26
27
|
import gguf
|
27
28
|
|
28
|
-
from convert import LlamaHfVocab
|
29
|
-
|
30
|
-
logger = logging.getLogger("hf-to-gguf")
|
31
|
-
|
32
29
|
logger = logging.getLogger("hf-to-gguf")
|
33
30
|
|
34
31
|
|
@@ -50,7 +47,8 @@ class Model:
|
|
50
47
|
_model_classes: dict[str, type[Model]] = {}
|
51
48
|
|
52
49
|
dir_model: Path
|
53
|
-
ftype:
|
50
|
+
ftype: gguf.LlamaFileType
|
51
|
+
fname_out: Path
|
54
52
|
is_big_endian: bool
|
55
53
|
endianess: gguf.GGUFEndian
|
56
54
|
use_temp_file: bool
|
@@ -61,29 +59,41 @@ class Model:
|
|
61
59
|
block_count: int
|
62
60
|
tensor_map: gguf.TensorNameMap
|
63
61
|
tensor_names: set[str] | None
|
64
|
-
fname_out: Path
|
65
62
|
gguf_writer: gguf.GGUFWriter
|
63
|
+
model_name: str | None
|
64
|
+
metadata_override: Path | None
|
65
|
+
dir_model_card: Path
|
66
66
|
|
67
67
|
# subclasses should define this!
|
68
68
|
model_arch: gguf.MODEL_ARCH
|
69
69
|
|
70
|
-
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool
|
70
|
+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
71
|
+
use_temp_file: bool = False, eager: bool = False,
|
72
|
+
metadata_override: Path | None = None, model_name: str | None = None,
|
73
|
+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
71
74
|
if type(self) is Model:
|
72
75
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
76
|
+
|
73
77
|
self.dir_model = dir_model
|
74
78
|
self.ftype = ftype
|
79
|
+
self.fname_out = fname_out
|
75
80
|
self.is_big_endian = is_big_endian
|
76
81
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
77
82
|
self.use_temp_file = use_temp_file
|
78
83
|
self.lazy = not eager
|
79
|
-
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
84
|
+
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
|
80
85
|
self.is_safetensors = len(self.part_names) > 0
|
81
86
|
if not self.is_safetensors:
|
82
|
-
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
87
|
+
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
83
88
|
self.hparams = Model.load_hparams(self.dir_model)
|
84
|
-
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
89
|
+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
85
90
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
86
91
|
self.tensor_names = None
|
92
|
+
self.metadata_override = metadata_override
|
93
|
+
self.model_name = model_name
|
94
|
+
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
95
|
+
|
96
|
+
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
87
97
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
88
98
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
89
99
|
_, first_tensor = next(self.get_tensors())
|
@@ -93,11 +103,10 @@ class Model:
|
|
93
103
|
else:
|
94
104
|
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
95
105
|
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
106
|
+
|
107
|
+
# Configure GGUF Writer
|
108
|
+
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
109
|
+
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
101
110
|
|
102
111
|
@classmethod
|
103
112
|
def __init_subclass__(cls):
|
@@ -147,9 +156,16 @@ class Model:
|
|
147
156
|
tensor_names_from_parts.update(model_part.keys())
|
148
157
|
|
149
158
|
for name in model_part.keys():
|
150
|
-
|
151
|
-
|
152
|
-
|
159
|
+
if self.is_safetensors:
|
160
|
+
if self.lazy:
|
161
|
+
data = model_part.get_slice(name)
|
162
|
+
data = LazyTorchTensor.from_safetensors_slice(data)
|
163
|
+
else:
|
164
|
+
data = model_part.get_tensor(name)
|
165
|
+
else:
|
166
|
+
data = model_part[name]
|
167
|
+
if self.lazy:
|
168
|
+
data = LazyTorchTensor.from_eager(data)
|
153
169
|
yield name, data
|
154
170
|
|
155
171
|
# only verify tensor name presence; it doesn't matter if they are not in the right files
|
@@ -185,7 +201,6 @@ class Model:
|
|
185
201
|
return new_name
|
186
202
|
|
187
203
|
def set_gguf_parameters(self):
|
188
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
189
204
|
self.gguf_writer.add_block_count(self.block_count)
|
190
205
|
|
191
206
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
@@ -224,6 +239,10 @@ class Model:
|
|
224
239
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
225
240
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
226
241
|
|
242
|
+
if (head_dim := self.hparams.get("head_dim")) is not None:
|
243
|
+
self.gguf_writer.add_key_length(head_dim)
|
244
|
+
self.gguf_writer.add_value_length(head_dim)
|
245
|
+
|
227
246
|
self.gguf_writer.add_file_type(self.ftype)
|
228
247
|
logger.info(f"gguf: file type = {self.ftype}")
|
229
248
|
|
@@ -242,7 +261,7 @@ class Model:
|
|
242
261
|
|
243
262
|
return False
|
244
263
|
|
245
|
-
def
|
264
|
+
def prepare_tensors(self):
|
246
265
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
247
266
|
|
248
267
|
for name, data_torch in self.get_tensors():
|
@@ -264,7 +283,7 @@ class Model:
|
|
264
283
|
break
|
265
284
|
|
266
285
|
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
267
|
-
data: np.ndarray
|
286
|
+
data: np.ndarray # type hint
|
268
287
|
n_dims = len(data.shape)
|
269
288
|
data_dtype = data.dtype
|
270
289
|
data_qtype: gguf.GGMLQuantizationType | None = None
|
@@ -325,23 +344,80 @@ class Model:
|
|
325
344
|
|
326
345
|
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
327
346
|
|
347
|
+
def set_type(self):
|
348
|
+
self.gguf_writer.add_type(gguf.GGUFType.MODEL)
|
349
|
+
|
350
|
+
def prepare_metadata(self, vocab_only: bool):
|
351
|
+
|
352
|
+
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
353
|
+
|
354
|
+
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
|
355
|
+
|
356
|
+
# Fallback to model directory name if metadata name is still missing
|
357
|
+
if self.metadata.name is None:
|
358
|
+
self.metadata.name = self.dir_model.name
|
359
|
+
|
360
|
+
# Generate parameter weight class (useful for leader boards) if not yet determined
|
361
|
+
if self.metadata.size_label is None and total_params > 0:
|
362
|
+
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
363
|
+
|
364
|
+
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
365
|
+
output_type: str = self.ftype.name.partition("_")[2]
|
366
|
+
|
367
|
+
# Filename Output
|
368
|
+
if self.fname_out.is_dir():
|
369
|
+
# Generate default filename based on model specification and available metadata
|
370
|
+
if not vocab_only:
|
371
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
372
|
+
else:
|
373
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
374
|
+
|
375
|
+
# Use the default filename
|
376
|
+
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
377
|
+
else:
|
378
|
+
# Output path is a custom defined templated filename
|
379
|
+
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
380
|
+
# file template strings as it doesn't actually exist as a file
|
381
|
+
|
382
|
+
# Process templated file name with the output ftype, useful with the "auto" ftype
|
383
|
+
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
384
|
+
|
385
|
+
self.set_type()
|
386
|
+
|
387
|
+
logger.info("Set meta model")
|
388
|
+
self.metadata.set_gguf_meta_model(self.gguf_writer)
|
389
|
+
|
390
|
+
logger.info("Set model parameters")
|
391
|
+
self.set_gguf_parameters()
|
392
|
+
|
393
|
+
logger.info("Set model tokenizer")
|
394
|
+
self.set_vocab()
|
395
|
+
|
396
|
+
logger.info("Set model quantization version")
|
397
|
+
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
398
|
+
|
328
399
|
def write(self):
|
329
|
-
self.
|
330
|
-
self.
|
400
|
+
self.prepare_tensors()
|
401
|
+
self.prepare_metadata(vocab_only=False)
|
402
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
331
403
|
self.gguf_writer.write_kv_data_to_file()
|
332
404
|
self.gguf_writer.write_tensors_to_file(progress=True)
|
333
405
|
self.gguf_writer.close()
|
334
406
|
|
335
407
|
def write_vocab(self):
|
336
|
-
self.gguf_writer.
|
408
|
+
if len(self.gguf_writer.tensors) != 1:
|
409
|
+
raise ValueError('Splitting the vocabulary is not supported')
|
410
|
+
|
411
|
+
self.prepare_metadata(vocab_only=True)
|
412
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
337
413
|
self.gguf_writer.write_kv_data_to_file()
|
338
414
|
self.gguf_writer.close()
|
339
415
|
|
340
416
|
@staticmethod
|
341
|
-
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
|
417
|
+
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
342
418
|
part_names: list[str] = []
|
343
419
|
for filename in os.listdir(dir_model):
|
344
|
-
if filename.endswith(suffix):
|
420
|
+
if filename.startswith(prefix) and filename.endswith(suffix):
|
345
421
|
part_names.append(filename)
|
346
422
|
|
347
423
|
part_names.sort()
|
@@ -370,6 +446,29 @@ class Model:
|
|
370
446
|
except KeyError:
|
371
447
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
372
448
|
|
449
|
+
def does_token_look_special(self, token: str | bytes) -> bool:
|
450
|
+
if isinstance(token, (bytes, bytearray)):
|
451
|
+
token_text = token.decode(encoding="utf-8")
|
452
|
+
elif isinstance(token, memoryview):
|
453
|
+
token_text = token.tobytes().decode(encoding="utf-8")
|
454
|
+
else:
|
455
|
+
token_text = token
|
456
|
+
|
457
|
+
# Some models mark some added tokens which ought to be control tokens as not special.
|
458
|
+
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
|
459
|
+
seems_special = token_text in (
|
460
|
+
"<pad>", # deepseek-coder
|
461
|
+
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
|
462
|
+
)
|
463
|
+
|
464
|
+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
|
465
|
+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
|
466
|
+
|
467
|
+
# TODO: should these be marked as UNUSED instead? (maybe not)
|
468
|
+
seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
|
469
|
+
|
470
|
+
return seems_special
|
471
|
+
|
373
472
|
# used for GPT-2 BPE and WordPiece vocabs
|
374
473
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
375
474
|
tokens: list[str] = []
|
@@ -388,20 +487,22 @@ class Model:
|
|
388
487
|
for i in range(vocab_size):
|
389
488
|
if i not in reverse_vocab:
|
390
489
|
tokens.append(f"[PAD{i}]")
|
391
|
-
toktypes.append(gguf.TokenType.
|
392
|
-
elif reverse_vocab[i] in added_vocab:
|
393
|
-
tokens.append(reverse_vocab[i])
|
394
|
-
if tokenizer.added_tokens_decoder[i].special:
|
395
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
396
|
-
else:
|
397
|
-
toktypes.append(gguf.TokenType.USER_DEFINED)
|
490
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
398
491
|
else:
|
399
|
-
|
400
|
-
|
492
|
+
token: str = reverse_vocab[i]
|
493
|
+
if token in added_vocab:
|
494
|
+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
495
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
496
|
+
else:
|
497
|
+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
498
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
499
|
+
else:
|
500
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
501
|
+
tokens.append(token)
|
401
502
|
|
402
503
|
return tokens, toktypes, tokpre
|
403
504
|
|
404
|
-
# NOTE: this function is generated by
|
505
|
+
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
405
506
|
# do not modify it manually!
|
406
507
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
407
508
|
# Marker: Start get_vocab_base_pre
|
@@ -421,7 +522,7 @@ class Model:
|
|
421
522
|
|
422
523
|
res = None
|
423
524
|
|
424
|
-
# NOTE: if you get an error here, you need to update the
|
525
|
+
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
425
526
|
# or pull the latest version of the model from Huggingface
|
426
527
|
# don't edit the hashes manually!
|
427
528
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
@@ -478,15 +579,39 @@ class Model:
|
|
478
579
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
479
580
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
480
581
|
res = "smaug-bpe"
|
582
|
+
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
583
|
+
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
584
|
+
res = "poro-chat"
|
585
|
+
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
586
|
+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
587
|
+
res = "jina-v2-code"
|
588
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
589
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
590
|
+
res = "chatglm-bpe"
|
591
|
+
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
592
|
+
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
593
|
+
res = "viking"
|
594
|
+
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
595
|
+
# ref: https://huggingface.co/core42/jais-13b
|
596
|
+
res = "jais"
|
597
|
+
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
598
|
+
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
599
|
+
res = "codeshell"
|
600
|
+
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
601
|
+
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
602
|
+
res = "tekken"
|
603
|
+
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
604
|
+
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
605
|
+
res = "smollm"
|
481
606
|
|
482
607
|
if res is None:
|
483
608
|
logger.warning("\n")
|
484
609
|
logger.warning("**************************************************************************************")
|
485
610
|
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
486
611
|
logger.warning("** There are 2 possible reasons for this:")
|
487
|
-
logger.warning("** - the model has not been added to
|
612
|
+
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
488
613
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
489
|
-
logger.warning("** Check your model files and
|
614
|
+
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
490
615
|
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
491
616
|
logger.warning("**")
|
492
617
|
logger.warning(f"** chkhsh: {chkhsh}")
|
@@ -541,7 +666,7 @@ class Model:
|
|
541
666
|
for i in range(vocab_size):
|
542
667
|
if i not in reverse_vocab:
|
543
668
|
tokens.append(f"[PAD{i}]")
|
544
|
-
toktypes.append(gguf.TokenType.
|
669
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
545
670
|
elif reverse_vocab[i] in added_vocab:
|
546
671
|
tokens.append(reverse_vocab[i])
|
547
672
|
toktypes.append(gguf.TokenType.CONTROL)
|
@@ -564,15 +689,23 @@ class Model:
|
|
564
689
|
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
565
690
|
special_vocab.add_to_gguf(self.gguf_writer)
|
566
691
|
|
567
|
-
def _set_vocab_sentencepiece(self):
|
692
|
+
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
693
|
+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
694
|
+
|
695
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
696
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
697
|
+
self.gguf_writer.add_token_list(tokens)
|
698
|
+
self.gguf_writer.add_token_scores(scores)
|
699
|
+
self.gguf_writer.add_token_types(toktypes)
|
700
|
+
|
701
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
702
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
703
|
+
|
704
|
+
def _create_vocab_sentencepiece(self):
|
568
705
|
from sentencepiece import SentencePieceProcessor
|
569
706
|
|
570
707
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
571
708
|
|
572
|
-
tokens: list[bytes] = []
|
573
|
-
scores: list[float] = []
|
574
|
-
toktypes: list[int] = []
|
575
|
-
|
576
709
|
if not tokenizer_path.is_file():
|
577
710
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
578
711
|
|
@@ -583,7 +716,7 @@ class Model:
|
|
583
716
|
|
584
717
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
585
718
|
scores: list[float] = [-10000.0] * vocab_size
|
586
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
719
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
587
720
|
|
588
721
|
for token_id in range(tokenizer.vocab_size()):
|
589
722
|
piece = tokenizer.IdToPiece(token_id)
|
@@ -610,7 +743,7 @@ class Model:
|
|
610
743
|
added_tokens_json = json.load(f)
|
611
744
|
for key in added_tokens_json:
|
612
745
|
token_id = added_tokens_json[key]
|
613
|
-
if
|
746
|
+
if token_id >= vocab_size:
|
614
747
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
615
748
|
continue
|
616
749
|
|
@@ -618,6 +751,26 @@ class Model:
|
|
618
751
|
scores[token_id] = -1000.0
|
619
752
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
620
753
|
|
754
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
755
|
+
if tokenizer_config_file.is_file():
|
756
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
757
|
+
tokenizer_config_json = json.load(f)
|
758
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
759
|
+
for token_id, token_data in added_tokens_decoder.items():
|
760
|
+
token_id = int(token_id)
|
761
|
+
token: str = token_data["content"]
|
762
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
763
|
+
if tokens[token_id] != token.encode("utf-8"):
|
764
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
765
|
+
if token_data.get("special") or self.does_token_look_special(token):
|
766
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
767
|
+
else:
|
768
|
+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
769
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
770
|
+
|
771
|
+
scores[token_id] = -1000.0
|
772
|
+
tokens[token_id] = token.encode("utf-8")
|
773
|
+
|
621
774
|
if vocab_size > len(tokens):
|
622
775
|
pad_count = vocab_size - len(tokens)
|
623
776
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
@@ -626,17 +779,10 @@ class Model:
|
|
626
779
|
scores.append(-1000.0)
|
627
780
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
628
781
|
|
629
|
-
|
630
|
-
self.gguf_writer.add_tokenizer_pre("default")
|
631
|
-
self.gguf_writer.add_token_list(tokens)
|
632
|
-
self.gguf_writer.add_token_scores(scores)
|
633
|
-
self.gguf_writer.add_token_types(toktypes)
|
634
|
-
|
635
|
-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
636
|
-
special_vocab.add_to_gguf(self.gguf_writer)
|
782
|
+
return tokens, scores, toktypes
|
637
783
|
|
638
784
|
def _set_vocab_llama_hf(self):
|
639
|
-
vocab = LlamaHfVocab(self.dir_model)
|
785
|
+
vocab = gguf.LlamaHfVocab(self.dir_model)
|
640
786
|
tokens = []
|
641
787
|
scores = []
|
642
788
|
toktypes = []
|
@@ -657,6 +803,51 @@ class Model:
|
|
657
803
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
658
804
|
special_vocab.add_to_gguf(self.gguf_writer)
|
659
805
|
|
806
|
+
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
807
|
+
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
808
|
+
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
809
|
+
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
|
810
|
+
|
811
|
+
default_pre = "mpt" if model_name == "gpt-neox" else "default"
|
812
|
+
|
813
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
814
|
+
assert field # tokenizer model
|
815
|
+
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
|
816
|
+
|
817
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
818
|
+
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
|
819
|
+
|
820
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
821
|
+
assert field # token list
|
822
|
+
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
823
|
+
|
824
|
+
if model_name == "llama-spm":
|
825
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
|
826
|
+
assert field # token scores
|
827
|
+
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
828
|
+
|
829
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
830
|
+
assert field # token types
|
831
|
+
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
832
|
+
|
833
|
+
if model_name != "llama-spm":
|
834
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
835
|
+
assert field # token merges
|
836
|
+
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
837
|
+
|
838
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
|
839
|
+
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
|
840
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
|
841
|
+
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
|
842
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
|
843
|
+
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
|
844
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
|
845
|
+
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
|
846
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
|
847
|
+
self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
|
848
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
|
849
|
+
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
|
850
|
+
|
660
851
|
|
661
852
|
@Model.register("GPTNeoXForCausalLM")
|
662
853
|
class GPTNeoXModel(Model):
|
@@ -665,7 +856,6 @@ class GPTNeoXModel(Model):
|
|
665
856
|
def set_gguf_parameters(self):
|
666
857
|
block_count = self.hparams["num_hidden_layers"]
|
667
858
|
|
668
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
669
859
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
670
860
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
671
861
|
self.gguf_writer.add_block_count(block_count)
|
@@ -721,7 +911,6 @@ class BloomModel(Model):
|
|
721
911
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
722
912
|
|
723
913
|
def set_gguf_parameters(self):
|
724
|
-
self.gguf_writer.add_name("Bloom")
|
725
914
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
726
915
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
727
916
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
@@ -798,7 +987,6 @@ class MPTModel(Model):
|
|
798
987
|
|
799
988
|
def set_gguf_parameters(self):
|
800
989
|
block_count = self.hparams["n_layers"]
|
801
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
802
990
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
803
991
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
804
992
|
self.gguf_writer.add_block_count(block_count)
|
@@ -837,7 +1025,6 @@ class OrionModel(Model):
|
|
837
1025
|
block_count = self.hparams["num_hidden_layers"]
|
838
1026
|
head_count = self.hparams["num_attention_heads"]
|
839
1027
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
840
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
841
1028
|
|
842
1029
|
ctx_length = 0
|
843
1030
|
if "max_sequence_length" in self.hparams:
|
@@ -850,8 +1037,6 @@ class OrionModel(Model):
|
|
850
1037
|
raise ValueError("gguf: can not find ctx length parameter.")
|
851
1038
|
|
852
1039
|
self.gguf_writer.add_file_type(self.ftype)
|
853
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
854
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
855
1040
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
856
1041
|
self.gguf_writer.add_context_length(ctx_length)
|
857
1042
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -875,7 +1060,6 @@ class BaichuanModel(Model):
|
|
875
1060
|
block_count = self.hparams["num_hidden_layers"]
|
876
1061
|
head_count = self.hparams["num_attention_heads"]
|
877
1062
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
878
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
879
1063
|
|
880
1064
|
ctx_length = 0
|
881
1065
|
if "max_sequence_length" in self.hparams:
|
@@ -887,8 +1071,6 @@ class BaichuanModel(Model):
|
|
887
1071
|
else:
|
888
1072
|
raise ValueError("gguf: can not find ctx length parameter.")
|
889
1073
|
|
890
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
891
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
892
1074
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
893
1075
|
self.gguf_writer.add_context_length(ctx_length)
|
894
1076
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -962,7 +1144,11 @@ class XverseModel(Model):
|
|
962
1144
|
from transformers import AutoTokenizer
|
963
1145
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
964
1146
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
965
|
-
|
1147
|
+
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
|
1148
|
+
# because vocab_size is the count of items, and indexes start at 0.
|
1149
|
+
max_vocab_index = max(tokenizer.get_vocab().values())
|
1150
|
+
if max_vocab_index >= vocab_size:
|
1151
|
+
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
966
1152
|
|
967
1153
|
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
968
1154
|
added_vocab = tokenizer.get_added_vocab()
|
@@ -998,7 +1184,6 @@ class XverseModel(Model):
|
|
998
1184
|
block_count = self.hparams["num_hidden_layers"]
|
999
1185
|
head_count = self.hparams["num_attention_heads"]
|
1000
1186
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
1001
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
1002
1187
|
|
1003
1188
|
ctx_length = 0
|
1004
1189
|
if "max_sequence_length" in self.hparams:
|
@@ -1010,8 +1195,6 @@ class XverseModel(Model):
|
|
1010
1195
|
else:
|
1011
1196
|
raise ValueError("gguf: can not find ctx length parameter.")
|
1012
1197
|
|
1013
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1014
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
1015
1198
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
1016
1199
|
self.gguf_writer.add_context_length(ctx_length)
|
1017
1200
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1070,7 +1253,6 @@ class FalconModel(Model):
|
|
1070
1253
|
if n_head_kv is None:
|
1071
1254
|
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
|
1072
1255
|
|
1073
|
-
self.gguf_writer.add_name("Falcon")
|
1074
1256
|
self.gguf_writer.add_context_length(2048) # not in config.json
|
1075
1257
|
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
1076
1258
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1115,7 +1297,6 @@ class StarCoderModel(Model):
|
|
1115
1297
|
def set_gguf_parameters(self):
|
1116
1298
|
block_count = self.hparams["n_layer"]
|
1117
1299
|
|
1118
|
-
self.gguf_writer.add_name("StarCoder")
|
1119
1300
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
1120
1301
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
1121
1302
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
@@ -1135,11 +1316,11 @@ class RefactModel(Model):
|
|
1135
1316
|
|
1136
1317
|
# TODO: how to determine special FIM tokens automatically?
|
1137
1318
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
1138
|
-
special_token_types = ['prefix', 'suffix', 'middle', '
|
1319
|
+
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
1139
1320
|
special_vocab._set_special_token("prefix", 1)
|
1140
1321
|
special_vocab._set_special_token("suffix", 3)
|
1141
1322
|
special_vocab._set_special_token("middle", 2)
|
1142
|
-
special_vocab.
|
1323
|
+
special_vocab.chat_template = None # do not add it twice
|
1143
1324
|
special_vocab.add_to_gguf(self.gguf_writer)
|
1144
1325
|
|
1145
1326
|
def set_gguf_parameters(self):
|
@@ -1151,7 +1332,6 @@ class RefactModel(Model):
|
|
1151
1332
|
|
1152
1333
|
block_count = self.hparams["n_layer"]
|
1153
1334
|
|
1154
|
-
self.gguf_writer.add_name("Refact")
|
1155
1335
|
# refact uses Alibi. So this is from config.json which might be used by training.
|
1156
1336
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
1157
1337
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
@@ -1199,14 +1379,13 @@ class StableLMModel(Model):
|
|
1199
1379
|
if (self.dir_model / "tokenizer.json").is_file():
|
1200
1380
|
self._set_vocab_gpt2()
|
1201
1381
|
else:
|
1202
|
-
# StableLM 2 1.6B
|
1382
|
+
# StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
|
1203
1383
|
self._set_vocab_qwen()
|
1204
1384
|
|
1205
1385
|
def set_gguf_parameters(self):
|
1206
1386
|
hparams = self.hparams
|
1207
1387
|
block_count = hparams["num_hidden_layers"]
|
1208
1388
|
|
1209
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1210
1389
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
1211
1390
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1212
1391
|
self.gguf_writer.add_block_count(block_count)
|
@@ -1268,8 +1447,8 @@ class StableLMModel(Model):
|
|
1268
1447
|
|
1269
1448
|
return [(new_name, data_torch)]
|
1270
1449
|
|
1271
|
-
def
|
1272
|
-
super().
|
1450
|
+
def prepare_tensors(self):
|
1451
|
+
super().prepare_tensors()
|
1273
1452
|
|
1274
1453
|
if self._q_norms is not None or self._k_norms is not None:
|
1275
1454
|
# flatten two `list[dict[str, Tensor]]` into a single `list[str]`
|
@@ -1281,85 +1460,6 @@ class StableLMModel(Model):
|
|
1281
1460
|
if len(norms) > 0:
|
1282
1461
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1283
1462
|
|
1284
|
-
def write_tensors(self):
|
1285
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
1286
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
1287
|
-
n_head = self.hparams.get("num_attention_heads")
|
1288
|
-
n_kv_head = self.hparams.get("num_key_value_heads")
|
1289
|
-
q_norms = dict()
|
1290
|
-
k_norms = dict()
|
1291
|
-
for name, data_torch in self.get_tensors():
|
1292
|
-
# we don't need these
|
1293
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
1294
|
-
continue
|
1295
|
-
|
1296
|
-
old_dtype = data_torch.dtype
|
1297
|
-
|
1298
|
-
# convert any unsupported data types to float32
|
1299
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1300
|
-
data_torch = data_torch.to(torch.float32)
|
1301
|
-
|
1302
|
-
data = data_torch.squeeze().numpy()
|
1303
|
-
n_dims = len(data.shape)
|
1304
|
-
if name.find("q_layernorm.norms") != -1:
|
1305
|
-
q_norms[name] = data
|
1306
|
-
if len(q_norms) >= (block_count * n_head):
|
1307
|
-
self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
|
1308
|
-
continue
|
1309
|
-
if name.find("k_layernorm.norms") != -1:
|
1310
|
-
k_norms[name] = data
|
1311
|
-
if len(k_norms) >= (block_count * n_kv_head):
|
1312
|
-
self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
|
1313
|
-
continue
|
1314
|
-
|
1315
|
-
# map tensor names
|
1316
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1317
|
-
if new_name is None:
|
1318
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1319
|
-
|
1320
|
-
n_dims = len(data.shape)
|
1321
|
-
data_dtype = data.dtype
|
1322
|
-
|
1323
|
-
# if f32 desired, convert any float16 to float32
|
1324
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1325
|
-
data = data.astype(np.float32)
|
1326
|
-
|
1327
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1328
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
1329
|
-
data = data.astype(np.float32)
|
1330
|
-
|
1331
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1332
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
1333
|
-
data = data.astype(np.float16)
|
1334
|
-
|
1335
|
-
logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1336
|
-
|
1337
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1338
|
-
|
1339
|
-
def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
|
1340
|
-
for bid in range(block_count):
|
1341
|
-
datas = []
|
1342
|
-
for xid in range(n_head):
|
1343
|
-
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
1344
|
-
datas.append(norms[ename])
|
1345
|
-
del norms[ename]
|
1346
|
-
data = np.stack(datas, axis=0)
|
1347
|
-
data_dtype = data.dtype
|
1348
|
-
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
1349
|
-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
1350
|
-
if new_name is None:
|
1351
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1352
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
1353
|
-
data = data.astype(np.float32)
|
1354
|
-
|
1355
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1356
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
1357
|
-
data = data.astype(np.float16)
|
1358
|
-
|
1359
|
-
logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
1360
|
-
|
1361
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1362
|
-
|
1363
1463
|
|
1364
1464
|
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1365
1465
|
class LlamaModel(Model):
|
@@ -1367,7 +1467,7 @@ class LlamaModel(Model):
|
|
1367
1467
|
|
1368
1468
|
def set_vocab(self):
|
1369
1469
|
try:
|
1370
|
-
self.
|
1470
|
+
self._set_vocab_sentencepiece()
|
1371
1471
|
except FileNotFoundError:
|
1372
1472
|
try:
|
1373
1473
|
self._set_vocab_llama_hf()
|
@@ -1391,13 +1491,29 @@ class LlamaModel(Model):
|
|
1391
1491
|
super().set_gguf_parameters()
|
1392
1492
|
hparams = self.hparams
|
1393
1493
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1394
|
-
|
1494
|
+
|
1495
|
+
if "head_dim" in hparams:
|
1496
|
+
rope_dim = hparams["head_dim"]
|
1497
|
+
else:
|
1498
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1499
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1395
1500
|
|
1396
1501
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1397
1502
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
1398
1503
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1399
1504
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1400
1505
|
|
1506
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1507
|
+
if tokenizer_config_file.is_file():
|
1508
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1509
|
+
tokenizer_config_json = json.load(f)
|
1510
|
+
if "add_prefix_space" in tokenizer_config_json:
|
1511
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1512
|
+
|
1513
|
+
# Apply to granite small models only
|
1514
|
+
if self.hparams.get("vocab_size", 32000) == 49152:
|
1515
|
+
self.gguf_writer.add_add_bos_token(False)
|
1516
|
+
|
1401
1517
|
@staticmethod
|
1402
1518
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1403
1519
|
if n_head_kv is not None and n_head != n_head_kv:
|
@@ -1412,9 +1528,9 @@ class LlamaModel(Model):
|
|
1412
1528
|
n_head = self.hparams["num_attention_heads"]
|
1413
1529
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
1414
1530
|
|
1415
|
-
if name.endswith("q_proj.weight"):
|
1531
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1416
1532
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1417
|
-
if name.endswith("k_proj.weight"):
|
1533
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1418
1534
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1419
1535
|
|
1420
1536
|
# process the experts separately
|
@@ -1453,8 +1569,35 @@ class LlamaModel(Model):
|
|
1453
1569
|
|
1454
1570
|
return [(self.map_tensor_name(name), data_torch)]
|
1455
1571
|
|
1456
|
-
def
|
1457
|
-
|
1572
|
+
def prepare_tensors(self):
|
1573
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1574
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1575
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
1576
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1577
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1578
|
+
factor = rope_scaling.get("factor", 8.0)
|
1579
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1580
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
1581
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
1582
|
+
|
1583
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
1584
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
1585
|
+
assert low_freq_wavelen != high_freq_wavelen
|
1586
|
+
|
1587
|
+
rope_factors = []
|
1588
|
+
for freq in freqs:
|
1589
|
+
wavelen = 2 * math.pi / freq
|
1590
|
+
if wavelen < high_freq_wavelen:
|
1591
|
+
rope_factors.append(1)
|
1592
|
+
elif wavelen > low_freq_wavelen:
|
1593
|
+
rope_factors.append(factor)
|
1594
|
+
else:
|
1595
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1596
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1597
|
+
|
1598
|
+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
1599
|
+
|
1600
|
+
super().prepare_tensors()
|
1458
1601
|
|
1459
1602
|
if self._experts is not None:
|
1460
1603
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -1463,6 +1606,48 @@ class LlamaModel(Model):
|
|
1463
1606
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1464
1607
|
|
1465
1608
|
|
1609
|
+
@Model.register("BitnetForCausalLM")
|
1610
|
+
class BitnetModel(Model):
|
1611
|
+
model_arch = gguf.MODEL_ARCH.BITNET
|
1612
|
+
|
1613
|
+
def set_vocab(self):
|
1614
|
+
self._set_vocab_sentencepiece()
|
1615
|
+
|
1616
|
+
def set_gguf_parameters(self):
|
1617
|
+
super().set_gguf_parameters()
|
1618
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1619
|
+
self.gguf_writer.add_rope_scaling_factor(1.0)
|
1620
|
+
|
1621
|
+
def weight_quant(self, weight):
|
1622
|
+
dtype = weight.dtype
|
1623
|
+
weight = weight.float()
|
1624
|
+
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
1625
|
+
weight = (weight * s).round().clamp(-1, 1) / s
|
1626
|
+
scale = weight.abs().max().unsqueeze(0)
|
1627
|
+
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
1628
|
+
weight = torch.sign(weight).type(dtype)
|
1629
|
+
return weight.type(dtype), scale.type(torch.float32)
|
1630
|
+
|
1631
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1632
|
+
new_name = self.map_tensor_name(name)
|
1633
|
+
|
1634
|
+
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
1635
|
+
gguf.MODEL_TENSOR.ATTN_Q,
|
1636
|
+
gguf.MODEL_TENSOR.ATTN_K,
|
1637
|
+
gguf.MODEL_TENSOR.ATTN_V,
|
1638
|
+
gguf.MODEL_TENSOR.ATTN_OUT,
|
1639
|
+
gguf.MODEL_TENSOR.FFN_UP,
|
1640
|
+
gguf.MODEL_TENSOR.FFN_DOWN,
|
1641
|
+
gguf.MODEL_TENSOR.FFN_GATE,
|
1642
|
+
]):
|
1643
|
+
# transform weight into 1/0/-1 (in fp32)
|
1644
|
+
weight_torch, scale_torch = self.weight_quant(data_torch)
|
1645
|
+
yield (new_name, weight_torch)
|
1646
|
+
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
1647
|
+
else:
|
1648
|
+
yield (new_name, data_torch)
|
1649
|
+
|
1650
|
+
|
1466
1651
|
@Model.register("GrokForCausalLM")
|
1467
1652
|
class GrokModel(Model):
|
1468
1653
|
model_arch = gguf.MODEL_ARCH.GROK
|
@@ -1475,7 +1660,6 @@ class GrokModel(Model):
|
|
1475
1660
|
|
1476
1661
|
def set_gguf_parameters(self):
|
1477
1662
|
super().set_gguf_parameters()
|
1478
|
-
self.gguf_writer.add_name("Grok")
|
1479
1663
|
|
1480
1664
|
_experts: list[dict[str, Tensor]] | None = None
|
1481
1665
|
|
@@ -1524,7 +1708,6 @@ class DbrxModel(Model):
|
|
1524
1708
|
def set_gguf_parameters(self):
|
1525
1709
|
ffn_config = self.hparams["ffn_config"]
|
1526
1710
|
attn_config = self.hparams["attn_config"]
|
1527
|
-
self.gguf_writer.add_name(self.hparams["model_type"])
|
1528
1711
|
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
1529
1712
|
|
1530
1713
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
@@ -1537,7 +1720,6 @@ class DbrxModel(Model):
|
|
1537
1720
|
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
1538
1721
|
|
1539
1722
|
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
1540
|
-
self.gguf_writer.add_file_type(self.ftype)
|
1541
1723
|
|
1542
1724
|
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
1543
1725
|
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
@@ -1594,7 +1776,6 @@ class MiniCPMModel(Model):
|
|
1594
1776
|
|
1595
1777
|
def set_gguf_parameters(self):
|
1596
1778
|
block_count = self.hparams["num_hidden_layers"]
|
1597
|
-
self.gguf_writer.add_name("MiniCPM")
|
1598
1779
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1599
1780
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
1600
1781
|
self.gguf_writer.add_block_count(block_count)
|
@@ -1610,7 +1791,7 @@ class MiniCPMModel(Model):
|
|
1610
1791
|
|
1611
1792
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
1612
1793
|
if n_kv_head is not None and n_head != n_kv_head:
|
1613
|
-
n_head
|
1794
|
+
n_head = n_kv_head
|
1614
1795
|
|
1615
1796
|
return (
|
1616
1797
|
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
@@ -1664,7 +1845,6 @@ class QwenModel(Model):
|
|
1664
1845
|
self._set_vocab_qwen()
|
1665
1846
|
|
1666
1847
|
def set_gguf_parameters(self):
|
1667
|
-
self.gguf_writer.add_name("Qwen")
|
1668
1848
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1669
1849
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
1670
1850
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1695,6 +1875,12 @@ class Qwen2MoeModel(Model):
|
|
1695
1875
|
super().set_gguf_parameters()
|
1696
1876
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
1697
1877
|
self.gguf_writer.add_expert_count(n_experts)
|
1878
|
+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
1879
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
1880
|
+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
1881
|
+
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
1882
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
1883
|
+
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
1698
1884
|
|
1699
1885
|
_experts: list[dict[str, Tensor]] | None = None
|
1700
1886
|
|
@@ -1734,8 +1920,8 @@ class Qwen2MoeModel(Model):
|
|
1734
1920
|
|
1735
1921
|
return [(self.map_tensor_name(name), data_torch)]
|
1736
1922
|
|
1737
|
-
def
|
1738
|
-
super().
|
1923
|
+
def prepare_tensors(self):
|
1924
|
+
super().prepare_tensors()
|
1739
1925
|
|
1740
1926
|
if self._experts is not None:
|
1741
1927
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -1749,7 +1935,6 @@ class GPT2Model(Model):
|
|
1749
1935
|
model_arch = gguf.MODEL_ARCH.GPT2
|
1750
1936
|
|
1751
1937
|
def set_gguf_parameters(self):
|
1752
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1753
1938
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
1754
1939
|
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
1755
1940
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
@@ -1792,7 +1977,6 @@ class Phi2Model(Model):
|
|
1792
1977
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
1793
1978
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
1794
1979
|
|
1795
|
-
self.gguf_writer.add_name("Phi2")
|
1796
1980
|
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
1797
1981
|
|
1798
1982
|
self.gguf_writer.add_embedding_length(n_embd)
|
@@ -1825,7 +2009,7 @@ class Phi3MiniModel(Model):
|
|
1825
2009
|
|
1826
2010
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
1827
2011
|
scores: list[float] = [-10000.0] * vocab_size
|
1828
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
2012
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
1829
2013
|
|
1830
2014
|
for token_id in range(tokenizer.vocab_size()):
|
1831
2015
|
|
@@ -1854,7 +2038,7 @@ class Phi3MiniModel(Model):
|
|
1854
2038
|
|
1855
2039
|
for key in added_tokens_json:
|
1856
2040
|
token_id = added_tokens_json[key]
|
1857
|
-
if
|
2041
|
+
if token_id >= vocab_size:
|
1858
2042
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
1859
2043
|
continue
|
1860
2044
|
|
@@ -1870,8 +2054,9 @@ class Phi3MiniModel(Model):
|
|
1870
2054
|
for token_id, foken_data in added_tokens_decoder.items():
|
1871
2055
|
token_id = int(token_id)
|
1872
2056
|
token = foken_data["content"].encode("utf-8")
|
1873
|
-
if toktypes[token_id] != SentencePieceTokenTypes.
|
1874
|
-
|
2057
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2058
|
+
if tokens[token_id] != token:
|
2059
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
1875
2060
|
tokens[token_id] = token
|
1876
2061
|
scores[token_id] = -1000.0
|
1877
2062
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
@@ -1886,8 +2071,9 @@ class Phi3MiniModel(Model):
|
|
1886
2071
|
for foken_data in added_tokens:
|
1887
2072
|
token_id = int(foken_data["id"])
|
1888
2073
|
token = foken_data["content"].encode("utf-8")
|
1889
|
-
if toktypes[token_id] != SentencePieceTokenTypes.
|
1890
|
-
|
2074
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2075
|
+
if tokens[token_id] != token:
|
2076
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
1891
2077
|
tokens[token_id] = token
|
1892
2078
|
scores[token_id] = -1000.0
|
1893
2079
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
@@ -1914,7 +2100,6 @@ class Phi3MiniModel(Model):
|
|
1914
2100
|
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
1915
2101
|
rope_dims = n_embd // n_head
|
1916
2102
|
|
1917
|
-
self.gguf_writer.add_name("Phi3")
|
1918
2103
|
self.gguf_writer.add_context_length(max_pos_embds)
|
1919
2104
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
1920
2105
|
self.gguf_writer.add_embedding_length(n_embd)
|
@@ -1926,10 +2111,11 @@ class Phi3MiniModel(Model):
|
|
1926
2111
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
1927
2112
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
1928
2113
|
self.gguf_writer.add_file_type(self.ftype)
|
2114
|
+
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
|
1929
2115
|
|
1930
2116
|
# write rope scaling for long context (128k) model
|
1931
2117
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
1932
|
-
if
|
2118
|
+
if rope_scaling is None:
|
1933
2119
|
return
|
1934
2120
|
|
1935
2121
|
scale = max_pos_embds / orig_max_pos_embds
|
@@ -1938,7 +2124,7 @@ class Phi3MiniModel(Model):
|
|
1938
2124
|
if len(rope_scaling_type) == 0:
|
1939
2125
|
raise KeyError('Missing the required key rope_scaling.type')
|
1940
2126
|
|
1941
|
-
if rope_scaling_type == 'su':
|
2127
|
+
if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
|
1942
2128
|
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
|
1943
2129
|
elif rope_scaling_type == 'yarn':
|
1944
2130
|
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
|
@@ -1971,7 +2157,6 @@ class PlamoModel(Model):
|
|
1971
2157
|
hparams = self.hparams
|
1972
2158
|
block_count = hparams["num_hidden_layers"]
|
1973
2159
|
|
1974
|
-
self.gguf_writer.add_name("PLaMo")
|
1975
2160
|
self.gguf_writer.add_context_length(4096) # not in config.json
|
1976
2161
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1977
2162
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
@@ -2016,7 +2201,6 @@ class CodeShellModel(Model):
|
|
2016
2201
|
def set_gguf_parameters(self):
|
2017
2202
|
block_count = self.hparams["n_layer"]
|
2018
2203
|
|
2019
|
-
self.gguf_writer.add_name("CodeShell")
|
2020
2204
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
2021
2205
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
2022
2206
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
@@ -2068,7 +2252,7 @@ class InternLM2Model(Model):
|
|
2068
2252
|
logger.error(f'Error: Missing {tokenizer_path}')
|
2069
2253
|
sys.exit(1)
|
2070
2254
|
|
2071
|
-
sentencepiece_model = model.ModelProto()
|
2255
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
2072
2256
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
2073
2257
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
2074
2258
|
|
@@ -2096,6 +2280,9 @@ class InternLM2Model(Model):
|
|
2096
2280
|
toktype = SentencePieceTokenTypes.UNUSED
|
2097
2281
|
elif tokenizer.IsByte(token_id):
|
2098
2282
|
toktype = SentencePieceTokenTypes.BYTE
|
2283
|
+
# take care of ununsed raw token
|
2284
|
+
if piece.startswith('[UNUSED'):
|
2285
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
2099
2286
|
|
2100
2287
|
tokens.append(text)
|
2101
2288
|
scores.append(score)
|
@@ -2111,6 +2298,49 @@ class InternLM2Model(Model):
|
|
2111
2298
|
scores.append(-1000.0)
|
2112
2299
|
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
2113
2300
|
|
2301
|
+
chat_eos_token = '<|im_end|>'
|
2302
|
+
chat_eos_token_id = None
|
2303
|
+
|
2304
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2305
|
+
if tokenizer_config_file.is_file():
|
2306
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2307
|
+
tokenizer_config_json = json.load(f)
|
2308
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
2309
|
+
for token_id, foken_data in added_tokens_decoder.items():
|
2310
|
+
token_id = int(token_id)
|
2311
|
+
token = foken_data["content"]
|
2312
|
+
if token == chat_eos_token:
|
2313
|
+
chat_eos_token_id = token_id
|
2314
|
+
token = token.encode("utf-8")
|
2315
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2316
|
+
if tokens[token_id] != token:
|
2317
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
2318
|
+
tokens[token_id] = token
|
2319
|
+
scores[token_id] = -1000.0
|
2320
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2321
|
+
if foken_data.get("special"):
|
2322
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2323
|
+
|
2324
|
+
tokenizer_file = self.dir_model / 'tokenizer.json'
|
2325
|
+
if tokenizer_file.is_file():
|
2326
|
+
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
2327
|
+
tokenizer_json = json.load(f)
|
2328
|
+
added_tokens = tokenizer_json.get("added_tokens", [])
|
2329
|
+
for foken_data in added_tokens:
|
2330
|
+
token_id = int(foken_data["id"])
|
2331
|
+
token = foken_data["content"]
|
2332
|
+
if token == chat_eos_token:
|
2333
|
+
chat_eos_token_id = token_id
|
2334
|
+
token = token.encode("utf-8")
|
2335
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2336
|
+
if tokens[token_id] != token:
|
2337
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
2338
|
+
tokens[token_id] = token
|
2339
|
+
scores[token_id] = -1000.0
|
2340
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2341
|
+
if foken_data.get("special"):
|
2342
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2343
|
+
|
2114
2344
|
self.gguf_writer.add_tokenizer_model("llama")
|
2115
2345
|
self.gguf_writer.add_tokenizer_pre("default")
|
2116
2346
|
self.gguf_writer.add_token_list(tokens)
|
@@ -2120,37 +2350,17 @@ class InternLM2Model(Model):
|
|
2120
2350
|
|
2121
2351
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2122
2352
|
old_eos = special_vocab.special_token_ids["eos"]
|
2123
|
-
if
|
2353
|
+
if chat_eos_token_id is not None:
|
2124
2354
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
2125
2355
|
# TODO: this is a hack, should be fixed
|
2126
2356
|
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
2127
|
-
special_vocab.special_token_ids["eos"] =
|
2128
|
-
logger.warning(f"Replace eos:{old_eos} with a special token:{
|
2129
|
-
in chat mode so that the conversation can end normally.")
|
2357
|
+
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
2358
|
+
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
2359
|
+
" in chat mode so that the conversation can end normally.")
|
2130
2360
|
|
2131
2361
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2132
2362
|
|
2133
|
-
def _try_get_sft_eos(self, tokenizer):
|
2134
|
-
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
|
2135
|
-
im_end_list = tokenizer.Encode('<|im_end|>')
|
2136
|
-
eos_token = None
|
2137
|
-
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
|
2138
|
-
if len(unused_145_list) == 1:
|
2139
|
-
eos_token = unused_145_list[0]
|
2140
|
-
if len(im_end_list) == 1:
|
2141
|
-
eos_token = im_end_list[0]
|
2142
|
-
assert eos_token
|
2143
|
-
return eos_token
|
2144
|
-
|
2145
|
-
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
2146
|
-
if n_head_kv is not None and n_head != n_head_kv:
|
2147
|
-
n_head = n_head_kv
|
2148
|
-
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
2149
|
-
.swapaxes(1, 2)
|
2150
|
-
.reshape(weights.shape))
|
2151
|
-
|
2152
2363
|
def set_gguf_parameters(self):
|
2153
|
-
self.gguf_writer.add_name("InternLM2")
|
2154
2364
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
2155
2365
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
2156
2366
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -2160,30 +2370,30 @@ in chat mode so that the conversation can end normally.")
|
|
2160
2370
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2161
2371
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
2162
2372
|
self.gguf_writer.add_file_type(self.ftype)
|
2373
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2374
|
+
if self.hparams["rope_scaling"].get("type") == "linear":
|
2375
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2376
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2163
2377
|
|
2164
2378
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2165
2379
|
num_heads = self.hparams["num_attention_heads"]
|
2166
2380
|
num_kv_heads = self.hparams["num_key_value_heads"]
|
2167
|
-
|
2381
|
+
n_embd = self.hparams["hidden_size"]
|
2168
2382
|
q_per_kv = num_heads // num_kv_heads
|
2169
|
-
head_dim =
|
2383
|
+
head_dim = n_embd // num_heads
|
2170
2384
|
num_groups = num_heads // q_per_kv
|
2171
2385
|
|
2172
|
-
|
2173
|
-
|
2174
|
-
if re.match(qkv_pattern, name):
|
2175
|
-
bid = re.findall(qkv_pattern, name)[0]
|
2386
|
+
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
2176
2387
|
qkv = data_torch
|
2177
|
-
|
2178
|
-
qkv = qkv.
|
2179
|
-
q, k, v = qkv[
|
2388
|
+
|
2389
|
+
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
|
2390
|
+
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
2391
|
+
|
2180
2392
|
# The model weights of q and k equire additional reshape.
|
2181
|
-
|
2182
|
-
|
2183
|
-
|
2184
|
-
|
2185
|
-
# v = rearrange(v, " o g n i -> o (g n i)").T
|
2186
|
-
v = v.reshape((v.shape[0], -1)).T
|
2393
|
+
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
2394
|
+
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
2395
|
+
v = v.reshape((-1, v.shape[-1]))
|
2396
|
+
|
2187
2397
|
return [
|
2188
2398
|
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
|
2189
2399
|
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
|
@@ -2310,13 +2520,55 @@ class GemmaModel(Model):
|
|
2310
2520
|
special_vocab._set_special_token("middle", 68)
|
2311
2521
|
special_vocab._set_special_token("fsep", 70)
|
2312
2522
|
special_vocab._set_special_token("eot", 107)
|
2523
|
+
special_vocab.chat_template = None # do not add it twice
|
2313
2524
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2314
2525
|
|
2526
|
+
self.gguf_writer.add_add_space_prefix(False)
|
2527
|
+
|
2528
|
+
def set_gguf_parameters(self):
|
2529
|
+
hparams = self.hparams
|
2530
|
+
block_count = hparams["num_hidden_layers"]
|
2531
|
+
|
2532
|
+
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2533
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2534
|
+
self.gguf_writer.add_block_count(block_count)
|
2535
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
2536
|
+
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
2537
|
+
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
2538
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2539
|
+
self.gguf_writer.add_key_length(hparams["head_dim"])
|
2540
|
+
self.gguf_writer.add_value_length(hparams["head_dim"])
|
2541
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2542
|
+
|
2543
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2544
|
+
del bid # unused
|
2545
|
+
|
2546
|
+
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
2547
|
+
# To prevent errors, skip loading lm_head.weight.
|
2548
|
+
if name == "lm_head.weight":
|
2549
|
+
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
2550
|
+
return []
|
2551
|
+
|
2552
|
+
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
2553
|
+
if name.endswith("norm.weight"):
|
2554
|
+
data_torch = data_torch + 1
|
2555
|
+
|
2556
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2557
|
+
|
2558
|
+
|
2559
|
+
@Model.register("Gemma2ForCausalLM")
|
2560
|
+
class Gemma2Model(Model):
|
2561
|
+
model_arch = gguf.MODEL_ARCH.GEMMA2
|
2562
|
+
|
2563
|
+
def set_vocab(self):
|
2564
|
+
self._set_vocab_sentencepiece()
|
2565
|
+
|
2566
|
+
self.gguf_writer.add_add_space_prefix(False)
|
2567
|
+
|
2315
2568
|
def set_gguf_parameters(self):
|
2316
2569
|
hparams = self.hparams
|
2317
2570
|
block_count = hparams["num_hidden_layers"]
|
2318
2571
|
|
2319
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
2320
2572
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2321
2573
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2322
2574
|
self.gguf_writer.add_block_count(block_count)
|
@@ -2327,6 +2579,13 @@ class GemmaModel(Model):
|
|
2327
2579
|
self.gguf_writer.add_key_length(hparams["head_dim"])
|
2328
2580
|
self.gguf_writer.add_value_length(hparams["head_dim"])
|
2329
2581
|
self.gguf_writer.add_file_type(self.ftype)
|
2582
|
+
self.gguf_writer.add_attn_logit_softcapping(
|
2583
|
+
self.hparams["attn_logit_softcapping"]
|
2584
|
+
)
|
2585
|
+
self.gguf_writer.add_final_logit_softcapping(
|
2586
|
+
self.hparams["final_logit_softcapping"]
|
2587
|
+
)
|
2588
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
2330
2589
|
|
2331
2590
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2332
2591
|
del bid # unused
|
@@ -2368,39 +2627,7 @@ class MambaModel(Model):
|
|
2368
2627
|
self._set_vocab_sentencepiece()
|
2369
2628
|
else:
|
2370
2629
|
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
2371
|
-
|
2372
|
-
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
2373
|
-
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
|
2374
|
-
|
2375
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
2376
|
-
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
|
2377
|
-
|
2378
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
2379
|
-
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
|
2380
|
-
|
2381
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
2382
|
-
assert field
|
2383
|
-
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
2384
|
-
|
2385
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
2386
|
-
assert field
|
2387
|
-
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
2388
|
-
|
2389
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
2390
|
-
assert field
|
2391
|
-
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
2392
|
-
|
2393
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
|
2394
|
-
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
|
2395
|
-
|
2396
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
2397
|
-
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2398
|
-
|
2399
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
|
2400
|
-
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2401
|
-
|
2402
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
|
2403
|
-
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2630
|
+
self._set_vocab_builtin("gpt-neox", vocab_size)
|
2404
2631
|
|
2405
2632
|
def set_gguf_parameters(self):
|
2406
2633
|
d_model = self.find_hparam(["hidden_size", "d_model"])
|
@@ -2416,7 +2643,6 @@ class MambaModel(Model):
|
|
2416
2643
|
# Fail early for models which don't have a block expansion factor of 2
|
2417
2644
|
assert d_inner == 2 * d_model
|
2418
2645
|
|
2419
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
2420
2646
|
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
2421
2647
|
self.gguf_writer.add_embedding_length(d_model)
|
2422
2648
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
@@ -2523,18 +2749,20 @@ class JinaBertV2Model(BertModel):
|
|
2523
2749
|
|
2524
2750
|
def get_tensors(self):
|
2525
2751
|
for name, data in super().get_tensors():
|
2526
|
-
if '
|
2752
|
+
if 'gated_layer' in name:
|
2527
2753
|
d1 = data[:self.intermediate_size, :]
|
2528
2754
|
name1 = name.replace('gated_layers', 'gated_layers_w')
|
2755
|
+
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
2529
2756
|
d2 = data[self.intermediate_size:, :]
|
2530
2757
|
name2 = name.replace('gated_layers', 'gated_layers_v')
|
2758
|
+
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
2531
2759
|
yield name1, d1
|
2532
2760
|
yield name2, d2
|
2533
2761
|
continue
|
2534
2762
|
|
2535
2763
|
yield name, data
|
2536
2764
|
|
2537
|
-
def set_vocab(self
|
2765
|
+
def set_vocab(self):
|
2538
2766
|
tokenizer_class = 'BertTokenizer'
|
2539
2767
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
2540
2768
|
tokenizer_class = json.load(f)['tokenizer_class']
|
@@ -2550,19 +2778,94 @@ class JinaBertV2Model(BertModel):
|
|
2550
2778
|
self.gguf_writer.add_add_eos_token(True)
|
2551
2779
|
|
2552
2780
|
|
2553
|
-
@Model.register("
|
2554
|
-
class
|
2555
|
-
model_arch = gguf.MODEL_ARCH.
|
2781
|
+
@Model.register("OpenELMForCausalLM")
|
2782
|
+
class OpenELMModel(Model):
|
2783
|
+
model_arch = gguf.MODEL_ARCH.OPENELM
|
2784
|
+
|
2785
|
+
@staticmethod
|
2786
|
+
def _make_divisible(v: float | int, divisor: int) -> int:
|
2787
|
+
# ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
|
2788
|
+
new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
|
2789
|
+
# Make sure that round down does not go down by more than 10%.
|
2790
|
+
if new_v < 0.9 * v:
|
2791
|
+
new_v += divisor
|
2792
|
+
return new_v
|
2793
|
+
|
2794
|
+
def __init__(self, *args, **kwargs):
|
2795
|
+
super().__init__(*args, **kwargs)
|
2556
2796
|
|
2797
|
+
ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
|
2798
|
+
ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
|
2799
|
+
self._n_embd: int = self.hparams["model_dim"]
|
2800
|
+
self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
|
2801
|
+
self._num_query_heads: list[int] = self.hparams["num_query_heads"]
|
2802
|
+
self._ffn_dims: list[int] = [
|
2803
|
+
OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
|
2804
|
+
for multiplier in ffn_multipliers
|
2805
|
+
]
|
2806
|
+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
2807
|
+
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
|
2808
|
+
|
2809
|
+
# Uses the tokenizer from meta-llama/Llama-2-7b-hf
|
2557
2810
|
def set_vocab(self):
|
2558
|
-
|
2559
|
-
|
2560
|
-
|
2561
|
-
|
2811
|
+
try:
|
2812
|
+
self._set_vocab_sentencepiece()
|
2813
|
+
except FileNotFoundError:
|
2814
|
+
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
|
2562
2815
|
|
2563
|
-
|
2816
|
+
def set_gguf_parameters(self):
|
2817
|
+
n_embd = self._n_embd
|
2818
|
+
head_dim = self.hparams["head_dim"]
|
2819
|
+
rot_pct = 1.0
|
2820
|
+
assert self.block_count == len(self._num_kv_heads)
|
2821
|
+
assert self.block_count == len(self._num_query_heads)
|
2822
|
+
assert self.block_count == len(self._ffn_dims)
|
2564
2823
|
|
2565
|
-
|
2824
|
+
self.gguf_writer.add_block_count(self.block_count)
|
2825
|
+
self.gguf_writer.add_context_length(self.hparams["max_context_length"])
|
2826
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
2827
|
+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
2828
|
+
self.gguf_writer.add_head_count(self._num_query_heads)
|
2829
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
2830
|
+
self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
|
2831
|
+
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
|
2832
|
+
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
|
2833
|
+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
|
2834
|
+
self.gguf_writer.add_key_length(head_dim)
|
2835
|
+
self.gguf_writer.add_value_length(head_dim)
|
2836
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2837
|
+
|
2838
|
+
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
2839
|
+
if "n_layers" in keys:
|
2840
|
+
return self.hparams["num_transformer_layers"]
|
2841
|
+
|
2842
|
+
return super().find_hparam(keys, optional)
|
2843
|
+
|
2844
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2845
|
+
|
2846
|
+
# split ff
|
2847
|
+
if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
|
2848
|
+
ff_dim = self._ffn_dims[bid]
|
2849
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
|
2850
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
|
2851
|
+
return
|
2852
|
+
|
2853
|
+
yield (self.map_tensor_name(name), data_torch)
|
2854
|
+
|
2855
|
+
|
2856
|
+
@Model.register("ArcticForCausalLM")
|
2857
|
+
class ArcticModel(Model):
|
2858
|
+
model_arch = gguf.MODEL_ARCH.ARCTIC
|
2859
|
+
|
2860
|
+
def set_vocab(self):
|
2861
|
+
# The reason for using a custom implementation here is that the
|
2862
|
+
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
2863
|
+
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
2864
|
+
from sentencepiece import SentencePieceProcessor
|
2865
|
+
|
2866
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
2867
|
+
|
2868
|
+
if not tokenizer_path.is_file():
|
2566
2869
|
logger.error(f'Error: Missing {tokenizer_path}')
|
2567
2870
|
sys.exit(1)
|
2568
2871
|
|
@@ -2574,7 +2877,7 @@ class ArcticModel(Model):
|
|
2574
2877
|
|
2575
2878
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
2576
2879
|
scores: list[float] = [-10000.0] * vocab_size
|
2577
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
2880
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
2578
2881
|
|
2579
2882
|
for token_id in range(tokenizer.vocab_size()):
|
2580
2883
|
|
@@ -2607,7 +2910,7 @@ class ArcticModel(Model):
|
|
2607
2910
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
2608
2911
|
for token_id, token_json in added_tokens_decoder.items():
|
2609
2912
|
token_id = int(token_id)
|
2610
|
-
if
|
2913
|
+
if token_id >= vocab_size:
|
2611
2914
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
2612
2915
|
continue
|
2613
2916
|
|
@@ -2691,8 +2994,8 @@ class ArcticModel(Model):
|
|
2691
2994
|
|
2692
2995
|
return [(self.map_tensor_name(name), data_torch)]
|
2693
2996
|
|
2694
|
-
def
|
2695
|
-
super().
|
2997
|
+
def prepare_tensors(self):
|
2998
|
+
super().prepare_tensors()
|
2696
2999
|
|
2697
3000
|
if self._experts is not None:
|
2698
3001
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -2701,6 +3004,499 @@ class ArcticModel(Model):
|
|
2701
3004
|
raise ValueError(f"Unprocessed experts: {experts}")
|
2702
3005
|
|
2703
3006
|
|
3007
|
+
@Model.register("DeepseekV2ForCausalLM")
|
3008
|
+
class DeepseekV2Model(Model):
|
3009
|
+
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
3010
|
+
|
3011
|
+
def set_vocab(self):
|
3012
|
+
self._set_vocab_gpt2()
|
3013
|
+
|
3014
|
+
def set_gguf_parameters(self):
|
3015
|
+
super().set_gguf_parameters()
|
3016
|
+
hparams = self.hparams
|
3017
|
+
|
3018
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
3019
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
3020
|
+
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
3021
|
+
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
3022
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
3023
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
3024
|
+
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
3025
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
3026
|
+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3027
|
+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3028
|
+
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
3029
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
3030
|
+
|
3031
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
3032
|
+
if self.hparams["rope_scaling"].get("type") == "yarn":
|
3033
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
3034
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
3035
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
3036
|
+
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
3037
|
+
|
3038
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3039
|
+
|
3040
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3041
|
+
# process the experts separately
|
3042
|
+
if name.find("mlp.experts") != -1:
|
3043
|
+
n_experts = self.hparams["n_routed_experts"]
|
3044
|
+
assert bid is not None
|
3045
|
+
|
3046
|
+
if self._experts is None:
|
3047
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3048
|
+
|
3049
|
+
self._experts[bid][name] = data_torch
|
3050
|
+
|
3051
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3052
|
+
tensors: list[tuple[str, Tensor]] = []
|
3053
|
+
|
3054
|
+
# merge the experts into a single 3d tensor
|
3055
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
3056
|
+
datas: list[Tensor] = []
|
3057
|
+
|
3058
|
+
for xid in range(n_experts):
|
3059
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
3060
|
+
datas.append(self._experts[bid][ename])
|
3061
|
+
del self._experts[bid][ename]
|
3062
|
+
|
3063
|
+
data_torch = torch.stack(datas, dim=0)
|
3064
|
+
|
3065
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
3066
|
+
|
3067
|
+
new_name = self.map_tensor_name(merged_name)
|
3068
|
+
|
3069
|
+
tensors.append((new_name, data_torch))
|
3070
|
+
return tensors
|
3071
|
+
else:
|
3072
|
+
return []
|
3073
|
+
|
3074
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3075
|
+
|
3076
|
+
def prepare_tensors(self):
|
3077
|
+
super().prepare_tensors()
|
3078
|
+
|
3079
|
+
if self._experts is not None:
|
3080
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3081
|
+
experts = [k for d in self._experts for k in d.keys()]
|
3082
|
+
if len(experts) > 0:
|
3083
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
3084
|
+
|
3085
|
+
|
3086
|
+
@Model.register("T5WithLMHeadModel")
|
3087
|
+
@Model.register("T5ForConditionalGeneration")
|
3088
|
+
@Model.register("MT5ForConditionalGeneration")
|
3089
|
+
@Model.register("UMT5ForConditionalGeneration")
|
3090
|
+
class T5Model(Model):
|
3091
|
+
model_arch = gguf.MODEL_ARCH.T5
|
3092
|
+
|
3093
|
+
def __init__(self, *args, **kwargs):
|
3094
|
+
super().__init__(*args, **kwargs)
|
3095
|
+
self.shared_token_embeddings_found = False
|
3096
|
+
|
3097
|
+
def set_vocab(self):
|
3098
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
3099
|
+
# exception when importing sentencepiece_model_pb2
|
3100
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
3101
|
+
from sentencepiece import SentencePieceProcessor
|
3102
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
3103
|
+
|
3104
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
3105
|
+
|
3106
|
+
# many older models use spiece.model tokenizer model filename
|
3107
|
+
if not tokenizer_path.is_file():
|
3108
|
+
tokenizer_path = self.dir_model / 'spiece.model'
|
3109
|
+
|
3110
|
+
if not tokenizer_path.is_file():
|
3111
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
3112
|
+
|
3113
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
3114
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
3115
|
+
|
3116
|
+
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
3117
|
+
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
3118
|
+
# assure the tokenizer model file name is correct
|
3119
|
+
assert tokenizer_path.name == 'tokenizer.model'
|
3120
|
+
return self._set_vocab_sentencepiece()
|
3121
|
+
else:
|
3122
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3123
|
+
|
3124
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
3125
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
3126
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
3127
|
+
|
3128
|
+
tokenizer = SentencePieceProcessor()
|
3129
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3130
|
+
|
3131
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
3132
|
+
|
3133
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3134
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3135
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3136
|
+
|
3137
|
+
for token_id in range(tokenizer.vocab_size()):
|
3138
|
+
piece = tokenizer.IdToPiece(token_id)
|
3139
|
+
text = piece.encode("utf-8")
|
3140
|
+
score = tokenizer.GetScore(token_id)
|
3141
|
+
|
3142
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3143
|
+
if tokenizer.IsUnknown(token_id):
|
3144
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3145
|
+
elif tokenizer.IsControl(token_id):
|
3146
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3147
|
+
elif tokenizer.IsUnused(token_id):
|
3148
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3149
|
+
elif tokenizer.IsByte(token_id):
|
3150
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3151
|
+
|
3152
|
+
tokens[token_id] = text
|
3153
|
+
scores[token_id] = score
|
3154
|
+
toktypes[token_id] = toktype
|
3155
|
+
|
3156
|
+
added_tokens_file = self.dir_model / 'added_tokens.json'
|
3157
|
+
if added_tokens_file.is_file():
|
3158
|
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
3159
|
+
added_tokens_json = json.load(f)
|
3160
|
+
for key in added_tokens_json:
|
3161
|
+
token_id = added_tokens_json[key]
|
3162
|
+
if token_id >= vocab_size:
|
3163
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
3164
|
+
continue
|
3165
|
+
|
3166
|
+
tokens[token_id] = key.encode("utf-8")
|
3167
|
+
scores[token_id] = -1000.0
|
3168
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
3169
|
+
|
3170
|
+
if vocab_size > len(tokens):
|
3171
|
+
pad_count = vocab_size - len(tokens)
|
3172
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
3173
|
+
for i in range(1, pad_count + 1):
|
3174
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
3175
|
+
scores.append(-1000.0)
|
3176
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
3177
|
+
|
3178
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
3179
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3180
|
+
self.gguf_writer.add_token_list(tokens)
|
3181
|
+
self.gguf_writer.add_token_scores(scores)
|
3182
|
+
self.gguf_writer.add_token_types(toktypes)
|
3183
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
3184
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3185
|
+
if precompiled_charsmap:
|
3186
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3187
|
+
|
3188
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3189
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3190
|
+
|
3191
|
+
self.gguf_writer.add_add_bos_token(False)
|
3192
|
+
self.gguf_writer.add_add_eos_token(True)
|
3193
|
+
|
3194
|
+
def set_gguf_parameters(self):
|
3195
|
+
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
3196
|
+
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
3197
|
+
n_ctx = 512
|
3198
|
+
self.gguf_writer.add_context_length(n_ctx)
|
3199
|
+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
3200
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
3201
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3202
|
+
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
3203
|
+
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
3204
|
+
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
3205
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3206
|
+
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
3207
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
3208
|
+
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
3209
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3210
|
+
|
3211
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3212
|
+
del bid # unused
|
3213
|
+
|
3214
|
+
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
|
3215
|
+
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
|
3216
|
+
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
|
3217
|
+
# and decoder and ignore the remaining ones.
|
3218
|
+
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
|
3219
|
+
if not self.shared_token_embeddings_found:
|
3220
|
+
name = "shared.weight"
|
3221
|
+
self.shared_token_embeddings_found = True
|
3222
|
+
else:
|
3223
|
+
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
|
3224
|
+
return []
|
3225
|
+
|
3226
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3227
|
+
|
3228
|
+
|
3229
|
+
@Model.register("JAISLMHeadModel")
|
3230
|
+
class JaisModel(Model):
|
3231
|
+
model_arch = gguf.MODEL_ARCH.JAIS
|
3232
|
+
|
3233
|
+
def __init__(self, *args, **kwargs):
|
3234
|
+
super().__init__(*args, **kwargs)
|
3235
|
+
|
3236
|
+
# SwigLU activation
|
3237
|
+
assert self.hparams["activation_function"] == "swiglu"
|
3238
|
+
# ALiBi position embedding
|
3239
|
+
assert self.hparams["position_embedding_type"] == "alibi"
|
3240
|
+
|
3241
|
+
# Embeddings scale
|
3242
|
+
self.embeddings_scale = 1.0
|
3243
|
+
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
3244
|
+
self.output_is_wte = False
|
3245
|
+
if 'mup_embeddings_scale' in self.hparams:
|
3246
|
+
self.output_is_wte = True # Hack (?)
|
3247
|
+
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
3248
|
+
elif 'embeddings_scale' in self.hparams:
|
3249
|
+
self.embeddings_scale = self.hparams['embeddings_scale']
|
3250
|
+
else:
|
3251
|
+
assert False
|
3252
|
+
|
3253
|
+
self.width_scale = 1.0
|
3254
|
+
if 'mup_output_alpha' in self.hparams:
|
3255
|
+
assert 'mup_width_scale' in self.hparams
|
3256
|
+
self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
|
3257
|
+
elif 'width_scale' in self.hparams:
|
3258
|
+
self.width_scale = self.hparams['width_scale']
|
3259
|
+
else:
|
3260
|
+
assert False
|
3261
|
+
|
3262
|
+
self.max_alibi_bias = 8.0
|
3263
|
+
|
3264
|
+
def set_vocab(self):
|
3265
|
+
self._set_vocab_gpt2()
|
3266
|
+
|
3267
|
+
def set_gguf_parameters(self):
|
3268
|
+
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
3269
|
+
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
3270
|
+
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
3271
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
|
3272
|
+
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
3273
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3274
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3275
|
+
|
3276
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3277
|
+
del bid # unused
|
3278
|
+
|
3279
|
+
tensors: list[tuple[str, Tensor]] = []
|
3280
|
+
|
3281
|
+
# we don't need these
|
3282
|
+
if name.endswith((".attn.bias")):
|
3283
|
+
return tensors
|
3284
|
+
|
3285
|
+
if name.endswith(("relative_pe.slopes")):
|
3286
|
+
# Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
|
3287
|
+
# Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
|
3288
|
+
# but Jais's PyTorch model simply precalculates the slope values and places them
|
3289
|
+
# in relative_pes.slopes
|
3290
|
+
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
|
3291
|
+
first_val = float(data_torch[0].item())
|
3292
|
+
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
|
3293
|
+
|
3294
|
+
return tensors
|
3295
|
+
|
3296
|
+
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
|
3297
|
+
data_torch = data_torch.transpose(1, 0)
|
3298
|
+
|
3299
|
+
new_name = self.map_tensor_name(name)
|
3300
|
+
|
3301
|
+
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3302
|
+
tensors.append((new_name, data_torch * self.embeddings_scale))
|
3303
|
+
if self.output_is_wte:
|
3304
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
3305
|
+
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
3306
|
+
assert not self.output_is_wte
|
3307
|
+
tensors.append((new_name, data_torch * self.width_scale))
|
3308
|
+
else:
|
3309
|
+
tensors.append((new_name, data_torch))
|
3310
|
+
|
3311
|
+
return tensors
|
3312
|
+
|
3313
|
+
def prepare_tensors(self):
|
3314
|
+
super().prepare_tensors()
|
3315
|
+
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
3316
|
+
|
3317
|
+
|
3318
|
+
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
3319
|
+
class ChatGLMModel(Model):
|
3320
|
+
model_arch = gguf.MODEL_ARCH.CHATGLM
|
3321
|
+
|
3322
|
+
def set_vocab_chatglm3(self):
|
3323
|
+
dir_model = self.dir_model
|
3324
|
+
hparams = self.hparams
|
3325
|
+
tokens: list[bytes] = []
|
3326
|
+
toktypes: list[int] = []
|
3327
|
+
scores: list[float] = []
|
3328
|
+
|
3329
|
+
from transformers import AutoTokenizer
|
3330
|
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3331
|
+
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
|
3332
|
+
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3333
|
+
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
|
3334
|
+
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
|
3335
|
+
for token_id in range(vocab_size):
|
3336
|
+
piece = tokenizer._convert_id_to_token(token_id)
|
3337
|
+
if token_id == 0:
|
3338
|
+
piece = "<unk>"
|
3339
|
+
elif token_id == 1:
|
3340
|
+
piece = "<bos>"
|
3341
|
+
elif token_id == 2:
|
3342
|
+
piece = "<eos>"
|
3343
|
+
|
3344
|
+
text = piece.encode("utf-8")
|
3345
|
+
score = 0.0
|
3346
|
+
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
|
3347
|
+
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
|
3348
|
+
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
|
3349
|
+
score = tokenizer.tokenizer.sp_model.get_score(token_id)
|
3350
|
+
|
3351
|
+
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
|
3352
|
+
if piece in special_tokens:
|
3353
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3354
|
+
elif len(piece) == 0:
|
3355
|
+
text = f"[PAD{token_id}]".encode("utf-8")
|
3356
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3357
|
+
else:
|
3358
|
+
toktype = SentencePieceTokenTypes.USER_DEFINED
|
3359
|
+
tokens.append(text)
|
3360
|
+
scores.append(score)
|
3361
|
+
toktypes.append(toktype)
|
3362
|
+
continue
|
3363
|
+
|
3364
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3365
|
+
if tokenizer.tokenizer.sp_model.is_unknown(token_id):
|
3366
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3367
|
+
elif tokenizer.tokenizer.sp_model.is_control(token_id):
|
3368
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3369
|
+
elif tokenizer.tokenizer.sp_model.is_unused(token_id):
|
3370
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3371
|
+
elif tokenizer.tokenizer.sp_model.is_byte(token_id):
|
3372
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3373
|
+
|
3374
|
+
tokens.append(text)
|
3375
|
+
scores.append(score)
|
3376
|
+
toktypes.append(toktype)
|
3377
|
+
|
3378
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
3379
|
+
# glm3 needs prefix and suffix formatted as:
|
3380
|
+
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
|
3381
|
+
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
|
3382
|
+
self.gguf_writer.add_token_list(tokens)
|
3383
|
+
self.gguf_writer.add_token_scores(scores)
|
3384
|
+
self.gguf_writer.add_token_types(toktypes)
|
3385
|
+
|
3386
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3387
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3388
|
+
|
3389
|
+
@staticmethod
|
3390
|
+
def token_bytes_to_string(b):
|
3391
|
+
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
3392
|
+
byte_encoder = bytes_to_unicode()
|
3393
|
+
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
3394
|
+
|
3395
|
+
@staticmethod
|
3396
|
+
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
|
3397
|
+
parts = [bytes([b]) for b in token]
|
3398
|
+
while True:
|
3399
|
+
min_idx = None
|
3400
|
+
min_rank = None
|
3401
|
+
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
|
3402
|
+
rank = mergeable_ranks.get(pair[0] + pair[1])
|
3403
|
+
if rank is not None and (min_rank is None or rank < min_rank):
|
3404
|
+
min_idx = i
|
3405
|
+
min_rank = rank
|
3406
|
+
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
|
3407
|
+
break
|
3408
|
+
assert min_idx is not None
|
3409
|
+
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
|
3410
|
+
return parts
|
3411
|
+
|
3412
|
+
def set_vocab(self):
|
3413
|
+
if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
|
3414
|
+
self.set_vocab_chatglm3()
|
3415
|
+
return
|
3416
|
+
|
3417
|
+
dir_model = self.dir_model
|
3418
|
+
hparams = self.hparams
|
3419
|
+
tokens: list[str] = []
|
3420
|
+
toktypes: list[int] = []
|
3421
|
+
|
3422
|
+
from transformers import AutoTokenizer
|
3423
|
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3424
|
+
vocab_size = hparams["padded_vocab_size"]
|
3425
|
+
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3426
|
+
|
3427
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
3428
|
+
|
3429
|
+
merges = []
|
3430
|
+
vocab = {}
|
3431
|
+
mergeable_ranks = tokenizer.mergeable_ranks
|
3432
|
+
for token, rank in mergeable_ranks.items():
|
3433
|
+
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
3434
|
+
if len(token) == 1:
|
3435
|
+
continue
|
3436
|
+
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
3437
|
+
assert len(merged) >= 2 and len(merged) <= 7
|
3438
|
+
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
3439
|
+
|
3440
|
+
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
3441
|
+
added_vocab = tokenizer.get_added_vocab()
|
3442
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
3443
|
+
|
3444
|
+
for i in range(vocab_size):
|
3445
|
+
if i not in reverse_vocab:
|
3446
|
+
tokens.append(f"[PAD{i}]")
|
3447
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
3448
|
+
elif reverse_vocab[i] in added_vocab:
|
3449
|
+
tokens.append(reverse_vocab[i])
|
3450
|
+
if tokenizer.added_tokens_decoder[i].special:
|
3451
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
3452
|
+
else:
|
3453
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
3454
|
+
else:
|
3455
|
+
tokens.append(reverse_vocab[i])
|
3456
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
3457
|
+
|
3458
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
3459
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
3460
|
+
self.gguf_writer.add_token_list(tokens)
|
3461
|
+
self.gguf_writer.add_token_types(toktypes)
|
3462
|
+
|
3463
|
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
3464
|
+
special_vocab.merges = merges
|
3465
|
+
# only add special tokens when they were not already loaded from config.json
|
3466
|
+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3467
|
+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
3468
|
+
# this one is usually not in config.json anyway
|
3469
|
+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3470
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3471
|
+
|
3472
|
+
def set_gguf_parameters(self):
|
3473
|
+
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
3474
|
+
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
3475
|
+
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
3476
|
+
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
3477
|
+
self.gguf_writer.add_embedding_length(n_embed)
|
3478
|
+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
3479
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3480
|
+
self.gguf_writer.add_head_count(n_head)
|
3481
|
+
self.gguf_writer.add_head_count_kv(n_head_kv)
|
3482
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
|
3483
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3484
|
+
self.gguf_writer.add_rope_dimension_count(64)
|
3485
|
+
self.gguf_writer.add_add_bos_token(False)
|
3486
|
+
rope_freq = 10000
|
3487
|
+
if "rope_ratio" in self.hparams:
|
3488
|
+
rope_freq = rope_freq * self.hparams["rope_ratio"]
|
3489
|
+
self.gguf_writer.add_rope_freq_base(rope_freq)
|
3490
|
+
|
3491
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3492
|
+
del bid # unused
|
3493
|
+
|
3494
|
+
if name.endswith(".rotary_pos_emb.inv_freq"):
|
3495
|
+
return []
|
3496
|
+
|
3497
|
+
name = name.removeprefix("transformer.")
|
3498
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3499
|
+
|
2704
3500
|
###### CONVERSION LOGIC ######
|
2705
3501
|
|
2706
3502
|
|
@@ -2717,19 +3513,46 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
2717
3513
|
torch.float32: np.float32,
|
2718
3514
|
}
|
2719
3515
|
|
3516
|
+
# used for safetensors slices
|
3517
|
+
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
3518
|
+
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
3519
|
+
_dtype_str_map: dict[str, torch.dtype] = {
|
3520
|
+
"F64": torch.float64,
|
3521
|
+
"F32": torch.float32,
|
3522
|
+
"BF16": torch.bfloat16,
|
3523
|
+
"F16": torch.float16,
|
3524
|
+
# "U64": torch.uint64,
|
3525
|
+
"I64": torch.int64,
|
3526
|
+
# "U32": torch.uint32,
|
3527
|
+
"I32": torch.int32,
|
3528
|
+
# "U16": torch.uint16,
|
3529
|
+
"I16": torch.int16,
|
3530
|
+
"U8": torch.uint8,
|
3531
|
+
"I8": torch.int8,
|
3532
|
+
"BOOL": torch.bool,
|
3533
|
+
"F8_E4M3": torch.float8_e4m3fn,
|
3534
|
+
"F8_E5M2": torch.float8_e5m2,
|
3535
|
+
}
|
3536
|
+
|
2720
3537
|
def numpy(self) -> gguf.LazyNumpyTensor:
|
2721
3538
|
dtype = self._dtype_map[self.dtype]
|
2722
3539
|
return gguf.LazyNumpyTensor(
|
2723
3540
|
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
2724
|
-
lazy=self._lazy,
|
2725
3541
|
args=(self,),
|
2726
|
-
func=(lambda s: s
|
3542
|
+
func=(lambda s: s.numpy())
|
2727
3543
|
)
|
2728
3544
|
|
2729
3545
|
@classmethod
|
2730
|
-
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape:
|
3546
|
+
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
2731
3547
|
return torch.empty(size=shape, dtype=dtype, device="meta")
|
2732
3548
|
|
3549
|
+
@classmethod
|
3550
|
+
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
3551
|
+
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
3552
|
+
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
3553
|
+
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
3554
|
+
return cast(torch.Tensor, lazy)
|
3555
|
+
|
2733
3556
|
@classmethod
|
2734
3557
|
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
2735
3558
|
del types # unused
|
@@ -2740,7 +3563,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
2740
3563
|
if func is torch.Tensor.numpy:
|
2741
3564
|
return args[0].numpy()
|
2742
3565
|
|
2743
|
-
return
|
3566
|
+
return cls._wrap_fn(func)(*args, **kwargs)
|
2744
3567
|
|
2745
3568
|
|
2746
3569
|
def parse_args() -> argparse.Namespace:
|
@@ -2750,10 +3573,6 @@ def parse_args() -> argparse.Namespace:
|
|
2750
3573
|
"--vocab-only", action="store_true",
|
2751
3574
|
help="extract only the vocab",
|
2752
3575
|
)
|
2753
|
-
parser.add_argument(
|
2754
|
-
"--awq-path", type=Path, default=None,
|
2755
|
-
help="Path to scale awq cache file",
|
2756
|
-
)
|
2757
3576
|
parser.add_argument(
|
2758
3577
|
"--outfile", type=Path,
|
2759
3578
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
@@ -2786,30 +3605,58 @@ def parse_args() -> argparse.Namespace:
|
|
2786
3605
|
"--verbose", action="store_true",
|
2787
3606
|
help="increase output verbosity",
|
2788
3607
|
)
|
3608
|
+
parser.add_argument(
|
3609
|
+
"--split-max-tensors", type=int, default=0,
|
3610
|
+
help="max tensors in each split",
|
3611
|
+
)
|
3612
|
+
parser.add_argument(
|
3613
|
+
"--split-max-size", type=str, default="0",
|
3614
|
+
help="max size per split N(M|G)",
|
3615
|
+
)
|
3616
|
+
parser.add_argument(
|
3617
|
+
"--dry-run", action="store_true",
|
3618
|
+
help="only print out a split plan and exit, without writing any new files",
|
3619
|
+
)
|
3620
|
+
parser.add_argument(
|
3621
|
+
"--no-tensor-first-split", action="store_true",
|
3622
|
+
help="do not add tensors to the first split (disabled by default)"
|
3623
|
+
)
|
3624
|
+
parser.add_argument(
|
3625
|
+
"--metadata", type=Path,
|
3626
|
+
help="Specify the path for an authorship metadata override file"
|
3627
|
+
)
|
2789
3628
|
|
2790
3629
|
return parser.parse_args()
|
2791
3630
|
|
2792
3631
|
|
3632
|
+
def split_str_to_n_bytes(split_str: str) -> int:
|
3633
|
+
if split_str.endswith("K"):
|
3634
|
+
n = int(split_str[:-1]) * 1000
|
3635
|
+
elif split_str.endswith("M"):
|
3636
|
+
n = int(split_str[:-1]) * 1000 * 1000
|
3637
|
+
elif split_str.endswith("G"):
|
3638
|
+
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
3639
|
+
elif split_str.isnumeric():
|
3640
|
+
n = int(split_str)
|
3641
|
+
else:
|
3642
|
+
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
3643
|
+
|
3644
|
+
if n < 0:
|
3645
|
+
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
3646
|
+
|
3647
|
+
return n
|
3648
|
+
|
3649
|
+
|
2793
3650
|
def main() -> None:
|
2794
3651
|
args = parse_args()
|
2795
3652
|
|
2796
|
-
|
3653
|
+
if args.verbose:
|
3654
|
+
logging.basicConfig(level=logging.DEBUG)
|
3655
|
+
else:
|
3656
|
+
logging.basicConfig(level=logging.INFO)
|
2797
3657
|
|
2798
3658
|
dir_model = args.model
|
2799
3659
|
|
2800
|
-
if args.awq_path:
|
2801
|
-
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
2802
|
-
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
2803
|
-
tmp_model_path = args.model / "weighted_model"
|
2804
|
-
dir_model = tmp_model_path
|
2805
|
-
if tmp_model_path.is_dir():
|
2806
|
-
logger.info(f"{tmp_model_path} exists as a weighted model.")
|
2807
|
-
else:
|
2808
|
-
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
2809
|
-
logger.info("Saving new weighted model ...")
|
2810
|
-
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
2811
|
-
logger.info(f"Saved weighted model at {tmp_model_path}.")
|
2812
|
-
|
2813
3660
|
if not dir_model.is_dir():
|
2814
3661
|
logger.error(f'Error: {args.model} is not a directory')
|
2815
3662
|
sys.exit(1)
|
@@ -2822,36 +3669,47 @@ def main() -> None:
|
|
2822
3669
|
"auto": gguf.LlamaFileType.GUESSED,
|
2823
3670
|
}
|
2824
3671
|
|
3672
|
+
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
3673
|
+
if args.use_temp_file and is_split:
|
3674
|
+
logger.error("Error: Cannot use temp file when splitting")
|
3675
|
+
sys.exit(1)
|
3676
|
+
|
2825
3677
|
if args.outfile is not None:
|
2826
3678
|
fname_out = args.outfile
|
2827
3679
|
else:
|
2828
|
-
|
2829
|
-
fname_out = dir_model / 'ggml-model-{ftype}.gguf'
|
3680
|
+
fname_out = dir_model
|
2830
3681
|
|
2831
3682
|
logger.info(f"Loading model: {dir_model.name}")
|
2832
3683
|
|
2833
3684
|
hparams = Model.load_hparams(dir_model)
|
2834
3685
|
|
2835
3686
|
with torch.inference_mode():
|
2836
|
-
|
2837
|
-
|
3687
|
+
output_type = ftype_map[args.outtype]
|
3688
|
+
model_architecture = hparams["architectures"][0]
|
2838
3689
|
|
2839
|
-
|
2840
|
-
|
2841
|
-
|
2842
|
-
|
2843
|
-
|
3690
|
+
try:
|
3691
|
+
model_class = Model.from_model_architecture(model_architecture)
|
3692
|
+
except NotImplementedError:
|
3693
|
+
logger.error(f"Model {model_architecture} is not supported")
|
3694
|
+
sys.exit(1)
|
2844
3695
|
|
2845
|
-
model_instance
|
3696
|
+
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
|
3697
|
+
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
|
3698
|
+
eager=args.no_lazy,
|
3699
|
+
metadata_override=args.metadata, model_name=args.model_name,
|
3700
|
+
split_max_tensors=args.split_max_tensors,
|
3701
|
+
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
3702
|
+
small_first_shard=args.no_tensor_first_split)
|
2846
3703
|
|
2847
3704
|
if args.vocab_only:
|
2848
|
-
logger.info(
|
3705
|
+
logger.info("Exporting model vocab...")
|
2849
3706
|
model_instance.write_vocab()
|
3707
|
+
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
|
2850
3708
|
else:
|
2851
|
-
logger.info(
|
3709
|
+
logger.info("Exporting model...")
|
2852
3710
|
model_instance.write()
|
2853
|
-
|
2854
|
-
|
3711
|
+
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
3712
|
+
logger.info(f"Model successfully exported to {out_path}")
|
2855
3713
|
|
2856
3714
|
|
2857
3715
|
if __name__ == '__main__':
|