bigdl-core-cpp 2.1.0b2__py3-none-manylinux2010_x86_64.whl → 2.1.0b20240820.post1__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert-hf-to-gguf.py +1174 -314
- bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
- bigdl/cpp/gguf-py/gguf/constants.py +463 -167
- bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +29 -8
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +475 -156
- bigdl/cpp/gguf-py/gguf/lazy.py +24 -49
- bigdl/cpp/gguf-py/gguf/metadata.py +503 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +209 -23
- bigdl/cpp/gguf-py/gguf/utility.py +69 -0
- bigdl/cpp/libs/baby-llama +0 -0
- bigdl/cpp/libs/batched +0 -0
- bigdl/cpp/libs/batched-bench +0 -0
- bigdl/cpp/libs/benchmark +0 -0
- bigdl/cpp/libs/embedding +0 -0
- bigdl/cpp/libs/gguf +0 -0
- bigdl/cpp/libs/imatrix +0 -0
- bigdl/cpp/libs/llama-bench +0 -0
- bigdl/cpp/libs/llava-cli +0 -0
- bigdl/cpp/libs/lookahead +0 -0
- bigdl/cpp/libs/lookup +0 -0
- bigdl/cpp/libs/ls-sycl-device +0 -0
- bigdl/cpp/libs/main +0 -0
- bigdl/cpp/libs/ollama +0 -0
- bigdl/cpp/libs/perplexity +0 -0
- bigdl/cpp/libs/quantize +0 -0
- bigdl/cpp/libs/quantize-stats +0 -0
- bigdl/cpp/libs/save-load-state +0 -0
- bigdl/cpp/libs/server +0 -0
- bigdl/cpp/libs/speculative +0 -0
- bigdl/cpp/libs/tokenize +0 -0
- {bigdl_core_cpp-2.1.0b2.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/METADATA +8 -8
- bigdl_core_cpp-2.1.0b20240820.post1.dist-info/RECORD +45 -0
- {bigdl_core_cpp-2.1.0b2.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/WHEEL +1 -1
- bigdl_core_cpp-2.1.0b2.dist-info/RECORD +0 -43
- {bigdl_core_cpp-2.1.0b2.data → bigdl_core_cpp-2.1.0b20240820.post1.data}/scripts/init-llama-cpp +0 -0
- {bigdl_core_cpp-2.1.0b2.data → bigdl_core_cpp-2.1.0b20240820.post1.data}/scripts/init-ollama +0 -0
- {bigdl_core_cpp-2.1.0b2.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert-hf-to-gguf.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
2
3
|
|
3
4
|
from __future__ import annotations
|
4
5
|
|
@@ -12,7 +13,7 @@ import sys
|
|
12
13
|
from enum import IntEnum
|
13
14
|
from pathlib import Path
|
14
15
|
from hashlib import sha256
|
15
|
-
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
16
|
+
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
16
17
|
|
17
18
|
import math
|
18
19
|
import numpy as np
|
@@ -25,10 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|
25
26
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
26
27
|
import gguf
|
27
28
|
|
28
|
-
from convert import LlamaHfVocab
|
29
|
-
|
30
|
-
logger = logging.getLogger("hf-to-gguf")
|
31
|
-
|
32
29
|
logger = logging.getLogger("hf-to-gguf")
|
33
30
|
|
34
31
|
|
@@ -50,7 +47,8 @@ class Model:
|
|
50
47
|
_model_classes: dict[str, type[Model]] = {}
|
51
48
|
|
52
49
|
dir_model: Path
|
53
|
-
ftype:
|
50
|
+
ftype: gguf.LlamaFileType
|
51
|
+
fname_out: Path
|
54
52
|
is_big_endian: bool
|
55
53
|
endianess: gguf.GGUFEndian
|
56
54
|
use_temp_file: bool
|
@@ -61,29 +59,41 @@ class Model:
|
|
61
59
|
block_count: int
|
62
60
|
tensor_map: gguf.TensorNameMap
|
63
61
|
tensor_names: set[str] | None
|
64
|
-
fname_out: Path
|
65
62
|
gguf_writer: gguf.GGUFWriter
|
63
|
+
model_name: str | None
|
64
|
+
metadata_override: Path | None
|
65
|
+
dir_model_card: Path
|
66
66
|
|
67
67
|
# subclasses should define this!
|
68
68
|
model_arch: gguf.MODEL_ARCH
|
69
69
|
|
70
|
-
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool
|
70
|
+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
71
|
+
use_temp_file: bool = False, eager: bool = False,
|
72
|
+
metadata_override: Path | None = None, model_name: str | None = None,
|
73
|
+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
71
74
|
if type(self) is Model:
|
72
75
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
76
|
+
|
73
77
|
self.dir_model = dir_model
|
74
78
|
self.ftype = ftype
|
79
|
+
self.fname_out = fname_out
|
75
80
|
self.is_big_endian = is_big_endian
|
76
81
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
77
82
|
self.use_temp_file = use_temp_file
|
78
83
|
self.lazy = not eager
|
79
|
-
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
84
|
+
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
|
80
85
|
self.is_safetensors = len(self.part_names) > 0
|
81
86
|
if not self.is_safetensors:
|
82
|
-
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
87
|
+
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
83
88
|
self.hparams = Model.load_hparams(self.dir_model)
|
84
|
-
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
89
|
+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
85
90
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
86
91
|
self.tensor_names = None
|
92
|
+
self.metadata_override = metadata_override
|
93
|
+
self.model_name = model_name
|
94
|
+
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
95
|
+
|
96
|
+
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
87
97
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
88
98
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
89
99
|
_, first_tensor = next(self.get_tensors())
|
@@ -93,11 +103,10 @@ class Model:
|
|
93
103
|
else:
|
94
104
|
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
95
105
|
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
106
|
+
|
107
|
+
# Configure GGUF Writer
|
108
|
+
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
109
|
+
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
101
110
|
|
102
111
|
@classmethod
|
103
112
|
def __init_subclass__(cls):
|
@@ -147,9 +156,16 @@ class Model:
|
|
147
156
|
tensor_names_from_parts.update(model_part.keys())
|
148
157
|
|
149
158
|
for name in model_part.keys():
|
150
|
-
|
151
|
-
|
152
|
-
|
159
|
+
if self.is_safetensors:
|
160
|
+
if self.lazy:
|
161
|
+
data = model_part.get_slice(name)
|
162
|
+
data = LazyTorchTensor.from_safetensors_slice(data)
|
163
|
+
else:
|
164
|
+
data = model_part.get_tensor(name)
|
165
|
+
else:
|
166
|
+
data = model_part[name]
|
167
|
+
if self.lazy:
|
168
|
+
data = LazyTorchTensor.from_eager(data)
|
153
169
|
yield name, data
|
154
170
|
|
155
171
|
# only verify tensor name presence; it doesn't matter if they are not in the right files
|
@@ -185,7 +201,6 @@ class Model:
|
|
185
201
|
return new_name
|
186
202
|
|
187
203
|
def set_gguf_parameters(self):
|
188
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
189
204
|
self.gguf_writer.add_block_count(self.block_count)
|
190
205
|
|
191
206
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
@@ -224,6 +239,10 @@ class Model:
|
|
224
239
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
225
240
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
226
241
|
|
242
|
+
if (head_dim := self.hparams.get("head_dim")) is not None:
|
243
|
+
self.gguf_writer.add_key_length(head_dim)
|
244
|
+
self.gguf_writer.add_value_length(head_dim)
|
245
|
+
|
227
246
|
self.gguf_writer.add_file_type(self.ftype)
|
228
247
|
logger.info(f"gguf: file type = {self.ftype}")
|
229
248
|
|
@@ -242,7 +261,7 @@ class Model:
|
|
242
261
|
|
243
262
|
return False
|
244
263
|
|
245
|
-
def
|
264
|
+
def prepare_tensors(self):
|
246
265
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
247
266
|
|
248
267
|
for name, data_torch in self.get_tensors():
|
@@ -264,7 +283,7 @@ class Model:
|
|
264
283
|
break
|
265
284
|
|
266
285
|
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
267
|
-
data: np.ndarray
|
286
|
+
data: np.ndarray # type hint
|
268
287
|
n_dims = len(data.shape)
|
269
288
|
data_dtype = data.dtype
|
270
289
|
data_qtype: gguf.GGMLQuantizationType | None = None
|
@@ -325,23 +344,80 @@ class Model:
|
|
325
344
|
|
326
345
|
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
327
346
|
|
347
|
+
def set_type(self):
|
348
|
+
self.gguf_writer.add_type(gguf.GGUFType.MODEL)
|
349
|
+
|
350
|
+
def prepare_metadata(self, vocab_only: bool):
|
351
|
+
|
352
|
+
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
353
|
+
|
354
|
+
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
|
355
|
+
|
356
|
+
# Fallback to model directory name if metadata name is still missing
|
357
|
+
if self.metadata.name is None:
|
358
|
+
self.metadata.name = self.dir_model.name
|
359
|
+
|
360
|
+
# Generate parameter weight class (useful for leader boards) if not yet determined
|
361
|
+
if self.metadata.size_label is None and total_params > 0:
|
362
|
+
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
363
|
+
|
364
|
+
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
365
|
+
output_type: str = self.ftype.name.partition("_")[2]
|
366
|
+
|
367
|
+
# Filename Output
|
368
|
+
if self.fname_out.is_dir():
|
369
|
+
# Generate default filename based on model specification and available metadata
|
370
|
+
if not vocab_only:
|
371
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
372
|
+
else:
|
373
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
374
|
+
|
375
|
+
# Use the default filename
|
376
|
+
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
377
|
+
else:
|
378
|
+
# Output path is a custom defined templated filename
|
379
|
+
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
380
|
+
# file template strings as it doesn't actually exist as a file
|
381
|
+
|
382
|
+
# Process templated file name with the output ftype, useful with the "auto" ftype
|
383
|
+
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
384
|
+
|
385
|
+
self.set_type()
|
386
|
+
|
387
|
+
logger.info("Set meta model")
|
388
|
+
self.metadata.set_gguf_meta_model(self.gguf_writer)
|
389
|
+
|
390
|
+
logger.info("Set model parameters")
|
391
|
+
self.set_gguf_parameters()
|
392
|
+
|
393
|
+
logger.info("Set model tokenizer")
|
394
|
+
self.set_vocab()
|
395
|
+
|
396
|
+
logger.info("Set model quantization version")
|
397
|
+
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
398
|
+
|
328
399
|
def write(self):
|
329
|
-
self.
|
330
|
-
self.
|
400
|
+
self.prepare_tensors()
|
401
|
+
self.prepare_metadata(vocab_only=False)
|
402
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
331
403
|
self.gguf_writer.write_kv_data_to_file()
|
332
404
|
self.gguf_writer.write_tensors_to_file(progress=True)
|
333
405
|
self.gguf_writer.close()
|
334
406
|
|
335
407
|
def write_vocab(self):
|
336
|
-
self.gguf_writer.
|
408
|
+
if len(self.gguf_writer.tensors) != 1:
|
409
|
+
raise ValueError('Splitting the vocabulary is not supported')
|
410
|
+
|
411
|
+
self.prepare_metadata(vocab_only=True)
|
412
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
337
413
|
self.gguf_writer.write_kv_data_to_file()
|
338
414
|
self.gguf_writer.close()
|
339
415
|
|
340
416
|
@staticmethod
|
341
|
-
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
|
417
|
+
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
342
418
|
part_names: list[str] = []
|
343
419
|
for filename in os.listdir(dir_model):
|
344
|
-
if filename.endswith(suffix):
|
420
|
+
if filename.startswith(prefix) and filename.endswith(suffix):
|
345
421
|
part_names.append(filename)
|
346
422
|
|
347
423
|
part_names.sort()
|
@@ -370,6 +446,29 @@ class Model:
|
|
370
446
|
except KeyError:
|
371
447
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
372
448
|
|
449
|
+
def does_token_look_special(self, token: str | bytes) -> bool:
|
450
|
+
if isinstance(token, (bytes, bytearray)):
|
451
|
+
token_text = token.decode(encoding="utf-8")
|
452
|
+
elif isinstance(token, memoryview):
|
453
|
+
token_text = token.tobytes().decode(encoding="utf-8")
|
454
|
+
else:
|
455
|
+
token_text = token
|
456
|
+
|
457
|
+
# Some models mark some added tokens which ought to be control tokens as not special.
|
458
|
+
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
|
459
|
+
seems_special = token_text in (
|
460
|
+
"<pad>", # deepseek-coder
|
461
|
+
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
|
462
|
+
)
|
463
|
+
|
464
|
+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
|
465
|
+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
|
466
|
+
|
467
|
+
# TODO: should these be marked as UNUSED instead? (maybe not)
|
468
|
+
seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
|
469
|
+
|
470
|
+
return seems_special
|
471
|
+
|
373
472
|
# used for GPT-2 BPE and WordPiece vocabs
|
374
473
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
375
474
|
tokens: list[str] = []
|
@@ -388,20 +487,22 @@ class Model:
|
|
388
487
|
for i in range(vocab_size):
|
389
488
|
if i not in reverse_vocab:
|
390
489
|
tokens.append(f"[PAD{i}]")
|
391
|
-
toktypes.append(gguf.TokenType.
|
392
|
-
elif reverse_vocab[i] in added_vocab:
|
393
|
-
tokens.append(reverse_vocab[i])
|
394
|
-
if tokenizer.added_tokens_decoder[i].special:
|
395
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
396
|
-
else:
|
397
|
-
toktypes.append(gguf.TokenType.USER_DEFINED)
|
490
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
398
491
|
else:
|
399
|
-
|
400
|
-
|
492
|
+
token: str = reverse_vocab[i]
|
493
|
+
if token in added_vocab:
|
494
|
+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
495
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
496
|
+
else:
|
497
|
+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
498
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
499
|
+
else:
|
500
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
501
|
+
tokens.append(token)
|
401
502
|
|
402
503
|
return tokens, toktypes, tokpre
|
403
504
|
|
404
|
-
# NOTE: this function is generated by
|
505
|
+
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
405
506
|
# do not modify it manually!
|
406
507
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
407
508
|
# Marker: Start get_vocab_base_pre
|
@@ -421,7 +522,7 @@ class Model:
|
|
421
522
|
|
422
523
|
res = None
|
423
524
|
|
424
|
-
# NOTE: if you get an error here, you need to update the
|
525
|
+
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
425
526
|
# or pull the latest version of the model from Huggingface
|
426
527
|
# don't edit the hashes manually!
|
427
528
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
@@ -478,15 +579,39 @@ class Model:
|
|
478
579
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
479
580
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
480
581
|
res = "smaug-bpe"
|
582
|
+
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
583
|
+
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
584
|
+
res = "poro-chat"
|
585
|
+
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
586
|
+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
587
|
+
res = "jina-v2-code"
|
588
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
589
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
590
|
+
res = "chatglm-bpe"
|
591
|
+
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
592
|
+
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
593
|
+
res = "viking"
|
594
|
+
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
595
|
+
# ref: https://huggingface.co/core42/jais-13b
|
596
|
+
res = "jais"
|
597
|
+
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
598
|
+
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
599
|
+
res = "codeshell"
|
600
|
+
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
601
|
+
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
602
|
+
res = "tekken"
|
603
|
+
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
604
|
+
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
605
|
+
res = "smollm"
|
481
606
|
|
482
607
|
if res is None:
|
483
608
|
logger.warning("\n")
|
484
609
|
logger.warning("**************************************************************************************")
|
485
610
|
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
486
611
|
logger.warning("** There are 2 possible reasons for this:")
|
487
|
-
logger.warning("** - the model has not been added to
|
612
|
+
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
488
613
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
489
|
-
logger.warning("** Check your model files and
|
614
|
+
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
490
615
|
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
491
616
|
logger.warning("**")
|
492
617
|
logger.warning(f"** chkhsh: {chkhsh}")
|
@@ -541,7 +666,7 @@ class Model:
|
|
541
666
|
for i in range(vocab_size):
|
542
667
|
if i not in reverse_vocab:
|
543
668
|
tokens.append(f"[PAD{i}]")
|
544
|
-
toktypes.append(gguf.TokenType.
|
669
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
545
670
|
elif reverse_vocab[i] in added_vocab:
|
546
671
|
tokens.append(reverse_vocab[i])
|
547
672
|
toktypes.append(gguf.TokenType.CONTROL)
|
@@ -564,15 +689,23 @@ class Model:
|
|
564
689
|
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
565
690
|
special_vocab.add_to_gguf(self.gguf_writer)
|
566
691
|
|
567
|
-
def _set_vocab_sentencepiece(self):
|
692
|
+
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
693
|
+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
694
|
+
|
695
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
696
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
697
|
+
self.gguf_writer.add_token_list(tokens)
|
698
|
+
self.gguf_writer.add_token_scores(scores)
|
699
|
+
self.gguf_writer.add_token_types(toktypes)
|
700
|
+
|
701
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
702
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
703
|
+
|
704
|
+
def _create_vocab_sentencepiece(self):
|
568
705
|
from sentencepiece import SentencePieceProcessor
|
569
706
|
|
570
707
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
571
708
|
|
572
|
-
tokens: list[bytes] = []
|
573
|
-
scores: list[float] = []
|
574
|
-
toktypes: list[int] = []
|
575
|
-
|
576
709
|
if not tokenizer_path.is_file():
|
577
710
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
578
711
|
|
@@ -583,7 +716,7 @@ class Model:
|
|
583
716
|
|
584
717
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
585
718
|
scores: list[float] = [-10000.0] * vocab_size
|
586
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
719
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
587
720
|
|
588
721
|
for token_id in range(tokenizer.vocab_size()):
|
589
722
|
piece = tokenizer.IdToPiece(token_id)
|
@@ -610,7 +743,7 @@ class Model:
|
|
610
743
|
added_tokens_json = json.load(f)
|
611
744
|
for key in added_tokens_json:
|
612
745
|
token_id = added_tokens_json[key]
|
613
|
-
if
|
746
|
+
if token_id >= vocab_size:
|
614
747
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
615
748
|
continue
|
616
749
|
|
@@ -618,6 +751,26 @@ class Model:
|
|
618
751
|
scores[token_id] = -1000.0
|
619
752
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
620
753
|
|
754
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
755
|
+
if tokenizer_config_file.is_file():
|
756
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
757
|
+
tokenizer_config_json = json.load(f)
|
758
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
759
|
+
for token_id, token_data in added_tokens_decoder.items():
|
760
|
+
token_id = int(token_id)
|
761
|
+
token: str = token_data["content"]
|
762
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
763
|
+
if tokens[token_id] != token.encode("utf-8"):
|
764
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
765
|
+
if token_data.get("special") or self.does_token_look_special(token):
|
766
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
767
|
+
else:
|
768
|
+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
769
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
770
|
+
|
771
|
+
scores[token_id] = -1000.0
|
772
|
+
tokens[token_id] = token.encode("utf-8")
|
773
|
+
|
621
774
|
if vocab_size > len(tokens):
|
622
775
|
pad_count = vocab_size - len(tokens)
|
623
776
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
@@ -626,17 +779,10 @@ class Model:
|
|
626
779
|
scores.append(-1000.0)
|
627
780
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
628
781
|
|
629
|
-
|
630
|
-
self.gguf_writer.add_tokenizer_pre("default")
|
631
|
-
self.gguf_writer.add_token_list(tokens)
|
632
|
-
self.gguf_writer.add_token_scores(scores)
|
633
|
-
self.gguf_writer.add_token_types(toktypes)
|
634
|
-
|
635
|
-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
636
|
-
special_vocab.add_to_gguf(self.gguf_writer)
|
782
|
+
return tokens, scores, toktypes
|
637
783
|
|
638
784
|
def _set_vocab_llama_hf(self):
|
639
|
-
vocab = LlamaHfVocab(self.dir_model)
|
785
|
+
vocab = gguf.LlamaHfVocab(self.dir_model)
|
640
786
|
tokens = []
|
641
787
|
scores = []
|
642
788
|
toktypes = []
|
@@ -657,6 +803,51 @@ class Model:
|
|
657
803
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
658
804
|
special_vocab.add_to_gguf(self.gguf_writer)
|
659
805
|
|
806
|
+
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
807
|
+
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
808
|
+
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
809
|
+
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
|
810
|
+
|
811
|
+
default_pre = "mpt" if model_name == "gpt-neox" else "default"
|
812
|
+
|
813
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
814
|
+
assert field # tokenizer model
|
815
|
+
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
|
816
|
+
|
817
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
818
|
+
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
|
819
|
+
|
820
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
821
|
+
assert field # token list
|
822
|
+
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
823
|
+
|
824
|
+
if model_name == "llama-spm":
|
825
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
|
826
|
+
assert field # token scores
|
827
|
+
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
828
|
+
|
829
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
830
|
+
assert field # token types
|
831
|
+
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
832
|
+
|
833
|
+
if model_name != "llama-spm":
|
834
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
835
|
+
assert field # token merges
|
836
|
+
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
837
|
+
|
838
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
|
839
|
+
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
|
840
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
|
841
|
+
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
|
842
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
|
843
|
+
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
|
844
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
|
845
|
+
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
|
846
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
|
847
|
+
self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
|
848
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
|
849
|
+
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
|
850
|
+
|
660
851
|
|
661
852
|
@Model.register("GPTNeoXForCausalLM")
|
662
853
|
class GPTNeoXModel(Model):
|
@@ -665,7 +856,6 @@ class GPTNeoXModel(Model):
|
|
665
856
|
def set_gguf_parameters(self):
|
666
857
|
block_count = self.hparams["num_hidden_layers"]
|
667
858
|
|
668
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
669
859
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
670
860
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
671
861
|
self.gguf_writer.add_block_count(block_count)
|
@@ -721,7 +911,6 @@ class BloomModel(Model):
|
|
721
911
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
722
912
|
|
723
913
|
def set_gguf_parameters(self):
|
724
|
-
self.gguf_writer.add_name("Bloom")
|
725
914
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
726
915
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
727
916
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
@@ -798,7 +987,6 @@ class MPTModel(Model):
|
|
798
987
|
|
799
988
|
def set_gguf_parameters(self):
|
800
989
|
block_count = self.hparams["n_layers"]
|
801
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
802
990
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
803
991
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
804
992
|
self.gguf_writer.add_block_count(block_count)
|
@@ -837,7 +1025,6 @@ class OrionModel(Model):
|
|
837
1025
|
block_count = self.hparams["num_hidden_layers"]
|
838
1026
|
head_count = self.hparams["num_attention_heads"]
|
839
1027
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
840
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
841
1028
|
|
842
1029
|
ctx_length = 0
|
843
1030
|
if "max_sequence_length" in self.hparams:
|
@@ -850,8 +1037,6 @@ class OrionModel(Model):
|
|
850
1037
|
raise ValueError("gguf: can not find ctx length parameter.")
|
851
1038
|
|
852
1039
|
self.gguf_writer.add_file_type(self.ftype)
|
853
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
854
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
855
1040
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
856
1041
|
self.gguf_writer.add_context_length(ctx_length)
|
857
1042
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -875,7 +1060,6 @@ class BaichuanModel(Model):
|
|
875
1060
|
block_count = self.hparams["num_hidden_layers"]
|
876
1061
|
head_count = self.hparams["num_attention_heads"]
|
877
1062
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
878
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
879
1063
|
|
880
1064
|
ctx_length = 0
|
881
1065
|
if "max_sequence_length" in self.hparams:
|
@@ -887,8 +1071,6 @@ class BaichuanModel(Model):
|
|
887
1071
|
else:
|
888
1072
|
raise ValueError("gguf: can not find ctx length parameter.")
|
889
1073
|
|
890
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
891
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
892
1074
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
893
1075
|
self.gguf_writer.add_context_length(ctx_length)
|
894
1076
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -962,7 +1144,11 @@ class XverseModel(Model):
|
|
962
1144
|
from transformers import AutoTokenizer
|
963
1145
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
964
1146
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
965
|
-
|
1147
|
+
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
|
1148
|
+
# because vocab_size is the count of items, and indexes start at 0.
|
1149
|
+
max_vocab_index = max(tokenizer.get_vocab().values())
|
1150
|
+
if max_vocab_index >= vocab_size:
|
1151
|
+
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
966
1152
|
|
967
1153
|
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
968
1154
|
added_vocab = tokenizer.get_added_vocab()
|
@@ -998,7 +1184,6 @@ class XverseModel(Model):
|
|
998
1184
|
block_count = self.hparams["num_hidden_layers"]
|
999
1185
|
head_count = self.hparams["num_attention_heads"]
|
1000
1186
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
1001
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
1002
1187
|
|
1003
1188
|
ctx_length = 0
|
1004
1189
|
if "max_sequence_length" in self.hparams:
|
@@ -1010,8 +1195,6 @@ class XverseModel(Model):
|
|
1010
1195
|
else:
|
1011
1196
|
raise ValueError("gguf: can not find ctx length parameter.")
|
1012
1197
|
|
1013
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1014
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
1015
1198
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
1016
1199
|
self.gguf_writer.add_context_length(ctx_length)
|
1017
1200
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1070,7 +1253,6 @@ class FalconModel(Model):
|
|
1070
1253
|
if n_head_kv is None:
|
1071
1254
|
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
|
1072
1255
|
|
1073
|
-
self.gguf_writer.add_name("Falcon")
|
1074
1256
|
self.gguf_writer.add_context_length(2048) # not in config.json
|
1075
1257
|
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
1076
1258
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1115,7 +1297,6 @@ class StarCoderModel(Model):
|
|
1115
1297
|
def set_gguf_parameters(self):
|
1116
1298
|
block_count = self.hparams["n_layer"]
|
1117
1299
|
|
1118
|
-
self.gguf_writer.add_name("StarCoder")
|
1119
1300
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
1120
1301
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
1121
1302
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
@@ -1135,11 +1316,11 @@ class RefactModel(Model):
|
|
1135
1316
|
|
1136
1317
|
# TODO: how to determine special FIM tokens automatically?
|
1137
1318
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
1138
|
-
special_token_types = ['prefix', 'suffix', 'middle', '
|
1319
|
+
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
1139
1320
|
special_vocab._set_special_token("prefix", 1)
|
1140
1321
|
special_vocab._set_special_token("suffix", 3)
|
1141
1322
|
special_vocab._set_special_token("middle", 2)
|
1142
|
-
special_vocab.
|
1323
|
+
special_vocab.chat_template = None # do not add it twice
|
1143
1324
|
special_vocab.add_to_gguf(self.gguf_writer)
|
1144
1325
|
|
1145
1326
|
def set_gguf_parameters(self):
|
@@ -1151,7 +1332,6 @@ class RefactModel(Model):
|
|
1151
1332
|
|
1152
1333
|
block_count = self.hparams["n_layer"]
|
1153
1334
|
|
1154
|
-
self.gguf_writer.add_name("Refact")
|
1155
1335
|
# refact uses Alibi. So this is from config.json which might be used by training.
|
1156
1336
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
1157
1337
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
@@ -1199,14 +1379,13 @@ class StableLMModel(Model):
|
|
1199
1379
|
if (self.dir_model / "tokenizer.json").is_file():
|
1200
1380
|
self._set_vocab_gpt2()
|
1201
1381
|
else:
|
1202
|
-
# StableLM 2 1.6B
|
1382
|
+
# StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
|
1203
1383
|
self._set_vocab_qwen()
|
1204
1384
|
|
1205
1385
|
def set_gguf_parameters(self):
|
1206
1386
|
hparams = self.hparams
|
1207
1387
|
block_count = hparams["num_hidden_layers"]
|
1208
1388
|
|
1209
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1210
1389
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
1211
1390
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1212
1391
|
self.gguf_writer.add_block_count(block_count)
|
@@ -1268,8 +1447,8 @@ class StableLMModel(Model):
|
|
1268
1447
|
|
1269
1448
|
return [(new_name, data_torch)]
|
1270
1449
|
|
1271
|
-
def
|
1272
|
-
super().
|
1450
|
+
def prepare_tensors(self):
|
1451
|
+
super().prepare_tensors()
|
1273
1452
|
|
1274
1453
|
if self._q_norms is not None or self._k_norms is not None:
|
1275
1454
|
# flatten two `list[dict[str, Tensor]]` into a single `list[str]`
|
@@ -1281,85 +1460,6 @@ class StableLMModel(Model):
|
|
1281
1460
|
if len(norms) > 0:
|
1282
1461
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1283
1462
|
|
1284
|
-
def write_tensors(self):
|
1285
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
1286
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
1287
|
-
n_head = self.hparams.get("num_attention_heads")
|
1288
|
-
n_kv_head = self.hparams.get("num_key_value_heads")
|
1289
|
-
q_norms = dict()
|
1290
|
-
k_norms = dict()
|
1291
|
-
for name, data_torch in self.get_tensors():
|
1292
|
-
# we don't need these
|
1293
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
1294
|
-
continue
|
1295
|
-
|
1296
|
-
old_dtype = data_torch.dtype
|
1297
|
-
|
1298
|
-
# convert any unsupported data types to float32
|
1299
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1300
|
-
data_torch = data_torch.to(torch.float32)
|
1301
|
-
|
1302
|
-
data = data_torch.squeeze().numpy()
|
1303
|
-
n_dims = len(data.shape)
|
1304
|
-
if name.find("q_layernorm.norms") != -1:
|
1305
|
-
q_norms[name] = data
|
1306
|
-
if len(q_norms) >= (block_count * n_head):
|
1307
|
-
self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
|
1308
|
-
continue
|
1309
|
-
if name.find("k_layernorm.norms") != -1:
|
1310
|
-
k_norms[name] = data
|
1311
|
-
if len(k_norms) >= (block_count * n_kv_head):
|
1312
|
-
self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
|
1313
|
-
continue
|
1314
|
-
|
1315
|
-
# map tensor names
|
1316
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1317
|
-
if new_name is None:
|
1318
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1319
|
-
|
1320
|
-
n_dims = len(data.shape)
|
1321
|
-
data_dtype = data.dtype
|
1322
|
-
|
1323
|
-
# if f32 desired, convert any float16 to float32
|
1324
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1325
|
-
data = data.astype(np.float32)
|
1326
|
-
|
1327
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1328
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
1329
|
-
data = data.astype(np.float32)
|
1330
|
-
|
1331
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1332
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
1333
|
-
data = data.astype(np.float16)
|
1334
|
-
|
1335
|
-
logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1336
|
-
|
1337
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1338
|
-
|
1339
|
-
def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
|
1340
|
-
for bid in range(block_count):
|
1341
|
-
datas = []
|
1342
|
-
for xid in range(n_head):
|
1343
|
-
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
1344
|
-
datas.append(norms[ename])
|
1345
|
-
del norms[ename]
|
1346
|
-
data = np.stack(datas, axis=0)
|
1347
|
-
data_dtype = data.dtype
|
1348
|
-
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
1349
|
-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
1350
|
-
if new_name is None:
|
1351
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1352
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
1353
|
-
data = data.astype(np.float32)
|
1354
|
-
|
1355
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1356
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
1357
|
-
data = data.astype(np.float16)
|
1358
|
-
|
1359
|
-
logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
1360
|
-
|
1361
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1362
|
-
|
1363
1463
|
|
1364
1464
|
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1365
1465
|
class LlamaModel(Model):
|
@@ -1367,7 +1467,7 @@ class LlamaModel(Model):
|
|
1367
1467
|
|
1368
1468
|
def set_vocab(self):
|
1369
1469
|
try:
|
1370
|
-
self.
|
1470
|
+
self._set_vocab_sentencepiece()
|
1371
1471
|
except FileNotFoundError:
|
1372
1472
|
try:
|
1373
1473
|
self._set_vocab_llama_hf()
|
@@ -1391,13 +1491,29 @@ class LlamaModel(Model):
|
|
1391
1491
|
super().set_gguf_parameters()
|
1392
1492
|
hparams = self.hparams
|
1393
1493
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1394
|
-
|
1494
|
+
|
1495
|
+
if "head_dim" in hparams:
|
1496
|
+
rope_dim = hparams["head_dim"]
|
1497
|
+
else:
|
1498
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1499
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1395
1500
|
|
1396
1501
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1397
1502
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
1398
1503
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1399
1504
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1400
1505
|
|
1506
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1507
|
+
if tokenizer_config_file.is_file():
|
1508
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1509
|
+
tokenizer_config_json = json.load(f)
|
1510
|
+
if "add_prefix_space" in tokenizer_config_json:
|
1511
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1512
|
+
|
1513
|
+
# Apply to granite small models only
|
1514
|
+
if self.hparams.get("vocab_size", 32000) == 49152:
|
1515
|
+
self.gguf_writer.add_add_bos_token(False)
|
1516
|
+
|
1401
1517
|
@staticmethod
|
1402
1518
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1403
1519
|
if n_head_kv is not None and n_head != n_head_kv:
|
@@ -1412,9 +1528,9 @@ class LlamaModel(Model):
|
|
1412
1528
|
n_head = self.hparams["num_attention_heads"]
|
1413
1529
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
1414
1530
|
|
1415
|
-
if name.endswith("q_proj.weight"):
|
1531
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1416
1532
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1417
|
-
if name.endswith("k_proj.weight"):
|
1533
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1418
1534
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1419
1535
|
|
1420
1536
|
# process the experts separately
|
@@ -1453,8 +1569,35 @@ class LlamaModel(Model):
|
|
1453
1569
|
|
1454
1570
|
return [(self.map_tensor_name(name), data_torch)]
|
1455
1571
|
|
1456
|
-
def
|
1457
|
-
|
1572
|
+
def prepare_tensors(self):
|
1573
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
1574
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
1575
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
1576
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1577
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1578
|
+
factor = rope_scaling.get("factor", 8.0)
|
1579
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1580
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
1581
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
1582
|
+
|
1583
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
1584
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
1585
|
+
assert low_freq_wavelen != high_freq_wavelen
|
1586
|
+
|
1587
|
+
rope_factors = []
|
1588
|
+
for freq in freqs:
|
1589
|
+
wavelen = 2 * math.pi / freq
|
1590
|
+
if wavelen < high_freq_wavelen:
|
1591
|
+
rope_factors.append(1)
|
1592
|
+
elif wavelen > low_freq_wavelen:
|
1593
|
+
rope_factors.append(factor)
|
1594
|
+
else:
|
1595
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
1596
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
1597
|
+
|
1598
|
+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
1599
|
+
|
1600
|
+
super().prepare_tensors()
|
1458
1601
|
|
1459
1602
|
if self._experts is not None:
|
1460
1603
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -1463,6 +1606,48 @@ class LlamaModel(Model):
|
|
1463
1606
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1464
1607
|
|
1465
1608
|
|
1609
|
+
@Model.register("BitnetForCausalLM")
|
1610
|
+
class BitnetModel(Model):
|
1611
|
+
model_arch = gguf.MODEL_ARCH.BITNET
|
1612
|
+
|
1613
|
+
def set_vocab(self):
|
1614
|
+
self._set_vocab_sentencepiece()
|
1615
|
+
|
1616
|
+
def set_gguf_parameters(self):
|
1617
|
+
super().set_gguf_parameters()
|
1618
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1619
|
+
self.gguf_writer.add_rope_scaling_factor(1.0)
|
1620
|
+
|
1621
|
+
def weight_quant(self, weight):
|
1622
|
+
dtype = weight.dtype
|
1623
|
+
weight = weight.float()
|
1624
|
+
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
1625
|
+
weight = (weight * s).round().clamp(-1, 1) / s
|
1626
|
+
scale = weight.abs().max().unsqueeze(0)
|
1627
|
+
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
1628
|
+
weight = torch.sign(weight).type(dtype)
|
1629
|
+
return weight.type(dtype), scale.type(torch.float32)
|
1630
|
+
|
1631
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1632
|
+
new_name = self.map_tensor_name(name)
|
1633
|
+
|
1634
|
+
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
1635
|
+
gguf.MODEL_TENSOR.ATTN_Q,
|
1636
|
+
gguf.MODEL_TENSOR.ATTN_K,
|
1637
|
+
gguf.MODEL_TENSOR.ATTN_V,
|
1638
|
+
gguf.MODEL_TENSOR.ATTN_OUT,
|
1639
|
+
gguf.MODEL_TENSOR.FFN_UP,
|
1640
|
+
gguf.MODEL_TENSOR.FFN_DOWN,
|
1641
|
+
gguf.MODEL_TENSOR.FFN_GATE,
|
1642
|
+
]):
|
1643
|
+
# transform weight into 1/0/-1 (in fp32)
|
1644
|
+
weight_torch, scale_torch = self.weight_quant(data_torch)
|
1645
|
+
yield (new_name, weight_torch)
|
1646
|
+
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
1647
|
+
else:
|
1648
|
+
yield (new_name, data_torch)
|
1649
|
+
|
1650
|
+
|
1466
1651
|
@Model.register("GrokForCausalLM")
|
1467
1652
|
class GrokModel(Model):
|
1468
1653
|
model_arch = gguf.MODEL_ARCH.GROK
|
@@ -1475,7 +1660,6 @@ class GrokModel(Model):
|
|
1475
1660
|
|
1476
1661
|
def set_gguf_parameters(self):
|
1477
1662
|
super().set_gguf_parameters()
|
1478
|
-
self.gguf_writer.add_name("Grok")
|
1479
1663
|
|
1480
1664
|
_experts: list[dict[str, Tensor]] | None = None
|
1481
1665
|
|
@@ -1524,7 +1708,6 @@ class DbrxModel(Model):
|
|
1524
1708
|
def set_gguf_parameters(self):
|
1525
1709
|
ffn_config = self.hparams["ffn_config"]
|
1526
1710
|
attn_config = self.hparams["attn_config"]
|
1527
|
-
self.gguf_writer.add_name(self.hparams["model_type"])
|
1528
1711
|
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
1529
1712
|
|
1530
1713
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
@@ -1537,7 +1720,6 @@ class DbrxModel(Model):
|
|
1537
1720
|
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
1538
1721
|
|
1539
1722
|
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
1540
|
-
self.gguf_writer.add_file_type(self.ftype)
|
1541
1723
|
|
1542
1724
|
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
1543
1725
|
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
@@ -1594,7 +1776,6 @@ class MiniCPMModel(Model):
|
|
1594
1776
|
|
1595
1777
|
def set_gguf_parameters(self):
|
1596
1778
|
block_count = self.hparams["num_hidden_layers"]
|
1597
|
-
self.gguf_writer.add_name("MiniCPM")
|
1598
1779
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1599
1780
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
1600
1781
|
self.gguf_writer.add_block_count(block_count)
|
@@ -1612,9 +1793,11 @@ class MiniCPMModel(Model):
|
|
1612
1793
|
if n_kv_head is not None and n_head != n_kv_head:
|
1613
1794
|
n_head = n_kv_head
|
1614
1795
|
|
1615
|
-
return (
|
1616
|
-
|
1617
|
-
|
1796
|
+
return (
|
1797
|
+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1798
|
+
.swapaxes(1, 2)
|
1799
|
+
.reshape(weights.shape)
|
1800
|
+
)
|
1618
1801
|
|
1619
1802
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1620
1803
|
del bid # unused
|
@@ -1662,7 +1845,6 @@ class QwenModel(Model):
|
|
1662
1845
|
self._set_vocab_qwen()
|
1663
1846
|
|
1664
1847
|
def set_gguf_parameters(self):
|
1665
|
-
self.gguf_writer.add_name("Qwen")
|
1666
1848
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1667
1849
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
1668
1850
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1693,6 +1875,12 @@ class Qwen2MoeModel(Model):
|
|
1693
1875
|
super().set_gguf_parameters()
|
1694
1876
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
1695
1877
|
self.gguf_writer.add_expert_count(n_experts)
|
1878
|
+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
1879
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
1880
|
+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
1881
|
+
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
1882
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
1883
|
+
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
1696
1884
|
|
1697
1885
|
_experts: list[dict[str, Tensor]] | None = None
|
1698
1886
|
|
@@ -1732,8 +1920,8 @@ class Qwen2MoeModel(Model):
|
|
1732
1920
|
|
1733
1921
|
return [(self.map_tensor_name(name), data_torch)]
|
1734
1922
|
|
1735
|
-
def
|
1736
|
-
super().
|
1923
|
+
def prepare_tensors(self):
|
1924
|
+
super().prepare_tensors()
|
1737
1925
|
|
1738
1926
|
if self._experts is not None:
|
1739
1927
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -1747,7 +1935,6 @@ class GPT2Model(Model):
|
|
1747
1935
|
model_arch = gguf.MODEL_ARCH.GPT2
|
1748
1936
|
|
1749
1937
|
def set_gguf_parameters(self):
|
1750
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1751
1938
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
1752
1939
|
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
1753
1940
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
@@ -1790,7 +1977,6 @@ class Phi2Model(Model):
|
|
1790
1977
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
1791
1978
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
1792
1979
|
|
1793
|
-
self.gguf_writer.add_name("Phi2")
|
1794
1980
|
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
1795
1981
|
|
1796
1982
|
self.gguf_writer.add_embedding_length(n_embd)
|
@@ -1823,7 +2009,7 @@ class Phi3MiniModel(Model):
|
|
1823
2009
|
|
1824
2010
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
1825
2011
|
scores: list[float] = [-10000.0] * vocab_size
|
1826
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
2012
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
1827
2013
|
|
1828
2014
|
for token_id in range(tokenizer.vocab_size()):
|
1829
2015
|
|
@@ -1852,7 +2038,7 @@ class Phi3MiniModel(Model):
|
|
1852
2038
|
|
1853
2039
|
for key in added_tokens_json:
|
1854
2040
|
token_id = added_tokens_json[key]
|
1855
|
-
if
|
2041
|
+
if token_id >= vocab_size:
|
1856
2042
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
1857
2043
|
continue
|
1858
2044
|
|
@@ -1868,8 +2054,9 @@ class Phi3MiniModel(Model):
|
|
1868
2054
|
for token_id, foken_data in added_tokens_decoder.items():
|
1869
2055
|
token_id = int(token_id)
|
1870
2056
|
token = foken_data["content"].encode("utf-8")
|
1871
|
-
if toktypes[token_id] != SentencePieceTokenTypes.
|
1872
|
-
|
2057
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2058
|
+
if tokens[token_id] != token:
|
2059
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
1873
2060
|
tokens[token_id] = token
|
1874
2061
|
scores[token_id] = -1000.0
|
1875
2062
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
@@ -1884,8 +2071,9 @@ class Phi3MiniModel(Model):
|
|
1884
2071
|
for foken_data in added_tokens:
|
1885
2072
|
token_id = int(foken_data["id"])
|
1886
2073
|
token = foken_data["content"].encode("utf-8")
|
1887
|
-
if toktypes[token_id] != SentencePieceTokenTypes.
|
1888
|
-
|
2074
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2075
|
+
if tokens[token_id] != token:
|
2076
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
1889
2077
|
tokens[token_id] = token
|
1890
2078
|
scores[token_id] = -1000.0
|
1891
2079
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
@@ -1912,7 +2100,6 @@ class Phi3MiniModel(Model):
|
|
1912
2100
|
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
1913
2101
|
rope_dims = n_embd // n_head
|
1914
2102
|
|
1915
|
-
self.gguf_writer.add_name("Phi3")
|
1916
2103
|
self.gguf_writer.add_context_length(max_pos_embds)
|
1917
2104
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
1918
2105
|
self.gguf_writer.add_embedding_length(n_embd)
|
@@ -1924,10 +2111,11 @@ class Phi3MiniModel(Model):
|
|
1924
2111
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
1925
2112
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
1926
2113
|
self.gguf_writer.add_file_type(self.ftype)
|
2114
|
+
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
|
1927
2115
|
|
1928
2116
|
# write rope scaling for long context (128k) model
|
1929
2117
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
1930
|
-
if
|
2118
|
+
if rope_scaling is None:
|
1931
2119
|
return
|
1932
2120
|
|
1933
2121
|
scale = max_pos_embds / orig_max_pos_embds
|
@@ -1936,7 +2124,7 @@ class Phi3MiniModel(Model):
|
|
1936
2124
|
if len(rope_scaling_type) == 0:
|
1937
2125
|
raise KeyError('Missing the required key rope_scaling.type')
|
1938
2126
|
|
1939
|
-
if rope_scaling_type == 'su':
|
2127
|
+
if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
|
1940
2128
|
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
|
1941
2129
|
elif rope_scaling_type == 'yarn':
|
1942
2130
|
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
|
@@ -1969,7 +2157,6 @@ class PlamoModel(Model):
|
|
1969
2157
|
hparams = self.hparams
|
1970
2158
|
block_count = hparams["num_hidden_layers"]
|
1971
2159
|
|
1972
|
-
self.gguf_writer.add_name("PLaMo")
|
1973
2160
|
self.gguf_writer.add_context_length(4096) # not in config.json
|
1974
2161
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1975
2162
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
@@ -2014,7 +2201,6 @@ class CodeShellModel(Model):
|
|
2014
2201
|
def set_gguf_parameters(self):
|
2015
2202
|
block_count = self.hparams["n_layer"]
|
2016
2203
|
|
2017
|
-
self.gguf_writer.add_name("CodeShell")
|
2018
2204
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
2019
2205
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
2020
2206
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
@@ -2066,7 +2252,7 @@ class InternLM2Model(Model):
|
|
2066
2252
|
logger.error(f'Error: Missing {tokenizer_path}')
|
2067
2253
|
sys.exit(1)
|
2068
2254
|
|
2069
|
-
sentencepiece_model = model.ModelProto()
|
2255
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
2070
2256
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
2071
2257
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
2072
2258
|
|
@@ -2094,6 +2280,9 @@ class InternLM2Model(Model):
|
|
2094
2280
|
toktype = SentencePieceTokenTypes.UNUSED
|
2095
2281
|
elif tokenizer.IsByte(token_id):
|
2096
2282
|
toktype = SentencePieceTokenTypes.BYTE
|
2283
|
+
# take care of ununsed raw token
|
2284
|
+
if piece.startswith('[UNUSED'):
|
2285
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
2097
2286
|
|
2098
2287
|
tokens.append(text)
|
2099
2288
|
scores.append(score)
|
@@ -2109,6 +2298,49 @@ class InternLM2Model(Model):
|
|
2109
2298
|
scores.append(-1000.0)
|
2110
2299
|
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
2111
2300
|
|
2301
|
+
chat_eos_token = '<|im_end|>'
|
2302
|
+
chat_eos_token_id = None
|
2303
|
+
|
2304
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2305
|
+
if tokenizer_config_file.is_file():
|
2306
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2307
|
+
tokenizer_config_json = json.load(f)
|
2308
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
2309
|
+
for token_id, foken_data in added_tokens_decoder.items():
|
2310
|
+
token_id = int(token_id)
|
2311
|
+
token = foken_data["content"]
|
2312
|
+
if token == chat_eos_token:
|
2313
|
+
chat_eos_token_id = token_id
|
2314
|
+
token = token.encode("utf-8")
|
2315
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2316
|
+
if tokens[token_id] != token:
|
2317
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
2318
|
+
tokens[token_id] = token
|
2319
|
+
scores[token_id] = -1000.0
|
2320
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2321
|
+
if foken_data.get("special"):
|
2322
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2323
|
+
|
2324
|
+
tokenizer_file = self.dir_model / 'tokenizer.json'
|
2325
|
+
if tokenizer_file.is_file():
|
2326
|
+
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
2327
|
+
tokenizer_json = json.load(f)
|
2328
|
+
added_tokens = tokenizer_json.get("added_tokens", [])
|
2329
|
+
for foken_data in added_tokens:
|
2330
|
+
token_id = int(foken_data["id"])
|
2331
|
+
token = foken_data["content"]
|
2332
|
+
if token == chat_eos_token:
|
2333
|
+
chat_eos_token_id = token_id
|
2334
|
+
token = token.encode("utf-8")
|
2335
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2336
|
+
if tokens[token_id] != token:
|
2337
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
2338
|
+
tokens[token_id] = token
|
2339
|
+
scores[token_id] = -1000.0
|
2340
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2341
|
+
if foken_data.get("special"):
|
2342
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2343
|
+
|
2112
2344
|
self.gguf_writer.add_tokenizer_model("llama")
|
2113
2345
|
self.gguf_writer.add_tokenizer_pre("default")
|
2114
2346
|
self.gguf_writer.add_token_list(tokens)
|
@@ -2118,37 +2350,17 @@ class InternLM2Model(Model):
|
|
2118
2350
|
|
2119
2351
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2120
2352
|
old_eos = special_vocab.special_token_ids["eos"]
|
2121
|
-
if
|
2353
|
+
if chat_eos_token_id is not None:
|
2122
2354
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
2123
2355
|
# TODO: this is a hack, should be fixed
|
2124
2356
|
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
2125
|
-
special_vocab.special_token_ids["eos"] =
|
2126
|
-
logger.warning(f"Replace eos:{old_eos} with a special token:{
|
2127
|
-
in chat mode so that the conversation can end normally.")
|
2357
|
+
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
2358
|
+
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
2359
|
+
" in chat mode so that the conversation can end normally.")
|
2128
2360
|
|
2129
2361
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2130
2362
|
|
2131
|
-
def _try_get_sft_eos(self, tokenizer):
|
2132
|
-
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
|
2133
|
-
im_end_list = tokenizer.Encode('<|im_end|>')
|
2134
|
-
eos_token = None
|
2135
|
-
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
|
2136
|
-
if len(unused_145_list) == 1:
|
2137
|
-
eos_token = unused_145_list[0]
|
2138
|
-
if len(im_end_list) == 1:
|
2139
|
-
eos_token = im_end_list[0]
|
2140
|
-
assert eos_token
|
2141
|
-
return eos_token
|
2142
|
-
|
2143
|
-
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
2144
|
-
if n_head_kv is not None and n_head != n_head_kv:
|
2145
|
-
n_head = n_head_kv
|
2146
|
-
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
2147
|
-
.swapaxes(1, 2)
|
2148
|
-
.reshape(weights.shape))
|
2149
|
-
|
2150
2363
|
def set_gguf_parameters(self):
|
2151
|
-
self.gguf_writer.add_name("InternLM2")
|
2152
2364
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
2153
2365
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
2154
2366
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -2158,30 +2370,30 @@ in chat mode so that the conversation can end normally.")
|
|
2158
2370
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2159
2371
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
2160
2372
|
self.gguf_writer.add_file_type(self.ftype)
|
2373
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2374
|
+
if self.hparams["rope_scaling"].get("type") == "linear":
|
2375
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2376
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2161
2377
|
|
2162
2378
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2163
2379
|
num_heads = self.hparams["num_attention_heads"]
|
2164
2380
|
num_kv_heads = self.hparams["num_key_value_heads"]
|
2165
|
-
|
2381
|
+
n_embd = self.hparams["hidden_size"]
|
2166
2382
|
q_per_kv = num_heads // num_kv_heads
|
2167
|
-
head_dim =
|
2383
|
+
head_dim = n_embd // num_heads
|
2168
2384
|
num_groups = num_heads // q_per_kv
|
2169
2385
|
|
2170
|
-
|
2171
|
-
|
2172
|
-
if re.match(qkv_pattern, name):
|
2173
|
-
bid = re.findall(qkv_pattern, name)[0]
|
2386
|
+
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
2174
2387
|
qkv = data_torch
|
2175
|
-
|
2176
|
-
qkv = qkv.
|
2177
|
-
q, k, v = qkv[
|
2388
|
+
|
2389
|
+
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
|
2390
|
+
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
2391
|
+
|
2178
2392
|
# The model weights of q and k equire additional reshape.
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2182
|
-
|
2183
|
-
# v = rearrange(v, " o g n i -> o (g n i)").T
|
2184
|
-
v = v.reshape((v.shape[0], -1)).T
|
2393
|
+
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
2394
|
+
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
2395
|
+
v = v.reshape((-1, v.shape[-1]))
|
2396
|
+
|
2185
2397
|
return [
|
2186
2398
|
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
|
2187
2399
|
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
|
@@ -2308,13 +2520,55 @@ class GemmaModel(Model):
|
|
2308
2520
|
special_vocab._set_special_token("middle", 68)
|
2309
2521
|
special_vocab._set_special_token("fsep", 70)
|
2310
2522
|
special_vocab._set_special_token("eot", 107)
|
2523
|
+
special_vocab.chat_template = None # do not add it twice
|
2311
2524
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2312
2525
|
|
2526
|
+
self.gguf_writer.add_add_space_prefix(False)
|
2527
|
+
|
2528
|
+
def set_gguf_parameters(self):
|
2529
|
+
hparams = self.hparams
|
2530
|
+
block_count = hparams["num_hidden_layers"]
|
2531
|
+
|
2532
|
+
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2533
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2534
|
+
self.gguf_writer.add_block_count(block_count)
|
2535
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
2536
|
+
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
2537
|
+
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
2538
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2539
|
+
self.gguf_writer.add_key_length(hparams["head_dim"])
|
2540
|
+
self.gguf_writer.add_value_length(hparams["head_dim"])
|
2541
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2542
|
+
|
2543
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2544
|
+
del bid # unused
|
2545
|
+
|
2546
|
+
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
2547
|
+
# To prevent errors, skip loading lm_head.weight.
|
2548
|
+
if name == "lm_head.weight":
|
2549
|
+
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
2550
|
+
return []
|
2551
|
+
|
2552
|
+
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
2553
|
+
if name.endswith("norm.weight"):
|
2554
|
+
data_torch = data_torch + 1
|
2555
|
+
|
2556
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2557
|
+
|
2558
|
+
|
2559
|
+
@Model.register("Gemma2ForCausalLM")
|
2560
|
+
class Gemma2Model(Model):
|
2561
|
+
model_arch = gguf.MODEL_ARCH.GEMMA2
|
2562
|
+
|
2563
|
+
def set_vocab(self):
|
2564
|
+
self._set_vocab_sentencepiece()
|
2565
|
+
|
2566
|
+
self.gguf_writer.add_add_space_prefix(False)
|
2567
|
+
|
2313
2568
|
def set_gguf_parameters(self):
|
2314
2569
|
hparams = self.hparams
|
2315
2570
|
block_count = hparams["num_hidden_layers"]
|
2316
2571
|
|
2317
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
2318
2572
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2319
2573
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2320
2574
|
self.gguf_writer.add_block_count(block_count)
|
@@ -2325,6 +2579,13 @@ class GemmaModel(Model):
|
|
2325
2579
|
self.gguf_writer.add_key_length(hparams["head_dim"])
|
2326
2580
|
self.gguf_writer.add_value_length(hparams["head_dim"])
|
2327
2581
|
self.gguf_writer.add_file_type(self.ftype)
|
2582
|
+
self.gguf_writer.add_attn_logit_softcapping(
|
2583
|
+
self.hparams["attn_logit_softcapping"]
|
2584
|
+
)
|
2585
|
+
self.gguf_writer.add_final_logit_softcapping(
|
2586
|
+
self.hparams["final_logit_softcapping"]
|
2587
|
+
)
|
2588
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
2328
2589
|
|
2329
2590
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2330
2591
|
del bid # unused
|
@@ -2366,39 +2627,7 @@ class MambaModel(Model):
|
|
2366
2627
|
self._set_vocab_sentencepiece()
|
2367
2628
|
else:
|
2368
2629
|
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
2369
|
-
|
2370
|
-
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
2371
|
-
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
|
2372
|
-
|
2373
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
2374
|
-
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
|
2375
|
-
|
2376
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
2377
|
-
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
|
2378
|
-
|
2379
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
2380
|
-
assert field
|
2381
|
-
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
2382
|
-
|
2383
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
2384
|
-
assert field
|
2385
|
-
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
2386
|
-
|
2387
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
2388
|
-
assert field
|
2389
|
-
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
2390
|
-
|
2391
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
|
2392
|
-
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
|
2393
|
-
|
2394
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
2395
|
-
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2396
|
-
|
2397
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
|
2398
|
-
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2399
|
-
|
2400
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
|
2401
|
-
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2630
|
+
self._set_vocab_builtin("gpt-neox", vocab_size)
|
2402
2631
|
|
2403
2632
|
def set_gguf_parameters(self):
|
2404
2633
|
d_model = self.find_hparam(["hidden_size", "d_model"])
|
@@ -2414,7 +2643,6 @@ class MambaModel(Model):
|
|
2414
2643
|
# Fail early for models which don't have a block expansion factor of 2
|
2415
2644
|
assert d_inner == 2 * d_model
|
2416
2645
|
|
2417
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
2418
2646
|
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
2419
2647
|
self.gguf_writer.add_embedding_length(d_model)
|
2420
2648
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
@@ -2521,18 +2749,20 @@ class JinaBertV2Model(BertModel):
|
|
2521
2749
|
|
2522
2750
|
def get_tensors(self):
|
2523
2751
|
for name, data in super().get_tensors():
|
2524
|
-
if '
|
2752
|
+
if 'gated_layer' in name:
|
2525
2753
|
d1 = data[:self.intermediate_size, :]
|
2526
2754
|
name1 = name.replace('gated_layers', 'gated_layers_w')
|
2755
|
+
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
2527
2756
|
d2 = data[self.intermediate_size:, :]
|
2528
2757
|
name2 = name.replace('gated_layers', 'gated_layers_v')
|
2758
|
+
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
2529
2759
|
yield name1, d1
|
2530
2760
|
yield name2, d2
|
2531
2761
|
continue
|
2532
2762
|
|
2533
2763
|
yield name, data
|
2534
2764
|
|
2535
|
-
def set_vocab(self
|
2765
|
+
def set_vocab(self):
|
2536
2766
|
tokenizer_class = 'BertTokenizer'
|
2537
2767
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
2538
2768
|
tokenizer_class = json.load(f)['tokenizer_class']
|
@@ -2548,17 +2778,92 @@ class JinaBertV2Model(BertModel):
|
|
2548
2778
|
self.gguf_writer.add_add_eos_token(True)
|
2549
2779
|
|
2550
2780
|
|
2551
|
-
@Model.register("
|
2552
|
-
class
|
2553
|
-
model_arch = gguf.MODEL_ARCH.
|
2554
|
-
|
2555
|
-
def set_vocab(self):
|
2556
|
-
# The reason for using a custom implementation here is that the
|
2557
|
-
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
2558
|
-
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
2559
|
-
from sentencepiece import SentencePieceProcessor
|
2781
|
+
@Model.register("OpenELMForCausalLM")
|
2782
|
+
class OpenELMModel(Model):
|
2783
|
+
model_arch = gguf.MODEL_ARCH.OPENELM
|
2560
2784
|
|
2561
|
-
|
2785
|
+
@staticmethod
|
2786
|
+
def _make_divisible(v: float | int, divisor: int) -> int:
|
2787
|
+
# ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
|
2788
|
+
new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
|
2789
|
+
# Make sure that round down does not go down by more than 10%.
|
2790
|
+
if new_v < 0.9 * v:
|
2791
|
+
new_v += divisor
|
2792
|
+
return new_v
|
2793
|
+
|
2794
|
+
def __init__(self, *args, **kwargs):
|
2795
|
+
super().__init__(*args, **kwargs)
|
2796
|
+
|
2797
|
+
ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
|
2798
|
+
ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
|
2799
|
+
self._n_embd: int = self.hparams["model_dim"]
|
2800
|
+
self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
|
2801
|
+
self._num_query_heads: list[int] = self.hparams["num_query_heads"]
|
2802
|
+
self._ffn_dims: list[int] = [
|
2803
|
+
OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
|
2804
|
+
for multiplier in ffn_multipliers
|
2805
|
+
]
|
2806
|
+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
2807
|
+
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
|
2808
|
+
|
2809
|
+
# Uses the tokenizer from meta-llama/Llama-2-7b-hf
|
2810
|
+
def set_vocab(self):
|
2811
|
+
try:
|
2812
|
+
self._set_vocab_sentencepiece()
|
2813
|
+
except FileNotFoundError:
|
2814
|
+
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
|
2815
|
+
|
2816
|
+
def set_gguf_parameters(self):
|
2817
|
+
n_embd = self._n_embd
|
2818
|
+
head_dim = self.hparams["head_dim"]
|
2819
|
+
rot_pct = 1.0
|
2820
|
+
assert self.block_count == len(self._num_kv_heads)
|
2821
|
+
assert self.block_count == len(self._num_query_heads)
|
2822
|
+
assert self.block_count == len(self._ffn_dims)
|
2823
|
+
|
2824
|
+
self.gguf_writer.add_block_count(self.block_count)
|
2825
|
+
self.gguf_writer.add_context_length(self.hparams["max_context_length"])
|
2826
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
2827
|
+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
2828
|
+
self.gguf_writer.add_head_count(self._num_query_heads)
|
2829
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
2830
|
+
self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
|
2831
|
+
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
|
2832
|
+
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
|
2833
|
+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
|
2834
|
+
self.gguf_writer.add_key_length(head_dim)
|
2835
|
+
self.gguf_writer.add_value_length(head_dim)
|
2836
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2837
|
+
|
2838
|
+
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
2839
|
+
if "n_layers" in keys:
|
2840
|
+
return self.hparams["num_transformer_layers"]
|
2841
|
+
|
2842
|
+
return super().find_hparam(keys, optional)
|
2843
|
+
|
2844
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2845
|
+
|
2846
|
+
# split ff
|
2847
|
+
if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
|
2848
|
+
ff_dim = self._ffn_dims[bid]
|
2849
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
|
2850
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
|
2851
|
+
return
|
2852
|
+
|
2853
|
+
yield (self.map_tensor_name(name), data_torch)
|
2854
|
+
|
2855
|
+
|
2856
|
+
@Model.register("ArcticForCausalLM")
|
2857
|
+
class ArcticModel(Model):
|
2858
|
+
model_arch = gguf.MODEL_ARCH.ARCTIC
|
2859
|
+
|
2860
|
+
def set_vocab(self):
|
2861
|
+
# The reason for using a custom implementation here is that the
|
2862
|
+
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
2863
|
+
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
2864
|
+
from sentencepiece import SentencePieceProcessor
|
2865
|
+
|
2866
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
2562
2867
|
|
2563
2868
|
if not tokenizer_path.is_file():
|
2564
2869
|
logger.error(f'Error: Missing {tokenizer_path}')
|
@@ -2572,7 +2877,7 @@ class ArcticModel(Model):
|
|
2572
2877
|
|
2573
2878
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
2574
2879
|
scores: list[float] = [-10000.0] * vocab_size
|
2575
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
2880
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
2576
2881
|
|
2577
2882
|
for token_id in range(tokenizer.vocab_size()):
|
2578
2883
|
|
@@ -2605,7 +2910,7 @@ class ArcticModel(Model):
|
|
2605
2910
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
2606
2911
|
for token_id, token_json in added_tokens_decoder.items():
|
2607
2912
|
token_id = int(token_id)
|
2608
|
-
if
|
2913
|
+
if token_id >= vocab_size:
|
2609
2914
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
2610
2915
|
continue
|
2611
2916
|
|
@@ -2689,8 +2994,8 @@ class ArcticModel(Model):
|
|
2689
2994
|
|
2690
2995
|
return [(self.map_tensor_name(name), data_torch)]
|
2691
2996
|
|
2692
|
-
def
|
2693
|
-
super().
|
2997
|
+
def prepare_tensors(self):
|
2998
|
+
super().prepare_tensors()
|
2694
2999
|
|
2695
3000
|
if self._experts is not None:
|
2696
3001
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -2699,6 +3004,499 @@ class ArcticModel(Model):
|
|
2699
3004
|
raise ValueError(f"Unprocessed experts: {experts}")
|
2700
3005
|
|
2701
3006
|
|
3007
|
+
@Model.register("DeepseekV2ForCausalLM")
|
3008
|
+
class DeepseekV2Model(Model):
|
3009
|
+
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
3010
|
+
|
3011
|
+
def set_vocab(self):
|
3012
|
+
self._set_vocab_gpt2()
|
3013
|
+
|
3014
|
+
def set_gguf_parameters(self):
|
3015
|
+
super().set_gguf_parameters()
|
3016
|
+
hparams = self.hparams
|
3017
|
+
|
3018
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
3019
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
3020
|
+
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
3021
|
+
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
3022
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
3023
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
3024
|
+
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
3025
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
3026
|
+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3027
|
+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3028
|
+
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
3029
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
3030
|
+
|
3031
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
3032
|
+
if self.hparams["rope_scaling"].get("type") == "yarn":
|
3033
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
3034
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
3035
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
3036
|
+
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
3037
|
+
|
3038
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3039
|
+
|
3040
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3041
|
+
# process the experts separately
|
3042
|
+
if name.find("mlp.experts") != -1:
|
3043
|
+
n_experts = self.hparams["n_routed_experts"]
|
3044
|
+
assert bid is not None
|
3045
|
+
|
3046
|
+
if self._experts is None:
|
3047
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3048
|
+
|
3049
|
+
self._experts[bid][name] = data_torch
|
3050
|
+
|
3051
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3052
|
+
tensors: list[tuple[str, Tensor]] = []
|
3053
|
+
|
3054
|
+
# merge the experts into a single 3d tensor
|
3055
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
3056
|
+
datas: list[Tensor] = []
|
3057
|
+
|
3058
|
+
for xid in range(n_experts):
|
3059
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
3060
|
+
datas.append(self._experts[bid][ename])
|
3061
|
+
del self._experts[bid][ename]
|
3062
|
+
|
3063
|
+
data_torch = torch.stack(datas, dim=0)
|
3064
|
+
|
3065
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
3066
|
+
|
3067
|
+
new_name = self.map_tensor_name(merged_name)
|
3068
|
+
|
3069
|
+
tensors.append((new_name, data_torch))
|
3070
|
+
return tensors
|
3071
|
+
else:
|
3072
|
+
return []
|
3073
|
+
|
3074
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3075
|
+
|
3076
|
+
def prepare_tensors(self):
|
3077
|
+
super().prepare_tensors()
|
3078
|
+
|
3079
|
+
if self._experts is not None:
|
3080
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3081
|
+
experts = [k for d in self._experts for k in d.keys()]
|
3082
|
+
if len(experts) > 0:
|
3083
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
3084
|
+
|
3085
|
+
|
3086
|
+
@Model.register("T5WithLMHeadModel")
|
3087
|
+
@Model.register("T5ForConditionalGeneration")
|
3088
|
+
@Model.register("MT5ForConditionalGeneration")
|
3089
|
+
@Model.register("UMT5ForConditionalGeneration")
|
3090
|
+
class T5Model(Model):
|
3091
|
+
model_arch = gguf.MODEL_ARCH.T5
|
3092
|
+
|
3093
|
+
def __init__(self, *args, **kwargs):
|
3094
|
+
super().__init__(*args, **kwargs)
|
3095
|
+
self.shared_token_embeddings_found = False
|
3096
|
+
|
3097
|
+
def set_vocab(self):
|
3098
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
3099
|
+
# exception when importing sentencepiece_model_pb2
|
3100
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
3101
|
+
from sentencepiece import SentencePieceProcessor
|
3102
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
3103
|
+
|
3104
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
3105
|
+
|
3106
|
+
# many older models use spiece.model tokenizer model filename
|
3107
|
+
if not tokenizer_path.is_file():
|
3108
|
+
tokenizer_path = self.dir_model / 'spiece.model'
|
3109
|
+
|
3110
|
+
if not tokenizer_path.is_file():
|
3111
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
3112
|
+
|
3113
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
3114
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
3115
|
+
|
3116
|
+
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
3117
|
+
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
3118
|
+
# assure the tokenizer model file name is correct
|
3119
|
+
assert tokenizer_path.name == 'tokenizer.model'
|
3120
|
+
return self._set_vocab_sentencepiece()
|
3121
|
+
else:
|
3122
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3123
|
+
|
3124
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
3125
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
3126
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
3127
|
+
|
3128
|
+
tokenizer = SentencePieceProcessor()
|
3129
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3130
|
+
|
3131
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
3132
|
+
|
3133
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3134
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3135
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3136
|
+
|
3137
|
+
for token_id in range(tokenizer.vocab_size()):
|
3138
|
+
piece = tokenizer.IdToPiece(token_id)
|
3139
|
+
text = piece.encode("utf-8")
|
3140
|
+
score = tokenizer.GetScore(token_id)
|
3141
|
+
|
3142
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3143
|
+
if tokenizer.IsUnknown(token_id):
|
3144
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3145
|
+
elif tokenizer.IsControl(token_id):
|
3146
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3147
|
+
elif tokenizer.IsUnused(token_id):
|
3148
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3149
|
+
elif tokenizer.IsByte(token_id):
|
3150
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3151
|
+
|
3152
|
+
tokens[token_id] = text
|
3153
|
+
scores[token_id] = score
|
3154
|
+
toktypes[token_id] = toktype
|
3155
|
+
|
3156
|
+
added_tokens_file = self.dir_model / 'added_tokens.json'
|
3157
|
+
if added_tokens_file.is_file():
|
3158
|
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
3159
|
+
added_tokens_json = json.load(f)
|
3160
|
+
for key in added_tokens_json:
|
3161
|
+
token_id = added_tokens_json[key]
|
3162
|
+
if token_id >= vocab_size:
|
3163
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
3164
|
+
continue
|
3165
|
+
|
3166
|
+
tokens[token_id] = key.encode("utf-8")
|
3167
|
+
scores[token_id] = -1000.0
|
3168
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
3169
|
+
|
3170
|
+
if vocab_size > len(tokens):
|
3171
|
+
pad_count = vocab_size - len(tokens)
|
3172
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
3173
|
+
for i in range(1, pad_count + 1):
|
3174
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
3175
|
+
scores.append(-1000.0)
|
3176
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
3177
|
+
|
3178
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
3179
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3180
|
+
self.gguf_writer.add_token_list(tokens)
|
3181
|
+
self.gguf_writer.add_token_scores(scores)
|
3182
|
+
self.gguf_writer.add_token_types(toktypes)
|
3183
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
3184
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3185
|
+
if precompiled_charsmap:
|
3186
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3187
|
+
|
3188
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3189
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3190
|
+
|
3191
|
+
self.gguf_writer.add_add_bos_token(False)
|
3192
|
+
self.gguf_writer.add_add_eos_token(True)
|
3193
|
+
|
3194
|
+
def set_gguf_parameters(self):
|
3195
|
+
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
3196
|
+
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
3197
|
+
n_ctx = 512
|
3198
|
+
self.gguf_writer.add_context_length(n_ctx)
|
3199
|
+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
3200
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
3201
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3202
|
+
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
3203
|
+
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
3204
|
+
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
3205
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3206
|
+
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
3207
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
3208
|
+
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
3209
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3210
|
+
|
3211
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3212
|
+
del bid # unused
|
3213
|
+
|
3214
|
+
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
|
3215
|
+
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
|
3216
|
+
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
|
3217
|
+
# and decoder and ignore the remaining ones.
|
3218
|
+
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
|
3219
|
+
if not self.shared_token_embeddings_found:
|
3220
|
+
name = "shared.weight"
|
3221
|
+
self.shared_token_embeddings_found = True
|
3222
|
+
else:
|
3223
|
+
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
|
3224
|
+
return []
|
3225
|
+
|
3226
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3227
|
+
|
3228
|
+
|
3229
|
+
@Model.register("JAISLMHeadModel")
|
3230
|
+
class JaisModel(Model):
|
3231
|
+
model_arch = gguf.MODEL_ARCH.JAIS
|
3232
|
+
|
3233
|
+
def __init__(self, *args, **kwargs):
|
3234
|
+
super().__init__(*args, **kwargs)
|
3235
|
+
|
3236
|
+
# SwigLU activation
|
3237
|
+
assert self.hparams["activation_function"] == "swiglu"
|
3238
|
+
# ALiBi position embedding
|
3239
|
+
assert self.hparams["position_embedding_type"] == "alibi"
|
3240
|
+
|
3241
|
+
# Embeddings scale
|
3242
|
+
self.embeddings_scale = 1.0
|
3243
|
+
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
3244
|
+
self.output_is_wte = False
|
3245
|
+
if 'mup_embeddings_scale' in self.hparams:
|
3246
|
+
self.output_is_wte = True # Hack (?)
|
3247
|
+
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
3248
|
+
elif 'embeddings_scale' in self.hparams:
|
3249
|
+
self.embeddings_scale = self.hparams['embeddings_scale']
|
3250
|
+
else:
|
3251
|
+
assert False
|
3252
|
+
|
3253
|
+
self.width_scale = 1.0
|
3254
|
+
if 'mup_output_alpha' in self.hparams:
|
3255
|
+
assert 'mup_width_scale' in self.hparams
|
3256
|
+
self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
|
3257
|
+
elif 'width_scale' in self.hparams:
|
3258
|
+
self.width_scale = self.hparams['width_scale']
|
3259
|
+
else:
|
3260
|
+
assert False
|
3261
|
+
|
3262
|
+
self.max_alibi_bias = 8.0
|
3263
|
+
|
3264
|
+
def set_vocab(self):
|
3265
|
+
self._set_vocab_gpt2()
|
3266
|
+
|
3267
|
+
def set_gguf_parameters(self):
|
3268
|
+
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
3269
|
+
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
3270
|
+
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
3271
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
|
3272
|
+
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
3273
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3274
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3275
|
+
|
3276
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3277
|
+
del bid # unused
|
3278
|
+
|
3279
|
+
tensors: list[tuple[str, Tensor]] = []
|
3280
|
+
|
3281
|
+
# we don't need these
|
3282
|
+
if name.endswith((".attn.bias")):
|
3283
|
+
return tensors
|
3284
|
+
|
3285
|
+
if name.endswith(("relative_pe.slopes")):
|
3286
|
+
# Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
|
3287
|
+
# Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
|
3288
|
+
# but Jais's PyTorch model simply precalculates the slope values and places them
|
3289
|
+
# in relative_pes.slopes
|
3290
|
+
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
|
3291
|
+
first_val = float(data_torch[0].item())
|
3292
|
+
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
|
3293
|
+
|
3294
|
+
return tensors
|
3295
|
+
|
3296
|
+
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
|
3297
|
+
data_torch = data_torch.transpose(1, 0)
|
3298
|
+
|
3299
|
+
new_name = self.map_tensor_name(name)
|
3300
|
+
|
3301
|
+
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3302
|
+
tensors.append((new_name, data_torch * self.embeddings_scale))
|
3303
|
+
if self.output_is_wte:
|
3304
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
3305
|
+
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
3306
|
+
assert not self.output_is_wte
|
3307
|
+
tensors.append((new_name, data_torch * self.width_scale))
|
3308
|
+
else:
|
3309
|
+
tensors.append((new_name, data_torch))
|
3310
|
+
|
3311
|
+
return tensors
|
3312
|
+
|
3313
|
+
def prepare_tensors(self):
|
3314
|
+
super().prepare_tensors()
|
3315
|
+
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
3316
|
+
|
3317
|
+
|
3318
|
+
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
3319
|
+
class ChatGLMModel(Model):
|
3320
|
+
model_arch = gguf.MODEL_ARCH.CHATGLM
|
3321
|
+
|
3322
|
+
def set_vocab_chatglm3(self):
|
3323
|
+
dir_model = self.dir_model
|
3324
|
+
hparams = self.hparams
|
3325
|
+
tokens: list[bytes] = []
|
3326
|
+
toktypes: list[int] = []
|
3327
|
+
scores: list[float] = []
|
3328
|
+
|
3329
|
+
from transformers import AutoTokenizer
|
3330
|
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3331
|
+
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
|
3332
|
+
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3333
|
+
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
|
3334
|
+
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
|
3335
|
+
for token_id in range(vocab_size):
|
3336
|
+
piece = tokenizer._convert_id_to_token(token_id)
|
3337
|
+
if token_id == 0:
|
3338
|
+
piece = "<unk>"
|
3339
|
+
elif token_id == 1:
|
3340
|
+
piece = "<bos>"
|
3341
|
+
elif token_id == 2:
|
3342
|
+
piece = "<eos>"
|
3343
|
+
|
3344
|
+
text = piece.encode("utf-8")
|
3345
|
+
score = 0.0
|
3346
|
+
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
|
3347
|
+
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
|
3348
|
+
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
|
3349
|
+
score = tokenizer.tokenizer.sp_model.get_score(token_id)
|
3350
|
+
|
3351
|
+
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
|
3352
|
+
if piece in special_tokens:
|
3353
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3354
|
+
elif len(piece) == 0:
|
3355
|
+
text = f"[PAD{token_id}]".encode("utf-8")
|
3356
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3357
|
+
else:
|
3358
|
+
toktype = SentencePieceTokenTypes.USER_DEFINED
|
3359
|
+
tokens.append(text)
|
3360
|
+
scores.append(score)
|
3361
|
+
toktypes.append(toktype)
|
3362
|
+
continue
|
3363
|
+
|
3364
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3365
|
+
if tokenizer.tokenizer.sp_model.is_unknown(token_id):
|
3366
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3367
|
+
elif tokenizer.tokenizer.sp_model.is_control(token_id):
|
3368
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3369
|
+
elif tokenizer.tokenizer.sp_model.is_unused(token_id):
|
3370
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3371
|
+
elif tokenizer.tokenizer.sp_model.is_byte(token_id):
|
3372
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3373
|
+
|
3374
|
+
tokens.append(text)
|
3375
|
+
scores.append(score)
|
3376
|
+
toktypes.append(toktype)
|
3377
|
+
|
3378
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
3379
|
+
# glm3 needs prefix and suffix formatted as:
|
3380
|
+
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
|
3381
|
+
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
|
3382
|
+
self.gguf_writer.add_token_list(tokens)
|
3383
|
+
self.gguf_writer.add_token_scores(scores)
|
3384
|
+
self.gguf_writer.add_token_types(toktypes)
|
3385
|
+
|
3386
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3387
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3388
|
+
|
3389
|
+
@staticmethod
|
3390
|
+
def token_bytes_to_string(b):
|
3391
|
+
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
3392
|
+
byte_encoder = bytes_to_unicode()
|
3393
|
+
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
3394
|
+
|
3395
|
+
@staticmethod
|
3396
|
+
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
|
3397
|
+
parts = [bytes([b]) for b in token]
|
3398
|
+
while True:
|
3399
|
+
min_idx = None
|
3400
|
+
min_rank = None
|
3401
|
+
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
|
3402
|
+
rank = mergeable_ranks.get(pair[0] + pair[1])
|
3403
|
+
if rank is not None and (min_rank is None or rank < min_rank):
|
3404
|
+
min_idx = i
|
3405
|
+
min_rank = rank
|
3406
|
+
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
|
3407
|
+
break
|
3408
|
+
assert min_idx is not None
|
3409
|
+
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
|
3410
|
+
return parts
|
3411
|
+
|
3412
|
+
def set_vocab(self):
|
3413
|
+
if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
|
3414
|
+
self.set_vocab_chatglm3()
|
3415
|
+
return
|
3416
|
+
|
3417
|
+
dir_model = self.dir_model
|
3418
|
+
hparams = self.hparams
|
3419
|
+
tokens: list[str] = []
|
3420
|
+
toktypes: list[int] = []
|
3421
|
+
|
3422
|
+
from transformers import AutoTokenizer
|
3423
|
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3424
|
+
vocab_size = hparams["padded_vocab_size"]
|
3425
|
+
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3426
|
+
|
3427
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
3428
|
+
|
3429
|
+
merges = []
|
3430
|
+
vocab = {}
|
3431
|
+
mergeable_ranks = tokenizer.mergeable_ranks
|
3432
|
+
for token, rank in mergeable_ranks.items():
|
3433
|
+
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
3434
|
+
if len(token) == 1:
|
3435
|
+
continue
|
3436
|
+
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
3437
|
+
assert len(merged) >= 2 and len(merged) <= 7
|
3438
|
+
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
3439
|
+
|
3440
|
+
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
3441
|
+
added_vocab = tokenizer.get_added_vocab()
|
3442
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
3443
|
+
|
3444
|
+
for i in range(vocab_size):
|
3445
|
+
if i not in reverse_vocab:
|
3446
|
+
tokens.append(f"[PAD{i}]")
|
3447
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
3448
|
+
elif reverse_vocab[i] in added_vocab:
|
3449
|
+
tokens.append(reverse_vocab[i])
|
3450
|
+
if tokenizer.added_tokens_decoder[i].special:
|
3451
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
3452
|
+
else:
|
3453
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
3454
|
+
else:
|
3455
|
+
tokens.append(reverse_vocab[i])
|
3456
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
3457
|
+
|
3458
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
3459
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
3460
|
+
self.gguf_writer.add_token_list(tokens)
|
3461
|
+
self.gguf_writer.add_token_types(toktypes)
|
3462
|
+
|
3463
|
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
3464
|
+
special_vocab.merges = merges
|
3465
|
+
# only add special tokens when they were not already loaded from config.json
|
3466
|
+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3467
|
+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
3468
|
+
# this one is usually not in config.json anyway
|
3469
|
+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3470
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3471
|
+
|
3472
|
+
def set_gguf_parameters(self):
|
3473
|
+
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
3474
|
+
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
3475
|
+
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
3476
|
+
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
3477
|
+
self.gguf_writer.add_embedding_length(n_embed)
|
3478
|
+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
3479
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3480
|
+
self.gguf_writer.add_head_count(n_head)
|
3481
|
+
self.gguf_writer.add_head_count_kv(n_head_kv)
|
3482
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
|
3483
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3484
|
+
self.gguf_writer.add_rope_dimension_count(64)
|
3485
|
+
self.gguf_writer.add_add_bos_token(False)
|
3486
|
+
rope_freq = 10000
|
3487
|
+
if "rope_ratio" in self.hparams:
|
3488
|
+
rope_freq = rope_freq * self.hparams["rope_ratio"]
|
3489
|
+
self.gguf_writer.add_rope_freq_base(rope_freq)
|
3490
|
+
|
3491
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3492
|
+
del bid # unused
|
3493
|
+
|
3494
|
+
if name.endswith(".rotary_pos_emb.inv_freq"):
|
3495
|
+
return []
|
3496
|
+
|
3497
|
+
name = name.removeprefix("transformer.")
|
3498
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3499
|
+
|
2702
3500
|
###### CONVERSION LOGIC ######
|
2703
3501
|
|
2704
3502
|
|
@@ -2715,19 +3513,46 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
2715
3513
|
torch.float32: np.float32,
|
2716
3514
|
}
|
2717
3515
|
|
3516
|
+
# used for safetensors slices
|
3517
|
+
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
3518
|
+
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
3519
|
+
_dtype_str_map: dict[str, torch.dtype] = {
|
3520
|
+
"F64": torch.float64,
|
3521
|
+
"F32": torch.float32,
|
3522
|
+
"BF16": torch.bfloat16,
|
3523
|
+
"F16": torch.float16,
|
3524
|
+
# "U64": torch.uint64,
|
3525
|
+
"I64": torch.int64,
|
3526
|
+
# "U32": torch.uint32,
|
3527
|
+
"I32": torch.int32,
|
3528
|
+
# "U16": torch.uint16,
|
3529
|
+
"I16": torch.int16,
|
3530
|
+
"U8": torch.uint8,
|
3531
|
+
"I8": torch.int8,
|
3532
|
+
"BOOL": torch.bool,
|
3533
|
+
"F8_E4M3": torch.float8_e4m3fn,
|
3534
|
+
"F8_E5M2": torch.float8_e5m2,
|
3535
|
+
}
|
3536
|
+
|
2718
3537
|
def numpy(self) -> gguf.LazyNumpyTensor:
|
2719
3538
|
dtype = self._dtype_map[self.dtype]
|
2720
3539
|
return gguf.LazyNumpyTensor(
|
2721
3540
|
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
2722
|
-
lazy=self._lazy,
|
2723
3541
|
args=(self,),
|
2724
|
-
func=(lambda s: s
|
3542
|
+
func=(lambda s: s.numpy())
|
2725
3543
|
)
|
2726
3544
|
|
2727
3545
|
@classmethod
|
2728
|
-
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape:
|
3546
|
+
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
2729
3547
|
return torch.empty(size=shape, dtype=dtype, device="meta")
|
2730
3548
|
|
3549
|
+
@classmethod
|
3550
|
+
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
3551
|
+
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
3552
|
+
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
3553
|
+
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
3554
|
+
return cast(torch.Tensor, lazy)
|
3555
|
+
|
2731
3556
|
@classmethod
|
2732
3557
|
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
2733
3558
|
del types # unused
|
@@ -2738,7 +3563,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
2738
3563
|
if func is torch.Tensor.numpy:
|
2739
3564
|
return args[0].numpy()
|
2740
3565
|
|
2741
|
-
return
|
3566
|
+
return cls._wrap_fn(func)(*args, **kwargs)
|
2742
3567
|
|
2743
3568
|
|
2744
3569
|
def parse_args() -> argparse.Namespace:
|
@@ -2748,10 +3573,6 @@ def parse_args() -> argparse.Namespace:
|
|
2748
3573
|
"--vocab-only", action="store_true",
|
2749
3574
|
help="extract only the vocab",
|
2750
3575
|
)
|
2751
|
-
parser.add_argument(
|
2752
|
-
"--awq-path", type=Path, default=None,
|
2753
|
-
help="Path to scale awq cache file",
|
2754
|
-
)
|
2755
3576
|
parser.add_argument(
|
2756
3577
|
"--outfile", type=Path,
|
2757
3578
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
@@ -2784,30 +3605,58 @@ def parse_args() -> argparse.Namespace:
|
|
2784
3605
|
"--verbose", action="store_true",
|
2785
3606
|
help="increase output verbosity",
|
2786
3607
|
)
|
3608
|
+
parser.add_argument(
|
3609
|
+
"--split-max-tensors", type=int, default=0,
|
3610
|
+
help="max tensors in each split",
|
3611
|
+
)
|
3612
|
+
parser.add_argument(
|
3613
|
+
"--split-max-size", type=str, default="0",
|
3614
|
+
help="max size per split N(M|G)",
|
3615
|
+
)
|
3616
|
+
parser.add_argument(
|
3617
|
+
"--dry-run", action="store_true",
|
3618
|
+
help="only print out a split plan and exit, without writing any new files",
|
3619
|
+
)
|
3620
|
+
parser.add_argument(
|
3621
|
+
"--no-tensor-first-split", action="store_true",
|
3622
|
+
help="do not add tensors to the first split (disabled by default)"
|
3623
|
+
)
|
3624
|
+
parser.add_argument(
|
3625
|
+
"--metadata", type=Path,
|
3626
|
+
help="Specify the path for an authorship metadata override file"
|
3627
|
+
)
|
2787
3628
|
|
2788
3629
|
return parser.parse_args()
|
2789
3630
|
|
2790
3631
|
|
3632
|
+
def split_str_to_n_bytes(split_str: str) -> int:
|
3633
|
+
if split_str.endswith("K"):
|
3634
|
+
n = int(split_str[:-1]) * 1000
|
3635
|
+
elif split_str.endswith("M"):
|
3636
|
+
n = int(split_str[:-1]) * 1000 * 1000
|
3637
|
+
elif split_str.endswith("G"):
|
3638
|
+
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
3639
|
+
elif split_str.isnumeric():
|
3640
|
+
n = int(split_str)
|
3641
|
+
else:
|
3642
|
+
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
3643
|
+
|
3644
|
+
if n < 0:
|
3645
|
+
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
3646
|
+
|
3647
|
+
return n
|
3648
|
+
|
3649
|
+
|
2791
3650
|
def main() -> None:
|
2792
3651
|
args = parse_args()
|
2793
3652
|
|
2794
|
-
|
3653
|
+
if args.verbose:
|
3654
|
+
logging.basicConfig(level=logging.DEBUG)
|
3655
|
+
else:
|
3656
|
+
logging.basicConfig(level=logging.INFO)
|
2795
3657
|
|
2796
3658
|
dir_model = args.model
|
2797
3659
|
|
2798
|
-
if args.awq_path:
|
2799
|
-
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
2800
|
-
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
2801
|
-
tmp_model_path = args.model / "weighted_model"
|
2802
|
-
dir_model = tmp_model_path
|
2803
|
-
if tmp_model_path.is_dir():
|
2804
|
-
logger.info(f"{tmp_model_path} exists as a weighted model.")
|
2805
|
-
else:
|
2806
|
-
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
2807
|
-
logger.info("Saving new weighted model ...")
|
2808
|
-
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
2809
|
-
logger.info(f"Saved weighted model at {tmp_model_path}.")
|
2810
|
-
|
2811
3660
|
if not dir_model.is_dir():
|
2812
3661
|
logger.error(f'Error: {args.model} is not a directory')
|
2813
3662
|
sys.exit(1)
|
@@ -2820,36 +3669,47 @@ def main() -> None:
|
|
2820
3669
|
"auto": gguf.LlamaFileType.GUESSED,
|
2821
3670
|
}
|
2822
3671
|
|
3672
|
+
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
3673
|
+
if args.use_temp_file and is_split:
|
3674
|
+
logger.error("Error: Cannot use temp file when splitting")
|
3675
|
+
sys.exit(1)
|
3676
|
+
|
2823
3677
|
if args.outfile is not None:
|
2824
3678
|
fname_out = args.outfile
|
2825
3679
|
else:
|
2826
|
-
|
2827
|
-
fname_out = dir_model / 'ggml-model-{ftype}.gguf'
|
3680
|
+
fname_out = dir_model
|
2828
3681
|
|
2829
3682
|
logger.info(f"Loading model: {dir_model.name}")
|
2830
3683
|
|
2831
3684
|
hparams = Model.load_hparams(dir_model)
|
2832
3685
|
|
2833
3686
|
with torch.inference_mode():
|
2834
|
-
|
2835
|
-
|
3687
|
+
output_type = ftype_map[args.outtype]
|
3688
|
+
model_architecture = hparams["architectures"][0]
|
2836
3689
|
|
2837
|
-
|
2838
|
-
|
2839
|
-
|
2840
|
-
|
2841
|
-
|
3690
|
+
try:
|
3691
|
+
model_class = Model.from_model_architecture(model_architecture)
|
3692
|
+
except NotImplementedError:
|
3693
|
+
logger.error(f"Model {model_architecture} is not supported")
|
3694
|
+
sys.exit(1)
|
2842
3695
|
|
2843
|
-
model_instance
|
3696
|
+
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
|
3697
|
+
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
|
3698
|
+
eager=args.no_lazy,
|
3699
|
+
metadata_override=args.metadata, model_name=args.model_name,
|
3700
|
+
split_max_tensors=args.split_max_tensors,
|
3701
|
+
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
3702
|
+
small_first_shard=args.no_tensor_first_split)
|
2844
3703
|
|
2845
3704
|
if args.vocab_only:
|
2846
|
-
logger.info(
|
3705
|
+
logger.info("Exporting model vocab...")
|
2847
3706
|
model_instance.write_vocab()
|
3707
|
+
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
|
2848
3708
|
else:
|
2849
|
-
logger.info(
|
3709
|
+
logger.info("Exporting model...")
|
2850
3710
|
model_instance.write()
|
2851
|
-
|
2852
|
-
|
3711
|
+
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
3712
|
+
logger.info(f"Model successfully exported to {out_path}")
|
2853
3713
|
|
2854
3714
|
|
2855
3715
|
if __name__ == '__main__':
|