bigdl-core-cpp 2.5.0b20240724__py3-none-manylinux2010_x86_64.whl → 2.5.0b20240726__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert-hf-to-gguf.py +1148 -315
- bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
- bigdl/cpp/gguf-py/gguf/constants.py +463 -167
- bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +29 -8
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +475 -156
- bigdl/cpp/gguf-py/gguf/lazy.py +24 -49
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +209 -23
- bigdl/cpp/libs/baby-llama +0 -0
- bigdl/cpp/libs/batched +0 -0
- bigdl/cpp/libs/batched-bench +0 -0
- bigdl/cpp/libs/benchmark +0 -0
- bigdl/cpp/libs/embedding +0 -0
- bigdl/cpp/libs/gguf +0 -0
- bigdl/cpp/libs/imatrix +0 -0
- bigdl/cpp/libs/llama-bench +0 -0
- bigdl/cpp/libs/llava-cli +0 -0
- bigdl/cpp/libs/lookahead +0 -0
- bigdl/cpp/libs/lookup +0 -0
- bigdl/cpp/libs/ls-sycl-device +0 -0
- bigdl/cpp/libs/main +0 -0
- bigdl/cpp/libs/ollama +0 -0
- bigdl/cpp/libs/perplexity +0 -0
- bigdl/cpp/libs/quantize +0 -0
- bigdl/cpp/libs/quantize-stats +0 -0
- bigdl/cpp/libs/save-load-state +0 -0
- bigdl/cpp/libs/server +0 -0
- bigdl/cpp/libs/speculative +0 -0
- bigdl/cpp/libs/tokenize +0 -0
- {bigdl_core_cpp-2.5.0b20240724.dist-info → bigdl_core_cpp-2.5.0b20240726.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.5.0b20240726.dist-info/RECORD +43 -0
- bigdl_core_cpp-2.5.0b20240724.dist-info/RECORD +0 -43
- {bigdl_core_cpp-2.5.0b20240724.data → bigdl_core_cpp-2.5.0b20240726.data}/scripts/init-llama-cpp +0 -0
- {bigdl_core_cpp-2.5.0b20240724.data → bigdl_core_cpp-2.5.0b20240726.data}/scripts/init-ollama +0 -0
- {bigdl_core_cpp-2.5.0b20240724.dist-info → bigdl_core_cpp-2.5.0b20240726.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240724.dist-info → bigdl_core_cpp-2.5.0b20240726.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert-hf-to-gguf.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
2
3
|
|
3
4
|
from __future__ import annotations
|
4
5
|
|
@@ -12,7 +13,7 @@ import sys
|
|
12
13
|
from enum import IntEnum
|
13
14
|
from pathlib import Path
|
14
15
|
from hashlib import sha256
|
15
|
-
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
16
|
+
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
16
17
|
|
17
18
|
import math
|
18
19
|
import numpy as np
|
@@ -25,10 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|
25
26
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
26
27
|
import gguf
|
27
28
|
|
28
|
-
from convert import LlamaHfVocab
|
29
|
-
|
30
|
-
logger = logging.getLogger("hf-to-gguf")
|
31
|
-
|
32
29
|
logger = logging.getLogger("hf-to-gguf")
|
33
30
|
|
34
31
|
|
@@ -50,7 +47,8 @@ class Model:
|
|
50
47
|
_model_classes: dict[str, type[Model]] = {}
|
51
48
|
|
52
49
|
dir_model: Path
|
53
|
-
ftype:
|
50
|
+
ftype: gguf.LlamaFileType
|
51
|
+
fname_out: Path
|
54
52
|
is_big_endian: bool
|
55
53
|
endianess: gguf.GGUFEndian
|
56
54
|
use_temp_file: bool
|
@@ -61,29 +59,41 @@ class Model:
|
|
61
59
|
block_count: int
|
62
60
|
tensor_map: gguf.TensorNameMap
|
63
61
|
tensor_names: set[str] | None
|
64
|
-
fname_out: Path
|
65
62
|
gguf_writer: gguf.GGUFWriter
|
63
|
+
model_name: str | None
|
64
|
+
metadata_override: Path | None
|
65
|
+
dir_model_card: Path
|
66
66
|
|
67
67
|
# subclasses should define this!
|
68
68
|
model_arch: gguf.MODEL_ARCH
|
69
69
|
|
70
|
-
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool
|
70
|
+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
71
|
+
use_temp_file: bool = False, eager: bool = False,
|
72
|
+
metadata_override: Path | None = None, model_name: str | None = None,
|
73
|
+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
71
74
|
if type(self) is Model:
|
72
75
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
76
|
+
|
73
77
|
self.dir_model = dir_model
|
74
78
|
self.ftype = ftype
|
79
|
+
self.fname_out = fname_out
|
75
80
|
self.is_big_endian = is_big_endian
|
76
81
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
77
82
|
self.use_temp_file = use_temp_file
|
78
83
|
self.lazy = not eager
|
79
|
-
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
84
|
+
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
|
80
85
|
self.is_safetensors = len(self.part_names) > 0
|
81
86
|
if not self.is_safetensors:
|
82
|
-
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
87
|
+
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
83
88
|
self.hparams = Model.load_hparams(self.dir_model)
|
84
|
-
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
89
|
+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
85
90
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
86
91
|
self.tensor_names = None
|
92
|
+
self.metadata_override = metadata_override
|
93
|
+
self.model_name = model_name
|
94
|
+
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
95
|
+
|
96
|
+
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
87
97
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
88
98
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
89
99
|
_, first_tensor = next(self.get_tensors())
|
@@ -93,11 +103,10 @@ class Model:
|
|
93
103
|
else:
|
94
104
|
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
95
105
|
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
106
|
+
|
107
|
+
# Configure GGUF Writer
|
108
|
+
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
109
|
+
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
101
110
|
|
102
111
|
@classmethod
|
103
112
|
def __init_subclass__(cls):
|
@@ -147,9 +156,16 @@ class Model:
|
|
147
156
|
tensor_names_from_parts.update(model_part.keys())
|
148
157
|
|
149
158
|
for name in model_part.keys():
|
150
|
-
|
151
|
-
|
152
|
-
|
159
|
+
if self.is_safetensors:
|
160
|
+
if self.lazy:
|
161
|
+
data = model_part.get_slice(name)
|
162
|
+
data = LazyTorchTensor.from_safetensors_slice(data)
|
163
|
+
else:
|
164
|
+
data = model_part.get_tensor(name)
|
165
|
+
else:
|
166
|
+
data = model_part[name]
|
167
|
+
if self.lazy:
|
168
|
+
data = LazyTorchTensor.from_eager(data)
|
153
169
|
yield name, data
|
154
170
|
|
155
171
|
# only verify tensor name presence; it doesn't matter if they are not in the right files
|
@@ -185,7 +201,6 @@ class Model:
|
|
185
201
|
return new_name
|
186
202
|
|
187
203
|
def set_gguf_parameters(self):
|
188
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
189
204
|
self.gguf_writer.add_block_count(self.block_count)
|
190
205
|
|
191
206
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
@@ -224,6 +239,10 @@ class Model:
|
|
224
239
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
225
240
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
226
241
|
|
242
|
+
if (head_dim := self.hparams.get("head_dim")) is not None:
|
243
|
+
self.gguf_writer.add_key_length(head_dim)
|
244
|
+
self.gguf_writer.add_value_length(head_dim)
|
245
|
+
|
227
246
|
self.gguf_writer.add_file_type(self.ftype)
|
228
247
|
logger.info(f"gguf: file type = {self.ftype}")
|
229
248
|
|
@@ -242,7 +261,7 @@ class Model:
|
|
242
261
|
|
243
262
|
return False
|
244
263
|
|
245
|
-
def
|
264
|
+
def prepare_tensors(self):
|
246
265
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
247
266
|
|
248
267
|
for name, data_torch in self.get_tensors():
|
@@ -264,7 +283,7 @@ class Model:
|
|
264
283
|
break
|
265
284
|
|
266
285
|
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
267
|
-
data: np.ndarray
|
286
|
+
data: np.ndarray # type hint
|
268
287
|
n_dims = len(data.shape)
|
269
288
|
data_dtype = data.dtype
|
270
289
|
data_qtype: gguf.GGMLQuantizationType | None = None
|
@@ -325,23 +344,80 @@ class Model:
|
|
325
344
|
|
326
345
|
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
327
346
|
|
347
|
+
def set_type(self):
|
348
|
+
self.gguf_writer.add_type(gguf.GGUFType.MODEL)
|
349
|
+
|
350
|
+
def prepare_metadata(self, vocab_only: bool):
|
351
|
+
|
352
|
+
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
353
|
+
|
354
|
+
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
|
355
|
+
|
356
|
+
# Fallback to model directory name if metadata name is still missing
|
357
|
+
if self.metadata.name is None:
|
358
|
+
self.metadata.name = self.dir_model.name
|
359
|
+
|
360
|
+
# Generate parameter weight class (useful for leader boards) if not yet determined
|
361
|
+
if self.metadata.size_label is None and total_params > 0:
|
362
|
+
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
363
|
+
|
364
|
+
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
365
|
+
output_type: str = self.ftype.name.partition("_")[2]
|
366
|
+
|
367
|
+
# Filename Output
|
368
|
+
if self.fname_out.is_dir():
|
369
|
+
# Generate default filename based on model specification and available metadata
|
370
|
+
if not vocab_only:
|
371
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
372
|
+
else:
|
373
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
374
|
+
|
375
|
+
# Use the default filename
|
376
|
+
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
377
|
+
else:
|
378
|
+
# Output path is a custom defined templated filename
|
379
|
+
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
380
|
+
# file template strings as it doesn't actually exist as a file
|
381
|
+
|
382
|
+
# Process templated file name with the output ftype, useful with the "auto" ftype
|
383
|
+
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
384
|
+
|
385
|
+
self.set_type()
|
386
|
+
|
387
|
+
logger.info("Set meta model")
|
388
|
+
self.metadata.set_gguf_meta_model(self.gguf_writer)
|
389
|
+
|
390
|
+
logger.info("Set model parameters")
|
391
|
+
self.set_gguf_parameters()
|
392
|
+
|
393
|
+
logger.info("Set model tokenizer")
|
394
|
+
self.set_vocab()
|
395
|
+
|
396
|
+
logger.info("Set model quantization version")
|
397
|
+
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
398
|
+
|
328
399
|
def write(self):
|
329
|
-
self.
|
330
|
-
self.
|
400
|
+
self.prepare_tensors()
|
401
|
+
self.prepare_metadata(vocab_only=False)
|
402
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
331
403
|
self.gguf_writer.write_kv_data_to_file()
|
332
404
|
self.gguf_writer.write_tensors_to_file(progress=True)
|
333
405
|
self.gguf_writer.close()
|
334
406
|
|
335
407
|
def write_vocab(self):
|
336
|
-
self.gguf_writer.
|
408
|
+
if len(self.gguf_writer.tensors) != 1:
|
409
|
+
raise ValueError('Splitting the vocabulary is not supported')
|
410
|
+
|
411
|
+
self.prepare_metadata(vocab_only=True)
|
412
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
337
413
|
self.gguf_writer.write_kv_data_to_file()
|
338
414
|
self.gguf_writer.close()
|
339
415
|
|
340
416
|
@staticmethod
|
341
|
-
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
|
417
|
+
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
342
418
|
part_names: list[str] = []
|
343
419
|
for filename in os.listdir(dir_model):
|
344
|
-
if filename.endswith(suffix):
|
420
|
+
if filename.startswith(prefix) and filename.endswith(suffix):
|
345
421
|
part_names.append(filename)
|
346
422
|
|
347
423
|
part_names.sort()
|
@@ -370,6 +446,29 @@ class Model:
|
|
370
446
|
except KeyError:
|
371
447
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
372
448
|
|
449
|
+
def does_token_look_special(self, token: str | bytes) -> bool:
|
450
|
+
if isinstance(token, (bytes, bytearray)):
|
451
|
+
token_text = token.decode(encoding="utf-8")
|
452
|
+
elif isinstance(token, memoryview):
|
453
|
+
token_text = token.tobytes().decode(encoding="utf-8")
|
454
|
+
else:
|
455
|
+
token_text = token
|
456
|
+
|
457
|
+
# Some models mark some added tokens which ought to be control tokens as not special.
|
458
|
+
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
|
459
|
+
seems_special = token_text in (
|
460
|
+
"<pad>", # deepseek-coder
|
461
|
+
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
|
462
|
+
)
|
463
|
+
|
464
|
+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
|
465
|
+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
|
466
|
+
|
467
|
+
# TODO: should these be marked as UNUSED instead? (maybe not)
|
468
|
+
seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
|
469
|
+
|
470
|
+
return seems_special
|
471
|
+
|
373
472
|
# used for GPT-2 BPE and WordPiece vocabs
|
374
473
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
375
474
|
tokens: list[str] = []
|
@@ -388,20 +487,22 @@ class Model:
|
|
388
487
|
for i in range(vocab_size):
|
389
488
|
if i not in reverse_vocab:
|
390
489
|
tokens.append(f"[PAD{i}]")
|
391
|
-
toktypes.append(gguf.TokenType.
|
392
|
-
elif reverse_vocab[i] in added_vocab:
|
393
|
-
tokens.append(reverse_vocab[i])
|
394
|
-
if tokenizer.added_tokens_decoder[i].special:
|
395
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
396
|
-
else:
|
397
|
-
toktypes.append(gguf.TokenType.USER_DEFINED)
|
490
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
398
491
|
else:
|
399
|
-
|
400
|
-
|
492
|
+
token: str = reverse_vocab[i]
|
493
|
+
if token in added_vocab:
|
494
|
+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
495
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
496
|
+
else:
|
497
|
+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
498
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
499
|
+
else:
|
500
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
501
|
+
tokens.append(token)
|
401
502
|
|
402
503
|
return tokens, toktypes, tokpre
|
403
504
|
|
404
|
-
# NOTE: this function is generated by
|
505
|
+
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
405
506
|
# do not modify it manually!
|
406
507
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
407
508
|
# Marker: Start get_vocab_base_pre
|
@@ -421,7 +522,7 @@ class Model:
|
|
421
522
|
|
422
523
|
res = None
|
423
524
|
|
424
|
-
# NOTE: if you get an error here, you need to update the
|
525
|
+
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
425
526
|
# or pull the latest version of the model from Huggingface
|
426
527
|
# don't edit the hashes manually!
|
427
528
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
@@ -478,15 +579,39 @@ class Model:
|
|
478
579
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
479
580
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
480
581
|
res = "smaug-bpe"
|
582
|
+
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
583
|
+
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
584
|
+
res = "poro-chat"
|
585
|
+
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
586
|
+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
587
|
+
res = "jina-v2-code"
|
588
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
589
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
590
|
+
res = "chatglm-bpe"
|
591
|
+
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
592
|
+
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
593
|
+
res = "viking"
|
594
|
+
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
595
|
+
# ref: https://huggingface.co/core42/jais-13b
|
596
|
+
res = "jais"
|
597
|
+
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
598
|
+
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
599
|
+
res = "codeshell"
|
600
|
+
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
601
|
+
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
602
|
+
res = "tekken"
|
603
|
+
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
604
|
+
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
605
|
+
res = "smollm"
|
481
606
|
|
482
607
|
if res is None:
|
483
608
|
logger.warning("\n")
|
484
609
|
logger.warning("**************************************************************************************")
|
485
610
|
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
486
611
|
logger.warning("** There are 2 possible reasons for this:")
|
487
|
-
logger.warning("** - the model has not been added to
|
612
|
+
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
488
613
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
489
|
-
logger.warning("** Check your model files and
|
614
|
+
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
490
615
|
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
491
616
|
logger.warning("**")
|
492
617
|
logger.warning(f"** chkhsh: {chkhsh}")
|
@@ -541,7 +666,7 @@ class Model:
|
|
541
666
|
for i in range(vocab_size):
|
542
667
|
if i not in reverse_vocab:
|
543
668
|
tokens.append(f"[PAD{i}]")
|
544
|
-
toktypes.append(gguf.TokenType.
|
669
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
545
670
|
elif reverse_vocab[i] in added_vocab:
|
546
671
|
tokens.append(reverse_vocab[i])
|
547
672
|
toktypes.append(gguf.TokenType.CONTROL)
|
@@ -564,15 +689,23 @@ class Model:
|
|
564
689
|
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
565
690
|
special_vocab.add_to_gguf(self.gguf_writer)
|
566
691
|
|
567
|
-
def _set_vocab_sentencepiece(self):
|
692
|
+
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
693
|
+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
694
|
+
|
695
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
696
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
697
|
+
self.gguf_writer.add_token_list(tokens)
|
698
|
+
self.gguf_writer.add_token_scores(scores)
|
699
|
+
self.gguf_writer.add_token_types(toktypes)
|
700
|
+
|
701
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
702
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
703
|
+
|
704
|
+
def _create_vocab_sentencepiece(self):
|
568
705
|
from sentencepiece import SentencePieceProcessor
|
569
706
|
|
570
707
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
571
708
|
|
572
|
-
tokens: list[bytes] = []
|
573
|
-
scores: list[float] = []
|
574
|
-
toktypes: list[int] = []
|
575
|
-
|
576
709
|
if not tokenizer_path.is_file():
|
577
710
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
578
711
|
|
@@ -583,7 +716,7 @@ class Model:
|
|
583
716
|
|
584
717
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
585
718
|
scores: list[float] = [-10000.0] * vocab_size
|
586
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
719
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
587
720
|
|
588
721
|
for token_id in range(tokenizer.vocab_size()):
|
589
722
|
piece = tokenizer.IdToPiece(token_id)
|
@@ -610,7 +743,7 @@ class Model:
|
|
610
743
|
added_tokens_json = json.load(f)
|
611
744
|
for key in added_tokens_json:
|
612
745
|
token_id = added_tokens_json[key]
|
613
|
-
if
|
746
|
+
if token_id >= vocab_size:
|
614
747
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
615
748
|
continue
|
616
749
|
|
@@ -618,6 +751,26 @@ class Model:
|
|
618
751
|
scores[token_id] = -1000.0
|
619
752
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
620
753
|
|
754
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
755
|
+
if tokenizer_config_file.is_file():
|
756
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
757
|
+
tokenizer_config_json = json.load(f)
|
758
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
759
|
+
for token_id, token_data in added_tokens_decoder.items():
|
760
|
+
token_id = int(token_id)
|
761
|
+
token: str = token_data["content"]
|
762
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
763
|
+
if tokens[token_id] != token.encode("utf-8"):
|
764
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
765
|
+
if token_data.get("special") or self.does_token_look_special(token):
|
766
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
767
|
+
else:
|
768
|
+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
769
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
770
|
+
|
771
|
+
scores[token_id] = -1000.0
|
772
|
+
tokens[token_id] = token.encode("utf-8")
|
773
|
+
|
621
774
|
if vocab_size > len(tokens):
|
622
775
|
pad_count = vocab_size - len(tokens)
|
623
776
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
@@ -626,17 +779,10 @@ class Model:
|
|
626
779
|
scores.append(-1000.0)
|
627
780
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
628
781
|
|
629
|
-
|
630
|
-
self.gguf_writer.add_tokenizer_pre("default")
|
631
|
-
self.gguf_writer.add_token_list(tokens)
|
632
|
-
self.gguf_writer.add_token_scores(scores)
|
633
|
-
self.gguf_writer.add_token_types(toktypes)
|
634
|
-
|
635
|
-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
636
|
-
special_vocab.add_to_gguf(self.gguf_writer)
|
782
|
+
return tokens, scores, toktypes
|
637
783
|
|
638
784
|
def _set_vocab_llama_hf(self):
|
639
|
-
vocab = LlamaHfVocab(self.dir_model)
|
785
|
+
vocab = gguf.LlamaHfVocab(self.dir_model)
|
640
786
|
tokens = []
|
641
787
|
scores = []
|
642
788
|
toktypes = []
|
@@ -657,6 +803,51 @@ class Model:
|
|
657
803
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
658
804
|
special_vocab.add_to_gguf(self.gguf_writer)
|
659
805
|
|
806
|
+
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
807
|
+
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
808
|
+
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
809
|
+
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
|
810
|
+
|
811
|
+
default_pre = "mpt" if model_name == "gpt-neox" else "default"
|
812
|
+
|
813
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
814
|
+
assert field # tokenizer model
|
815
|
+
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
|
816
|
+
|
817
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
818
|
+
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
|
819
|
+
|
820
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
821
|
+
assert field # token list
|
822
|
+
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
823
|
+
|
824
|
+
if model_name == "llama-spm":
|
825
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
|
826
|
+
assert field # token scores
|
827
|
+
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
828
|
+
|
829
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
830
|
+
assert field # token types
|
831
|
+
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
832
|
+
|
833
|
+
if model_name != "llama-spm":
|
834
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
835
|
+
assert field # token merges
|
836
|
+
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
837
|
+
|
838
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
|
839
|
+
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
|
840
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
|
841
|
+
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
|
842
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
|
843
|
+
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
|
844
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
|
845
|
+
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
|
846
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
|
847
|
+
self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
|
848
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
|
849
|
+
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
|
850
|
+
|
660
851
|
|
661
852
|
@Model.register("GPTNeoXForCausalLM")
|
662
853
|
class GPTNeoXModel(Model):
|
@@ -665,7 +856,6 @@ class GPTNeoXModel(Model):
|
|
665
856
|
def set_gguf_parameters(self):
|
666
857
|
block_count = self.hparams["num_hidden_layers"]
|
667
858
|
|
668
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
669
859
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
670
860
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
671
861
|
self.gguf_writer.add_block_count(block_count)
|
@@ -721,7 +911,6 @@ class BloomModel(Model):
|
|
721
911
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
722
912
|
|
723
913
|
def set_gguf_parameters(self):
|
724
|
-
self.gguf_writer.add_name("Bloom")
|
725
914
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
726
915
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
727
916
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
@@ -798,7 +987,6 @@ class MPTModel(Model):
|
|
798
987
|
|
799
988
|
def set_gguf_parameters(self):
|
800
989
|
block_count = self.hparams["n_layers"]
|
801
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
802
990
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
803
991
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
804
992
|
self.gguf_writer.add_block_count(block_count)
|
@@ -837,7 +1025,6 @@ class OrionModel(Model):
|
|
837
1025
|
block_count = self.hparams["num_hidden_layers"]
|
838
1026
|
head_count = self.hparams["num_attention_heads"]
|
839
1027
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
840
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
841
1028
|
|
842
1029
|
ctx_length = 0
|
843
1030
|
if "max_sequence_length" in self.hparams:
|
@@ -850,8 +1037,6 @@ class OrionModel(Model):
|
|
850
1037
|
raise ValueError("gguf: can not find ctx length parameter.")
|
851
1038
|
|
852
1039
|
self.gguf_writer.add_file_type(self.ftype)
|
853
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
854
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
855
1040
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
856
1041
|
self.gguf_writer.add_context_length(ctx_length)
|
857
1042
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -875,7 +1060,6 @@ class BaichuanModel(Model):
|
|
875
1060
|
block_count = self.hparams["num_hidden_layers"]
|
876
1061
|
head_count = self.hparams["num_attention_heads"]
|
877
1062
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
878
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
879
1063
|
|
880
1064
|
ctx_length = 0
|
881
1065
|
if "max_sequence_length" in self.hparams:
|
@@ -887,8 +1071,6 @@ class BaichuanModel(Model):
|
|
887
1071
|
else:
|
888
1072
|
raise ValueError("gguf: can not find ctx length parameter.")
|
889
1073
|
|
890
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
891
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
892
1074
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
893
1075
|
self.gguf_writer.add_context_length(ctx_length)
|
894
1076
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -962,7 +1144,11 @@ class XverseModel(Model):
|
|
962
1144
|
from transformers import AutoTokenizer
|
963
1145
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
964
1146
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
965
|
-
|
1147
|
+
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
|
1148
|
+
# because vocab_size is the count of items, and indexes start at 0.
|
1149
|
+
max_vocab_index = max(tokenizer.get_vocab().values())
|
1150
|
+
if max_vocab_index >= vocab_size:
|
1151
|
+
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
966
1152
|
|
967
1153
|
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
968
1154
|
added_vocab = tokenizer.get_added_vocab()
|
@@ -998,7 +1184,6 @@ class XverseModel(Model):
|
|
998
1184
|
block_count = self.hparams["num_hidden_layers"]
|
999
1185
|
head_count = self.hparams["num_attention_heads"]
|
1000
1186
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
1001
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
1002
1187
|
|
1003
1188
|
ctx_length = 0
|
1004
1189
|
if "max_sequence_length" in self.hparams:
|
@@ -1010,8 +1195,6 @@ class XverseModel(Model):
|
|
1010
1195
|
else:
|
1011
1196
|
raise ValueError("gguf: can not find ctx length parameter.")
|
1012
1197
|
|
1013
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1014
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
1015
1198
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
1016
1199
|
self.gguf_writer.add_context_length(ctx_length)
|
1017
1200
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1070,7 +1253,6 @@ class FalconModel(Model):
|
|
1070
1253
|
if n_head_kv is None:
|
1071
1254
|
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
|
1072
1255
|
|
1073
|
-
self.gguf_writer.add_name("Falcon")
|
1074
1256
|
self.gguf_writer.add_context_length(2048) # not in config.json
|
1075
1257
|
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
1076
1258
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1115,7 +1297,6 @@ class StarCoderModel(Model):
|
|
1115
1297
|
def set_gguf_parameters(self):
|
1116
1298
|
block_count = self.hparams["n_layer"]
|
1117
1299
|
|
1118
|
-
self.gguf_writer.add_name("StarCoder")
|
1119
1300
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
1120
1301
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
1121
1302
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
@@ -1135,11 +1316,11 @@ class RefactModel(Model):
|
|
1135
1316
|
|
1136
1317
|
# TODO: how to determine special FIM tokens automatically?
|
1137
1318
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
1138
|
-
special_token_types = ['prefix', 'suffix', 'middle', '
|
1319
|
+
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
1139
1320
|
special_vocab._set_special_token("prefix", 1)
|
1140
1321
|
special_vocab._set_special_token("suffix", 3)
|
1141
1322
|
special_vocab._set_special_token("middle", 2)
|
1142
|
-
special_vocab.
|
1323
|
+
special_vocab.chat_template = None # do not add it twice
|
1143
1324
|
special_vocab.add_to_gguf(self.gguf_writer)
|
1144
1325
|
|
1145
1326
|
def set_gguf_parameters(self):
|
@@ -1151,7 +1332,6 @@ class RefactModel(Model):
|
|
1151
1332
|
|
1152
1333
|
block_count = self.hparams["n_layer"]
|
1153
1334
|
|
1154
|
-
self.gguf_writer.add_name("Refact")
|
1155
1335
|
# refact uses Alibi. So this is from config.json which might be used by training.
|
1156
1336
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
1157
1337
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
@@ -1199,14 +1379,13 @@ class StableLMModel(Model):
|
|
1199
1379
|
if (self.dir_model / "tokenizer.json").is_file():
|
1200
1380
|
self._set_vocab_gpt2()
|
1201
1381
|
else:
|
1202
|
-
# StableLM 2 1.6B
|
1382
|
+
# StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
|
1203
1383
|
self._set_vocab_qwen()
|
1204
1384
|
|
1205
1385
|
def set_gguf_parameters(self):
|
1206
1386
|
hparams = self.hparams
|
1207
1387
|
block_count = hparams["num_hidden_layers"]
|
1208
1388
|
|
1209
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1210
1389
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
1211
1390
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1212
1391
|
self.gguf_writer.add_block_count(block_count)
|
@@ -1268,8 +1447,8 @@ class StableLMModel(Model):
|
|
1268
1447
|
|
1269
1448
|
return [(new_name, data_torch)]
|
1270
1449
|
|
1271
|
-
def
|
1272
|
-
super().
|
1450
|
+
def prepare_tensors(self):
|
1451
|
+
super().prepare_tensors()
|
1273
1452
|
|
1274
1453
|
if self._q_norms is not None or self._k_norms is not None:
|
1275
1454
|
# flatten two `list[dict[str, Tensor]]` into a single `list[str]`
|
@@ -1281,85 +1460,6 @@ class StableLMModel(Model):
|
|
1281
1460
|
if len(norms) > 0:
|
1282
1461
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1283
1462
|
|
1284
|
-
def write_tensors(self):
|
1285
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
1286
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
1287
|
-
n_head = self.hparams.get("num_attention_heads")
|
1288
|
-
n_kv_head = self.hparams.get("num_key_value_heads")
|
1289
|
-
q_norms = dict()
|
1290
|
-
k_norms = dict()
|
1291
|
-
for name, data_torch in self.get_tensors():
|
1292
|
-
# we don't need these
|
1293
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
1294
|
-
continue
|
1295
|
-
|
1296
|
-
old_dtype = data_torch.dtype
|
1297
|
-
|
1298
|
-
# convert any unsupported data types to float32
|
1299
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1300
|
-
data_torch = data_torch.to(torch.float32)
|
1301
|
-
|
1302
|
-
data = data_torch.squeeze().numpy()
|
1303
|
-
n_dims = len(data.shape)
|
1304
|
-
if name.find("q_layernorm.norms") != -1:
|
1305
|
-
q_norms[name] = data
|
1306
|
-
if len(q_norms) >= (block_count * n_head):
|
1307
|
-
self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
|
1308
|
-
continue
|
1309
|
-
if name.find("k_layernorm.norms") != -1:
|
1310
|
-
k_norms[name] = data
|
1311
|
-
if len(k_norms) >= (block_count * n_kv_head):
|
1312
|
-
self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
|
1313
|
-
continue
|
1314
|
-
|
1315
|
-
# map tensor names
|
1316
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1317
|
-
if new_name is None:
|
1318
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1319
|
-
|
1320
|
-
n_dims = len(data.shape)
|
1321
|
-
data_dtype = data.dtype
|
1322
|
-
|
1323
|
-
# if f32 desired, convert any float16 to float32
|
1324
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1325
|
-
data = data.astype(np.float32)
|
1326
|
-
|
1327
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1328
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
1329
|
-
data = data.astype(np.float32)
|
1330
|
-
|
1331
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1332
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
1333
|
-
data = data.astype(np.float16)
|
1334
|
-
|
1335
|
-
logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1336
|
-
|
1337
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1338
|
-
|
1339
|
-
def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
|
1340
|
-
for bid in range(block_count):
|
1341
|
-
datas = []
|
1342
|
-
for xid in range(n_head):
|
1343
|
-
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
1344
|
-
datas.append(norms[ename])
|
1345
|
-
del norms[ename]
|
1346
|
-
data = np.stack(datas, axis=0)
|
1347
|
-
data_dtype = data.dtype
|
1348
|
-
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
1349
|
-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
1350
|
-
if new_name is None:
|
1351
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1352
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
1353
|
-
data = data.astype(np.float32)
|
1354
|
-
|
1355
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1356
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
1357
|
-
data = data.astype(np.float16)
|
1358
|
-
|
1359
|
-
logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
1360
|
-
|
1361
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1362
|
-
|
1363
1463
|
|
1364
1464
|
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1365
1465
|
class LlamaModel(Model):
|
@@ -1367,7 +1467,7 @@ class LlamaModel(Model):
|
|
1367
1467
|
|
1368
1468
|
def set_vocab(self):
|
1369
1469
|
try:
|
1370
|
-
self.
|
1470
|
+
self._set_vocab_sentencepiece()
|
1371
1471
|
except FileNotFoundError:
|
1372
1472
|
try:
|
1373
1473
|
self._set_vocab_llama_hf()
|
@@ -1391,13 +1491,29 @@ class LlamaModel(Model):
|
|
1391
1491
|
super().set_gguf_parameters()
|
1392
1492
|
hparams = self.hparams
|
1393
1493
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1394
|
-
|
1494
|
+
|
1495
|
+
if "head_dim" in hparams:
|
1496
|
+
rope_dim = hparams["head_dim"]
|
1497
|
+
else:
|
1498
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1499
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1395
1500
|
|
1396
1501
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1397
1502
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
1398
1503
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1399
1504
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1400
1505
|
|
1506
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1507
|
+
if tokenizer_config_file.is_file():
|
1508
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1509
|
+
tokenizer_config_json = json.load(f)
|
1510
|
+
if "add_prefix_space" in tokenizer_config_json:
|
1511
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1512
|
+
|
1513
|
+
# Apply to granite small models only
|
1514
|
+
if self.hparams.get("vocab_size", 32000) == 49152:
|
1515
|
+
self.gguf_writer.add_add_bos_token(False)
|
1516
|
+
|
1401
1517
|
@staticmethod
|
1402
1518
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1403
1519
|
if n_head_kv is not None and n_head != n_head_kv:
|
@@ -1412,9 +1528,9 @@ class LlamaModel(Model):
|
|
1412
1528
|
n_head = self.hparams["num_attention_heads"]
|
1413
1529
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
1414
1530
|
|
1415
|
-
if name.endswith("q_proj.weight"):
|
1531
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1416
1532
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1417
|
-
if name.endswith("k_proj.weight"):
|
1533
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1418
1534
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1419
1535
|
|
1420
1536
|
# process the experts separately
|
@@ -1453,8 +1569,8 @@ class LlamaModel(Model):
|
|
1453
1569
|
|
1454
1570
|
return [(self.map_tensor_name(name), data_torch)]
|
1455
1571
|
|
1456
|
-
def
|
1457
|
-
super().
|
1572
|
+
def prepare_tensors(self):
|
1573
|
+
super().prepare_tensors()
|
1458
1574
|
|
1459
1575
|
if self._experts is not None:
|
1460
1576
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -1463,6 +1579,48 @@ class LlamaModel(Model):
|
|
1463
1579
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1464
1580
|
|
1465
1581
|
|
1582
|
+
@Model.register("BitnetForCausalLM")
|
1583
|
+
class BitnetModel(Model):
|
1584
|
+
model_arch = gguf.MODEL_ARCH.BITNET
|
1585
|
+
|
1586
|
+
def set_vocab(self):
|
1587
|
+
self._set_vocab_sentencepiece()
|
1588
|
+
|
1589
|
+
def set_gguf_parameters(self):
|
1590
|
+
super().set_gguf_parameters()
|
1591
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1592
|
+
self.gguf_writer.add_rope_scaling_factor(1.0)
|
1593
|
+
|
1594
|
+
def weight_quant(self, weight):
|
1595
|
+
dtype = weight.dtype
|
1596
|
+
weight = weight.float()
|
1597
|
+
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
1598
|
+
weight = (weight * s).round().clamp(-1, 1) / s
|
1599
|
+
scale = weight.abs().max().unsqueeze(0)
|
1600
|
+
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
1601
|
+
weight = torch.sign(weight).type(dtype)
|
1602
|
+
return weight.type(dtype), scale.type(torch.float32)
|
1603
|
+
|
1604
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1605
|
+
new_name = self.map_tensor_name(name)
|
1606
|
+
|
1607
|
+
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
1608
|
+
gguf.MODEL_TENSOR.ATTN_Q,
|
1609
|
+
gguf.MODEL_TENSOR.ATTN_K,
|
1610
|
+
gguf.MODEL_TENSOR.ATTN_V,
|
1611
|
+
gguf.MODEL_TENSOR.ATTN_OUT,
|
1612
|
+
gguf.MODEL_TENSOR.FFN_UP,
|
1613
|
+
gguf.MODEL_TENSOR.FFN_DOWN,
|
1614
|
+
gguf.MODEL_TENSOR.FFN_GATE,
|
1615
|
+
]):
|
1616
|
+
# transform weight into 1/0/-1 (in fp32)
|
1617
|
+
weight_torch, scale_torch = self.weight_quant(data_torch)
|
1618
|
+
yield (new_name, weight_torch)
|
1619
|
+
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
1620
|
+
else:
|
1621
|
+
yield (new_name, data_torch)
|
1622
|
+
|
1623
|
+
|
1466
1624
|
@Model.register("GrokForCausalLM")
|
1467
1625
|
class GrokModel(Model):
|
1468
1626
|
model_arch = gguf.MODEL_ARCH.GROK
|
@@ -1475,7 +1633,6 @@ class GrokModel(Model):
|
|
1475
1633
|
|
1476
1634
|
def set_gguf_parameters(self):
|
1477
1635
|
super().set_gguf_parameters()
|
1478
|
-
self.gguf_writer.add_name("Grok")
|
1479
1636
|
|
1480
1637
|
_experts: list[dict[str, Tensor]] | None = None
|
1481
1638
|
|
@@ -1524,7 +1681,6 @@ class DbrxModel(Model):
|
|
1524
1681
|
def set_gguf_parameters(self):
|
1525
1682
|
ffn_config = self.hparams["ffn_config"]
|
1526
1683
|
attn_config = self.hparams["attn_config"]
|
1527
|
-
self.gguf_writer.add_name(self.hparams["model_type"])
|
1528
1684
|
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
1529
1685
|
|
1530
1686
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
@@ -1537,7 +1693,6 @@ class DbrxModel(Model):
|
|
1537
1693
|
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
1538
1694
|
|
1539
1695
|
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
1540
|
-
self.gguf_writer.add_file_type(self.ftype)
|
1541
1696
|
|
1542
1697
|
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
1543
1698
|
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
@@ -1594,7 +1749,6 @@ class MiniCPMModel(Model):
|
|
1594
1749
|
|
1595
1750
|
def set_gguf_parameters(self):
|
1596
1751
|
block_count = self.hparams["num_hidden_layers"]
|
1597
|
-
self.gguf_writer.add_name("MiniCPM")
|
1598
1752
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1599
1753
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
1600
1754
|
self.gguf_writer.add_block_count(block_count)
|
@@ -1612,9 +1766,11 @@ class MiniCPMModel(Model):
|
|
1612
1766
|
if n_kv_head is not None and n_head != n_kv_head:
|
1613
1767
|
n_head = n_kv_head
|
1614
1768
|
|
1615
|
-
return (
|
1616
|
-
|
1617
|
-
|
1769
|
+
return (
|
1770
|
+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1771
|
+
.swapaxes(1, 2)
|
1772
|
+
.reshape(weights.shape)
|
1773
|
+
)
|
1618
1774
|
|
1619
1775
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1620
1776
|
del bid # unused
|
@@ -1662,7 +1818,6 @@ class QwenModel(Model):
|
|
1662
1818
|
self._set_vocab_qwen()
|
1663
1819
|
|
1664
1820
|
def set_gguf_parameters(self):
|
1665
|
-
self.gguf_writer.add_name("Qwen")
|
1666
1821
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1667
1822
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
1668
1823
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1693,6 +1848,12 @@ class Qwen2MoeModel(Model):
|
|
1693
1848
|
super().set_gguf_parameters()
|
1694
1849
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
1695
1850
|
self.gguf_writer.add_expert_count(n_experts)
|
1851
|
+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
1852
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
1853
|
+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
1854
|
+
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
1855
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
1856
|
+
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
1696
1857
|
|
1697
1858
|
_experts: list[dict[str, Tensor]] | None = None
|
1698
1859
|
|
@@ -1732,8 +1893,8 @@ class Qwen2MoeModel(Model):
|
|
1732
1893
|
|
1733
1894
|
return [(self.map_tensor_name(name), data_torch)]
|
1734
1895
|
|
1735
|
-
def
|
1736
|
-
super().
|
1896
|
+
def prepare_tensors(self):
|
1897
|
+
super().prepare_tensors()
|
1737
1898
|
|
1738
1899
|
if self._experts is not None:
|
1739
1900
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -1747,7 +1908,6 @@ class GPT2Model(Model):
|
|
1747
1908
|
model_arch = gguf.MODEL_ARCH.GPT2
|
1748
1909
|
|
1749
1910
|
def set_gguf_parameters(self):
|
1750
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1751
1911
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
1752
1912
|
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
1753
1913
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
@@ -1790,7 +1950,6 @@ class Phi2Model(Model):
|
|
1790
1950
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
1791
1951
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
1792
1952
|
|
1793
|
-
self.gguf_writer.add_name("Phi2")
|
1794
1953
|
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
1795
1954
|
|
1796
1955
|
self.gguf_writer.add_embedding_length(n_embd)
|
@@ -1823,7 +1982,7 @@ class Phi3MiniModel(Model):
|
|
1823
1982
|
|
1824
1983
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
1825
1984
|
scores: list[float] = [-10000.0] * vocab_size
|
1826
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
1985
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
1827
1986
|
|
1828
1987
|
for token_id in range(tokenizer.vocab_size()):
|
1829
1988
|
|
@@ -1852,7 +2011,7 @@ class Phi3MiniModel(Model):
|
|
1852
2011
|
|
1853
2012
|
for key in added_tokens_json:
|
1854
2013
|
token_id = added_tokens_json[key]
|
1855
|
-
if
|
2014
|
+
if token_id >= vocab_size:
|
1856
2015
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
1857
2016
|
continue
|
1858
2017
|
|
@@ -1868,8 +2027,9 @@ class Phi3MiniModel(Model):
|
|
1868
2027
|
for token_id, foken_data in added_tokens_decoder.items():
|
1869
2028
|
token_id = int(token_id)
|
1870
2029
|
token = foken_data["content"].encode("utf-8")
|
1871
|
-
if toktypes[token_id] != SentencePieceTokenTypes.
|
1872
|
-
|
2030
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2031
|
+
if tokens[token_id] != token:
|
2032
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
1873
2033
|
tokens[token_id] = token
|
1874
2034
|
scores[token_id] = -1000.0
|
1875
2035
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
@@ -1884,8 +2044,9 @@ class Phi3MiniModel(Model):
|
|
1884
2044
|
for foken_data in added_tokens:
|
1885
2045
|
token_id = int(foken_data["id"])
|
1886
2046
|
token = foken_data["content"].encode("utf-8")
|
1887
|
-
if toktypes[token_id] != SentencePieceTokenTypes.
|
1888
|
-
|
2047
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2048
|
+
if tokens[token_id] != token:
|
2049
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
1889
2050
|
tokens[token_id] = token
|
1890
2051
|
scores[token_id] = -1000.0
|
1891
2052
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
@@ -1912,7 +2073,6 @@ class Phi3MiniModel(Model):
|
|
1912
2073
|
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
1913
2074
|
rope_dims = n_embd // n_head
|
1914
2075
|
|
1915
|
-
self.gguf_writer.add_name("Phi3")
|
1916
2076
|
self.gguf_writer.add_context_length(max_pos_embds)
|
1917
2077
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
1918
2078
|
self.gguf_writer.add_embedding_length(n_embd)
|
@@ -1924,10 +2084,11 @@ class Phi3MiniModel(Model):
|
|
1924
2084
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
1925
2085
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
1926
2086
|
self.gguf_writer.add_file_type(self.ftype)
|
2087
|
+
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
|
1927
2088
|
|
1928
2089
|
# write rope scaling for long context (128k) model
|
1929
2090
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
1930
|
-
if
|
2091
|
+
if rope_scaling is None:
|
1931
2092
|
return
|
1932
2093
|
|
1933
2094
|
scale = max_pos_embds / orig_max_pos_embds
|
@@ -1936,7 +2097,7 @@ class Phi3MiniModel(Model):
|
|
1936
2097
|
if len(rope_scaling_type) == 0:
|
1937
2098
|
raise KeyError('Missing the required key rope_scaling.type')
|
1938
2099
|
|
1939
|
-
if rope_scaling_type == 'su':
|
2100
|
+
if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
|
1940
2101
|
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
|
1941
2102
|
elif rope_scaling_type == 'yarn':
|
1942
2103
|
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
|
@@ -1969,7 +2130,6 @@ class PlamoModel(Model):
|
|
1969
2130
|
hparams = self.hparams
|
1970
2131
|
block_count = hparams["num_hidden_layers"]
|
1971
2132
|
|
1972
|
-
self.gguf_writer.add_name("PLaMo")
|
1973
2133
|
self.gguf_writer.add_context_length(4096) # not in config.json
|
1974
2134
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1975
2135
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
@@ -2014,7 +2174,6 @@ class CodeShellModel(Model):
|
|
2014
2174
|
def set_gguf_parameters(self):
|
2015
2175
|
block_count = self.hparams["n_layer"]
|
2016
2176
|
|
2017
|
-
self.gguf_writer.add_name("CodeShell")
|
2018
2177
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
2019
2178
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
2020
2179
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
@@ -2066,7 +2225,7 @@ class InternLM2Model(Model):
|
|
2066
2225
|
logger.error(f'Error: Missing {tokenizer_path}')
|
2067
2226
|
sys.exit(1)
|
2068
2227
|
|
2069
|
-
sentencepiece_model = model.ModelProto()
|
2228
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
2070
2229
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
2071
2230
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
2072
2231
|
|
@@ -2094,6 +2253,9 @@ class InternLM2Model(Model):
|
|
2094
2253
|
toktype = SentencePieceTokenTypes.UNUSED
|
2095
2254
|
elif tokenizer.IsByte(token_id):
|
2096
2255
|
toktype = SentencePieceTokenTypes.BYTE
|
2256
|
+
# take care of ununsed raw token
|
2257
|
+
if piece.startswith('[UNUSED'):
|
2258
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
2097
2259
|
|
2098
2260
|
tokens.append(text)
|
2099
2261
|
scores.append(score)
|
@@ -2109,6 +2271,49 @@ class InternLM2Model(Model):
|
|
2109
2271
|
scores.append(-1000.0)
|
2110
2272
|
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
2111
2273
|
|
2274
|
+
chat_eos_token = '<|im_end|>'
|
2275
|
+
chat_eos_token_id = None
|
2276
|
+
|
2277
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2278
|
+
if tokenizer_config_file.is_file():
|
2279
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2280
|
+
tokenizer_config_json = json.load(f)
|
2281
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
2282
|
+
for token_id, foken_data in added_tokens_decoder.items():
|
2283
|
+
token_id = int(token_id)
|
2284
|
+
token = foken_data["content"]
|
2285
|
+
if token == chat_eos_token:
|
2286
|
+
chat_eos_token_id = token_id
|
2287
|
+
token = token.encode("utf-8")
|
2288
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2289
|
+
if tokens[token_id] != token:
|
2290
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
2291
|
+
tokens[token_id] = token
|
2292
|
+
scores[token_id] = -1000.0
|
2293
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2294
|
+
if foken_data.get("special"):
|
2295
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2296
|
+
|
2297
|
+
tokenizer_file = self.dir_model / 'tokenizer.json'
|
2298
|
+
if tokenizer_file.is_file():
|
2299
|
+
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
2300
|
+
tokenizer_json = json.load(f)
|
2301
|
+
added_tokens = tokenizer_json.get("added_tokens", [])
|
2302
|
+
for foken_data in added_tokens:
|
2303
|
+
token_id = int(foken_data["id"])
|
2304
|
+
token = foken_data["content"]
|
2305
|
+
if token == chat_eos_token:
|
2306
|
+
chat_eos_token_id = token_id
|
2307
|
+
token = token.encode("utf-8")
|
2308
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2309
|
+
if tokens[token_id] != token:
|
2310
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
2311
|
+
tokens[token_id] = token
|
2312
|
+
scores[token_id] = -1000.0
|
2313
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2314
|
+
if foken_data.get("special"):
|
2315
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2316
|
+
|
2112
2317
|
self.gguf_writer.add_tokenizer_model("llama")
|
2113
2318
|
self.gguf_writer.add_tokenizer_pre("default")
|
2114
2319
|
self.gguf_writer.add_token_list(tokens)
|
@@ -2118,37 +2323,17 @@ class InternLM2Model(Model):
|
|
2118
2323
|
|
2119
2324
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2120
2325
|
old_eos = special_vocab.special_token_ids["eos"]
|
2121
|
-
if
|
2326
|
+
if chat_eos_token_id is not None:
|
2122
2327
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
2123
2328
|
# TODO: this is a hack, should be fixed
|
2124
2329
|
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
2125
|
-
special_vocab.special_token_ids["eos"] =
|
2126
|
-
logger.warning(f"Replace eos:{old_eos} with a special token:{
|
2127
|
-
in chat mode so that the conversation can end normally.")
|
2330
|
+
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
2331
|
+
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
2332
|
+
" in chat mode so that the conversation can end normally.")
|
2128
2333
|
|
2129
2334
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2130
2335
|
|
2131
|
-
def _try_get_sft_eos(self, tokenizer):
|
2132
|
-
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
|
2133
|
-
im_end_list = tokenizer.Encode('<|im_end|>')
|
2134
|
-
eos_token = None
|
2135
|
-
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
|
2136
|
-
if len(unused_145_list) == 1:
|
2137
|
-
eos_token = unused_145_list[0]
|
2138
|
-
if len(im_end_list) == 1:
|
2139
|
-
eos_token = im_end_list[0]
|
2140
|
-
assert eos_token
|
2141
|
-
return eos_token
|
2142
|
-
|
2143
|
-
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
2144
|
-
if n_head_kv is not None and n_head != n_head_kv:
|
2145
|
-
n_head = n_head_kv
|
2146
|
-
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
2147
|
-
.swapaxes(1, 2)
|
2148
|
-
.reshape(weights.shape))
|
2149
|
-
|
2150
2336
|
def set_gguf_parameters(self):
|
2151
|
-
self.gguf_writer.add_name("InternLM2")
|
2152
2337
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
2153
2338
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
2154
2339
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -2158,30 +2343,30 @@ in chat mode so that the conversation can end normally.")
|
|
2158
2343
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2159
2344
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
2160
2345
|
self.gguf_writer.add_file_type(self.ftype)
|
2346
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2347
|
+
if self.hparams["rope_scaling"].get("type") == "linear":
|
2348
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2349
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2161
2350
|
|
2162
2351
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2163
2352
|
num_heads = self.hparams["num_attention_heads"]
|
2164
2353
|
num_kv_heads = self.hparams["num_key_value_heads"]
|
2165
|
-
|
2354
|
+
n_embd = self.hparams["hidden_size"]
|
2166
2355
|
q_per_kv = num_heads // num_kv_heads
|
2167
|
-
head_dim =
|
2356
|
+
head_dim = n_embd // num_heads
|
2168
2357
|
num_groups = num_heads // q_per_kv
|
2169
2358
|
|
2170
|
-
|
2171
|
-
|
2172
|
-
if re.match(qkv_pattern, name):
|
2173
|
-
bid = re.findall(qkv_pattern, name)[0]
|
2359
|
+
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
2174
2360
|
qkv = data_torch
|
2175
|
-
|
2176
|
-
qkv = qkv.
|
2177
|
-
q, k, v = qkv[
|
2361
|
+
|
2362
|
+
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
|
2363
|
+
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
2364
|
+
|
2178
2365
|
# The model weights of q and k equire additional reshape.
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2182
|
-
|
2183
|
-
# v = rearrange(v, " o g n i -> o (g n i)").T
|
2184
|
-
v = v.reshape((v.shape[0], -1)).T
|
2366
|
+
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
2367
|
+
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
2368
|
+
v = v.reshape((-1, v.shape[-1]))
|
2369
|
+
|
2185
2370
|
return [
|
2186
2371
|
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
|
2187
2372
|
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
|
@@ -2308,13 +2493,55 @@ class GemmaModel(Model):
|
|
2308
2493
|
special_vocab._set_special_token("middle", 68)
|
2309
2494
|
special_vocab._set_special_token("fsep", 70)
|
2310
2495
|
special_vocab._set_special_token("eot", 107)
|
2496
|
+
special_vocab.chat_template = None # do not add it twice
|
2311
2497
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2312
2498
|
|
2499
|
+
self.gguf_writer.add_add_space_prefix(False)
|
2500
|
+
|
2501
|
+
def set_gguf_parameters(self):
|
2502
|
+
hparams = self.hparams
|
2503
|
+
block_count = hparams["num_hidden_layers"]
|
2504
|
+
|
2505
|
+
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2506
|
+
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2507
|
+
self.gguf_writer.add_block_count(block_count)
|
2508
|
+
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
2509
|
+
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
2510
|
+
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
2511
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2512
|
+
self.gguf_writer.add_key_length(hparams["head_dim"])
|
2513
|
+
self.gguf_writer.add_value_length(hparams["head_dim"])
|
2514
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2515
|
+
|
2516
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2517
|
+
del bid # unused
|
2518
|
+
|
2519
|
+
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
2520
|
+
# To prevent errors, skip loading lm_head.weight.
|
2521
|
+
if name == "lm_head.weight":
|
2522
|
+
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
2523
|
+
return []
|
2524
|
+
|
2525
|
+
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
2526
|
+
if name.endswith("norm.weight"):
|
2527
|
+
data_torch = data_torch + 1
|
2528
|
+
|
2529
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2530
|
+
|
2531
|
+
|
2532
|
+
@Model.register("Gemma2ForCausalLM")
|
2533
|
+
class Gemma2Model(Model):
|
2534
|
+
model_arch = gguf.MODEL_ARCH.GEMMA2
|
2535
|
+
|
2536
|
+
def set_vocab(self):
|
2537
|
+
self._set_vocab_sentencepiece()
|
2538
|
+
|
2539
|
+
self.gguf_writer.add_add_space_prefix(False)
|
2540
|
+
|
2313
2541
|
def set_gguf_parameters(self):
|
2314
2542
|
hparams = self.hparams
|
2315
2543
|
block_count = hparams["num_hidden_layers"]
|
2316
2544
|
|
2317
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
2318
2545
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2319
2546
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2320
2547
|
self.gguf_writer.add_block_count(block_count)
|
@@ -2325,6 +2552,13 @@ class GemmaModel(Model):
|
|
2325
2552
|
self.gguf_writer.add_key_length(hparams["head_dim"])
|
2326
2553
|
self.gguf_writer.add_value_length(hparams["head_dim"])
|
2327
2554
|
self.gguf_writer.add_file_type(self.ftype)
|
2555
|
+
self.gguf_writer.add_attn_logit_softcapping(
|
2556
|
+
self.hparams["attn_logit_softcapping"]
|
2557
|
+
)
|
2558
|
+
self.gguf_writer.add_final_logit_softcapping(
|
2559
|
+
self.hparams["final_logit_softcapping"]
|
2560
|
+
)
|
2561
|
+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
2328
2562
|
|
2329
2563
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2330
2564
|
del bid # unused
|
@@ -2366,39 +2600,7 @@ class MambaModel(Model):
|
|
2366
2600
|
self._set_vocab_sentencepiece()
|
2367
2601
|
else:
|
2368
2602
|
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
2369
|
-
|
2370
|
-
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
2371
|
-
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
|
2372
|
-
|
2373
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
2374
|
-
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
|
2375
|
-
|
2376
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
2377
|
-
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
|
2378
|
-
|
2379
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
2380
|
-
assert field
|
2381
|
-
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
2382
|
-
|
2383
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
2384
|
-
assert field
|
2385
|
-
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
2386
|
-
|
2387
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
2388
|
-
assert field
|
2389
|
-
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
2390
|
-
|
2391
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
|
2392
|
-
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
|
2393
|
-
|
2394
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
2395
|
-
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2396
|
-
|
2397
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
|
2398
|
-
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2399
|
-
|
2400
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
|
2401
|
-
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2603
|
+
self._set_vocab_builtin("gpt-neox", vocab_size)
|
2402
2604
|
|
2403
2605
|
def set_gguf_parameters(self):
|
2404
2606
|
d_model = self.find_hparam(["hidden_size", "d_model"])
|
@@ -2414,7 +2616,6 @@ class MambaModel(Model):
|
|
2414
2616
|
# Fail early for models which don't have a block expansion factor of 2
|
2415
2617
|
assert d_inner == 2 * d_model
|
2416
2618
|
|
2417
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
2418
2619
|
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
2419
2620
|
self.gguf_writer.add_embedding_length(d_model)
|
2420
2621
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
@@ -2521,18 +2722,20 @@ class JinaBertV2Model(BertModel):
|
|
2521
2722
|
|
2522
2723
|
def get_tensors(self):
|
2523
2724
|
for name, data in super().get_tensors():
|
2524
|
-
if '
|
2725
|
+
if 'gated_layer' in name:
|
2525
2726
|
d1 = data[:self.intermediate_size, :]
|
2526
2727
|
name1 = name.replace('gated_layers', 'gated_layers_w')
|
2728
|
+
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
2527
2729
|
d2 = data[self.intermediate_size:, :]
|
2528
2730
|
name2 = name.replace('gated_layers', 'gated_layers_v')
|
2731
|
+
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
2529
2732
|
yield name1, d1
|
2530
2733
|
yield name2, d2
|
2531
2734
|
continue
|
2532
2735
|
|
2533
2736
|
yield name, data
|
2534
2737
|
|
2535
|
-
def set_vocab(self
|
2738
|
+
def set_vocab(self):
|
2536
2739
|
tokenizer_class = 'BertTokenizer'
|
2537
2740
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
2538
2741
|
tokenizer_class = json.load(f)['tokenizer_class']
|
@@ -2548,17 +2751,92 @@ class JinaBertV2Model(BertModel):
|
|
2548
2751
|
self.gguf_writer.add_add_eos_token(True)
|
2549
2752
|
|
2550
2753
|
|
2551
|
-
@Model.register("
|
2552
|
-
class
|
2553
|
-
model_arch = gguf.MODEL_ARCH.
|
2554
|
-
|
2555
|
-
def set_vocab(self):
|
2556
|
-
# The reason for using a custom implementation here is that the
|
2557
|
-
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
2558
|
-
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
2559
|
-
from sentencepiece import SentencePieceProcessor
|
2754
|
+
@Model.register("OpenELMForCausalLM")
|
2755
|
+
class OpenELMModel(Model):
|
2756
|
+
model_arch = gguf.MODEL_ARCH.OPENELM
|
2560
2757
|
|
2561
|
-
|
2758
|
+
@staticmethod
|
2759
|
+
def _make_divisible(v: float | int, divisor: int) -> int:
|
2760
|
+
# ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
|
2761
|
+
new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
|
2762
|
+
# Make sure that round down does not go down by more than 10%.
|
2763
|
+
if new_v < 0.9 * v:
|
2764
|
+
new_v += divisor
|
2765
|
+
return new_v
|
2766
|
+
|
2767
|
+
def __init__(self, *args, **kwargs):
|
2768
|
+
super().__init__(*args, **kwargs)
|
2769
|
+
|
2770
|
+
ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
|
2771
|
+
ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
|
2772
|
+
self._n_embd: int = self.hparams["model_dim"]
|
2773
|
+
self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
|
2774
|
+
self._num_query_heads: list[int] = self.hparams["num_query_heads"]
|
2775
|
+
self._ffn_dims: list[int] = [
|
2776
|
+
OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
|
2777
|
+
for multiplier in ffn_multipliers
|
2778
|
+
]
|
2779
|
+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
2780
|
+
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
|
2781
|
+
|
2782
|
+
# Uses the tokenizer from meta-llama/Llama-2-7b-hf
|
2783
|
+
def set_vocab(self):
|
2784
|
+
try:
|
2785
|
+
self._set_vocab_sentencepiece()
|
2786
|
+
except FileNotFoundError:
|
2787
|
+
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
|
2788
|
+
|
2789
|
+
def set_gguf_parameters(self):
|
2790
|
+
n_embd = self._n_embd
|
2791
|
+
head_dim = self.hparams["head_dim"]
|
2792
|
+
rot_pct = 1.0
|
2793
|
+
assert self.block_count == len(self._num_kv_heads)
|
2794
|
+
assert self.block_count == len(self._num_query_heads)
|
2795
|
+
assert self.block_count == len(self._ffn_dims)
|
2796
|
+
|
2797
|
+
self.gguf_writer.add_block_count(self.block_count)
|
2798
|
+
self.gguf_writer.add_context_length(self.hparams["max_context_length"])
|
2799
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
2800
|
+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
2801
|
+
self.gguf_writer.add_head_count(self._num_query_heads)
|
2802
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
2803
|
+
self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
|
2804
|
+
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
|
2805
|
+
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
|
2806
|
+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
|
2807
|
+
self.gguf_writer.add_key_length(head_dim)
|
2808
|
+
self.gguf_writer.add_value_length(head_dim)
|
2809
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2810
|
+
|
2811
|
+
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
2812
|
+
if "n_layers" in keys:
|
2813
|
+
return self.hparams["num_transformer_layers"]
|
2814
|
+
|
2815
|
+
return super().find_hparam(keys, optional)
|
2816
|
+
|
2817
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2818
|
+
|
2819
|
+
# split ff
|
2820
|
+
if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
|
2821
|
+
ff_dim = self._ffn_dims[bid]
|
2822
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
|
2823
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
|
2824
|
+
return
|
2825
|
+
|
2826
|
+
yield (self.map_tensor_name(name), data_torch)
|
2827
|
+
|
2828
|
+
|
2829
|
+
@Model.register("ArcticForCausalLM")
|
2830
|
+
class ArcticModel(Model):
|
2831
|
+
model_arch = gguf.MODEL_ARCH.ARCTIC
|
2832
|
+
|
2833
|
+
def set_vocab(self):
|
2834
|
+
# The reason for using a custom implementation here is that the
|
2835
|
+
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
2836
|
+
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
2837
|
+
from sentencepiece import SentencePieceProcessor
|
2838
|
+
|
2839
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
2562
2840
|
|
2563
2841
|
if not tokenizer_path.is_file():
|
2564
2842
|
logger.error(f'Error: Missing {tokenizer_path}')
|
@@ -2572,7 +2850,7 @@ class ArcticModel(Model):
|
|
2572
2850
|
|
2573
2851
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
2574
2852
|
scores: list[float] = [-10000.0] * vocab_size
|
2575
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
2853
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
2576
2854
|
|
2577
2855
|
for token_id in range(tokenizer.vocab_size()):
|
2578
2856
|
|
@@ -2605,7 +2883,7 @@ class ArcticModel(Model):
|
|
2605
2883
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
2606
2884
|
for token_id, token_json in added_tokens_decoder.items():
|
2607
2885
|
token_id = int(token_id)
|
2608
|
-
if
|
2886
|
+
if token_id >= vocab_size:
|
2609
2887
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
2610
2888
|
continue
|
2611
2889
|
|
@@ -2689,8 +2967,8 @@ class ArcticModel(Model):
|
|
2689
2967
|
|
2690
2968
|
return [(self.map_tensor_name(name), data_torch)]
|
2691
2969
|
|
2692
|
-
def
|
2693
|
-
super().
|
2970
|
+
def prepare_tensors(self):
|
2971
|
+
super().prepare_tensors()
|
2694
2972
|
|
2695
2973
|
if self._experts is not None:
|
2696
2974
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -2699,6 +2977,499 @@ class ArcticModel(Model):
|
|
2699
2977
|
raise ValueError(f"Unprocessed experts: {experts}")
|
2700
2978
|
|
2701
2979
|
|
2980
|
+
@Model.register("DeepseekV2ForCausalLM")
|
2981
|
+
class DeepseekV2Model(Model):
|
2982
|
+
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
2983
|
+
|
2984
|
+
def set_vocab(self):
|
2985
|
+
self._set_vocab_gpt2()
|
2986
|
+
|
2987
|
+
def set_gguf_parameters(self):
|
2988
|
+
super().set_gguf_parameters()
|
2989
|
+
hparams = self.hparams
|
2990
|
+
|
2991
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
2992
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
2993
|
+
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
2994
|
+
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
2995
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
2996
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
2997
|
+
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
2998
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
2999
|
+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3000
|
+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3001
|
+
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
3002
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
3003
|
+
|
3004
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
3005
|
+
if self.hparams["rope_scaling"].get("type") == "yarn":
|
3006
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
3007
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
3008
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
3009
|
+
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
3010
|
+
|
3011
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3012
|
+
|
3013
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3014
|
+
# process the experts separately
|
3015
|
+
if name.find("mlp.experts") != -1:
|
3016
|
+
n_experts = self.hparams["n_routed_experts"]
|
3017
|
+
assert bid is not None
|
3018
|
+
|
3019
|
+
if self._experts is None:
|
3020
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3021
|
+
|
3022
|
+
self._experts[bid][name] = data_torch
|
3023
|
+
|
3024
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3025
|
+
tensors: list[tuple[str, Tensor]] = []
|
3026
|
+
|
3027
|
+
# merge the experts into a single 3d tensor
|
3028
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
3029
|
+
datas: list[Tensor] = []
|
3030
|
+
|
3031
|
+
for xid in range(n_experts):
|
3032
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
3033
|
+
datas.append(self._experts[bid][ename])
|
3034
|
+
del self._experts[bid][ename]
|
3035
|
+
|
3036
|
+
data_torch = torch.stack(datas, dim=0)
|
3037
|
+
|
3038
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
3039
|
+
|
3040
|
+
new_name = self.map_tensor_name(merged_name)
|
3041
|
+
|
3042
|
+
tensors.append((new_name, data_torch))
|
3043
|
+
return tensors
|
3044
|
+
else:
|
3045
|
+
return []
|
3046
|
+
|
3047
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3048
|
+
|
3049
|
+
def prepare_tensors(self):
|
3050
|
+
super().prepare_tensors()
|
3051
|
+
|
3052
|
+
if self._experts is not None:
|
3053
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3054
|
+
experts = [k for d in self._experts for k in d.keys()]
|
3055
|
+
if len(experts) > 0:
|
3056
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
3057
|
+
|
3058
|
+
|
3059
|
+
@Model.register("T5WithLMHeadModel")
|
3060
|
+
@Model.register("T5ForConditionalGeneration")
|
3061
|
+
@Model.register("MT5ForConditionalGeneration")
|
3062
|
+
@Model.register("UMT5ForConditionalGeneration")
|
3063
|
+
class T5Model(Model):
|
3064
|
+
model_arch = gguf.MODEL_ARCH.T5
|
3065
|
+
|
3066
|
+
def __init__(self, *args, **kwargs):
|
3067
|
+
super().__init__(*args, **kwargs)
|
3068
|
+
self.shared_token_embeddings_found = False
|
3069
|
+
|
3070
|
+
def set_vocab(self):
|
3071
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
3072
|
+
# exception when importing sentencepiece_model_pb2
|
3073
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
3074
|
+
from sentencepiece import SentencePieceProcessor
|
3075
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
3076
|
+
|
3077
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
3078
|
+
|
3079
|
+
# many older models use spiece.model tokenizer model filename
|
3080
|
+
if not tokenizer_path.is_file():
|
3081
|
+
tokenizer_path = self.dir_model / 'spiece.model'
|
3082
|
+
|
3083
|
+
if not tokenizer_path.is_file():
|
3084
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
3085
|
+
|
3086
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
3087
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
3088
|
+
|
3089
|
+
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
3090
|
+
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
3091
|
+
# assure the tokenizer model file name is correct
|
3092
|
+
assert tokenizer_path.name == 'tokenizer.model'
|
3093
|
+
return self._set_vocab_sentencepiece()
|
3094
|
+
else:
|
3095
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3096
|
+
|
3097
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
3098
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
3099
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
3100
|
+
|
3101
|
+
tokenizer = SentencePieceProcessor()
|
3102
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3103
|
+
|
3104
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
3105
|
+
|
3106
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3107
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3108
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3109
|
+
|
3110
|
+
for token_id in range(tokenizer.vocab_size()):
|
3111
|
+
piece = tokenizer.IdToPiece(token_id)
|
3112
|
+
text = piece.encode("utf-8")
|
3113
|
+
score = tokenizer.GetScore(token_id)
|
3114
|
+
|
3115
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3116
|
+
if tokenizer.IsUnknown(token_id):
|
3117
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3118
|
+
elif tokenizer.IsControl(token_id):
|
3119
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3120
|
+
elif tokenizer.IsUnused(token_id):
|
3121
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3122
|
+
elif tokenizer.IsByte(token_id):
|
3123
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3124
|
+
|
3125
|
+
tokens[token_id] = text
|
3126
|
+
scores[token_id] = score
|
3127
|
+
toktypes[token_id] = toktype
|
3128
|
+
|
3129
|
+
added_tokens_file = self.dir_model / 'added_tokens.json'
|
3130
|
+
if added_tokens_file.is_file():
|
3131
|
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
3132
|
+
added_tokens_json = json.load(f)
|
3133
|
+
for key in added_tokens_json:
|
3134
|
+
token_id = added_tokens_json[key]
|
3135
|
+
if token_id >= vocab_size:
|
3136
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
3137
|
+
continue
|
3138
|
+
|
3139
|
+
tokens[token_id] = key.encode("utf-8")
|
3140
|
+
scores[token_id] = -1000.0
|
3141
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
3142
|
+
|
3143
|
+
if vocab_size > len(tokens):
|
3144
|
+
pad_count = vocab_size - len(tokens)
|
3145
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
3146
|
+
for i in range(1, pad_count + 1):
|
3147
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
3148
|
+
scores.append(-1000.0)
|
3149
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
3150
|
+
|
3151
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
3152
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3153
|
+
self.gguf_writer.add_token_list(tokens)
|
3154
|
+
self.gguf_writer.add_token_scores(scores)
|
3155
|
+
self.gguf_writer.add_token_types(toktypes)
|
3156
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
3157
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3158
|
+
if precompiled_charsmap:
|
3159
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3160
|
+
|
3161
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3162
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3163
|
+
|
3164
|
+
self.gguf_writer.add_add_bos_token(False)
|
3165
|
+
self.gguf_writer.add_add_eos_token(True)
|
3166
|
+
|
3167
|
+
def set_gguf_parameters(self):
|
3168
|
+
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
3169
|
+
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
3170
|
+
n_ctx = 512
|
3171
|
+
self.gguf_writer.add_context_length(n_ctx)
|
3172
|
+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
3173
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
3174
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3175
|
+
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
3176
|
+
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
3177
|
+
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
3178
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3179
|
+
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
3180
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
3181
|
+
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
3182
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3183
|
+
|
3184
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3185
|
+
del bid # unused
|
3186
|
+
|
3187
|
+
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
|
3188
|
+
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
|
3189
|
+
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
|
3190
|
+
# and decoder and ignore the remaining ones.
|
3191
|
+
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
|
3192
|
+
if not self.shared_token_embeddings_found:
|
3193
|
+
name = "shared.weight"
|
3194
|
+
self.shared_token_embeddings_found = True
|
3195
|
+
else:
|
3196
|
+
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
|
3197
|
+
return []
|
3198
|
+
|
3199
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3200
|
+
|
3201
|
+
|
3202
|
+
@Model.register("JAISLMHeadModel")
|
3203
|
+
class JaisModel(Model):
|
3204
|
+
model_arch = gguf.MODEL_ARCH.JAIS
|
3205
|
+
|
3206
|
+
def __init__(self, *args, **kwargs):
|
3207
|
+
super().__init__(*args, **kwargs)
|
3208
|
+
|
3209
|
+
# SwigLU activation
|
3210
|
+
assert self.hparams["activation_function"] == "swiglu"
|
3211
|
+
# ALiBi position embedding
|
3212
|
+
assert self.hparams["position_embedding_type"] == "alibi"
|
3213
|
+
|
3214
|
+
# Embeddings scale
|
3215
|
+
self.embeddings_scale = 1.0
|
3216
|
+
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
3217
|
+
self.output_is_wte = False
|
3218
|
+
if 'mup_embeddings_scale' in self.hparams:
|
3219
|
+
self.output_is_wte = True # Hack (?)
|
3220
|
+
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
3221
|
+
elif 'embeddings_scale' in self.hparams:
|
3222
|
+
self.embeddings_scale = self.hparams['embeddings_scale']
|
3223
|
+
else:
|
3224
|
+
assert False
|
3225
|
+
|
3226
|
+
self.width_scale = 1.0
|
3227
|
+
if 'mup_output_alpha' in self.hparams:
|
3228
|
+
assert 'mup_width_scale' in self.hparams
|
3229
|
+
self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
|
3230
|
+
elif 'width_scale' in self.hparams:
|
3231
|
+
self.width_scale = self.hparams['width_scale']
|
3232
|
+
else:
|
3233
|
+
assert False
|
3234
|
+
|
3235
|
+
self.max_alibi_bias = 8.0
|
3236
|
+
|
3237
|
+
def set_vocab(self):
|
3238
|
+
self._set_vocab_gpt2()
|
3239
|
+
|
3240
|
+
def set_gguf_parameters(self):
|
3241
|
+
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
3242
|
+
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
3243
|
+
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
3244
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
|
3245
|
+
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
3246
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3247
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3248
|
+
|
3249
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3250
|
+
del bid # unused
|
3251
|
+
|
3252
|
+
tensors: list[tuple[str, Tensor]] = []
|
3253
|
+
|
3254
|
+
# we don't need these
|
3255
|
+
if name.endswith((".attn.bias")):
|
3256
|
+
return tensors
|
3257
|
+
|
3258
|
+
if name.endswith(("relative_pe.slopes")):
|
3259
|
+
# Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
|
3260
|
+
# Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
|
3261
|
+
# but Jais's PyTorch model simply precalculates the slope values and places them
|
3262
|
+
# in relative_pes.slopes
|
3263
|
+
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
|
3264
|
+
first_val = float(data_torch[0].item())
|
3265
|
+
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
|
3266
|
+
|
3267
|
+
return tensors
|
3268
|
+
|
3269
|
+
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
|
3270
|
+
data_torch = data_torch.transpose(1, 0)
|
3271
|
+
|
3272
|
+
new_name = self.map_tensor_name(name)
|
3273
|
+
|
3274
|
+
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3275
|
+
tensors.append((new_name, data_torch * self.embeddings_scale))
|
3276
|
+
if self.output_is_wte:
|
3277
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
3278
|
+
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
3279
|
+
assert not self.output_is_wte
|
3280
|
+
tensors.append((new_name, data_torch * self.width_scale))
|
3281
|
+
else:
|
3282
|
+
tensors.append((new_name, data_torch))
|
3283
|
+
|
3284
|
+
return tensors
|
3285
|
+
|
3286
|
+
def prepare_tensors(self):
|
3287
|
+
super().prepare_tensors()
|
3288
|
+
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
3289
|
+
|
3290
|
+
|
3291
|
+
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
3292
|
+
class ChatGLMModel(Model):
|
3293
|
+
model_arch = gguf.MODEL_ARCH.CHATGLM
|
3294
|
+
|
3295
|
+
def set_vocab_chatglm3(self):
|
3296
|
+
dir_model = self.dir_model
|
3297
|
+
hparams = self.hparams
|
3298
|
+
tokens: list[bytes] = []
|
3299
|
+
toktypes: list[int] = []
|
3300
|
+
scores: list[float] = []
|
3301
|
+
|
3302
|
+
from transformers import AutoTokenizer
|
3303
|
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3304
|
+
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
|
3305
|
+
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3306
|
+
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
|
3307
|
+
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
|
3308
|
+
for token_id in range(vocab_size):
|
3309
|
+
piece = tokenizer._convert_id_to_token(token_id)
|
3310
|
+
if token_id == 0:
|
3311
|
+
piece = "<unk>"
|
3312
|
+
elif token_id == 1:
|
3313
|
+
piece = "<bos>"
|
3314
|
+
elif token_id == 2:
|
3315
|
+
piece = "<eos>"
|
3316
|
+
|
3317
|
+
text = piece.encode("utf-8")
|
3318
|
+
score = 0.0
|
3319
|
+
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
|
3320
|
+
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
|
3321
|
+
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
|
3322
|
+
score = tokenizer.tokenizer.sp_model.get_score(token_id)
|
3323
|
+
|
3324
|
+
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
|
3325
|
+
if piece in special_tokens:
|
3326
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3327
|
+
elif len(piece) == 0:
|
3328
|
+
text = f"[PAD{token_id}]".encode("utf-8")
|
3329
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3330
|
+
else:
|
3331
|
+
toktype = SentencePieceTokenTypes.USER_DEFINED
|
3332
|
+
tokens.append(text)
|
3333
|
+
scores.append(score)
|
3334
|
+
toktypes.append(toktype)
|
3335
|
+
continue
|
3336
|
+
|
3337
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3338
|
+
if tokenizer.tokenizer.sp_model.is_unknown(token_id):
|
3339
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3340
|
+
elif tokenizer.tokenizer.sp_model.is_control(token_id):
|
3341
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3342
|
+
elif tokenizer.tokenizer.sp_model.is_unused(token_id):
|
3343
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3344
|
+
elif tokenizer.tokenizer.sp_model.is_byte(token_id):
|
3345
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3346
|
+
|
3347
|
+
tokens.append(text)
|
3348
|
+
scores.append(score)
|
3349
|
+
toktypes.append(toktype)
|
3350
|
+
|
3351
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
3352
|
+
# glm3 needs prefix and suffix formatted as:
|
3353
|
+
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
|
3354
|
+
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
|
3355
|
+
self.gguf_writer.add_token_list(tokens)
|
3356
|
+
self.gguf_writer.add_token_scores(scores)
|
3357
|
+
self.gguf_writer.add_token_types(toktypes)
|
3358
|
+
|
3359
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3360
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3361
|
+
|
3362
|
+
@staticmethod
|
3363
|
+
def token_bytes_to_string(b):
|
3364
|
+
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
3365
|
+
byte_encoder = bytes_to_unicode()
|
3366
|
+
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
3367
|
+
|
3368
|
+
@staticmethod
|
3369
|
+
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
|
3370
|
+
parts = [bytes([b]) for b in token]
|
3371
|
+
while True:
|
3372
|
+
min_idx = None
|
3373
|
+
min_rank = None
|
3374
|
+
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
|
3375
|
+
rank = mergeable_ranks.get(pair[0] + pair[1])
|
3376
|
+
if rank is not None and (min_rank is None or rank < min_rank):
|
3377
|
+
min_idx = i
|
3378
|
+
min_rank = rank
|
3379
|
+
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
|
3380
|
+
break
|
3381
|
+
assert min_idx is not None
|
3382
|
+
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
|
3383
|
+
return parts
|
3384
|
+
|
3385
|
+
def set_vocab(self):
|
3386
|
+
if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
|
3387
|
+
self.set_vocab_chatglm3()
|
3388
|
+
return
|
3389
|
+
|
3390
|
+
dir_model = self.dir_model
|
3391
|
+
hparams = self.hparams
|
3392
|
+
tokens: list[str] = []
|
3393
|
+
toktypes: list[int] = []
|
3394
|
+
|
3395
|
+
from transformers import AutoTokenizer
|
3396
|
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3397
|
+
vocab_size = hparams["padded_vocab_size"]
|
3398
|
+
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3399
|
+
|
3400
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
3401
|
+
|
3402
|
+
merges = []
|
3403
|
+
vocab = {}
|
3404
|
+
mergeable_ranks = tokenizer.mergeable_ranks
|
3405
|
+
for token, rank in mergeable_ranks.items():
|
3406
|
+
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
3407
|
+
if len(token) == 1:
|
3408
|
+
continue
|
3409
|
+
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
3410
|
+
assert len(merged) >= 2 and len(merged) <= 7
|
3411
|
+
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
3412
|
+
|
3413
|
+
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
3414
|
+
added_vocab = tokenizer.get_added_vocab()
|
3415
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
3416
|
+
|
3417
|
+
for i in range(vocab_size):
|
3418
|
+
if i not in reverse_vocab:
|
3419
|
+
tokens.append(f"[PAD{i}]")
|
3420
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
3421
|
+
elif reverse_vocab[i] in added_vocab:
|
3422
|
+
tokens.append(reverse_vocab[i])
|
3423
|
+
if tokenizer.added_tokens_decoder[i].special:
|
3424
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
3425
|
+
else:
|
3426
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
3427
|
+
else:
|
3428
|
+
tokens.append(reverse_vocab[i])
|
3429
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
3430
|
+
|
3431
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
3432
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
3433
|
+
self.gguf_writer.add_token_list(tokens)
|
3434
|
+
self.gguf_writer.add_token_types(toktypes)
|
3435
|
+
|
3436
|
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
3437
|
+
special_vocab.merges = merges
|
3438
|
+
# only add special tokens when they were not already loaded from config.json
|
3439
|
+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3440
|
+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
3441
|
+
# this one is usually not in config.json anyway
|
3442
|
+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3443
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3444
|
+
|
3445
|
+
def set_gguf_parameters(self):
|
3446
|
+
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
3447
|
+
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
3448
|
+
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
3449
|
+
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
3450
|
+
self.gguf_writer.add_embedding_length(n_embed)
|
3451
|
+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
3452
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3453
|
+
self.gguf_writer.add_head_count(n_head)
|
3454
|
+
self.gguf_writer.add_head_count_kv(n_head_kv)
|
3455
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
|
3456
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3457
|
+
self.gguf_writer.add_rope_dimension_count(64)
|
3458
|
+
self.gguf_writer.add_add_bos_token(False)
|
3459
|
+
rope_freq = 10000
|
3460
|
+
if "rope_ratio" in self.hparams:
|
3461
|
+
rope_freq = rope_freq * self.hparams["rope_ratio"]
|
3462
|
+
self.gguf_writer.add_rope_freq_base(rope_freq)
|
3463
|
+
|
3464
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3465
|
+
del bid # unused
|
3466
|
+
|
3467
|
+
if name.endswith(".rotary_pos_emb.inv_freq"):
|
3468
|
+
return []
|
3469
|
+
|
3470
|
+
name = name.removeprefix("transformer.")
|
3471
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3472
|
+
|
2702
3473
|
###### CONVERSION LOGIC ######
|
2703
3474
|
|
2704
3475
|
|
@@ -2715,19 +3486,46 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
2715
3486
|
torch.float32: np.float32,
|
2716
3487
|
}
|
2717
3488
|
|
3489
|
+
# used for safetensors slices
|
3490
|
+
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
3491
|
+
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
3492
|
+
_dtype_str_map: dict[str, torch.dtype] = {
|
3493
|
+
"F64": torch.float64,
|
3494
|
+
"F32": torch.float32,
|
3495
|
+
"BF16": torch.bfloat16,
|
3496
|
+
"F16": torch.float16,
|
3497
|
+
# "U64": torch.uint64,
|
3498
|
+
"I64": torch.int64,
|
3499
|
+
# "U32": torch.uint32,
|
3500
|
+
"I32": torch.int32,
|
3501
|
+
# "U16": torch.uint16,
|
3502
|
+
"I16": torch.int16,
|
3503
|
+
"U8": torch.uint8,
|
3504
|
+
"I8": torch.int8,
|
3505
|
+
"BOOL": torch.bool,
|
3506
|
+
"F8_E4M3": torch.float8_e4m3fn,
|
3507
|
+
"F8_E5M2": torch.float8_e5m2,
|
3508
|
+
}
|
3509
|
+
|
2718
3510
|
def numpy(self) -> gguf.LazyNumpyTensor:
|
2719
3511
|
dtype = self._dtype_map[self.dtype]
|
2720
3512
|
return gguf.LazyNumpyTensor(
|
2721
3513
|
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
2722
|
-
lazy=self._lazy,
|
2723
3514
|
args=(self,),
|
2724
|
-
func=(lambda s: s
|
3515
|
+
func=(lambda s: s.numpy())
|
2725
3516
|
)
|
2726
3517
|
|
2727
3518
|
@classmethod
|
2728
|
-
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape:
|
3519
|
+
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
2729
3520
|
return torch.empty(size=shape, dtype=dtype, device="meta")
|
2730
3521
|
|
3522
|
+
@classmethod
|
3523
|
+
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
3524
|
+
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
3525
|
+
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
3526
|
+
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
3527
|
+
return cast(torch.Tensor, lazy)
|
3528
|
+
|
2731
3529
|
@classmethod
|
2732
3530
|
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
2733
3531
|
del types # unused
|
@@ -2738,7 +3536,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
2738
3536
|
if func is torch.Tensor.numpy:
|
2739
3537
|
return args[0].numpy()
|
2740
3538
|
|
2741
|
-
return
|
3539
|
+
return cls._wrap_fn(func)(*args, **kwargs)
|
2742
3540
|
|
2743
3541
|
|
2744
3542
|
def parse_args() -> argparse.Namespace:
|
@@ -2748,10 +3546,6 @@ def parse_args() -> argparse.Namespace:
|
|
2748
3546
|
"--vocab-only", action="store_true",
|
2749
3547
|
help="extract only the vocab",
|
2750
3548
|
)
|
2751
|
-
parser.add_argument(
|
2752
|
-
"--awq-path", type=Path, default=None,
|
2753
|
-
help="Path to scale awq cache file",
|
2754
|
-
)
|
2755
3549
|
parser.add_argument(
|
2756
3550
|
"--outfile", type=Path,
|
2757
3551
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
@@ -2784,30 +3578,58 @@ def parse_args() -> argparse.Namespace:
|
|
2784
3578
|
"--verbose", action="store_true",
|
2785
3579
|
help="increase output verbosity",
|
2786
3580
|
)
|
3581
|
+
parser.add_argument(
|
3582
|
+
"--split-max-tensors", type=int, default=0,
|
3583
|
+
help="max tensors in each split",
|
3584
|
+
)
|
3585
|
+
parser.add_argument(
|
3586
|
+
"--split-max-size", type=str, default="0",
|
3587
|
+
help="max size per split N(M|G)",
|
3588
|
+
)
|
3589
|
+
parser.add_argument(
|
3590
|
+
"--dry-run", action="store_true",
|
3591
|
+
help="only print out a split plan and exit, without writing any new files",
|
3592
|
+
)
|
3593
|
+
parser.add_argument(
|
3594
|
+
"--no-tensor-first-split", action="store_true",
|
3595
|
+
help="do not add tensors to the first split (disabled by default)"
|
3596
|
+
)
|
3597
|
+
parser.add_argument(
|
3598
|
+
"--metadata", type=Path,
|
3599
|
+
help="Specify the path for an authorship metadata override file"
|
3600
|
+
)
|
2787
3601
|
|
2788
3602
|
return parser.parse_args()
|
2789
3603
|
|
2790
3604
|
|
3605
|
+
def split_str_to_n_bytes(split_str: str) -> int:
|
3606
|
+
if split_str.endswith("K"):
|
3607
|
+
n = int(split_str[:-1]) * 1000
|
3608
|
+
elif split_str.endswith("M"):
|
3609
|
+
n = int(split_str[:-1]) * 1000 * 1000
|
3610
|
+
elif split_str.endswith("G"):
|
3611
|
+
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
3612
|
+
elif split_str.isnumeric():
|
3613
|
+
n = int(split_str)
|
3614
|
+
else:
|
3615
|
+
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
3616
|
+
|
3617
|
+
if n < 0:
|
3618
|
+
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
3619
|
+
|
3620
|
+
return n
|
3621
|
+
|
3622
|
+
|
2791
3623
|
def main() -> None:
|
2792
3624
|
args = parse_args()
|
2793
3625
|
|
2794
|
-
|
3626
|
+
if args.verbose:
|
3627
|
+
logging.basicConfig(level=logging.DEBUG)
|
3628
|
+
else:
|
3629
|
+
logging.basicConfig(level=logging.INFO)
|
2795
3630
|
|
2796
3631
|
dir_model = args.model
|
2797
3632
|
|
2798
|
-
if args.awq_path:
|
2799
|
-
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
2800
|
-
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
2801
|
-
tmp_model_path = args.model / "weighted_model"
|
2802
|
-
dir_model = tmp_model_path
|
2803
|
-
if tmp_model_path.is_dir():
|
2804
|
-
logger.info(f"{tmp_model_path} exists as a weighted model.")
|
2805
|
-
else:
|
2806
|
-
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
2807
|
-
logger.info("Saving new weighted model ...")
|
2808
|
-
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
2809
|
-
logger.info(f"Saved weighted model at {tmp_model_path}.")
|
2810
|
-
|
2811
3633
|
if not dir_model.is_dir():
|
2812
3634
|
logger.error(f'Error: {args.model} is not a directory')
|
2813
3635
|
sys.exit(1)
|
@@ -2820,37 +3642,48 @@ def main() -> None:
|
|
2820
3642
|
"auto": gguf.LlamaFileType.GUESSED,
|
2821
3643
|
}
|
2822
3644
|
|
3645
|
+
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
3646
|
+
if args.use_temp_file and is_split:
|
3647
|
+
logger.error("Error: Cannot use temp file when splitting")
|
3648
|
+
sys.exit(1)
|
3649
|
+
|
2823
3650
|
if args.outfile is not None:
|
2824
3651
|
fname_out = args.outfile
|
2825
3652
|
else:
|
2826
|
-
|
2827
|
-
fname_out = dir_model / 'ggml-model-{ftype}.gguf'
|
3653
|
+
fname_out = dir_model
|
2828
3654
|
|
2829
3655
|
logger.info(f"Loading model: {dir_model.name}")
|
2830
3656
|
|
2831
3657
|
hparams = Model.load_hparams(dir_model)
|
2832
3658
|
|
2833
3659
|
with torch.inference_mode():
|
2834
|
-
|
2835
|
-
|
3660
|
+
output_type = ftype_map[args.outtype]
|
3661
|
+
model_architecture = hparams["architectures"][0]
|
2836
3662
|
|
2837
|
-
|
2838
|
-
|
2839
|
-
|
2840
|
-
|
2841
|
-
|
3663
|
+
try:
|
3664
|
+
model_class = Model.from_model_architecture(model_architecture)
|
3665
|
+
except NotImplementedError:
|
3666
|
+
logger.error(f"Model {model_architecture} is not supported")
|
3667
|
+
sys.exit(1)
|
2842
3668
|
|
2843
|
-
model_instance
|
3669
|
+
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
|
3670
|
+
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
|
3671
|
+
eager=args.no_lazy,
|
3672
|
+
metadata_override=args.metadata, model_name=args.model_name,
|
3673
|
+
split_max_tensors=args.split_max_tensors,
|
3674
|
+
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
3675
|
+
small_first_shard=args.no_tensor_first_split)
|
2844
3676
|
|
2845
3677
|
if args.vocab_only:
|
2846
|
-
logger.info(
|
3678
|
+
logger.info("Exporting model vocab...")
|
2847
3679
|
model_instance.write_vocab()
|
3680
|
+
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
|
2848
3681
|
else:
|
2849
|
-
logger.info(
|
3682
|
+
logger.info("Exporting model...")
|
2850
3683
|
model_instance.write()
|
2851
|
-
|
2852
|
-
|
3684
|
+
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
3685
|
+
logger.info(f"Model successfully exported to {out_path}")
|
2853
3686
|
|
2854
3687
|
|
2855
3688
|
if __name__ == '__main__':
|
2856
|
-
main()
|
3689
|
+
main()
|