bigdl-core-cpp 2.5.0b20240725__py3-none-win_amd64.whl → 2.5.0b20240727__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert-hf-to-gguf.py +1106 -320
- bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
- bigdl/cpp/gguf-py/gguf/constants.py +442 -173
- bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +29 -8
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +472 -156
- bigdl/cpp/gguf-py/gguf/lazy.py +24 -49
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +195 -23
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/ggml_shared.dll +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- {bigdl_core_cpp-2.5.0b20240725.dist-info → bigdl_core_cpp-2.5.0b20240727.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.5.0b20240727.dist-info/RECORD +61 -0
- bigdl_core_cpp-2.5.0b20240725.dist-info/RECORD +0 -61
- {bigdl_core_cpp-2.5.0b20240725.data → bigdl_core_cpp-2.5.0b20240727.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240725.data → bigdl_core_cpp-2.5.0b20240727.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240725.data → bigdl_core_cpp-2.5.0b20240727.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240725.dist-info → bigdl_core_cpp-2.5.0b20240727.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240725.dist-info → bigdl_core_cpp-2.5.0b20240727.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert-hf-to-gguf.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
2
3
|
|
3
4
|
from __future__ import annotations
|
4
5
|
|
@@ -12,7 +13,7 @@ import sys
|
|
12
13
|
from enum import IntEnum
|
13
14
|
from pathlib import Path
|
14
15
|
from hashlib import sha256
|
15
|
-
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
16
|
+
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
16
17
|
|
17
18
|
import math
|
18
19
|
import numpy as np
|
@@ -25,10 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|
25
26
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
26
27
|
import gguf
|
27
28
|
|
28
|
-
from convert import LlamaHfVocab
|
29
|
-
|
30
|
-
logger = logging.getLogger("hf-to-gguf")
|
31
|
-
|
32
29
|
logger = logging.getLogger("hf-to-gguf")
|
33
30
|
|
34
31
|
|
@@ -50,7 +47,8 @@ class Model:
|
|
50
47
|
_model_classes: dict[str, type[Model]] = {}
|
51
48
|
|
52
49
|
dir_model: Path
|
53
|
-
ftype:
|
50
|
+
ftype: gguf.LlamaFileType
|
51
|
+
fname_out: Path
|
54
52
|
is_big_endian: bool
|
55
53
|
endianess: gguf.GGUFEndian
|
56
54
|
use_temp_file: bool
|
@@ -61,29 +59,41 @@ class Model:
|
|
61
59
|
block_count: int
|
62
60
|
tensor_map: gguf.TensorNameMap
|
63
61
|
tensor_names: set[str] | None
|
64
|
-
fname_out: Path
|
65
62
|
gguf_writer: gguf.GGUFWriter
|
63
|
+
model_name: str | None
|
64
|
+
metadata_override: Path | None
|
65
|
+
dir_model_card: Path
|
66
66
|
|
67
67
|
# subclasses should define this!
|
68
68
|
model_arch: gguf.MODEL_ARCH
|
69
69
|
|
70
|
-
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool
|
70
|
+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
71
|
+
use_temp_file: bool = False, eager: bool = False,
|
72
|
+
metadata_override: Path | None = None, model_name: str | None = None,
|
73
|
+
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
71
74
|
if type(self) is Model:
|
72
75
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
76
|
+
|
73
77
|
self.dir_model = dir_model
|
74
78
|
self.ftype = ftype
|
79
|
+
self.fname_out = fname_out
|
75
80
|
self.is_big_endian = is_big_endian
|
76
81
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
77
82
|
self.use_temp_file = use_temp_file
|
78
83
|
self.lazy = not eager
|
79
|
-
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
84
|
+
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
|
80
85
|
self.is_safetensors = len(self.part_names) > 0
|
81
86
|
if not self.is_safetensors:
|
82
|
-
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
87
|
+
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
83
88
|
self.hparams = Model.load_hparams(self.dir_model)
|
84
|
-
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
89
|
+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
85
90
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
86
91
|
self.tensor_names = None
|
92
|
+
self.metadata_override = metadata_override
|
93
|
+
self.model_name = model_name
|
94
|
+
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
95
|
+
|
96
|
+
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
87
97
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
88
98
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
89
99
|
_, first_tensor = next(self.get_tensors())
|
@@ -93,11 +103,10 @@ class Model:
|
|
93
103
|
else:
|
94
104
|
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
95
105
|
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
106
|
+
|
107
|
+
# Configure GGUF Writer
|
108
|
+
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
109
|
+
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
101
110
|
|
102
111
|
@classmethod
|
103
112
|
def __init_subclass__(cls):
|
@@ -147,9 +156,16 @@ class Model:
|
|
147
156
|
tensor_names_from_parts.update(model_part.keys())
|
148
157
|
|
149
158
|
for name in model_part.keys():
|
150
|
-
|
151
|
-
|
152
|
-
|
159
|
+
if self.is_safetensors:
|
160
|
+
if self.lazy:
|
161
|
+
data = model_part.get_slice(name)
|
162
|
+
data = LazyTorchTensor.from_safetensors_slice(data)
|
163
|
+
else:
|
164
|
+
data = model_part.get_tensor(name)
|
165
|
+
else:
|
166
|
+
data = model_part[name]
|
167
|
+
if self.lazy:
|
168
|
+
data = LazyTorchTensor.from_eager(data)
|
153
169
|
yield name, data
|
154
170
|
|
155
171
|
# only verify tensor name presence; it doesn't matter if they are not in the right files
|
@@ -185,7 +201,6 @@ class Model:
|
|
185
201
|
return new_name
|
186
202
|
|
187
203
|
def set_gguf_parameters(self):
|
188
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
189
204
|
self.gguf_writer.add_block_count(self.block_count)
|
190
205
|
|
191
206
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
@@ -224,6 +239,10 @@ class Model:
|
|
224
239
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
225
240
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
226
241
|
|
242
|
+
if (head_dim := self.hparams.get("head_dim")) is not None:
|
243
|
+
self.gguf_writer.add_key_length(head_dim)
|
244
|
+
self.gguf_writer.add_value_length(head_dim)
|
245
|
+
|
227
246
|
self.gguf_writer.add_file_type(self.ftype)
|
228
247
|
logger.info(f"gguf: file type = {self.ftype}")
|
229
248
|
|
@@ -242,7 +261,7 @@ class Model:
|
|
242
261
|
|
243
262
|
return False
|
244
263
|
|
245
|
-
def
|
264
|
+
def prepare_tensors(self):
|
246
265
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
247
266
|
|
248
267
|
for name, data_torch in self.get_tensors():
|
@@ -264,7 +283,7 @@ class Model:
|
|
264
283
|
break
|
265
284
|
|
266
285
|
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
267
|
-
data: np.ndarray
|
286
|
+
data: np.ndarray # type hint
|
268
287
|
n_dims = len(data.shape)
|
269
288
|
data_dtype = data.dtype
|
270
289
|
data_qtype: gguf.GGMLQuantizationType | None = None
|
@@ -325,23 +344,80 @@ class Model:
|
|
325
344
|
|
326
345
|
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
327
346
|
|
347
|
+
def set_type(self):
|
348
|
+
self.gguf_writer.add_type(gguf.GGUFType.MODEL)
|
349
|
+
|
350
|
+
def prepare_metadata(self, vocab_only: bool):
|
351
|
+
|
352
|
+
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
353
|
+
|
354
|
+
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
|
355
|
+
|
356
|
+
# Fallback to model directory name if metadata name is still missing
|
357
|
+
if self.metadata.name is None:
|
358
|
+
self.metadata.name = self.dir_model.name
|
359
|
+
|
360
|
+
# Generate parameter weight class (useful for leader boards) if not yet determined
|
361
|
+
if self.metadata.size_label is None and total_params > 0:
|
362
|
+
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
363
|
+
|
364
|
+
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
365
|
+
output_type: str = self.ftype.name.partition("_")[2]
|
366
|
+
|
367
|
+
# Filename Output
|
368
|
+
if self.fname_out.is_dir():
|
369
|
+
# Generate default filename based on model specification and available metadata
|
370
|
+
if not vocab_only:
|
371
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
372
|
+
else:
|
373
|
+
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
374
|
+
|
375
|
+
# Use the default filename
|
376
|
+
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
377
|
+
else:
|
378
|
+
# Output path is a custom defined templated filename
|
379
|
+
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
380
|
+
# file template strings as it doesn't actually exist as a file
|
381
|
+
|
382
|
+
# Process templated file name with the output ftype, useful with the "auto" ftype
|
383
|
+
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
384
|
+
|
385
|
+
self.set_type()
|
386
|
+
|
387
|
+
logger.info("Set meta model")
|
388
|
+
self.metadata.set_gguf_meta_model(self.gguf_writer)
|
389
|
+
|
390
|
+
logger.info("Set model parameters")
|
391
|
+
self.set_gguf_parameters()
|
392
|
+
|
393
|
+
logger.info("Set model tokenizer")
|
394
|
+
self.set_vocab()
|
395
|
+
|
396
|
+
logger.info("Set model quantization version")
|
397
|
+
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
398
|
+
|
328
399
|
def write(self):
|
329
|
-
self.
|
330
|
-
self.
|
400
|
+
self.prepare_tensors()
|
401
|
+
self.prepare_metadata(vocab_only=False)
|
402
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
331
403
|
self.gguf_writer.write_kv_data_to_file()
|
332
404
|
self.gguf_writer.write_tensors_to_file(progress=True)
|
333
405
|
self.gguf_writer.close()
|
334
406
|
|
335
407
|
def write_vocab(self):
|
336
|
-
self.gguf_writer.
|
408
|
+
if len(self.gguf_writer.tensors) != 1:
|
409
|
+
raise ValueError('Splitting the vocabulary is not supported')
|
410
|
+
|
411
|
+
self.prepare_metadata(vocab_only=True)
|
412
|
+
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
337
413
|
self.gguf_writer.write_kv_data_to_file()
|
338
414
|
self.gguf_writer.close()
|
339
415
|
|
340
416
|
@staticmethod
|
341
|
-
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
|
417
|
+
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
342
418
|
part_names: list[str] = []
|
343
419
|
for filename in os.listdir(dir_model):
|
344
|
-
if filename.endswith(suffix):
|
420
|
+
if filename.startswith(prefix) and filename.endswith(suffix):
|
345
421
|
part_names.append(filename)
|
346
422
|
|
347
423
|
part_names.sort()
|
@@ -370,6 +446,29 @@ class Model:
|
|
370
446
|
except KeyError:
|
371
447
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
372
448
|
|
449
|
+
def does_token_look_special(self, token: str | bytes) -> bool:
|
450
|
+
if isinstance(token, (bytes, bytearray)):
|
451
|
+
token_text = token.decode(encoding="utf-8")
|
452
|
+
elif isinstance(token, memoryview):
|
453
|
+
token_text = token.tobytes().decode(encoding="utf-8")
|
454
|
+
else:
|
455
|
+
token_text = token
|
456
|
+
|
457
|
+
# Some models mark some added tokens which ought to be control tokens as not special.
|
458
|
+
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
|
459
|
+
seems_special = token_text in (
|
460
|
+
"<pad>", # deepseek-coder
|
461
|
+
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
|
462
|
+
)
|
463
|
+
|
464
|
+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
|
465
|
+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
|
466
|
+
|
467
|
+
# TODO: should these be marked as UNUSED instead? (maybe not)
|
468
|
+
seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
|
469
|
+
|
470
|
+
return seems_special
|
471
|
+
|
373
472
|
# used for GPT-2 BPE and WordPiece vocabs
|
374
473
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
375
474
|
tokens: list[str] = []
|
@@ -388,20 +487,22 @@ class Model:
|
|
388
487
|
for i in range(vocab_size):
|
389
488
|
if i not in reverse_vocab:
|
390
489
|
tokens.append(f"[PAD{i}]")
|
391
|
-
toktypes.append(gguf.TokenType.
|
392
|
-
elif reverse_vocab[i] in added_vocab:
|
393
|
-
tokens.append(reverse_vocab[i])
|
394
|
-
if tokenizer.added_tokens_decoder[i].special:
|
395
|
-
toktypes.append(gguf.TokenType.CONTROL)
|
396
|
-
else:
|
397
|
-
toktypes.append(gguf.TokenType.USER_DEFINED)
|
490
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
398
491
|
else:
|
399
|
-
|
400
|
-
|
492
|
+
token: str = reverse_vocab[i]
|
493
|
+
if token in added_vocab:
|
494
|
+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
495
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
496
|
+
else:
|
497
|
+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
498
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
499
|
+
else:
|
500
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
501
|
+
tokens.append(token)
|
401
502
|
|
402
503
|
return tokens, toktypes, tokpre
|
403
504
|
|
404
|
-
# NOTE: this function is generated by
|
505
|
+
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
405
506
|
# do not modify it manually!
|
406
507
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
407
508
|
# Marker: Start get_vocab_base_pre
|
@@ -421,7 +522,7 @@ class Model:
|
|
421
522
|
|
422
523
|
res = None
|
423
524
|
|
424
|
-
# NOTE: if you get an error here, you need to update the
|
525
|
+
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
425
526
|
# or pull the latest version of the model from Huggingface
|
426
527
|
# don't edit the hashes manually!
|
427
528
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
@@ -478,15 +579,39 @@ class Model:
|
|
478
579
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
479
580
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
480
581
|
res = "smaug-bpe"
|
582
|
+
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
583
|
+
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
584
|
+
res = "poro-chat"
|
585
|
+
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
586
|
+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
587
|
+
res = "jina-v2-code"
|
588
|
+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
589
|
+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
590
|
+
res = "chatglm-bpe"
|
591
|
+
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
592
|
+
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
593
|
+
res = "viking"
|
594
|
+
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
595
|
+
# ref: https://huggingface.co/core42/jais-13b
|
596
|
+
res = "jais"
|
597
|
+
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
598
|
+
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
599
|
+
res = "codeshell"
|
600
|
+
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
601
|
+
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
602
|
+
res = "tekken"
|
603
|
+
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
604
|
+
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
605
|
+
res = "smollm"
|
481
606
|
|
482
607
|
if res is None:
|
483
608
|
logger.warning("\n")
|
484
609
|
logger.warning("**************************************************************************************")
|
485
610
|
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
486
611
|
logger.warning("** There are 2 possible reasons for this:")
|
487
|
-
logger.warning("** - the model has not been added to
|
612
|
+
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
488
613
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
489
|
-
logger.warning("** Check your model files and
|
614
|
+
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
490
615
|
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
491
616
|
logger.warning("**")
|
492
617
|
logger.warning(f"** chkhsh: {chkhsh}")
|
@@ -541,7 +666,7 @@ class Model:
|
|
541
666
|
for i in range(vocab_size):
|
542
667
|
if i not in reverse_vocab:
|
543
668
|
tokens.append(f"[PAD{i}]")
|
544
|
-
toktypes.append(gguf.TokenType.
|
669
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
545
670
|
elif reverse_vocab[i] in added_vocab:
|
546
671
|
tokens.append(reverse_vocab[i])
|
547
672
|
toktypes.append(gguf.TokenType.CONTROL)
|
@@ -564,15 +689,23 @@ class Model:
|
|
564
689
|
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
565
690
|
special_vocab.add_to_gguf(self.gguf_writer)
|
566
691
|
|
567
|
-
def _set_vocab_sentencepiece(self):
|
692
|
+
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
693
|
+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
694
|
+
|
695
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
696
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
697
|
+
self.gguf_writer.add_token_list(tokens)
|
698
|
+
self.gguf_writer.add_token_scores(scores)
|
699
|
+
self.gguf_writer.add_token_types(toktypes)
|
700
|
+
|
701
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
702
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
703
|
+
|
704
|
+
def _create_vocab_sentencepiece(self):
|
568
705
|
from sentencepiece import SentencePieceProcessor
|
569
706
|
|
570
707
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
571
708
|
|
572
|
-
tokens: list[bytes] = []
|
573
|
-
scores: list[float] = []
|
574
|
-
toktypes: list[int] = []
|
575
|
-
|
576
709
|
if not tokenizer_path.is_file():
|
577
710
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
578
711
|
|
@@ -583,7 +716,7 @@ class Model:
|
|
583
716
|
|
584
717
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
585
718
|
scores: list[float] = [-10000.0] * vocab_size
|
586
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
719
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
587
720
|
|
588
721
|
for token_id in range(tokenizer.vocab_size()):
|
589
722
|
piece = tokenizer.IdToPiece(token_id)
|
@@ -610,7 +743,7 @@ class Model:
|
|
610
743
|
added_tokens_json = json.load(f)
|
611
744
|
for key in added_tokens_json:
|
612
745
|
token_id = added_tokens_json[key]
|
613
|
-
if
|
746
|
+
if token_id >= vocab_size:
|
614
747
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
615
748
|
continue
|
616
749
|
|
@@ -618,6 +751,26 @@ class Model:
|
|
618
751
|
scores[token_id] = -1000.0
|
619
752
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
620
753
|
|
754
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
755
|
+
if tokenizer_config_file.is_file():
|
756
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
757
|
+
tokenizer_config_json = json.load(f)
|
758
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
759
|
+
for token_id, token_data in added_tokens_decoder.items():
|
760
|
+
token_id = int(token_id)
|
761
|
+
token: str = token_data["content"]
|
762
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
763
|
+
if tokens[token_id] != token.encode("utf-8"):
|
764
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
765
|
+
if token_data.get("special") or self.does_token_look_special(token):
|
766
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
767
|
+
else:
|
768
|
+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
769
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
770
|
+
|
771
|
+
scores[token_id] = -1000.0
|
772
|
+
tokens[token_id] = token.encode("utf-8")
|
773
|
+
|
621
774
|
if vocab_size > len(tokens):
|
622
775
|
pad_count = vocab_size - len(tokens)
|
623
776
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
@@ -626,17 +779,10 @@ class Model:
|
|
626
779
|
scores.append(-1000.0)
|
627
780
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
628
781
|
|
629
|
-
|
630
|
-
self.gguf_writer.add_tokenizer_pre("default")
|
631
|
-
self.gguf_writer.add_token_list(tokens)
|
632
|
-
self.gguf_writer.add_token_scores(scores)
|
633
|
-
self.gguf_writer.add_token_types(toktypes)
|
634
|
-
|
635
|
-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
636
|
-
special_vocab.add_to_gguf(self.gguf_writer)
|
782
|
+
return tokens, scores, toktypes
|
637
783
|
|
638
784
|
def _set_vocab_llama_hf(self):
|
639
|
-
vocab = LlamaHfVocab(self.dir_model)
|
785
|
+
vocab = gguf.LlamaHfVocab(self.dir_model)
|
640
786
|
tokens = []
|
641
787
|
scores = []
|
642
788
|
toktypes = []
|
@@ -657,6 +803,51 @@ class Model:
|
|
657
803
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
658
804
|
special_vocab.add_to_gguf(self.gguf_writer)
|
659
805
|
|
806
|
+
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
807
|
+
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
808
|
+
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
809
|
+
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
|
810
|
+
|
811
|
+
default_pre = "mpt" if model_name == "gpt-neox" else "default"
|
812
|
+
|
813
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
814
|
+
assert field # tokenizer model
|
815
|
+
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
|
816
|
+
|
817
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
818
|
+
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
|
819
|
+
|
820
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
821
|
+
assert field # token list
|
822
|
+
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
823
|
+
|
824
|
+
if model_name == "llama-spm":
|
825
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
|
826
|
+
assert field # token scores
|
827
|
+
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
828
|
+
|
829
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
830
|
+
assert field # token types
|
831
|
+
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
832
|
+
|
833
|
+
if model_name != "llama-spm":
|
834
|
+
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
835
|
+
assert field # token merges
|
836
|
+
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
837
|
+
|
838
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
|
839
|
+
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
|
840
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
|
841
|
+
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
|
842
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
|
843
|
+
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
|
844
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
|
845
|
+
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
|
846
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
|
847
|
+
self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
|
848
|
+
if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
|
849
|
+
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
|
850
|
+
|
660
851
|
|
661
852
|
@Model.register("GPTNeoXForCausalLM")
|
662
853
|
class GPTNeoXModel(Model):
|
@@ -665,7 +856,6 @@ class GPTNeoXModel(Model):
|
|
665
856
|
def set_gguf_parameters(self):
|
666
857
|
block_count = self.hparams["num_hidden_layers"]
|
667
858
|
|
668
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
669
859
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
670
860
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
671
861
|
self.gguf_writer.add_block_count(block_count)
|
@@ -721,7 +911,6 @@ class BloomModel(Model):
|
|
721
911
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
722
912
|
|
723
913
|
def set_gguf_parameters(self):
|
724
|
-
self.gguf_writer.add_name("Bloom")
|
725
914
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
726
915
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
727
916
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
@@ -798,7 +987,6 @@ class MPTModel(Model):
|
|
798
987
|
|
799
988
|
def set_gguf_parameters(self):
|
800
989
|
block_count = self.hparams["n_layers"]
|
801
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
802
990
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
803
991
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
804
992
|
self.gguf_writer.add_block_count(block_count)
|
@@ -837,7 +1025,6 @@ class OrionModel(Model):
|
|
837
1025
|
block_count = self.hparams["num_hidden_layers"]
|
838
1026
|
head_count = self.hparams["num_attention_heads"]
|
839
1027
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
840
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
841
1028
|
|
842
1029
|
ctx_length = 0
|
843
1030
|
if "max_sequence_length" in self.hparams:
|
@@ -850,8 +1037,6 @@ class OrionModel(Model):
|
|
850
1037
|
raise ValueError("gguf: can not find ctx length parameter.")
|
851
1038
|
|
852
1039
|
self.gguf_writer.add_file_type(self.ftype)
|
853
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
854
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
855
1040
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
856
1041
|
self.gguf_writer.add_context_length(ctx_length)
|
857
1042
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -875,7 +1060,6 @@ class BaichuanModel(Model):
|
|
875
1060
|
block_count = self.hparams["num_hidden_layers"]
|
876
1061
|
head_count = self.hparams["num_attention_heads"]
|
877
1062
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
878
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
879
1063
|
|
880
1064
|
ctx_length = 0
|
881
1065
|
if "max_sequence_length" in self.hparams:
|
@@ -887,8 +1071,6 @@ class BaichuanModel(Model):
|
|
887
1071
|
else:
|
888
1072
|
raise ValueError("gguf: can not find ctx length parameter.")
|
889
1073
|
|
890
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
891
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
892
1074
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
893
1075
|
self.gguf_writer.add_context_length(ctx_length)
|
894
1076
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -962,7 +1144,11 @@ class XverseModel(Model):
|
|
962
1144
|
from transformers import AutoTokenizer
|
963
1145
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
964
1146
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
965
|
-
|
1147
|
+
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
|
1148
|
+
# because vocab_size is the count of items, and indexes start at 0.
|
1149
|
+
max_vocab_index = max(tokenizer.get_vocab().values())
|
1150
|
+
if max_vocab_index >= vocab_size:
|
1151
|
+
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
966
1152
|
|
967
1153
|
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
968
1154
|
added_vocab = tokenizer.get_added_vocab()
|
@@ -998,7 +1184,6 @@ class XverseModel(Model):
|
|
998
1184
|
block_count = self.hparams["num_hidden_layers"]
|
999
1185
|
head_count = self.hparams["num_attention_heads"]
|
1000
1186
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
1001
|
-
hf_repo = self.hparams.get("_name_or_path", "")
|
1002
1187
|
|
1003
1188
|
ctx_length = 0
|
1004
1189
|
if "max_sequence_length" in self.hparams:
|
@@ -1010,8 +1195,6 @@ class XverseModel(Model):
|
|
1010
1195
|
else:
|
1011
1196
|
raise ValueError("gguf: can not find ctx length parameter.")
|
1012
1197
|
|
1013
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1014
|
-
self.gguf_writer.add_source_hf_repo(hf_repo)
|
1015
1198
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
1016
1199
|
self.gguf_writer.add_context_length(ctx_length)
|
1017
1200
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1070,7 +1253,6 @@ class FalconModel(Model):
|
|
1070
1253
|
if n_head_kv is None:
|
1071
1254
|
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
|
1072
1255
|
|
1073
|
-
self.gguf_writer.add_name("Falcon")
|
1074
1256
|
self.gguf_writer.add_context_length(2048) # not in config.json
|
1075
1257
|
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
1076
1258
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1115,7 +1297,6 @@ class StarCoderModel(Model):
|
|
1115
1297
|
def set_gguf_parameters(self):
|
1116
1298
|
block_count = self.hparams["n_layer"]
|
1117
1299
|
|
1118
|
-
self.gguf_writer.add_name("StarCoder")
|
1119
1300
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
1120
1301
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
1121
1302
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
@@ -1135,11 +1316,11 @@ class RefactModel(Model):
|
|
1135
1316
|
|
1136
1317
|
# TODO: how to determine special FIM tokens automatically?
|
1137
1318
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
1138
|
-
special_token_types = ['prefix', 'suffix', 'middle', '
|
1319
|
+
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
1139
1320
|
special_vocab._set_special_token("prefix", 1)
|
1140
1321
|
special_vocab._set_special_token("suffix", 3)
|
1141
1322
|
special_vocab._set_special_token("middle", 2)
|
1142
|
-
special_vocab.
|
1323
|
+
special_vocab.chat_template = None # do not add it twice
|
1143
1324
|
special_vocab.add_to_gguf(self.gguf_writer)
|
1144
1325
|
|
1145
1326
|
def set_gguf_parameters(self):
|
@@ -1151,7 +1332,6 @@ class RefactModel(Model):
|
|
1151
1332
|
|
1152
1333
|
block_count = self.hparams["n_layer"]
|
1153
1334
|
|
1154
|
-
self.gguf_writer.add_name("Refact")
|
1155
1335
|
# refact uses Alibi. So this is from config.json which might be used by training.
|
1156
1336
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
1157
1337
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
@@ -1199,14 +1379,13 @@ class StableLMModel(Model):
|
|
1199
1379
|
if (self.dir_model / "tokenizer.json").is_file():
|
1200
1380
|
self._set_vocab_gpt2()
|
1201
1381
|
else:
|
1202
|
-
# StableLM 2 1.6B
|
1382
|
+
# StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
|
1203
1383
|
self._set_vocab_qwen()
|
1204
1384
|
|
1205
1385
|
def set_gguf_parameters(self):
|
1206
1386
|
hparams = self.hparams
|
1207
1387
|
block_count = hparams["num_hidden_layers"]
|
1208
1388
|
|
1209
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1210
1389
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
1211
1390
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1212
1391
|
self.gguf_writer.add_block_count(block_count)
|
@@ -1268,8 +1447,8 @@ class StableLMModel(Model):
|
|
1268
1447
|
|
1269
1448
|
return [(new_name, data_torch)]
|
1270
1449
|
|
1271
|
-
def
|
1272
|
-
super().
|
1450
|
+
def prepare_tensors(self):
|
1451
|
+
super().prepare_tensors()
|
1273
1452
|
|
1274
1453
|
if self._q_norms is not None or self._k_norms is not None:
|
1275
1454
|
# flatten two `list[dict[str, Tensor]]` into a single `list[str]`
|
@@ -1281,85 +1460,6 @@ class StableLMModel(Model):
|
|
1281
1460
|
if len(norms) > 0:
|
1282
1461
|
raise ValueError(f"Unprocessed norms: {norms}")
|
1283
1462
|
|
1284
|
-
def write_tensors(self):
|
1285
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
1286
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
1287
|
-
n_head = self.hparams.get("num_attention_heads")
|
1288
|
-
n_kv_head = self.hparams.get("num_key_value_heads")
|
1289
|
-
q_norms = dict()
|
1290
|
-
k_norms = dict()
|
1291
|
-
for name, data_torch in self.get_tensors():
|
1292
|
-
# we don't need these
|
1293
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
1294
|
-
continue
|
1295
|
-
|
1296
|
-
old_dtype = data_torch.dtype
|
1297
|
-
|
1298
|
-
# convert any unsupported data types to float32
|
1299
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1300
|
-
data_torch = data_torch.to(torch.float32)
|
1301
|
-
|
1302
|
-
data = data_torch.squeeze().numpy()
|
1303
|
-
n_dims = len(data.shape)
|
1304
|
-
if name.find("q_layernorm.norms") != -1:
|
1305
|
-
q_norms[name] = data
|
1306
|
-
if len(q_norms) >= (block_count * n_head):
|
1307
|
-
self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
|
1308
|
-
continue
|
1309
|
-
if name.find("k_layernorm.norms") != -1:
|
1310
|
-
k_norms[name] = data
|
1311
|
-
if len(k_norms) >= (block_count * n_kv_head):
|
1312
|
-
self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
|
1313
|
-
continue
|
1314
|
-
|
1315
|
-
# map tensor names
|
1316
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1317
|
-
if new_name is None:
|
1318
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1319
|
-
|
1320
|
-
n_dims = len(data.shape)
|
1321
|
-
data_dtype = data.dtype
|
1322
|
-
|
1323
|
-
# if f32 desired, convert any float16 to float32
|
1324
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1325
|
-
data = data.astype(np.float32)
|
1326
|
-
|
1327
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1328
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
1329
|
-
data = data.astype(np.float32)
|
1330
|
-
|
1331
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1332
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
1333
|
-
data = data.astype(np.float16)
|
1334
|
-
|
1335
|
-
logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1336
|
-
|
1337
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1338
|
-
|
1339
|
-
def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
|
1340
|
-
for bid in range(block_count):
|
1341
|
-
datas = []
|
1342
|
-
for xid in range(n_head):
|
1343
|
-
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
1344
|
-
datas.append(norms[ename])
|
1345
|
-
del norms[ename]
|
1346
|
-
data = np.stack(datas, axis=0)
|
1347
|
-
data_dtype = data.dtype
|
1348
|
-
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
1349
|
-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
1350
|
-
if new_name is None:
|
1351
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1352
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
1353
|
-
data = data.astype(np.float32)
|
1354
|
-
|
1355
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1356
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
|
1357
|
-
data = data.astype(np.float16)
|
1358
|
-
|
1359
|
-
logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
1360
|
-
|
1361
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1362
|
-
|
1363
1463
|
|
1364
1464
|
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
1365
1465
|
class LlamaModel(Model):
|
@@ -1367,7 +1467,7 @@ class LlamaModel(Model):
|
|
1367
1467
|
|
1368
1468
|
def set_vocab(self):
|
1369
1469
|
try:
|
1370
|
-
self.
|
1470
|
+
self._set_vocab_sentencepiece()
|
1371
1471
|
except FileNotFoundError:
|
1372
1472
|
try:
|
1373
1473
|
self._set_vocab_llama_hf()
|
@@ -1391,13 +1491,29 @@ class LlamaModel(Model):
|
|
1391
1491
|
super().set_gguf_parameters()
|
1392
1492
|
hparams = self.hparams
|
1393
1493
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
1394
|
-
|
1494
|
+
|
1495
|
+
if "head_dim" in hparams:
|
1496
|
+
rope_dim = hparams["head_dim"]
|
1497
|
+
else:
|
1498
|
+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
1499
|
+
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
1395
1500
|
|
1396
1501
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
1397
1502
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
1398
1503
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1399
1504
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1400
1505
|
|
1506
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
1507
|
+
if tokenizer_config_file.is_file():
|
1508
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
1509
|
+
tokenizer_config_json = json.load(f)
|
1510
|
+
if "add_prefix_space" in tokenizer_config_json:
|
1511
|
+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
1512
|
+
|
1513
|
+
# Apply to granite small models only
|
1514
|
+
if self.hparams.get("vocab_size", 32000) == 49152:
|
1515
|
+
self.gguf_writer.add_add_bos_token(False)
|
1516
|
+
|
1401
1517
|
@staticmethod
|
1402
1518
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1403
1519
|
if n_head_kv is not None and n_head != n_head_kv:
|
@@ -1412,9 +1528,9 @@ class LlamaModel(Model):
|
|
1412
1528
|
n_head = self.hparams["num_attention_heads"]
|
1413
1529
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
1414
1530
|
|
1415
|
-
if name.endswith("q_proj.weight"):
|
1531
|
+
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
1416
1532
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1417
|
-
if name.endswith("k_proj.weight"):
|
1533
|
+
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
1418
1534
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1419
1535
|
|
1420
1536
|
# process the experts separately
|
@@ -1453,8 +1569,8 @@ class LlamaModel(Model):
|
|
1453
1569
|
|
1454
1570
|
return [(self.map_tensor_name(name), data_torch)]
|
1455
1571
|
|
1456
|
-
def
|
1457
|
-
super().
|
1572
|
+
def prepare_tensors(self):
|
1573
|
+
super().prepare_tensors()
|
1458
1574
|
|
1459
1575
|
if self._experts is not None:
|
1460
1576
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -1463,6 +1579,48 @@ class LlamaModel(Model):
|
|
1463
1579
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1464
1580
|
|
1465
1581
|
|
1582
|
+
@Model.register("BitnetForCausalLM")
|
1583
|
+
class BitnetModel(Model):
|
1584
|
+
model_arch = gguf.MODEL_ARCH.BITNET
|
1585
|
+
|
1586
|
+
def set_vocab(self):
|
1587
|
+
self._set_vocab_sentencepiece()
|
1588
|
+
|
1589
|
+
def set_gguf_parameters(self):
|
1590
|
+
super().set_gguf_parameters()
|
1591
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1592
|
+
self.gguf_writer.add_rope_scaling_factor(1.0)
|
1593
|
+
|
1594
|
+
def weight_quant(self, weight):
|
1595
|
+
dtype = weight.dtype
|
1596
|
+
weight = weight.float()
|
1597
|
+
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
1598
|
+
weight = (weight * s).round().clamp(-1, 1) / s
|
1599
|
+
scale = weight.abs().max().unsqueeze(0)
|
1600
|
+
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
1601
|
+
weight = torch.sign(weight).type(dtype)
|
1602
|
+
return weight.type(dtype), scale.type(torch.float32)
|
1603
|
+
|
1604
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1605
|
+
new_name = self.map_tensor_name(name)
|
1606
|
+
|
1607
|
+
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
1608
|
+
gguf.MODEL_TENSOR.ATTN_Q,
|
1609
|
+
gguf.MODEL_TENSOR.ATTN_K,
|
1610
|
+
gguf.MODEL_TENSOR.ATTN_V,
|
1611
|
+
gguf.MODEL_TENSOR.ATTN_OUT,
|
1612
|
+
gguf.MODEL_TENSOR.FFN_UP,
|
1613
|
+
gguf.MODEL_TENSOR.FFN_DOWN,
|
1614
|
+
gguf.MODEL_TENSOR.FFN_GATE,
|
1615
|
+
]):
|
1616
|
+
# transform weight into 1/0/-1 (in fp32)
|
1617
|
+
weight_torch, scale_torch = self.weight_quant(data_torch)
|
1618
|
+
yield (new_name, weight_torch)
|
1619
|
+
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
1620
|
+
else:
|
1621
|
+
yield (new_name, data_torch)
|
1622
|
+
|
1623
|
+
|
1466
1624
|
@Model.register("GrokForCausalLM")
|
1467
1625
|
class GrokModel(Model):
|
1468
1626
|
model_arch = gguf.MODEL_ARCH.GROK
|
@@ -1475,7 +1633,6 @@ class GrokModel(Model):
|
|
1475
1633
|
|
1476
1634
|
def set_gguf_parameters(self):
|
1477
1635
|
super().set_gguf_parameters()
|
1478
|
-
self.gguf_writer.add_name("Grok")
|
1479
1636
|
|
1480
1637
|
_experts: list[dict[str, Tensor]] | None = None
|
1481
1638
|
|
@@ -1524,7 +1681,6 @@ class DbrxModel(Model):
|
|
1524
1681
|
def set_gguf_parameters(self):
|
1525
1682
|
ffn_config = self.hparams["ffn_config"]
|
1526
1683
|
attn_config = self.hparams["attn_config"]
|
1527
|
-
self.gguf_writer.add_name(self.hparams["model_type"])
|
1528
1684
|
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
1529
1685
|
|
1530
1686
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
@@ -1537,7 +1693,6 @@ class DbrxModel(Model):
|
|
1537
1693
|
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
1538
1694
|
|
1539
1695
|
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
1540
|
-
self.gguf_writer.add_file_type(self.ftype)
|
1541
1696
|
|
1542
1697
|
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
1543
1698
|
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
@@ -1594,7 +1749,6 @@ class MiniCPMModel(Model):
|
|
1594
1749
|
|
1595
1750
|
def set_gguf_parameters(self):
|
1596
1751
|
block_count = self.hparams["num_hidden_layers"]
|
1597
|
-
self.gguf_writer.add_name("MiniCPM")
|
1598
1752
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1599
1753
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
1600
1754
|
self.gguf_writer.add_block_count(block_count)
|
@@ -1612,9 +1766,11 @@ class MiniCPMModel(Model):
|
|
1612
1766
|
if n_kv_head is not None and n_head != n_kv_head:
|
1613
1767
|
n_head = n_kv_head
|
1614
1768
|
|
1615
|
-
return (
|
1616
|
-
|
1617
|
-
|
1769
|
+
return (
|
1770
|
+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1771
|
+
.swapaxes(1, 2)
|
1772
|
+
.reshape(weights.shape)
|
1773
|
+
)
|
1618
1774
|
|
1619
1775
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1620
1776
|
del bid # unused
|
@@ -1662,7 +1818,6 @@ class QwenModel(Model):
|
|
1662
1818
|
self._set_vocab_qwen()
|
1663
1819
|
|
1664
1820
|
def set_gguf_parameters(self):
|
1665
|
-
self.gguf_writer.add_name("Qwen")
|
1666
1821
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1667
1822
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
1668
1823
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -1693,6 +1848,12 @@ class Qwen2MoeModel(Model):
|
|
1693
1848
|
super().set_gguf_parameters()
|
1694
1849
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
1695
1850
|
self.gguf_writer.add_expert_count(n_experts)
|
1851
|
+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
1852
|
+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
1853
|
+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
1854
|
+
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
1855
|
+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
1856
|
+
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
1696
1857
|
|
1697
1858
|
_experts: list[dict[str, Tensor]] | None = None
|
1698
1859
|
|
@@ -1732,8 +1893,8 @@ class Qwen2MoeModel(Model):
|
|
1732
1893
|
|
1733
1894
|
return [(self.map_tensor_name(name), data_torch)]
|
1734
1895
|
|
1735
|
-
def
|
1736
|
-
super().
|
1896
|
+
def prepare_tensors(self):
|
1897
|
+
super().prepare_tensors()
|
1737
1898
|
|
1738
1899
|
if self._experts is not None:
|
1739
1900
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -1747,7 +1908,6 @@ class GPT2Model(Model):
|
|
1747
1908
|
model_arch = gguf.MODEL_ARCH.GPT2
|
1748
1909
|
|
1749
1910
|
def set_gguf_parameters(self):
|
1750
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
1751
1911
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
1752
1912
|
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
1753
1913
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
@@ -1790,7 +1950,6 @@ class Phi2Model(Model):
|
|
1790
1950
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
1791
1951
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
1792
1952
|
|
1793
|
-
self.gguf_writer.add_name("Phi2")
|
1794
1953
|
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
1795
1954
|
|
1796
1955
|
self.gguf_writer.add_embedding_length(n_embd)
|
@@ -1823,7 +1982,7 @@ class Phi3MiniModel(Model):
|
|
1823
1982
|
|
1824
1983
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
1825
1984
|
scores: list[float] = [-10000.0] * vocab_size
|
1826
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
1985
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
1827
1986
|
|
1828
1987
|
for token_id in range(tokenizer.vocab_size()):
|
1829
1988
|
|
@@ -1852,7 +2011,7 @@ class Phi3MiniModel(Model):
|
|
1852
2011
|
|
1853
2012
|
for key in added_tokens_json:
|
1854
2013
|
token_id = added_tokens_json[key]
|
1855
|
-
if
|
2014
|
+
if token_id >= vocab_size:
|
1856
2015
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
1857
2016
|
continue
|
1858
2017
|
|
@@ -1868,8 +2027,9 @@ class Phi3MiniModel(Model):
|
|
1868
2027
|
for token_id, foken_data in added_tokens_decoder.items():
|
1869
2028
|
token_id = int(token_id)
|
1870
2029
|
token = foken_data["content"].encode("utf-8")
|
1871
|
-
if toktypes[token_id] != SentencePieceTokenTypes.
|
1872
|
-
|
2030
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2031
|
+
if tokens[token_id] != token:
|
2032
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
1873
2033
|
tokens[token_id] = token
|
1874
2034
|
scores[token_id] = -1000.0
|
1875
2035
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
@@ -1884,8 +2044,9 @@ class Phi3MiniModel(Model):
|
|
1884
2044
|
for foken_data in added_tokens:
|
1885
2045
|
token_id = int(foken_data["id"])
|
1886
2046
|
token = foken_data["content"].encode("utf-8")
|
1887
|
-
if toktypes[token_id] != SentencePieceTokenTypes.
|
1888
|
-
|
2047
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2048
|
+
if tokens[token_id] != token:
|
2049
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
1889
2050
|
tokens[token_id] = token
|
1890
2051
|
scores[token_id] = -1000.0
|
1891
2052
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
@@ -1912,7 +2073,6 @@ class Phi3MiniModel(Model):
|
|
1912
2073
|
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
1913
2074
|
rope_dims = n_embd // n_head
|
1914
2075
|
|
1915
|
-
self.gguf_writer.add_name("Phi3")
|
1916
2076
|
self.gguf_writer.add_context_length(max_pos_embds)
|
1917
2077
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
1918
2078
|
self.gguf_writer.add_embedding_length(n_embd)
|
@@ -1924,10 +2084,11 @@ class Phi3MiniModel(Model):
|
|
1924
2084
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
1925
2085
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
1926
2086
|
self.gguf_writer.add_file_type(self.ftype)
|
2087
|
+
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
|
1927
2088
|
|
1928
2089
|
# write rope scaling for long context (128k) model
|
1929
2090
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
1930
|
-
if
|
2091
|
+
if rope_scaling is None:
|
1931
2092
|
return
|
1932
2093
|
|
1933
2094
|
scale = max_pos_embds / orig_max_pos_embds
|
@@ -1936,7 +2097,7 @@ class Phi3MiniModel(Model):
|
|
1936
2097
|
if len(rope_scaling_type) == 0:
|
1937
2098
|
raise KeyError('Missing the required key rope_scaling.type')
|
1938
2099
|
|
1939
|
-
if rope_scaling_type == 'su':
|
2100
|
+
if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
|
1940
2101
|
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
|
1941
2102
|
elif rope_scaling_type == 'yarn':
|
1942
2103
|
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
|
@@ -1969,7 +2130,6 @@ class PlamoModel(Model):
|
|
1969
2130
|
hparams = self.hparams
|
1970
2131
|
block_count = hparams["num_hidden_layers"]
|
1971
2132
|
|
1972
|
-
self.gguf_writer.add_name("PLaMo")
|
1973
2133
|
self.gguf_writer.add_context_length(4096) # not in config.json
|
1974
2134
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
1975
2135
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
@@ -2014,7 +2174,6 @@ class CodeShellModel(Model):
|
|
2014
2174
|
def set_gguf_parameters(self):
|
2015
2175
|
block_count = self.hparams["n_layer"]
|
2016
2176
|
|
2017
|
-
self.gguf_writer.add_name("CodeShell")
|
2018
2177
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
2019
2178
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
2020
2179
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
@@ -2066,7 +2225,7 @@ class InternLM2Model(Model):
|
|
2066
2225
|
logger.error(f'Error: Missing {tokenizer_path}')
|
2067
2226
|
sys.exit(1)
|
2068
2227
|
|
2069
|
-
sentencepiece_model = model.ModelProto()
|
2228
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
2070
2229
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
2071
2230
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
2072
2231
|
|
@@ -2094,6 +2253,9 @@ class InternLM2Model(Model):
|
|
2094
2253
|
toktype = SentencePieceTokenTypes.UNUSED
|
2095
2254
|
elif tokenizer.IsByte(token_id):
|
2096
2255
|
toktype = SentencePieceTokenTypes.BYTE
|
2256
|
+
# take care of ununsed raw token
|
2257
|
+
if piece.startswith('[UNUSED'):
|
2258
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
2097
2259
|
|
2098
2260
|
tokens.append(text)
|
2099
2261
|
scores.append(score)
|
@@ -2109,6 +2271,49 @@ class InternLM2Model(Model):
|
|
2109
2271
|
scores.append(-1000.0)
|
2110
2272
|
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
2111
2273
|
|
2274
|
+
chat_eos_token = '<|im_end|>'
|
2275
|
+
chat_eos_token_id = None
|
2276
|
+
|
2277
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2278
|
+
if tokenizer_config_file.is_file():
|
2279
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2280
|
+
tokenizer_config_json = json.load(f)
|
2281
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
2282
|
+
for token_id, foken_data in added_tokens_decoder.items():
|
2283
|
+
token_id = int(token_id)
|
2284
|
+
token = foken_data["content"]
|
2285
|
+
if token == chat_eos_token:
|
2286
|
+
chat_eos_token_id = token_id
|
2287
|
+
token = token.encode("utf-8")
|
2288
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2289
|
+
if tokens[token_id] != token:
|
2290
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
2291
|
+
tokens[token_id] = token
|
2292
|
+
scores[token_id] = -1000.0
|
2293
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2294
|
+
if foken_data.get("special"):
|
2295
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2296
|
+
|
2297
|
+
tokenizer_file = self.dir_model / 'tokenizer.json'
|
2298
|
+
if tokenizer_file.is_file():
|
2299
|
+
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
2300
|
+
tokenizer_json = json.load(f)
|
2301
|
+
added_tokens = tokenizer_json.get("added_tokens", [])
|
2302
|
+
for foken_data in added_tokens:
|
2303
|
+
token_id = int(foken_data["id"])
|
2304
|
+
token = foken_data["content"]
|
2305
|
+
if token == chat_eos_token:
|
2306
|
+
chat_eos_token_id = token_id
|
2307
|
+
token = token.encode("utf-8")
|
2308
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
2309
|
+
if tokens[token_id] != token:
|
2310
|
+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
2311
|
+
tokens[token_id] = token
|
2312
|
+
scores[token_id] = -1000.0
|
2313
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2314
|
+
if foken_data.get("special"):
|
2315
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2316
|
+
|
2112
2317
|
self.gguf_writer.add_tokenizer_model("llama")
|
2113
2318
|
self.gguf_writer.add_tokenizer_pre("default")
|
2114
2319
|
self.gguf_writer.add_token_list(tokens)
|
@@ -2118,37 +2323,17 @@ class InternLM2Model(Model):
|
|
2118
2323
|
|
2119
2324
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2120
2325
|
old_eos = special_vocab.special_token_ids["eos"]
|
2121
|
-
if
|
2326
|
+
if chat_eos_token_id is not None:
|
2122
2327
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
2123
2328
|
# TODO: this is a hack, should be fixed
|
2124
2329
|
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
2125
|
-
special_vocab.special_token_ids["eos"] =
|
2126
|
-
logger.warning(f"Replace eos:{old_eos} with a special token:{
|
2127
|
-
in chat mode so that the conversation can end normally.")
|
2330
|
+
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
2331
|
+
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
2332
|
+
" in chat mode so that the conversation can end normally.")
|
2128
2333
|
|
2129
2334
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2130
2335
|
|
2131
|
-
def _try_get_sft_eos(self, tokenizer):
|
2132
|
-
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
|
2133
|
-
im_end_list = tokenizer.Encode('<|im_end|>')
|
2134
|
-
eos_token = None
|
2135
|
-
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
|
2136
|
-
if len(unused_145_list) == 1:
|
2137
|
-
eos_token = unused_145_list[0]
|
2138
|
-
if len(im_end_list) == 1:
|
2139
|
-
eos_token = im_end_list[0]
|
2140
|
-
assert eos_token
|
2141
|
-
return eos_token
|
2142
|
-
|
2143
|
-
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
2144
|
-
if n_head_kv is not None and n_head != n_head_kv:
|
2145
|
-
n_head = n_head_kv
|
2146
|
-
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
2147
|
-
.swapaxes(1, 2)
|
2148
|
-
.reshape(weights.shape))
|
2149
|
-
|
2150
2336
|
def set_gguf_parameters(self):
|
2151
|
-
self.gguf_writer.add_name("InternLM2")
|
2152
2337
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
2153
2338
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
2154
2339
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
@@ -2158,30 +2343,30 @@ in chat mode so that the conversation can end normally.")
|
|
2158
2343
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2159
2344
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
2160
2345
|
self.gguf_writer.add_file_type(self.ftype)
|
2346
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
2347
|
+
if self.hparams["rope_scaling"].get("type") == "linear":
|
2348
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2349
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
2161
2350
|
|
2162
2351
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2163
2352
|
num_heads = self.hparams["num_attention_heads"]
|
2164
2353
|
num_kv_heads = self.hparams["num_key_value_heads"]
|
2165
|
-
|
2354
|
+
n_embd = self.hparams["hidden_size"]
|
2166
2355
|
q_per_kv = num_heads // num_kv_heads
|
2167
|
-
head_dim =
|
2356
|
+
head_dim = n_embd // num_heads
|
2168
2357
|
num_groups = num_heads // q_per_kv
|
2169
2358
|
|
2170
|
-
|
2171
|
-
|
2172
|
-
if re.match(qkv_pattern, name):
|
2173
|
-
bid = re.findall(qkv_pattern, name)[0]
|
2359
|
+
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
2174
2360
|
qkv = data_torch
|
2175
|
-
|
2176
|
-
qkv = qkv.
|
2177
|
-
q, k, v = qkv[
|
2361
|
+
|
2362
|
+
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
|
2363
|
+
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
2364
|
+
|
2178
2365
|
# The model weights of q and k equire additional reshape.
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2182
|
-
|
2183
|
-
# v = rearrange(v, " o g n i -> o (g n i)").T
|
2184
|
-
v = v.reshape((v.shape[0], -1)).T
|
2366
|
+
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
2367
|
+
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
2368
|
+
v = v.reshape((-1, v.shape[-1]))
|
2369
|
+
|
2185
2370
|
return [
|
2186
2371
|
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
|
2187
2372
|
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
|
@@ -2308,13 +2493,15 @@ class GemmaModel(Model):
|
|
2308
2493
|
special_vocab._set_special_token("middle", 68)
|
2309
2494
|
special_vocab._set_special_token("fsep", 70)
|
2310
2495
|
special_vocab._set_special_token("eot", 107)
|
2496
|
+
special_vocab.chat_template = None # do not add it twice
|
2311
2497
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2312
2498
|
|
2499
|
+
self.gguf_writer.add_add_space_prefix(False)
|
2500
|
+
|
2313
2501
|
def set_gguf_parameters(self):
|
2314
2502
|
hparams = self.hparams
|
2315
2503
|
block_count = hparams["num_hidden_layers"]
|
2316
2504
|
|
2317
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
2318
2505
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2319
2506
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2320
2507
|
self.gguf_writer.add_block_count(block_count)
|
@@ -2347,14 +2534,14 @@ class Gemma2Model(Model):
|
|
2347
2534
|
model_arch = gguf.MODEL_ARCH.GEMMA2
|
2348
2535
|
|
2349
2536
|
def set_vocab(self):
|
2350
|
-
self.
|
2537
|
+
self._set_vocab_sentencepiece()
|
2538
|
+
|
2351
2539
|
self.gguf_writer.add_add_space_prefix(False)
|
2352
2540
|
|
2353
2541
|
def set_gguf_parameters(self):
|
2354
2542
|
hparams = self.hparams
|
2355
2543
|
block_count = hparams["num_hidden_layers"]
|
2356
2544
|
|
2357
|
-
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
2358
2545
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
2359
2546
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
2360
2547
|
self.gguf_writer.add_block_count(block_count)
|
@@ -2374,7 +2561,7 @@ class Gemma2Model(Model):
|
|
2374
2561
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
2375
2562
|
|
2376
2563
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2377
|
-
del bid #
|
2564
|
+
del bid # unused
|
2378
2565
|
|
2379
2566
|
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
2380
2567
|
# To prevent errors, skip loading lm_head.weight.
|
@@ -2413,39 +2600,7 @@ class MambaModel(Model):
|
|
2413
2600
|
self._set_vocab_sentencepiece()
|
2414
2601
|
else:
|
2415
2602
|
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
2416
|
-
|
2417
|
-
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
2418
|
-
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
|
2419
|
-
|
2420
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
2421
|
-
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
|
2422
|
-
|
2423
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
2424
|
-
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
|
2425
|
-
|
2426
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
2427
|
-
assert field
|
2428
|
-
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
2429
|
-
|
2430
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
2431
|
-
assert field
|
2432
|
-
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
2433
|
-
|
2434
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
2435
|
-
assert field
|
2436
|
-
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
2437
|
-
|
2438
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
|
2439
|
-
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
|
2440
|
-
|
2441
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
2442
|
-
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2443
|
-
|
2444
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
|
2445
|
-
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2446
|
-
|
2447
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
|
2448
|
-
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2603
|
+
self._set_vocab_builtin("gpt-neox", vocab_size)
|
2449
2604
|
|
2450
2605
|
def set_gguf_parameters(self):
|
2451
2606
|
d_model = self.find_hparam(["hidden_size", "d_model"])
|
@@ -2461,7 +2616,6 @@ class MambaModel(Model):
|
|
2461
2616
|
# Fail early for models which don't have a block expansion factor of 2
|
2462
2617
|
assert d_inner == 2 * d_model
|
2463
2618
|
|
2464
|
-
self.gguf_writer.add_name(self.dir_model.name)
|
2465
2619
|
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
2466
2620
|
self.gguf_writer.add_embedding_length(d_model)
|
2467
2621
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
@@ -2568,18 +2722,20 @@ class JinaBertV2Model(BertModel):
|
|
2568
2722
|
|
2569
2723
|
def get_tensors(self):
|
2570
2724
|
for name, data in super().get_tensors():
|
2571
|
-
if '
|
2725
|
+
if 'gated_layer' in name:
|
2572
2726
|
d1 = data[:self.intermediate_size, :]
|
2573
2727
|
name1 = name.replace('gated_layers', 'gated_layers_w')
|
2728
|
+
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
2574
2729
|
d2 = data[self.intermediate_size:, :]
|
2575
2730
|
name2 = name.replace('gated_layers', 'gated_layers_v')
|
2731
|
+
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
2576
2732
|
yield name1, d1
|
2577
2733
|
yield name2, d2
|
2578
2734
|
continue
|
2579
2735
|
|
2580
2736
|
yield name, data
|
2581
2737
|
|
2582
|
-
def set_vocab(self
|
2738
|
+
def set_vocab(self):
|
2583
2739
|
tokenizer_class = 'BertTokenizer'
|
2584
2740
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
2585
2741
|
tokenizer_class = json.load(f)['tokenizer_class']
|
@@ -2595,6 +2751,81 @@ class JinaBertV2Model(BertModel):
|
|
2595
2751
|
self.gguf_writer.add_add_eos_token(True)
|
2596
2752
|
|
2597
2753
|
|
2754
|
+
@Model.register("OpenELMForCausalLM")
|
2755
|
+
class OpenELMModel(Model):
|
2756
|
+
model_arch = gguf.MODEL_ARCH.OPENELM
|
2757
|
+
|
2758
|
+
@staticmethod
|
2759
|
+
def _make_divisible(v: float | int, divisor: int) -> int:
|
2760
|
+
# ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
|
2761
|
+
new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
|
2762
|
+
# Make sure that round down does not go down by more than 10%.
|
2763
|
+
if new_v < 0.9 * v:
|
2764
|
+
new_v += divisor
|
2765
|
+
return new_v
|
2766
|
+
|
2767
|
+
def __init__(self, *args, **kwargs):
|
2768
|
+
super().__init__(*args, **kwargs)
|
2769
|
+
|
2770
|
+
ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
|
2771
|
+
ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
|
2772
|
+
self._n_embd: int = self.hparams["model_dim"]
|
2773
|
+
self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
|
2774
|
+
self._num_query_heads: list[int] = self.hparams["num_query_heads"]
|
2775
|
+
self._ffn_dims: list[int] = [
|
2776
|
+
OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
|
2777
|
+
for multiplier in ffn_multipliers
|
2778
|
+
]
|
2779
|
+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
2780
|
+
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
|
2781
|
+
|
2782
|
+
# Uses the tokenizer from meta-llama/Llama-2-7b-hf
|
2783
|
+
def set_vocab(self):
|
2784
|
+
try:
|
2785
|
+
self._set_vocab_sentencepiece()
|
2786
|
+
except FileNotFoundError:
|
2787
|
+
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
|
2788
|
+
|
2789
|
+
def set_gguf_parameters(self):
|
2790
|
+
n_embd = self._n_embd
|
2791
|
+
head_dim = self.hparams["head_dim"]
|
2792
|
+
rot_pct = 1.0
|
2793
|
+
assert self.block_count == len(self._num_kv_heads)
|
2794
|
+
assert self.block_count == len(self._num_query_heads)
|
2795
|
+
assert self.block_count == len(self._ffn_dims)
|
2796
|
+
|
2797
|
+
self.gguf_writer.add_block_count(self.block_count)
|
2798
|
+
self.gguf_writer.add_context_length(self.hparams["max_context_length"])
|
2799
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
2800
|
+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
|
2801
|
+
self.gguf_writer.add_head_count(self._num_query_heads)
|
2802
|
+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
2803
|
+
self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
|
2804
|
+
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
|
2805
|
+
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
|
2806
|
+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
|
2807
|
+
self.gguf_writer.add_key_length(head_dim)
|
2808
|
+
self.gguf_writer.add_value_length(head_dim)
|
2809
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2810
|
+
|
2811
|
+
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
2812
|
+
if "n_layers" in keys:
|
2813
|
+
return self.hparams["num_transformer_layers"]
|
2814
|
+
|
2815
|
+
return super().find_hparam(keys, optional)
|
2816
|
+
|
2817
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2818
|
+
|
2819
|
+
# split ff
|
2820
|
+
if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
|
2821
|
+
ff_dim = self._ffn_dims[bid]
|
2822
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
|
2823
|
+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
|
2824
|
+
return
|
2825
|
+
|
2826
|
+
yield (self.map_tensor_name(name), data_torch)
|
2827
|
+
|
2828
|
+
|
2598
2829
|
@Model.register("ArcticForCausalLM")
|
2599
2830
|
class ArcticModel(Model):
|
2600
2831
|
model_arch = gguf.MODEL_ARCH.ARCTIC
|
@@ -2619,7 +2850,7 @@ class ArcticModel(Model):
|
|
2619
2850
|
|
2620
2851
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
2621
2852
|
scores: list[float] = [-10000.0] * vocab_size
|
2622
|
-
toktypes: list[int] = [SentencePieceTokenTypes.
|
2853
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
2623
2854
|
|
2624
2855
|
for token_id in range(tokenizer.vocab_size()):
|
2625
2856
|
|
@@ -2652,7 +2883,7 @@ class ArcticModel(Model):
|
|
2652
2883
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
2653
2884
|
for token_id, token_json in added_tokens_decoder.items():
|
2654
2885
|
token_id = int(token_id)
|
2655
|
-
if
|
2886
|
+
if token_id >= vocab_size:
|
2656
2887
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
2657
2888
|
continue
|
2658
2889
|
|
@@ -2736,8 +2967,8 @@ class ArcticModel(Model):
|
|
2736
2967
|
|
2737
2968
|
return [(self.map_tensor_name(name), data_torch)]
|
2738
2969
|
|
2739
|
-
def
|
2740
|
-
super().
|
2970
|
+
def prepare_tensors(self):
|
2971
|
+
super().prepare_tensors()
|
2741
2972
|
|
2742
2973
|
if self._experts is not None:
|
2743
2974
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
@@ -2746,35 +2977,555 @@ class ArcticModel(Model):
|
|
2746
2977
|
raise ValueError(f"Unprocessed experts: {experts}")
|
2747
2978
|
|
2748
2979
|
|
2749
|
-
|
2980
|
+
@Model.register("DeepseekV2ForCausalLM")
|
2981
|
+
class DeepseekV2Model(Model):
|
2982
|
+
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
2750
2983
|
|
2984
|
+
def set_vocab(self):
|
2985
|
+
self._set_vocab_gpt2()
|
2751
2986
|
|
2752
|
-
|
2753
|
-
|
2754
|
-
|
2755
|
-
# to keep the type-checker happy
|
2756
|
-
dtype: torch.dtype
|
2757
|
-
shape: torch.Size
|
2987
|
+
def set_gguf_parameters(self):
|
2988
|
+
super().set_gguf_parameters()
|
2989
|
+
hparams = self.hparams
|
2758
2990
|
|
2759
|
-
|
2760
|
-
|
2761
|
-
|
2762
|
-
|
2763
|
-
|
2991
|
+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
2992
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
2993
|
+
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
2994
|
+
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
2995
|
+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
2996
|
+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
2997
|
+
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
2998
|
+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
2999
|
+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
3000
|
+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
3001
|
+
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
3002
|
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
3003
|
+
|
3004
|
+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
3005
|
+
if self.hparams["rope_scaling"].get("type") == "yarn":
|
3006
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
3007
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
3008
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
3009
|
+
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
3010
|
+
|
3011
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3012
|
+
|
3013
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3014
|
+
# process the experts separately
|
3015
|
+
if name.find("mlp.experts") != -1:
|
3016
|
+
n_experts = self.hparams["n_routed_experts"]
|
3017
|
+
assert bid is not None
|
3018
|
+
|
3019
|
+
if self._experts is None:
|
3020
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3021
|
+
|
3022
|
+
self._experts[bid][name] = data_torch
|
3023
|
+
|
3024
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3025
|
+
tensors: list[tuple[str, Tensor]] = []
|
3026
|
+
|
3027
|
+
# merge the experts into a single 3d tensor
|
3028
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
3029
|
+
datas: list[Tensor] = []
|
3030
|
+
|
3031
|
+
for xid in range(n_experts):
|
3032
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
3033
|
+
datas.append(self._experts[bid][ename])
|
3034
|
+
del self._experts[bid][ename]
|
3035
|
+
|
3036
|
+
data_torch = torch.stack(datas, dim=0)
|
3037
|
+
|
3038
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
3039
|
+
|
3040
|
+
new_name = self.map_tensor_name(merged_name)
|
3041
|
+
|
3042
|
+
tensors.append((new_name, data_torch))
|
3043
|
+
return tensors
|
3044
|
+
else:
|
3045
|
+
return []
|
3046
|
+
|
3047
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3048
|
+
|
3049
|
+
def prepare_tensors(self):
|
3050
|
+
super().prepare_tensors()
|
3051
|
+
|
3052
|
+
if self._experts is not None:
|
3053
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
3054
|
+
experts = [k for d in self._experts for k in d.keys()]
|
3055
|
+
if len(experts) > 0:
|
3056
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
3057
|
+
|
3058
|
+
|
3059
|
+
@Model.register("T5WithLMHeadModel")
|
3060
|
+
@Model.register("T5ForConditionalGeneration")
|
3061
|
+
@Model.register("MT5ForConditionalGeneration")
|
3062
|
+
@Model.register("UMT5ForConditionalGeneration")
|
3063
|
+
class T5Model(Model):
|
3064
|
+
model_arch = gguf.MODEL_ARCH.T5
|
3065
|
+
|
3066
|
+
def __init__(self, *args, **kwargs):
|
3067
|
+
super().__init__(*args, **kwargs)
|
3068
|
+
self.shared_token_embeddings_found = False
|
3069
|
+
|
3070
|
+
def set_vocab(self):
|
3071
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
3072
|
+
# exception when importing sentencepiece_model_pb2
|
3073
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
3074
|
+
from sentencepiece import SentencePieceProcessor
|
3075
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
3076
|
+
|
3077
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
3078
|
+
|
3079
|
+
# many older models use spiece.model tokenizer model filename
|
3080
|
+
if not tokenizer_path.is_file():
|
3081
|
+
tokenizer_path = self.dir_model / 'spiece.model'
|
3082
|
+
|
3083
|
+
if not tokenizer_path.is_file():
|
3084
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
3085
|
+
|
3086
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
3087
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
3088
|
+
|
3089
|
+
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
3090
|
+
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
3091
|
+
# assure the tokenizer model file name is correct
|
3092
|
+
assert tokenizer_path.name == 'tokenizer.model'
|
3093
|
+
return self._set_vocab_sentencepiece()
|
3094
|
+
else:
|
3095
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3096
|
+
|
3097
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
3098
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
3099
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
3100
|
+
|
3101
|
+
tokenizer = SentencePieceProcessor()
|
3102
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3103
|
+
|
3104
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
3105
|
+
|
3106
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3107
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3108
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3109
|
+
|
3110
|
+
for token_id in range(tokenizer.vocab_size()):
|
3111
|
+
piece = tokenizer.IdToPiece(token_id)
|
3112
|
+
text = piece.encode("utf-8")
|
3113
|
+
score = tokenizer.GetScore(token_id)
|
3114
|
+
|
3115
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3116
|
+
if tokenizer.IsUnknown(token_id):
|
3117
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3118
|
+
elif tokenizer.IsControl(token_id):
|
3119
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3120
|
+
elif tokenizer.IsUnused(token_id):
|
3121
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3122
|
+
elif tokenizer.IsByte(token_id):
|
3123
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3124
|
+
|
3125
|
+
tokens[token_id] = text
|
3126
|
+
scores[token_id] = score
|
3127
|
+
toktypes[token_id] = toktype
|
3128
|
+
|
3129
|
+
added_tokens_file = self.dir_model / 'added_tokens.json'
|
3130
|
+
if added_tokens_file.is_file():
|
3131
|
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
3132
|
+
added_tokens_json = json.load(f)
|
3133
|
+
for key in added_tokens_json:
|
3134
|
+
token_id = added_tokens_json[key]
|
3135
|
+
if token_id >= vocab_size:
|
3136
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
3137
|
+
continue
|
3138
|
+
|
3139
|
+
tokens[token_id] = key.encode("utf-8")
|
3140
|
+
scores[token_id] = -1000.0
|
3141
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
3142
|
+
|
3143
|
+
if vocab_size > len(tokens):
|
3144
|
+
pad_count = vocab_size - len(tokens)
|
3145
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
3146
|
+
for i in range(1, pad_count + 1):
|
3147
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
3148
|
+
scores.append(-1000.0)
|
3149
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
3150
|
+
|
3151
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
3152
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3153
|
+
self.gguf_writer.add_token_list(tokens)
|
3154
|
+
self.gguf_writer.add_token_scores(scores)
|
3155
|
+
self.gguf_writer.add_token_types(toktypes)
|
3156
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
3157
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3158
|
+
if precompiled_charsmap:
|
3159
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3160
|
+
|
3161
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3162
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3163
|
+
|
3164
|
+
self.gguf_writer.add_add_bos_token(False)
|
3165
|
+
self.gguf_writer.add_add_eos_token(True)
|
3166
|
+
|
3167
|
+
def set_gguf_parameters(self):
|
3168
|
+
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
3169
|
+
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
3170
|
+
n_ctx = 512
|
3171
|
+
self.gguf_writer.add_context_length(n_ctx)
|
3172
|
+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
3173
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
3174
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3175
|
+
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
3176
|
+
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
3177
|
+
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
3178
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3179
|
+
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
3180
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
3181
|
+
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
3182
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3183
|
+
|
3184
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3185
|
+
del bid # unused
|
3186
|
+
|
3187
|
+
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
|
3188
|
+
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
|
3189
|
+
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
|
3190
|
+
# and decoder and ignore the remaining ones.
|
3191
|
+
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
|
3192
|
+
if not self.shared_token_embeddings_found:
|
3193
|
+
name = "shared.weight"
|
3194
|
+
self.shared_token_embeddings_found = True
|
3195
|
+
else:
|
3196
|
+
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
|
3197
|
+
return []
|
3198
|
+
|
3199
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3200
|
+
|
3201
|
+
|
3202
|
+
@Model.register("JAISLMHeadModel")
|
3203
|
+
class JaisModel(Model):
|
3204
|
+
model_arch = gguf.MODEL_ARCH.JAIS
|
3205
|
+
|
3206
|
+
def __init__(self, *args, **kwargs):
|
3207
|
+
super().__init__(*args, **kwargs)
|
3208
|
+
|
3209
|
+
# SwigLU activation
|
3210
|
+
assert self.hparams["activation_function"] == "swiglu"
|
3211
|
+
# ALiBi position embedding
|
3212
|
+
assert self.hparams["position_embedding_type"] == "alibi"
|
3213
|
+
|
3214
|
+
# Embeddings scale
|
3215
|
+
self.embeddings_scale = 1.0
|
3216
|
+
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
3217
|
+
self.output_is_wte = False
|
3218
|
+
if 'mup_embeddings_scale' in self.hparams:
|
3219
|
+
self.output_is_wte = True # Hack (?)
|
3220
|
+
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
3221
|
+
elif 'embeddings_scale' in self.hparams:
|
3222
|
+
self.embeddings_scale = self.hparams['embeddings_scale']
|
3223
|
+
else:
|
3224
|
+
assert False
|
3225
|
+
|
3226
|
+
self.width_scale = 1.0
|
3227
|
+
if 'mup_output_alpha' in self.hparams:
|
3228
|
+
assert 'mup_width_scale' in self.hparams
|
3229
|
+
self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
|
3230
|
+
elif 'width_scale' in self.hparams:
|
3231
|
+
self.width_scale = self.hparams['width_scale']
|
3232
|
+
else:
|
3233
|
+
assert False
|
3234
|
+
|
3235
|
+
self.max_alibi_bias = 8.0
|
3236
|
+
|
3237
|
+
def set_vocab(self):
|
3238
|
+
self._set_vocab_gpt2()
|
3239
|
+
|
3240
|
+
def set_gguf_parameters(self):
|
3241
|
+
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
3242
|
+
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
3243
|
+
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
3244
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
|
3245
|
+
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
3246
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3247
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3248
|
+
|
3249
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3250
|
+
del bid # unused
|
3251
|
+
|
3252
|
+
tensors: list[tuple[str, Tensor]] = []
|
3253
|
+
|
3254
|
+
# we don't need these
|
3255
|
+
if name.endswith((".attn.bias")):
|
3256
|
+
return tensors
|
3257
|
+
|
3258
|
+
if name.endswith(("relative_pe.slopes")):
|
3259
|
+
# Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
|
3260
|
+
# Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
|
3261
|
+
# but Jais's PyTorch model simply precalculates the slope values and places them
|
3262
|
+
# in relative_pes.slopes
|
3263
|
+
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
|
3264
|
+
first_val = float(data_torch[0].item())
|
3265
|
+
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
|
3266
|
+
|
3267
|
+
return tensors
|
3268
|
+
|
3269
|
+
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
|
3270
|
+
data_torch = data_torch.transpose(1, 0)
|
3271
|
+
|
3272
|
+
new_name = self.map_tensor_name(name)
|
3273
|
+
|
3274
|
+
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3275
|
+
tensors.append((new_name, data_torch * self.embeddings_scale))
|
3276
|
+
if self.output_is_wte:
|
3277
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
3278
|
+
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
3279
|
+
assert not self.output_is_wte
|
3280
|
+
tensors.append((new_name, data_torch * self.width_scale))
|
3281
|
+
else:
|
3282
|
+
tensors.append((new_name, data_torch))
|
3283
|
+
|
3284
|
+
return tensors
|
3285
|
+
|
3286
|
+
def prepare_tensors(self):
|
3287
|
+
super().prepare_tensors()
|
3288
|
+
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
3289
|
+
|
3290
|
+
|
3291
|
+
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
3292
|
+
class ChatGLMModel(Model):
|
3293
|
+
model_arch = gguf.MODEL_ARCH.CHATGLM
|
3294
|
+
|
3295
|
+
def set_vocab_chatglm3(self):
|
3296
|
+
dir_model = self.dir_model
|
3297
|
+
hparams = self.hparams
|
3298
|
+
tokens: list[bytes] = []
|
3299
|
+
toktypes: list[int] = []
|
3300
|
+
scores: list[float] = []
|
3301
|
+
|
3302
|
+
from transformers import AutoTokenizer
|
3303
|
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3304
|
+
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
|
3305
|
+
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3306
|
+
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
|
3307
|
+
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
|
3308
|
+
for token_id in range(vocab_size):
|
3309
|
+
piece = tokenizer._convert_id_to_token(token_id)
|
3310
|
+
if token_id == 0:
|
3311
|
+
piece = "<unk>"
|
3312
|
+
elif token_id == 1:
|
3313
|
+
piece = "<bos>"
|
3314
|
+
elif token_id == 2:
|
3315
|
+
piece = "<eos>"
|
3316
|
+
|
3317
|
+
text = piece.encode("utf-8")
|
3318
|
+
score = 0.0
|
3319
|
+
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
|
3320
|
+
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
|
3321
|
+
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
|
3322
|
+
score = tokenizer.tokenizer.sp_model.get_score(token_id)
|
3323
|
+
|
3324
|
+
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
|
3325
|
+
if piece in special_tokens:
|
3326
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3327
|
+
elif len(piece) == 0:
|
3328
|
+
text = f"[PAD{token_id}]".encode("utf-8")
|
3329
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3330
|
+
else:
|
3331
|
+
toktype = SentencePieceTokenTypes.USER_DEFINED
|
3332
|
+
tokens.append(text)
|
3333
|
+
scores.append(score)
|
3334
|
+
toktypes.append(toktype)
|
3335
|
+
continue
|
3336
|
+
|
3337
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3338
|
+
if tokenizer.tokenizer.sp_model.is_unknown(token_id):
|
3339
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3340
|
+
elif tokenizer.tokenizer.sp_model.is_control(token_id):
|
3341
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3342
|
+
elif tokenizer.tokenizer.sp_model.is_unused(token_id):
|
3343
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3344
|
+
elif tokenizer.tokenizer.sp_model.is_byte(token_id):
|
3345
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3346
|
+
|
3347
|
+
tokens.append(text)
|
3348
|
+
scores.append(score)
|
3349
|
+
toktypes.append(toktype)
|
3350
|
+
|
3351
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
3352
|
+
# glm3 needs prefix and suffix formatted as:
|
3353
|
+
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
|
3354
|
+
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
|
3355
|
+
self.gguf_writer.add_token_list(tokens)
|
3356
|
+
self.gguf_writer.add_token_scores(scores)
|
3357
|
+
self.gguf_writer.add_token_types(toktypes)
|
3358
|
+
|
3359
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3360
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3361
|
+
|
3362
|
+
@staticmethod
|
3363
|
+
def token_bytes_to_string(b):
|
3364
|
+
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
3365
|
+
byte_encoder = bytes_to_unicode()
|
3366
|
+
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
3367
|
+
|
3368
|
+
@staticmethod
|
3369
|
+
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
|
3370
|
+
parts = [bytes([b]) for b in token]
|
3371
|
+
while True:
|
3372
|
+
min_idx = None
|
3373
|
+
min_rank = None
|
3374
|
+
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
|
3375
|
+
rank = mergeable_ranks.get(pair[0] + pair[1])
|
3376
|
+
if rank is not None and (min_rank is None or rank < min_rank):
|
3377
|
+
min_idx = i
|
3378
|
+
min_rank = rank
|
3379
|
+
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
|
3380
|
+
break
|
3381
|
+
assert min_idx is not None
|
3382
|
+
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
|
3383
|
+
return parts
|
3384
|
+
|
3385
|
+
def set_vocab(self):
|
3386
|
+
if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
|
3387
|
+
self.set_vocab_chatglm3()
|
3388
|
+
return
|
3389
|
+
|
3390
|
+
dir_model = self.dir_model
|
3391
|
+
hparams = self.hparams
|
3392
|
+
tokens: list[str] = []
|
3393
|
+
toktypes: list[int] = []
|
3394
|
+
|
3395
|
+
from transformers import AutoTokenizer
|
3396
|
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
3397
|
+
vocab_size = hparams["padded_vocab_size"]
|
3398
|
+
assert max(tokenizer.get_vocab().values()) < vocab_size
|
3399
|
+
|
3400
|
+
tokpre = self.get_vocab_base_pre(tokenizer)
|
3401
|
+
|
3402
|
+
merges = []
|
3403
|
+
vocab = {}
|
3404
|
+
mergeable_ranks = tokenizer.mergeable_ranks
|
3405
|
+
for token, rank in mergeable_ranks.items():
|
3406
|
+
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
3407
|
+
if len(token) == 1:
|
3408
|
+
continue
|
3409
|
+
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
3410
|
+
assert len(merged) >= 2 and len(merged) <= 7
|
3411
|
+
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
3412
|
+
|
3413
|
+
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
3414
|
+
added_vocab = tokenizer.get_added_vocab()
|
3415
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
3416
|
+
|
3417
|
+
for i in range(vocab_size):
|
3418
|
+
if i not in reverse_vocab:
|
3419
|
+
tokens.append(f"[PAD{i}]")
|
3420
|
+
toktypes.append(gguf.TokenType.UNUSED)
|
3421
|
+
elif reverse_vocab[i] in added_vocab:
|
3422
|
+
tokens.append(reverse_vocab[i])
|
3423
|
+
if tokenizer.added_tokens_decoder[i].special:
|
3424
|
+
toktypes.append(gguf.TokenType.CONTROL)
|
3425
|
+
else:
|
3426
|
+
toktypes.append(gguf.TokenType.USER_DEFINED)
|
3427
|
+
else:
|
3428
|
+
tokens.append(reverse_vocab[i])
|
3429
|
+
toktypes.append(gguf.TokenType.NORMAL)
|
3430
|
+
|
3431
|
+
self.gguf_writer.add_tokenizer_model("gpt2")
|
3432
|
+
self.gguf_writer.add_tokenizer_pre(tokpre)
|
3433
|
+
self.gguf_writer.add_token_list(tokens)
|
3434
|
+
self.gguf_writer.add_token_types(toktypes)
|
3435
|
+
|
3436
|
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
3437
|
+
special_vocab.merges = merges
|
3438
|
+
# only add special tokens when they were not already loaded from config.json
|
3439
|
+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3440
|
+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
3441
|
+
# this one is usually not in config.json anyway
|
3442
|
+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
3443
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3444
|
+
|
3445
|
+
def set_gguf_parameters(self):
|
3446
|
+
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
3447
|
+
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
3448
|
+
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
3449
|
+
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
3450
|
+
self.gguf_writer.add_embedding_length(n_embed)
|
3451
|
+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
3452
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3453
|
+
self.gguf_writer.add_head_count(n_head)
|
3454
|
+
self.gguf_writer.add_head_count_kv(n_head_kv)
|
3455
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
|
3456
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3457
|
+
self.gguf_writer.add_rope_dimension_count(64)
|
3458
|
+
self.gguf_writer.add_add_bos_token(False)
|
3459
|
+
rope_freq = 10000
|
3460
|
+
if "rope_ratio" in self.hparams:
|
3461
|
+
rope_freq = rope_freq * self.hparams["rope_ratio"]
|
3462
|
+
self.gguf_writer.add_rope_freq_base(rope_freq)
|
3463
|
+
|
3464
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3465
|
+
del bid # unused
|
3466
|
+
|
3467
|
+
if name.endswith(".rotary_pos_emb.inv_freq"):
|
3468
|
+
return []
|
3469
|
+
|
3470
|
+
name = name.removeprefix("transformer.")
|
3471
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3472
|
+
|
3473
|
+
###### CONVERSION LOGIC ######
|
3474
|
+
|
3475
|
+
|
3476
|
+
# tree of lazy tensors
|
3477
|
+
class LazyTorchTensor(gguf.LazyBase):
|
3478
|
+
_tensor_type = torch.Tensor
|
3479
|
+
# to keep the type-checker happy
|
3480
|
+
dtype: torch.dtype
|
3481
|
+
shape: torch.Size
|
3482
|
+
|
3483
|
+
# only used when converting a torch.Tensor to a np.ndarray
|
3484
|
+
_dtype_map: dict[torch.dtype, type] = {
|
3485
|
+
torch.float16: np.float16,
|
3486
|
+
torch.float32: np.float32,
|
3487
|
+
}
|
3488
|
+
|
3489
|
+
# used for safetensors slices
|
3490
|
+
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
3491
|
+
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
3492
|
+
_dtype_str_map: dict[str, torch.dtype] = {
|
3493
|
+
"F64": torch.float64,
|
3494
|
+
"F32": torch.float32,
|
3495
|
+
"BF16": torch.bfloat16,
|
3496
|
+
"F16": torch.float16,
|
3497
|
+
# "U64": torch.uint64,
|
3498
|
+
"I64": torch.int64,
|
3499
|
+
# "U32": torch.uint32,
|
3500
|
+
"I32": torch.int32,
|
3501
|
+
# "U16": torch.uint16,
|
3502
|
+
"I16": torch.int16,
|
3503
|
+
"U8": torch.uint8,
|
3504
|
+
"I8": torch.int8,
|
3505
|
+
"BOOL": torch.bool,
|
3506
|
+
"F8_E4M3": torch.float8_e4m3fn,
|
3507
|
+
"F8_E5M2": torch.float8_e5m2,
|
3508
|
+
}
|
2764
3509
|
|
2765
3510
|
def numpy(self) -> gguf.LazyNumpyTensor:
|
2766
3511
|
dtype = self._dtype_map[self.dtype]
|
2767
3512
|
return gguf.LazyNumpyTensor(
|
2768
3513
|
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
2769
|
-
lazy=self._lazy,
|
2770
3514
|
args=(self,),
|
2771
|
-
func=(lambda s: s
|
3515
|
+
func=(lambda s: s.numpy())
|
2772
3516
|
)
|
2773
3517
|
|
2774
3518
|
@classmethod
|
2775
|
-
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape:
|
3519
|
+
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
2776
3520
|
return torch.empty(size=shape, dtype=dtype, device="meta")
|
2777
3521
|
|
3522
|
+
@classmethod
|
3523
|
+
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
3524
|
+
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
3525
|
+
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
3526
|
+
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
3527
|
+
return cast(torch.Tensor, lazy)
|
3528
|
+
|
2778
3529
|
@classmethod
|
2779
3530
|
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
2780
3531
|
del types # unused
|
@@ -2785,7 +3536,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
|
2785
3536
|
if func is torch.Tensor.numpy:
|
2786
3537
|
return args[0].numpy()
|
2787
3538
|
|
2788
|
-
return
|
3539
|
+
return cls._wrap_fn(func)(*args, **kwargs)
|
2789
3540
|
|
2790
3541
|
|
2791
3542
|
def parse_args() -> argparse.Namespace:
|
@@ -2795,10 +3546,6 @@ def parse_args() -> argparse.Namespace:
|
|
2795
3546
|
"--vocab-only", action="store_true",
|
2796
3547
|
help="extract only the vocab",
|
2797
3548
|
)
|
2798
|
-
parser.add_argument(
|
2799
|
-
"--awq-path", type=Path, default=None,
|
2800
|
-
help="Path to scale awq cache file",
|
2801
|
-
)
|
2802
3549
|
parser.add_argument(
|
2803
3550
|
"--outfile", type=Path,
|
2804
3551
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
@@ -2831,30 +3578,58 @@ def parse_args() -> argparse.Namespace:
|
|
2831
3578
|
"--verbose", action="store_true",
|
2832
3579
|
help="increase output verbosity",
|
2833
3580
|
)
|
3581
|
+
parser.add_argument(
|
3582
|
+
"--split-max-tensors", type=int, default=0,
|
3583
|
+
help="max tensors in each split",
|
3584
|
+
)
|
3585
|
+
parser.add_argument(
|
3586
|
+
"--split-max-size", type=str, default="0",
|
3587
|
+
help="max size per split N(M|G)",
|
3588
|
+
)
|
3589
|
+
parser.add_argument(
|
3590
|
+
"--dry-run", action="store_true",
|
3591
|
+
help="only print out a split plan and exit, without writing any new files",
|
3592
|
+
)
|
3593
|
+
parser.add_argument(
|
3594
|
+
"--no-tensor-first-split", action="store_true",
|
3595
|
+
help="do not add tensors to the first split (disabled by default)"
|
3596
|
+
)
|
3597
|
+
parser.add_argument(
|
3598
|
+
"--metadata", type=Path,
|
3599
|
+
help="Specify the path for an authorship metadata override file"
|
3600
|
+
)
|
2834
3601
|
|
2835
3602
|
return parser.parse_args()
|
2836
3603
|
|
2837
3604
|
|
3605
|
+
def split_str_to_n_bytes(split_str: str) -> int:
|
3606
|
+
if split_str.endswith("K"):
|
3607
|
+
n = int(split_str[:-1]) * 1000
|
3608
|
+
elif split_str.endswith("M"):
|
3609
|
+
n = int(split_str[:-1]) * 1000 * 1000
|
3610
|
+
elif split_str.endswith("G"):
|
3611
|
+
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
3612
|
+
elif split_str.isnumeric():
|
3613
|
+
n = int(split_str)
|
3614
|
+
else:
|
3615
|
+
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
3616
|
+
|
3617
|
+
if n < 0:
|
3618
|
+
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
3619
|
+
|
3620
|
+
return n
|
3621
|
+
|
3622
|
+
|
2838
3623
|
def main() -> None:
|
2839
3624
|
args = parse_args()
|
2840
3625
|
|
2841
|
-
|
3626
|
+
if args.verbose:
|
3627
|
+
logging.basicConfig(level=logging.DEBUG)
|
3628
|
+
else:
|
3629
|
+
logging.basicConfig(level=logging.INFO)
|
2842
3630
|
|
2843
3631
|
dir_model = args.model
|
2844
3632
|
|
2845
|
-
if args.awq_path:
|
2846
|
-
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
2847
|
-
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
2848
|
-
tmp_model_path = args.model / "weighted_model"
|
2849
|
-
dir_model = tmp_model_path
|
2850
|
-
if tmp_model_path.is_dir():
|
2851
|
-
logger.info(f"{tmp_model_path} exists as a weighted model.")
|
2852
|
-
else:
|
2853
|
-
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
2854
|
-
logger.info("Saving new weighted model ...")
|
2855
|
-
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
2856
|
-
logger.info(f"Saved weighted model at {tmp_model_path}.")
|
2857
|
-
|
2858
3633
|
if not dir_model.is_dir():
|
2859
3634
|
logger.error(f'Error: {args.model} is not a directory')
|
2860
3635
|
sys.exit(1)
|
@@ -2867,37 +3642,48 @@ def main() -> None:
|
|
2867
3642
|
"auto": gguf.LlamaFileType.GUESSED,
|
2868
3643
|
}
|
2869
3644
|
|
3645
|
+
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
3646
|
+
if args.use_temp_file and is_split:
|
3647
|
+
logger.error("Error: Cannot use temp file when splitting")
|
3648
|
+
sys.exit(1)
|
3649
|
+
|
2870
3650
|
if args.outfile is not None:
|
2871
3651
|
fname_out = args.outfile
|
2872
3652
|
else:
|
2873
|
-
|
2874
|
-
fname_out = dir_model / 'ggml-model-{ftype}.gguf'
|
3653
|
+
fname_out = dir_model
|
2875
3654
|
|
2876
3655
|
logger.info(f"Loading model: {dir_model.name}")
|
2877
3656
|
|
2878
3657
|
hparams = Model.load_hparams(dir_model)
|
2879
3658
|
|
2880
3659
|
with torch.inference_mode():
|
2881
|
-
|
2882
|
-
|
3660
|
+
output_type = ftype_map[args.outtype]
|
3661
|
+
model_architecture = hparams["architectures"][0]
|
2883
3662
|
|
2884
|
-
|
2885
|
-
|
2886
|
-
|
2887
|
-
|
2888
|
-
|
3663
|
+
try:
|
3664
|
+
model_class = Model.from_model_architecture(model_architecture)
|
3665
|
+
except NotImplementedError:
|
3666
|
+
logger.error(f"Model {model_architecture} is not supported")
|
3667
|
+
sys.exit(1)
|
2889
3668
|
|
2890
|
-
model_instance
|
3669
|
+
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
|
3670
|
+
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
|
3671
|
+
eager=args.no_lazy,
|
3672
|
+
metadata_override=args.metadata, model_name=args.model_name,
|
3673
|
+
split_max_tensors=args.split_max_tensors,
|
3674
|
+
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
3675
|
+
small_first_shard=args.no_tensor_first_split)
|
2891
3676
|
|
2892
3677
|
if args.vocab_only:
|
2893
|
-
logger.info(
|
3678
|
+
logger.info("Exporting model vocab...")
|
2894
3679
|
model_instance.write_vocab()
|
3680
|
+
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
|
2895
3681
|
else:
|
2896
|
-
logger.info(
|
3682
|
+
logger.info("Exporting model...")
|
2897
3683
|
model_instance.write()
|
2898
|
-
|
2899
|
-
|
3684
|
+
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
3685
|
+
logger.info(f"Model successfully exported to {out_path}")
|
2900
3686
|
|
2901
3687
|
|
2902
3688
|
if __name__ == '__main__':
|
2903
|
-
main()
|
3689
|
+
main()
|