bigdl-core-cpp 2.5.0b20240527__py3-none-win_amd64.whl → 2.5.0b20240529__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert-hf-to-gguf.py +1363 -338
- bigdl/cpp/convert.py +199 -52
- bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
- bigdl/cpp/gguf-py/gguf/constants.py +102 -28
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +9 -5
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +36 -11
- bigdl/cpp/gguf-py/gguf/lazy.py +236 -0
- bigdl/cpp/gguf-py/gguf/quants.py +123 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +28 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +3 -3
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/ggml_shared.dll +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.5.0b20240529.dist-info/RECORD +61 -0
- bigdl_core_cpp-2.5.0b20240527.dist-info/RECORD +0 -59
- {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert-hf-to-gguf.py
CHANGED
@@ -9,12 +9,16 @@ import json
|
|
9
9
|
import os
|
10
10
|
import re
|
11
11
|
import sys
|
12
|
-
from abc import ABC, abstractmethod
|
13
12
|
from enum import IntEnum
|
14
13
|
from pathlib import Path
|
15
14
|
from hashlib import sha256
|
15
|
+
<<<<<<< HEAD
|
16
16
|
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
|
17
|
+
=======
|
18
|
+
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
19
|
+
>>>>>>> uupstream/master
|
17
20
|
|
21
|
+
import math
|
18
22
|
import numpy as np
|
19
23
|
import torch
|
20
24
|
|
@@ -25,7 +29,9 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|
25
29
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
26
30
|
import gguf
|
27
31
|
|
28
|
-
from convert import LlamaHfVocab
|
32
|
+
from convert import LlamaHfVocab
|
33
|
+
|
34
|
+
logger = logging.getLogger("hf-to-gguf")
|
29
35
|
|
30
36
|
logger = logging.getLogger("hf-to-gguf")
|
31
37
|
|
@@ -44,29 +50,79 @@ class SentencePieceTokenTypes(IntEnum):
|
|
44
50
|
AnyModel = TypeVar("AnyModel", bound="type[Model]")
|
45
51
|
|
46
52
|
|
47
|
-
class Model
|
53
|
+
class Model:
|
48
54
|
_model_classes: dict[str, type[Model]] = {}
|
49
55
|
|
56
|
+
<<<<<<< HEAD
|
50
57
|
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
|
58
|
+
=======
|
59
|
+
dir_model: Path
|
60
|
+
ftype: int
|
61
|
+
is_big_endian: bool
|
62
|
+
endianess: gguf.GGUFEndian
|
63
|
+
use_temp_file: bool
|
64
|
+
lazy: bool
|
65
|
+
part_names: list[str]
|
66
|
+
is_safetensors: bool
|
67
|
+
hparams: dict[str, Any]
|
68
|
+
block_count: int
|
69
|
+
tensor_map: gguf.TensorNameMap
|
70
|
+
tensor_names: set[str] | None
|
71
|
+
fname_out: Path
|
72
|
+
gguf_writer: gguf.GGUFWriter
|
73
|
+
|
74
|
+
# subclasses should define this!
|
75
|
+
model_arch: gguf.MODEL_ARCH
|
76
|
+
|
77
|
+
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
|
78
|
+
if type(self) is Model:
|
79
|
+
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
80
|
+
>>>>>>> uupstream/master
|
51
81
|
self.dir_model = dir_model
|
52
82
|
self.ftype = ftype
|
53
|
-
self.fname_out = fname_out
|
54
83
|
self.is_big_endian = is_big_endian
|
55
84
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
56
85
|
self.use_temp_file = use_temp_file
|
86
|
+
<<<<<<< HEAD
|
57
87
|
self.is_safetensors = self._is_model_safetensors()
|
58
88
|
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
59
89
|
self.part_names = self._get_part_names()
|
60
90
|
self.hparams = Model.load_hparams(self.dir_model)
|
61
91
|
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
92
|
+
=======
|
93
|
+
self.lazy = not eager
|
94
|
+
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
95
|
+
self.is_safetensors = len(self.part_names) > 0
|
96
|
+
if not self.is_safetensors:
|
97
|
+
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
98
|
+
self.hparams = Model.load_hparams(self.dir_model)
|
99
|
+
>>>>>>> uupstream/master
|
62
100
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
101
|
+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
102
|
+
self.tensor_names = None
|
103
|
+
if self.ftype == gguf.LlamaFileType.GUESSED:
|
104
|
+
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
105
|
+
_, first_tensor = next(self.get_tensors())
|
106
|
+
if first_tensor.dtype == torch.float16:
|
107
|
+
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
|
108
|
+
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
109
|
+
else:
|
110
|
+
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
111
|
+
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
112
|
+
ftype_up: str = self.ftype.name.partition("_")[2].upper()
|
113
|
+
ftype_lw: str = ftype_up.lower()
|
114
|
+
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
115
|
+
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
116
|
+
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
63
117
|
|
64
|
-
@
|
65
|
-
|
66
|
-
|
67
|
-
|
118
|
+
@classmethod
|
119
|
+
def __init_subclass__(cls):
|
120
|
+
# can't use an abstract property, because overriding it without type errors
|
121
|
+
# would require using decorated functions instead of simply defining the property
|
122
|
+
if "model_arch" not in cls.__dict__:
|
123
|
+
raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
|
68
124
|
|
69
|
-
def find_hparam(self, keys:
|
125
|
+
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
70
126
|
key = next((k for k in keys if k in self.hparams), None)
|
71
127
|
if key is not None:
|
72
128
|
return self.hparams[key]
|
@@ -78,6 +134,22 @@ class Model(ABC):
|
|
78
134
|
self._set_vocab_gpt2()
|
79
135
|
|
80
136
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
137
|
+
tensor_names_from_parts: set[str] = set()
|
138
|
+
|
139
|
+
if len(self.part_names) > 1:
|
140
|
+
self.tensor_names = set()
|
141
|
+
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
142
|
+
index_name += ".index.json"
|
143
|
+
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
144
|
+
with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
|
145
|
+
index: dict[str, Any] = json.load(f)
|
146
|
+
weight_map = index.get("weight_map")
|
147
|
+
if weight_map is None or not isinstance(weight_map, dict):
|
148
|
+
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
|
149
|
+
self.tensor_names.update(weight_map.keys())
|
150
|
+
else:
|
151
|
+
self.tensor_names = tensor_names_from_parts
|
152
|
+
|
81
153
|
for part_name in self.part_names:
|
82
154
|
logger.info(f"gguf: loading model part '{part_name}'")
|
83
155
|
ctx: ContextManager[Any]
|
@@ -88,10 +160,46 @@ class Model(ABC):
|
|
88
160
|
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
89
161
|
|
90
162
|
with ctx as model_part:
|
163
|
+
tensor_names_from_parts.update(model_part.keys())
|
164
|
+
|
91
165
|
for name in model_part.keys():
|
92
166
|
data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
|
167
|
+
if self.lazy:
|
168
|
+
data = LazyTorchTensor.from_eager(data)
|
93
169
|
yield name, data
|
94
170
|
|
171
|
+
# only verify tensor name presence; it doesn't matter if they are not in the right files
|
172
|
+
if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
173
|
+
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
|
174
|
+
|
175
|
+
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
176
|
+
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
177
|
+
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
|
178
|
+
name: str = gguf.TENSOR_NAMES[key]
|
179
|
+
if "{bid}" in name:
|
180
|
+
assert bid is not None
|
181
|
+
name = name.format(bid=bid)
|
182
|
+
return name + suffix
|
183
|
+
|
184
|
+
def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
|
185
|
+
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
186
|
+
return False
|
187
|
+
key_name: str = gguf.TENSOR_NAMES[key]
|
188
|
+
if "{bid}" in key_name:
|
189
|
+
if bid is None:
|
190
|
+
return False
|
191
|
+
key_name = key_name.format(bid=bid)
|
192
|
+
else:
|
193
|
+
if bid is not None:
|
194
|
+
return False
|
195
|
+
return name == (key_name + suffix)
|
196
|
+
|
197
|
+
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
198
|
+
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
|
199
|
+
if new_name is None:
|
200
|
+
raise ValueError(f"Can not map tensor {name!r}")
|
201
|
+
return new_name
|
202
|
+
|
95
203
|
def set_gguf_parameters(self):
|
96
204
|
self.gguf_writer.add_name(self.dir_model.name)
|
97
205
|
self.gguf_writer.add_block_count(self.block_count)
|
@@ -134,13 +242,31 @@ class Model(ABC):
|
|
134
242
|
|
135
243
|
self.gguf_writer.add_file_type(self.ftype)
|
136
244
|
logger.info(f"gguf: file type = {self.ftype}")
|
245
|
+
<<<<<<< HEAD
|
246
|
+
=======
|
247
|
+
|
248
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
249
|
+
del bid # unused
|
250
|
+
|
251
|
+
return [(self.map_tensor_name(name), data_torch)]
|
252
|
+
|
253
|
+
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
254
|
+
del name, new_name, bid, n_dims # unused
|
255
|
+
|
256
|
+
return False
|
257
|
+
|
258
|
+
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
259
|
+
del name, new_name, bid, n_dims # unused
|
260
|
+
|
261
|
+
return False
|
262
|
+
>>>>>>> uupstream/master
|
137
263
|
|
138
264
|
def write_tensors(self):
|
139
|
-
|
140
|
-
|
265
|
+
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
266
|
+
|
141
267
|
for name, data_torch in self.get_tensors():
|
142
268
|
# we don't need these
|
143
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".
|
269
|
+
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
144
270
|
continue
|
145
271
|
|
146
272
|
old_dtype = data_torch.dtype
|
@@ -149,37 +275,97 @@ class Model(ABC):
|
|
149
275
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
150
276
|
data_torch = data_torch.to(torch.float32)
|
151
277
|
|
152
|
-
|
278
|
+
# use the first number-like part of the tensor name as the block id
|
279
|
+
bid = None
|
280
|
+
for part in name.split("."):
|
281
|
+
if part.isdecimal():
|
282
|
+
bid = int(part)
|
283
|
+
break
|
153
284
|
|
285
|
+
<<<<<<< HEAD
|
154
286
|
# map tensor names
|
155
287
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
156
288
|
if new_name is None:
|
157
289
|
raise ValueError(f"Can not map tensor {name!r}")
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
290
|
+
=======
|
291
|
+
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
292
|
+
data: np.ndarray = data # type hint
|
293
|
+
n_dims = len(data.shape)
|
294
|
+
data_dtype = data.dtype
|
295
|
+
data_qtype: gguf.GGMLQuantizationType | None = None
|
296
|
+
>>>>>>> uupstream/master
|
297
|
+
|
298
|
+
# when both are True, f32 should win
|
299
|
+
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
300
|
+
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
|
301
|
+
|
302
|
+
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
303
|
+
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
304
|
+
extra_f32 = any(cond for cond in (
|
305
|
+
extra_f32,
|
306
|
+
n_dims == 1,
|
307
|
+
new_name.endswith("_norm.weight"),
|
308
|
+
))
|
309
|
+
|
310
|
+
<<<<<<< HEAD
|
166
311
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
167
312
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
168
313
|
data = data.astype(np.float32)
|
314
|
+
=======
|
315
|
+
# Some tensor types are always in float32
|
316
|
+
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
|
317
|
+
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
318
|
+
gguf.MODEL_TENSOR.POS_EMBD,
|
319
|
+
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
320
|
+
))
|
321
|
+
>>>>>>> uupstream/master
|
169
322
|
|
170
|
-
|
171
|
-
|
172
|
-
|
323
|
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
324
|
+
extra_f16 = any(cond for cond in (
|
325
|
+
extra_f16,
|
326
|
+
(name.endswith(".weight") and n_dims >= 2),
|
327
|
+
))
|
173
328
|
|
329
|
+
<<<<<<< HEAD
|
174
330
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
331
|
+
=======
|
332
|
+
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
333
|
+
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
334
|
+
data = gguf.quantize_bf16(data)
|
335
|
+
assert data.dtype == np.int16
|
336
|
+
data_qtype = gguf.GGMLQuantizationType.BF16
|
337
|
+
>>>>>>> uupstream/master
|
175
338
|
|
176
|
-
|
339
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
340
|
+
data = gguf.quantize_q8_0(data)
|
341
|
+
assert data.dtype == np.uint8
|
342
|
+
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
343
|
+
|
344
|
+
else: # default to float16 for quantized tensors
|
345
|
+
if data_dtype != np.float16:
|
346
|
+
data = data.astype(np.float16)
|
347
|
+
data_qtype = gguf.GGMLQuantizationType.F16
|
348
|
+
|
349
|
+
if data_qtype is None: # by default, convert to float32
|
350
|
+
if data_dtype != np.float32:
|
351
|
+
data = data.astype(np.float32)
|
352
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
353
|
+
|
354
|
+
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
355
|
+
|
356
|
+
# reverse shape to make it similar to the internal ggml dimension order
|
357
|
+
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
358
|
+
|
359
|
+
# n_dims is implicit in the shape
|
360
|
+
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
361
|
+
|
362
|
+
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
177
363
|
|
178
364
|
def write(self):
|
179
365
|
self.write_tensors()
|
180
366
|
self.gguf_writer.write_header_to_file()
|
181
367
|
self.gguf_writer.write_kv_data_to_file()
|
182
|
-
self.gguf_writer.write_tensors_to_file()
|
368
|
+
self.gguf_writer.write_tensors_to_file(progress=True)
|
183
369
|
self.gguf_writer.close()
|
184
370
|
|
185
371
|
def write_vocab(self):
|
@@ -188,16 +374,18 @@ class Model(ABC):
|
|
188
374
|
self.gguf_writer.close()
|
189
375
|
|
190
376
|
@staticmethod
|
191
|
-
def
|
192
|
-
|
377
|
+
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
|
378
|
+
part_names: list[str] = []
|
193
379
|
for filename in os.listdir(dir_model):
|
194
|
-
if filename.endswith(
|
195
|
-
|
380
|
+
if filename.endswith(suffix):
|
381
|
+
part_names.append(filename)
|
382
|
+
|
383
|
+
part_names.sort()
|
196
384
|
|
197
|
-
return
|
385
|
+
return part_names
|
198
386
|
|
199
387
|
@staticmethod
|
200
|
-
def load_hparams(dir_model):
|
388
|
+
def load_hparams(dir_model: Path):
|
201
389
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
202
390
|
return json.load(f)
|
203
391
|
|
@@ -205,19 +393,20 @@ class Model(ABC):
|
|
205
393
|
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
206
394
|
assert names
|
207
395
|
|
208
|
-
def func(modelcls:
|
396
|
+
def func(modelcls: AnyModel) -> AnyModel:
|
209
397
|
for name in names:
|
210
398
|
cls._model_classes[name] = modelcls
|
211
399
|
return modelcls
|
212
400
|
return func
|
213
401
|
|
214
402
|
@classmethod
|
215
|
-
def from_model_architecture(cls, arch):
|
403
|
+
def from_model_architecture(cls, arch: str) -> type[Model]:
|
216
404
|
try:
|
217
405
|
return cls._model_classes[arch]
|
218
406
|
except KeyError:
|
219
407
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
220
408
|
|
409
|
+
<<<<<<< HEAD
|
221
410
|
def _is_model_safetensors(self) -> bool:
|
222
411
|
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
|
223
412
|
|
@@ -231,6 +420,8 @@ class Model(ABC):
|
|
231
420
|
return ("pytorch_model.bin",)
|
232
421
|
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
233
422
|
|
423
|
+
=======
|
424
|
+
>>>>>>> uupstream/master
|
234
425
|
# used for GPT-2 BPE and WordPiece vocabs
|
235
426
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
236
427
|
tokens: list[str] = []
|
@@ -265,6 +456,10 @@ class Model(ABC):
|
|
265
456
|
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
266
457
|
# do not modify it manually!
|
267
458
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
459
|
+
<<<<<<< HEAD
|
460
|
+
=======
|
461
|
+
# Marker: Start get_vocab_base_pre
|
462
|
+
>>>>>>> uupstream/master
|
268
463
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
269
464
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
270
465
|
# is specific for the BPE pre-tokenizer used by the model
|
@@ -308,15 +503,45 @@ class Model(ABC):
|
|
308
503
|
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
|
309
504
|
# ref: https://huggingface.co/openai-community/gpt2
|
310
505
|
res = "gpt-2"
|
506
|
+
<<<<<<< HEAD
|
507
|
+
=======
|
508
|
+
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
|
509
|
+
# ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
|
510
|
+
res = "stablelm2"
|
511
|
+
>>>>>>> uupstream/master
|
311
512
|
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
|
312
513
|
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
|
313
514
|
res = "refact"
|
314
515
|
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
|
315
516
|
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
|
316
517
|
res = "command-r"
|
518
|
+
<<<<<<< HEAD
|
519
|
+
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
520
|
+
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
521
|
+
res = "olmo"
|
522
|
+
=======
|
523
|
+
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
|
524
|
+
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
|
525
|
+
res = "qwen2"
|
317
526
|
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
318
527
|
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
319
528
|
res = "olmo"
|
529
|
+
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
530
|
+
# ref: https://huggingface.co/databricks/dbrx-base
|
531
|
+
res = "dbrx"
|
532
|
+
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
533
|
+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
534
|
+
res = "jina-v2-en"
|
535
|
+
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
|
536
|
+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
|
537
|
+
res = "jina-v2-es"
|
538
|
+
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
539
|
+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
540
|
+
res = "jina-v2-de"
|
541
|
+
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
542
|
+
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
543
|
+
res = "smaug-bpe"
|
544
|
+
>>>>>>> uupstream/master
|
320
545
|
|
321
546
|
if res is None:
|
322
547
|
logger.warning("\n")
|
@@ -337,6 +562,10 @@ class Model(ABC):
|
|
337
562
|
logger.debug(f"chkhsh: {chkhsh}")
|
338
563
|
|
339
564
|
return res
|
565
|
+
<<<<<<< HEAD
|
566
|
+
=======
|
567
|
+
# Marker: End get_vocab_base_pre
|
568
|
+
>>>>>>> uupstream/master
|
340
569
|
|
341
570
|
def _set_vocab_gpt2(self) -> None:
|
342
571
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
@@ -374,7 +603,7 @@ class Model(ABC):
|
|
374
603
|
|
375
604
|
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
376
605
|
added_vocab = tokenizer.special_tokens
|
377
|
-
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in
|
606
|
+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
378
607
|
|
379
608
|
for i in range(vocab_size):
|
380
609
|
if i not in reverse_vocab:
|
@@ -414,49 +643,66 @@ class Model(ABC):
|
|
414
643
|
if not tokenizer_path.is_file():
|
415
644
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
416
645
|
|
417
|
-
tokenizer = SentencePieceProcessor(
|
646
|
+
tokenizer = SentencePieceProcessor()
|
647
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
648
|
+
|
418
649
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
419
650
|
|
651
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
652
|
+
scores: list[float] = [-10000.0] * vocab_size
|
653
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
654
|
+
|
420
655
|
for token_id in range(tokenizer.vocab_size()):
|
421
|
-
piece = tokenizer.
|
656
|
+
piece = tokenizer.IdToPiece(token_id)
|
422
657
|
text = piece.encode("utf-8")
|
423
|
-
score = tokenizer.
|
658
|
+
score = tokenizer.GetScore(token_id)
|
424
659
|
|
425
660
|
toktype = SentencePieceTokenTypes.NORMAL
|
426
|
-
if tokenizer.
|
661
|
+
if tokenizer.IsUnknown(token_id):
|
427
662
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
428
|
-
elif tokenizer.
|
663
|
+
elif tokenizer.IsControl(token_id):
|
429
664
|
toktype = SentencePieceTokenTypes.CONTROL
|
430
|
-
elif tokenizer.
|
665
|
+
elif tokenizer.IsUnused(token_id):
|
431
666
|
toktype = SentencePieceTokenTypes.UNUSED
|
432
|
-
elif tokenizer.
|
667
|
+
elif tokenizer.IsByte(token_id):
|
433
668
|
toktype = SentencePieceTokenTypes.BYTE
|
434
669
|
|
435
|
-
tokens
|
436
|
-
scores
|
437
|
-
toktypes
|
670
|
+
tokens[token_id] = text
|
671
|
+
scores[token_id] = score
|
672
|
+
toktypes[token_id] = toktype
|
438
673
|
|
439
674
|
added_tokens_file = self.dir_model / 'added_tokens.json'
|
440
675
|
if added_tokens_file.is_file():
|
441
676
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
442
677
|
added_tokens_json = json.load(f)
|
443
|
-
|
444
678
|
for key in added_tokens_json:
|
445
|
-
|
446
|
-
if
|
447
|
-
|
448
|
-
|
449
|
-
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
679
|
+
token_id = added_tokens_json[key]
|
680
|
+
if (token_id >= vocab_size):
|
681
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
682
|
+
continue
|
450
683
|
|
684
|
+
<<<<<<< HEAD
|
685
|
+
=======
|
686
|
+
tokens[token_id] = key.encode("utf-8")
|
687
|
+
scores[token_id] = -1000.0
|
688
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
689
|
+
|
690
|
+
>>>>>>> uupstream/master
|
451
691
|
if vocab_size > len(tokens):
|
452
692
|
pad_count = vocab_size - len(tokens)
|
453
693
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
454
694
|
for i in range(1, pad_count + 1):
|
695
|
+
<<<<<<< HEAD
|
455
696
|
tokens.append(f"[PAD{i}]")
|
456
697
|
scores.append(-1000.0)
|
457
698
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
458
699
|
|
459
700
|
assert len(tokens) == vocab_size
|
701
|
+
=======
|
702
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
703
|
+
scores.append(-1000.0)
|
704
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
705
|
+
>>>>>>> uupstream/master
|
460
706
|
|
461
707
|
self.gguf_writer.add_tokenizer_model("llama")
|
462
708
|
self.gguf_writer.add_tokenizer_pre("default")
|
@@ -509,6 +755,44 @@ class GPTNeoXModel(Model):
|
|
509
755
|
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
510
756
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
511
757
|
|
758
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
759
|
+
del bid # unused
|
760
|
+
|
761
|
+
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
762
|
+
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
763
|
+
|
764
|
+
tensors: list[tuple[str, Tensor]] = []
|
765
|
+
|
766
|
+
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
|
767
|
+
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
768
|
+
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
769
|
+
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
770
|
+
qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
|
771
|
+
data_torch = torch.cat(
|
772
|
+
(
|
773
|
+
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
774
|
+
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
775
|
+
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
|
776
|
+
),
|
777
|
+
dim=0,
|
778
|
+
)
|
779
|
+
logger.info("re-format attention.linear_qkv.weight")
|
780
|
+
elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
|
781
|
+
qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
|
782
|
+
data_torch = torch.cat(
|
783
|
+
(
|
784
|
+
qkv_bias[:, 0, :].reshape((n_embed,)),
|
785
|
+
qkv_bias[:, 1, :].reshape((n_embed,)),
|
786
|
+
qkv_bias[:, 2, :].reshape((n_embed,)),
|
787
|
+
),
|
788
|
+
dim=0,
|
789
|
+
)
|
790
|
+
logger.info("re-format attention.linear_qkv.bias")
|
791
|
+
|
792
|
+
tensors.append((self.map_tensor_name(name), data_torch))
|
793
|
+
|
794
|
+
return tensors
|
795
|
+
|
512
796
|
|
513
797
|
@Model.register("BloomForCausalLM")
|
514
798
|
class BloomModel(Model):
|
@@ -527,28 +811,48 @@ class BloomModel(Model):
|
|
527
811
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
528
812
|
self.gguf_writer.add_file_type(self.ftype)
|
529
813
|
|
530
|
-
def
|
531
|
-
|
532
|
-
|
533
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
534
|
-
has_lm_head = True
|
814
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
815
|
+
del bid # unused
|
816
|
+
|
535
817
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
536
818
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
537
819
|
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
820
|
+
name = re.sub(r'transformer\.', '', name)
|
821
|
+
|
822
|
+
tensors: list[tuple[str, Tensor]] = []
|
823
|
+
|
824
|
+
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
825
|
+
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
826
|
+
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
827
|
+
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
828
|
+
qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
|
829
|
+
data_torch = torch.cat(
|
830
|
+
(
|
831
|
+
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
832
|
+
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
833
|
+
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
|
834
|
+
),
|
835
|
+
dim=0,
|
836
|
+
)
|
837
|
+
logger.info("re-format attention.linear_qkv.weight")
|
838
|
+
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
839
|
+
qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
|
840
|
+
data_torch = torch.cat(
|
841
|
+
(
|
842
|
+
qkv_bias[:, 0, :].reshape((n_embed,)),
|
843
|
+
qkv_bias[:, 1, :].reshape((n_embed,)),
|
844
|
+
qkv_bias[:, 2, :].reshape((n_embed,)),
|
845
|
+
),
|
846
|
+
dim=0,
|
847
|
+
)
|
848
|
+
logger.info("re-format attention.linear_qkv.bias")
|
545
849
|
|
546
|
-
|
547
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
548
|
-
data_torch = data_torch.to(torch.float32)
|
850
|
+
tensors.append((self.map_tensor_name(name), data_torch))
|
549
851
|
|
550
|
-
|
852
|
+
if name == "word_embeddings.weight":
|
853
|
+
assert self.tensor_names is not None
|
551
854
|
|
855
|
+
<<<<<<< HEAD
|
552
856
|
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
553
857
|
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
554
858
|
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
@@ -602,6 +906,13 @@ class BloomModel(Model):
|
|
602
906
|
if not has_lm_head and name == "word_embeddings.weight":
|
603
907
|
self.gguf_writer.add_tensor("output.weight", data)
|
604
908
|
logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
909
|
+
=======
|
910
|
+
# TODO: tie them at runtime, don't duplicate in the model file
|
911
|
+
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
|
912
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
913
|
+
|
914
|
+
return tensors
|
915
|
+
>>>>>>> uupstream/master
|
605
916
|
|
606
917
|
|
607
918
|
@Model.register("MPTForCausalLM")
|
@@ -637,16 +948,16 @@ class MPTModel(Model):
|
|
637
948
|
else:
|
638
949
|
self.gguf_writer.add_max_alibi_bias(0.0)
|
639
950
|
|
640
|
-
def
|
641
|
-
|
642
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
643
|
-
for name, data_torch in self.get_tensors():
|
644
|
-
# we don't need these
|
645
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
646
|
-
continue
|
951
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
952
|
+
del bid # unused
|
647
953
|
|
648
|
-
|
954
|
+
if "scales" in name:
|
955
|
+
new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales"))
|
956
|
+
new_name = new_name.replace("scales", "act.scales")
|
957
|
+
else:
|
958
|
+
new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias"))
|
649
959
|
|
960
|
+
<<<<<<< HEAD
|
650
961
|
# convert any unsupported data types to float32
|
651
962
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
652
963
|
data_torch = data_torch.to(torch.float32)
|
@@ -681,6 +992,9 @@ class MPTModel(Model):
|
|
681
992
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
682
993
|
|
683
994
|
self.gguf_writer.add_tensor(new_name, data)
|
995
|
+
=======
|
996
|
+
return [(new_name, data_torch)]
|
997
|
+
>>>>>>> uupstream/master
|
684
998
|
|
685
999
|
|
686
1000
|
@Model.register("OrionForCausalLM")
|
@@ -720,6 +1034,7 @@ class OrionModel(Model):
|
|
720
1034
|
# ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
|
721
1035
|
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
722
1036
|
|
1037
|
+
<<<<<<< HEAD
|
723
1038
|
def write_tensors(self):
|
724
1039
|
# Collect tensors from generator object
|
725
1040
|
model_kv = dict(self.get_tensors())
|
@@ -762,6 +1077,8 @@ class OrionModel(Model):
|
|
762
1077
|
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
763
1078
|
self.gguf_writer.add_tensor(new_name, data)
|
764
1079
|
|
1080
|
+
=======
|
1081
|
+
>>>>>>> uupstream/master
|
765
1082
|
|
766
1083
|
@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
|
767
1084
|
class BaichuanModel(Model):
|
@@ -797,20 +1114,18 @@ class BaichuanModel(Model):
|
|
797
1114
|
self.gguf_writer.add_head_count(head_count)
|
798
1115
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
799
1116
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
1117
|
+
self.gguf_writer.add_file_type(self.ftype)
|
800
1118
|
|
801
1119
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
802
1120
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
803
1121
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
804
1122
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
805
1123
|
|
806
|
-
def
|
807
|
-
# Collect tensors from generator object
|
808
|
-
model_kv = dict(self.get_tensors())
|
809
|
-
block_count = self.hparams["num_hidden_layers"]
|
1124
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
810
1125
|
head_count = self.hparams["num_attention_heads"]
|
811
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
812
1126
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
813
1127
|
|
1128
|
+
<<<<<<< HEAD
|
814
1129
|
for i in range(block_count):
|
815
1130
|
if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
|
816
1131
|
logger.info(f"Unpacking and permuting layer {i}")
|
@@ -821,12 +1136,24 @@ class BaichuanModel(Model):
|
|
821
1136
|
model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
|
822
1137
|
self._reverse_hf_part(w, 2)
|
823
1138
|
del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
|
1139
|
+
=======
|
1140
|
+
tensors: list[tuple[str, Tensor]] = []
|
1141
|
+
>>>>>>> uupstream/master
|
1142
|
+
|
1143
|
+
if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
|
1144
|
+
logger.info(f"Unpacking and permuting layer {bid}")
|
1145
|
+
tensors = [
|
1146
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid),
|
1147
|
+
self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)),
|
1148
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid),
|
1149
|
+
self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)),
|
1150
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid),
|
1151
|
+
self._reverse_hf_part(data_torch, 2)),
|
1152
|
+
]
|
1153
|
+
else:
|
1154
|
+
tensors = [(self.map_tensor_name(name), data_torch)]
|
824
1155
|
|
825
|
-
|
826
|
-
# we don't need these
|
827
|
-
if name.endswith(".rotary_emb.inv_freq"):
|
828
|
-
continue
|
829
|
-
|
1156
|
+
<<<<<<< HEAD
|
830
1157
|
old_dtype = data_torch.dtype
|
831
1158
|
|
832
1159
|
# convert any unsupported data types to float32
|
@@ -857,6 +1184,9 @@ class BaichuanModel(Model):
|
|
857
1184
|
|
858
1185
|
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
859
1186
|
self.gguf_writer.add_tensor(new_name, data)
|
1187
|
+
=======
|
1188
|
+
return tensors
|
1189
|
+
>>>>>>> uupstream/master
|
860
1190
|
|
861
1191
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
862
1192
|
if n_kv_head is not None and n_head != n_kv_head:
|
@@ -888,7 +1218,7 @@ class XverseModel(Model):
|
|
888
1218
|
dir_model = self.dir_model
|
889
1219
|
hparams = self.hparams
|
890
1220
|
|
891
|
-
tokens: list[
|
1221
|
+
tokens: list[bytes] = []
|
892
1222
|
toktypes: list[int] = []
|
893
1223
|
|
894
1224
|
from transformers import AutoTokenizer
|
@@ -896,7 +1226,7 @@ class XverseModel(Model):
|
|
896
1226
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
897
1227
|
assert max(tokenizer.vocab.values()) < vocab_size
|
898
1228
|
|
899
|
-
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
1229
|
+
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
900
1230
|
added_vocab = tokenizer.get_added_vocab()
|
901
1231
|
|
902
1232
|
for token_id in range(vocab_size):
|
@@ -953,25 +1283,26 @@ class XverseModel(Model):
|
|
953
1283
|
self.gguf_writer.add_head_count(head_count)
|
954
1284
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
955
1285
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
1286
|
+
self.gguf_writer.add_file_type(self.ftype)
|
956
1287
|
|
957
1288
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
958
1289
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
959
1290
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
960
1291
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
961
1292
|
|
962
|
-
def
|
963
|
-
#
|
964
|
-
|
965
|
-
block_count = self.hparams["num_hidden_layers"]
|
1293
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1294
|
+
del bid # unused
|
1295
|
+
|
966
1296
|
head_count = self.hparams["num_attention_heads"]
|
967
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
968
1297
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
969
1298
|
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
1299
|
+
# HF models permute some of the tensors, so we need to undo that
|
1300
|
+
if name.endswith("q_proj.weight"):
|
1301
|
+
data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
|
1302
|
+
if name.endswith("k_proj.weight"):
|
1303
|
+
data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
|
974
1304
|
|
1305
|
+
<<<<<<< HEAD
|
975
1306
|
old_dtype = data_torch.dtype
|
976
1307
|
|
977
1308
|
# convert any unsupported data types to float32
|
@@ -1008,6 +1339,9 @@ class XverseModel(Model):
|
|
1008
1339
|
|
1009
1340
|
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1010
1341
|
self.gguf_writer.add_tensor(new_name, data)
|
1342
|
+
=======
|
1343
|
+
return [(self.map_tensor_name(name), data_torch)]
|
1344
|
+
>>>>>>> uupstream/master
|
1011
1345
|
|
1012
1346
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
1013
1347
|
if n_kv_head is not None and n_head != n_kv_head:
|
@@ -1048,22 +1382,31 @@ class FalconModel(Model):
|
|
1048
1382
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
1049
1383
|
self.gguf_writer.add_file_type(self.ftype)
|
1050
1384
|
|
1051
|
-
def
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
n_head
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
n_head_kv
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1385
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1386
|
+
del bid # unused
|
1387
|
+
|
1388
|
+
# QKV tensor transform
|
1389
|
+
# The original query_key_value tensor contains n_head_kv "kv groups",
|
1390
|
+
# each consisting of n_head/n_head_kv query weights followed by one key
|
1391
|
+
# and one value weight (shared by all query heads in the kv group).
|
1392
|
+
# This layout makes it a big pain to work with in GGML.
|
1393
|
+
# So we rearrange them here,, so that we have n_head query weights
|
1394
|
+
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
1395
|
+
# in contiguous fashion.
|
1396
|
+
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
1397
|
+
|
1398
|
+
if "query_key_value" in name:
|
1399
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
1400
|
+
n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1
|
1401
|
+
head_dim = self.hparams["hidden_size"] // n_head
|
1402
|
+
|
1403
|
+
qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
1404
|
+
q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
|
1405
|
+
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
1406
|
+
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
1407
|
+
data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
|
1408
|
+
|
1409
|
+
<<<<<<< HEAD
|
1067
1410
|
for name, data_torch in self.get_tensors():
|
1068
1411
|
old_dtype = data_torch.dtype
|
1069
1412
|
|
@@ -1113,6 +1456,9 @@ class FalconModel(Model):
|
|
1113
1456
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1114
1457
|
|
1115
1458
|
self.gguf_writer.add_tensor(new_name, data)
|
1459
|
+
=======
|
1460
|
+
return [(self.map_tensor_name(name), data_torch)]
|
1461
|
+
>>>>>>> uupstream/master
|
1116
1462
|
|
1117
1463
|
|
1118
1464
|
@Model.register("GPTBigCodeForCausalLM")
|
@@ -1137,6 +1483,18 @@ class StarCoderModel(Model):
|
|
1137
1483
|
class RefactModel(Model):
|
1138
1484
|
model_arch = gguf.MODEL_ARCH.REFACT
|
1139
1485
|
|
1486
|
+
def set_vocab(self):
|
1487
|
+
super().set_vocab()
|
1488
|
+
|
1489
|
+
# TODO: how to determine special FIM tokens automatically?
|
1490
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
1491
|
+
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
|
1492
|
+
special_vocab._set_special_token("prefix", 1)
|
1493
|
+
special_vocab._set_special_token("suffix", 3)
|
1494
|
+
special_vocab._set_special_token("middle", 2)
|
1495
|
+
special_vocab._set_special_token("fsep", 4) # is this correct?
|
1496
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
1497
|
+
|
1140
1498
|
def set_gguf_parameters(self):
|
1141
1499
|
hidden_dim = self.hparams["n_embd"]
|
1142
1500
|
inner_dim = 4 * hidden_dim
|
@@ -1158,7 +1516,7 @@ class RefactModel(Model):
|
|
1158
1516
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
1159
1517
|
self.gguf_writer.add_file_type(self.ftype)
|
1160
1518
|
|
1161
|
-
def
|
1519
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1162
1520
|
hidden_dim = self.hparams["n_embd"]
|
1163
1521
|
inner_dim = 4 * hidden_dim
|
1164
1522
|
hidden_dim = int(2 * inner_dim / 3)
|
@@ -1167,27 +1525,23 @@ class RefactModel(Model):
|
|
1167
1525
|
n_head = self.hparams["n_head"]
|
1168
1526
|
n_head_kv = 1
|
1169
1527
|
head_dim = self.hparams["n_embd"] // n_head
|
1170
|
-
block_count = self.hparams["n_layer"]
|
1171
1528
|
|
1172
|
-
|
1529
|
+
tensors: list[tuple[str, Tensor]] = []
|
1173
1530
|
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
tensors
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
tensors
|
1182
|
-
|
1183
|
-
if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
|
1184
|
-
tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
|
1185
|
-
tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
|
1186
|
-
del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
1187
|
-
|
1188
|
-
for name, data_torch in tensors.items():
|
1189
|
-
old_dtype = data_torch.dtype
|
1531
|
+
if bid is not None:
|
1532
|
+
if name == f"transformer.h.{bid}.attn.kv.weight":
|
1533
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim]))
|
1534
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:]))
|
1535
|
+
elif name == f"transformer.h.{bid}.attn.q.weight":
|
1536
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch))
|
1537
|
+
elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight":
|
1538
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]))
|
1539
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]))
|
1190
1540
|
|
1541
|
+
if len(tensors) == 0:
|
1542
|
+
tensors.append((self.map_tensor_name(name), data_torch))
|
1543
|
+
|
1544
|
+
<<<<<<< HEAD
|
1191
1545
|
# convert any unsupported data types to float32
|
1192
1546
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
1193
1547
|
data_torch = data_torch.to(torch.float32)
|
@@ -1267,6 +1621,9 @@ class PersimmonModel(Model):
|
|
1267
1621
|
n_dims = len(data.shape)
|
1268
1622
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1269
1623
|
self.gguf_writer.add_tensor(new_name, data)
|
1624
|
+
=======
|
1625
|
+
return tensors
|
1626
|
+
>>>>>>> uupstream/master
|
1270
1627
|
|
1271
1628
|
|
1272
1629
|
@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
|
@@ -1295,6 +1652,69 @@ class StableLMModel(Model):
|
|
1295
1652
|
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
1296
1653
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
1297
1654
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
1655
|
+
self.gguf_writer.add_file_type(self.ftype)
|
1656
|
+
|
1657
|
+
_q_norms: list[dict[str, Tensor]] | None = None
|
1658
|
+
_k_norms: list[dict[str, Tensor]] | None = None
|
1659
|
+
|
1660
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1661
|
+
n_head = self.hparams["num_attention_heads"]
|
1662
|
+
n_kv_head = self.hparams["num_key_value_heads"]
|
1663
|
+
|
1664
|
+
if name.find("q_layernorm.norms") != -1:
|
1665
|
+
assert bid is not None
|
1666
|
+
|
1667
|
+
if self._q_norms is None:
|
1668
|
+
self._q_norms = [{} for _ in range(self.block_count)]
|
1669
|
+
|
1670
|
+
self._q_norms[bid][name] = data_torch
|
1671
|
+
|
1672
|
+
if len(self._q_norms[bid]) >= n_head:
|
1673
|
+
return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm")
|
1674
|
+
else:
|
1675
|
+
return []
|
1676
|
+
|
1677
|
+
if name.find("k_layernorm.norms") != -1:
|
1678
|
+
assert bid is not None
|
1679
|
+
|
1680
|
+
if self._k_norms is None:
|
1681
|
+
self._k_norms = [{} for _ in range(self.block_count)]
|
1682
|
+
|
1683
|
+
self._k_norms[bid][name] = data_torch
|
1684
|
+
|
1685
|
+
if len(self._k_norms[bid]) >= n_kv_head:
|
1686
|
+
return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm")
|
1687
|
+
else:
|
1688
|
+
return []
|
1689
|
+
|
1690
|
+
return [(self.map_tensor_name(name), data_torch)]
|
1691
|
+
|
1692
|
+
def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"):
|
1693
|
+
datas: list[Tensor] = []
|
1694
|
+
# extract the norms in order
|
1695
|
+
for xid in range(n_head):
|
1696
|
+
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
1697
|
+
datas.append(norms[ename])
|
1698
|
+
del norms[ename]
|
1699
|
+
data_torch = torch.stack(datas, dim=0)
|
1700
|
+
|
1701
|
+
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
1702
|
+
new_name = self.map_tensor_name(merged_name)
|
1703
|
+
|
1704
|
+
return [(new_name, data_torch)]
|
1705
|
+
|
1706
|
+
def write_tensors(self):
|
1707
|
+
super().write_tensors()
|
1708
|
+
|
1709
|
+
if self._q_norms is not None or self._k_norms is not None:
|
1710
|
+
# flatten two `list[dict[str, Tensor]]` into a single `list[str]`
|
1711
|
+
norms = (
|
1712
|
+
[k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else []
|
1713
|
+
) + (
|
1714
|
+
[k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else []
|
1715
|
+
)
|
1716
|
+
if len(norms) > 0:
|
1717
|
+
raise ValueError(f"Unprocessed norms: {norms}")
|
1298
1718
|
|
1299
1719
|
def write_tensors(self):
|
1300
1720
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
@@ -1413,6 +1833,7 @@ class LlamaModel(Model):
|
|
1413
1833
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1414
1834
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1415
1835
|
|
1836
|
+
<<<<<<< HEAD
|
1416
1837
|
# Same as super class, but permuting q_proj, k_proj
|
1417
1838
|
def write_tensors(self):
|
1418
1839
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
@@ -1425,64 +1846,75 @@ class LlamaModel(Model):
|
|
1425
1846
|
# we don't need these
|
1426
1847
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
1427
1848
|
continue
|
1849
|
+
=======
|
1850
|
+
@staticmethod
|
1851
|
+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1852
|
+
if n_head_kv is not None and n_head != n_head_kv:
|
1853
|
+
n_head = n_head_kv
|
1854
|
+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
1855
|
+
.swapaxes(1, 2)
|
1856
|
+
.reshape(weights.shape))
|
1428
1857
|
|
1429
|
-
|
1858
|
+
_experts: list[dict[str, Tensor]] | None = None
|
1430
1859
|
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1860
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1861
|
+
n_head = self.hparams["num_attention_heads"]
|
1862
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
1863
|
+
>>>>>>> uupstream/master
|
1434
1864
|
|
1435
|
-
|
1865
|
+
if name.endswith("q_proj.weight"):
|
1866
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
1867
|
+
if name.endswith("k_proj.weight"):
|
1868
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
1436
1869
|
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
data = permute(data, n_head, n_kv_head)
|
1870
|
+
# process the experts separately
|
1871
|
+
if name.find("block_sparse_moe.experts") != -1:
|
1872
|
+
n_experts = self.hparams["num_local_experts"]
|
1441
1873
|
|
1442
|
-
|
1874
|
+
assert bid is not None
|
1443
1875
|
|
1444
|
-
|
1445
|
-
|
1446
|
-
experts[name] = data
|
1447
|
-
if len(experts) >= n_experts:
|
1448
|
-
# merge the experts into a single 3d tensor
|
1449
|
-
for bid in range(block_count):
|
1450
|
-
for wid in range(1, 4):
|
1451
|
-
full = True
|
1452
|
-
for xid in range(n_experts):
|
1453
|
-
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
|
1454
|
-
if ename not in experts:
|
1455
|
-
full = False
|
1456
|
-
break
|
1457
|
-
if not full:
|
1458
|
-
continue
|
1876
|
+
if self._experts is None:
|
1877
|
+
self._experts = [{} for _ in range(self.block_count)]
|
1459
1878
|
|
1460
|
-
|
1461
|
-
for xid in range(n_experts):
|
1462
|
-
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
|
1463
|
-
datas.append(experts[ename])
|
1464
|
-
del experts[ename]
|
1879
|
+
self._experts[bid][name] = data_torch
|
1465
1880
|
|
1466
|
-
|
1467
|
-
|
1881
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
1882
|
+
tensors: list[tuple[str, Tensor]] = []
|
1468
1883
|
|
1469
|
-
|
1470
|
-
|
1884
|
+
# merge the experts into a single 3d tensor
|
1885
|
+
for wid in ["w1", "w2", "w3"]:
|
1886
|
+
datas: list[Tensor] = []
|
1471
1887
|
|
1472
|
-
|
1473
|
-
|
1888
|
+
for xid in range(n_experts):
|
1889
|
+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
|
1890
|
+
datas.append(self._experts[bid][ename])
|
1891
|
+
del self._experts[bid][ename]
|
1892
|
+
|
1893
|
+
data_torch = torch.stack(datas, dim=0)
|
1474
1894
|
|
1475
|
-
|
1895
|
+
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
|
1476
1896
|
|
1897
|
+
new_name = self.map_tensor_name(merged_name)
|
1898
|
+
|
1899
|
+
<<<<<<< HEAD
|
1477
1900
|
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
1478
1901
|
if new_name is None:
|
1479
1902
|
raise ValueError(f"Can not map tensor {name!r}")
|
1480
1903
|
|
1481
1904
|
logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
1905
|
+
=======
|
1906
|
+
tensors.append((new_name, data_torch))
|
1907
|
+
return tensors
|
1908
|
+
else:
|
1909
|
+
return []
|
1482
1910
|
|
1483
|
-
|
1484
|
-
|
1911
|
+
return [(self.map_tensor_name(name), data_torch)]
|
1912
|
+
>>>>>>> uupstream/master
|
1485
1913
|
|
1914
|
+
def write_tensors(self):
|
1915
|
+
super().write_tensors()
|
1916
|
+
|
1917
|
+
<<<<<<< HEAD
|
1486
1918
|
# map tensor names
|
1487
1919
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1488
1920
|
if new_name is None:
|
@@ -1509,6 +1941,13 @@ class LlamaModel(Model):
|
|
1509
1941
|
|
1510
1942
|
if len(experts) > 0:
|
1511
1943
|
raise ValueError(f"Unprocessed experts: {experts.keys()}")
|
1944
|
+
=======
|
1945
|
+
if self._experts is not None:
|
1946
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
1947
|
+
experts = [k for d in self._experts for k in d.keys()]
|
1948
|
+
if len(experts) > 0:
|
1949
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
1950
|
+
>>>>>>> uupstream/master
|
1512
1951
|
|
1513
1952
|
|
1514
1953
|
@Model.register("GrokForCausalLM")
|
@@ -1525,86 +1964,79 @@ class GrokModel(Model):
|
|
1525
1964
|
super().set_gguf_parameters()
|
1526
1965
|
self.gguf_writer.add_name("Grok")
|
1527
1966
|
|
1528
|
-
|
1529
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
1530
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
1531
|
-
n_experts = self.hparams.get("num_local_experts")
|
1532
|
-
experts = dict()
|
1533
|
-
for name, data_torch in self.get_tensors():
|
1534
|
-
# we don't need these
|
1535
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
1536
|
-
continue
|
1967
|
+
_experts: list[dict[str, Tensor]] | None = None
|
1537
1968
|
|
1538
|
-
|
1969
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1970
|
+
# process the experts separately
|
1971
|
+
if name.find(".moe.") != -1:
|
1972
|
+
n_experts = self.hparams["num_local_experts"]
|
1539
1973
|
|
1540
|
-
|
1541
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1542
|
-
data_torch = data_torch.to(torch.float32)
|
1974
|
+
assert bid is not None
|
1543
1975
|
|
1544
|
-
|
1976
|
+
if self._experts is None:
|
1977
|
+
self._experts = [{} for _ in range(self.block_count)]
|
1545
1978
|
|
1546
|
-
|
1547
|
-
if name.find(".moe.") != -1:
|
1548
|
-
experts[name] = data
|
1549
|
-
if len(experts) >= n_experts:
|
1550
|
-
# merge the experts into a single 3d tensor
|
1551
|
-
for bid in range(block_count):
|
1552
|
-
for wid in ["linear", "linear_1", "linear_v"]:
|
1553
|
-
full = True
|
1554
|
-
for xid in range(n_experts):
|
1555
|
-
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
|
1556
|
-
if ename not in experts:
|
1557
|
-
full = False
|
1558
|
-
break
|
1559
|
-
if not full:
|
1560
|
-
continue
|
1979
|
+
self._experts[bid][name] = data_torch
|
1561
1980
|
|
1562
|
-
|
1563
|
-
|
1564
|
-
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
|
1565
|
-
datas.append(experts[ename])
|
1566
|
-
del experts[ename]
|
1981
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
1982
|
+
tensors: list[tuple[str, Tensor]] = []
|
1567
1983
|
|
1568
|
-
|
1569
|
-
|
1984
|
+
# merge the experts into a single 3d tensor
|
1985
|
+
for wid in ["linear", "linear_1", "linear_v"]:
|
1986
|
+
datas: list[Tensor] = []
|
1570
1987
|
|
1571
|
-
|
1572
|
-
|
1988
|
+
for xid in range(n_experts):
|
1989
|
+
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
|
1990
|
+
datas.append(self._experts[bid][ename])
|
1991
|
+
del self._experts[bid][ename]
|
1573
1992
|
|
1574
|
-
|
1575
|
-
data = data.astype(np.float16)
|
1993
|
+
data_torch = torch.stack(datas, dim=0)
|
1576
1994
|
|
1577
|
-
|
1995
|
+
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
|
1578
1996
|
|
1997
|
+
<<<<<<< HEAD
|
1579
1998
|
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
1580
1999
|
if new_name is None:
|
1581
2000
|
raise ValueError(f"Can not map tensor {name!r}")
|
1582
2001
|
|
1583
2002
|
logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
2003
|
+
=======
|
2004
|
+
new_name = self.map_tensor_name(merged_name)
|
1584
2005
|
|
1585
|
-
|
1586
|
-
|
2006
|
+
tensors.append((new_name, data_torch))
|
2007
|
+
return tensors
|
2008
|
+
else:
|
2009
|
+
return []
|
2010
|
+
>>>>>>> uupstream/master
|
1587
2011
|
|
2012
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2013
|
+
|
2014
|
+
<<<<<<< HEAD
|
1588
2015
|
# map tensor names
|
1589
2016
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1590
2017
|
if new_name is None:
|
1591
2018
|
raise ValueError(f"Can not map tensor {name!r}")
|
2019
|
+
=======
|
2020
|
+
>>>>>>> uupstream/master
|
1592
2021
|
|
1593
|
-
|
1594
|
-
|
2022
|
+
@Model.register("DbrxForCausalLM")
|
2023
|
+
class DbrxModel(Model):
|
2024
|
+
model_arch = gguf.MODEL_ARCH.DBRX
|
1595
2025
|
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
2026
|
+
def set_gguf_parameters(self):
|
2027
|
+
ffn_config = self.hparams["ffn_config"]
|
2028
|
+
attn_config = self.hparams["attn_config"]
|
2029
|
+
self.gguf_writer.add_name(self.hparams["model_type"])
|
2030
|
+
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
1599
2031
|
|
1600
|
-
|
1601
|
-
|
1602
|
-
|
2032
|
+
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
2033
|
+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
2034
|
+
self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
|
1603
2035
|
|
1604
|
-
|
1605
|
-
|
1606
|
-
data = data.astype(np.float16)
|
2036
|
+
self.gguf_writer.add_head_count(self.hparams["n_heads"])
|
2037
|
+
self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
|
1607
2038
|
|
2039
|
+
<<<<<<< HEAD
|
1608
2040
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1609
2041
|
|
1610
2042
|
self.gguf_writer.add_tensor(new_name, data)
|
@@ -1796,8 +2228,60 @@ class DbrxModel(Model):
|
|
1796
2228
|
data = data.astype(np.float16)
|
1797
2229
|
|
1798
2230
|
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
2231
|
+
=======
|
2232
|
+
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
2233
|
+
>>>>>>> uupstream/master
|
1799
2234
|
|
1800
|
-
|
2235
|
+
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
2236
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2237
|
+
|
2238
|
+
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
2239
|
+
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
2240
|
+
|
2241
|
+
self.gguf_writer.add_layer_norm_eps(1e-5)
|
2242
|
+
|
2243
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2244
|
+
logger.info(f"gguf: file type = {self.ftype}")
|
2245
|
+
|
2246
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2247
|
+
del bid # unused
|
2248
|
+
|
2249
|
+
n_expert = self.hparams["ffn_config"]["moe_num_experts"]
|
2250
|
+
n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
|
2251
|
+
n_embd = self.hparams["d_model"]
|
2252
|
+
|
2253
|
+
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
|
2254
|
+
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights
|
2255
|
+
# But llama.cpp moe graph works differently
|
2256
|
+
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
2257
|
+
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
2258
|
+
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
2259
|
+
"ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
|
2260
|
+
"ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
2261
|
+
experts = False
|
2262
|
+
|
2263
|
+
for exp_tensor_name in exp_tensor_names.keys():
|
2264
|
+
if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
|
2265
|
+
experts = True
|
2266
|
+
data_torch = data_torch.view(n_expert, n_ff, n_embd)
|
2267
|
+
if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
|
2268
|
+
data_torch = data_torch.permute(*permute_tensor)
|
2269
|
+
break
|
2270
|
+
|
2271
|
+
# map tensor names
|
2272
|
+
# In MoE models the ffn tensors are typically most of the model weights,
|
2273
|
+
# and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
|
2274
|
+
# Every other model has the weight names ending in .weight,
|
2275
|
+
# let's assume that is the convention which is not the case for dbrx:
|
2276
|
+
# https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
|
2277
|
+
new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
|
2278
|
+
|
2279
|
+
return [(new_name, data_torch)]
|
2280
|
+
|
2281
|
+
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
2282
|
+
del name, new_name, bid # unused
|
2283
|
+
|
2284
|
+
return n_dims > 1
|
1801
2285
|
|
1802
2286
|
|
1803
2287
|
@Model.register("MiniCPMForCausalLM")
|
@@ -1830,18 +2314,19 @@ class MiniCPMModel(Model):
|
|
1830
2314
|
.reshape(weights.shape)
|
1831
2315
|
)
|
1832
2316
|
|
1833
|
-
def
|
1834
|
-
|
1835
|
-
|
1836
|
-
n_head = self.hparams
|
2317
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2318
|
+
del bid # unused
|
2319
|
+
|
2320
|
+
n_head = self.hparams["num_attention_heads"]
|
1837
2321
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
1838
|
-
for name, data_torch in self.get_tensors():
|
1839
|
-
# we don't need these
|
1840
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
1841
|
-
continue
|
1842
2322
|
|
1843
|
-
|
2323
|
+
# HF models permute some of the tensors, so we need to undo that
|
2324
|
+
if name.endswith(("q_proj.weight")):
|
2325
|
+
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
|
2326
|
+
if name.endswith(("k_proj.weight")):
|
2327
|
+
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
|
1844
2328
|
|
2329
|
+
<<<<<<< HEAD
|
1845
2330
|
# convert any unsupported data types to float32
|
1846
2331
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
1847
2332
|
data_torch = data_torch.to(torch.float32)
|
@@ -1877,6 +2362,9 @@ class MiniCPMModel(Model):
|
|
1877
2362
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1878
2363
|
|
1879
2364
|
self.gguf_writer.add_tensor(new_name, data)
|
2365
|
+
=======
|
2366
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2367
|
+
>>>>>>> uupstream/master
|
1880
2368
|
|
1881
2369
|
|
1882
2370
|
@Model.register("QWenLMHeadModel")
|
@@ -1919,6 +2407,7 @@ class QwenModel(Model):
|
|
1919
2407
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
1920
2408
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
1921
2409
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
2410
|
+
<<<<<<< HEAD
|
1922
2411
|
|
1923
2412
|
def write_tensors(self):
|
1924
2413
|
block_count = self.hparams["num_hidden_layers"]
|
@@ -1959,6 +2448,9 @@ class QwenModel(Model):
|
|
1959
2448
|
|
1960
2449
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1961
2450
|
self.gguf_writer.add_tensor(new_name, data)
|
2451
|
+
=======
|
2452
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2453
|
+
>>>>>>> uupstream/master
|
1962
2454
|
|
1963
2455
|
|
1964
2456
|
@Model.register("Qwen2ForCausalLM")
|
@@ -1981,6 +2473,7 @@ class Qwen2MoeModel(Model):
|
|
1981
2473
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
1982
2474
|
self.gguf_writer.add_expert_count(n_experts)
|
1983
2475
|
|
2476
|
+
<<<<<<< HEAD
|
1984
2477
|
def write_tensors(self):
|
1985
2478
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
1986
2479
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
@@ -2166,6 +2659,54 @@ class Qwen2MoeModel(Model):
|
|
2166
2659
|
|
2167
2660
|
if len(experts) > 0:
|
2168
2661
|
raise ValueError(f"Unprocessed experts: {experts.keys()}")
|
2662
|
+
=======
|
2663
|
+
_experts: list[dict[str, Tensor]] | None = None
|
2664
|
+
|
2665
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2666
|
+
# process the experts separately
|
2667
|
+
if name.find("experts") != -1:
|
2668
|
+
n_experts = self.hparams["num_experts"]
|
2669
|
+
assert bid is not None
|
2670
|
+
|
2671
|
+
if self._experts is None:
|
2672
|
+
self._experts = [{} for _ in range(self.block_count)]
|
2673
|
+
|
2674
|
+
self._experts[bid][name] = data_torch
|
2675
|
+
|
2676
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
2677
|
+
tensors: list[tuple[str, Tensor]] = []
|
2678
|
+
|
2679
|
+
# merge the experts into a single 3d tensor
|
2680
|
+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
2681
|
+
datas: list[Tensor] = []
|
2682
|
+
|
2683
|
+
for xid in range(n_experts):
|
2684
|
+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
2685
|
+
datas.append(self._experts[bid][ename])
|
2686
|
+
del self._experts[bid][ename]
|
2687
|
+
|
2688
|
+
data_torch = torch.stack(datas, dim=0)
|
2689
|
+
|
2690
|
+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
2691
|
+
|
2692
|
+
new_name = self.map_tensor_name(merged_name)
|
2693
|
+
|
2694
|
+
tensors.append((new_name, data_torch))
|
2695
|
+
return tensors
|
2696
|
+
else:
|
2697
|
+
return []
|
2698
|
+
|
2699
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2700
|
+
|
2701
|
+
def write_tensors(self):
|
2702
|
+
super().write_tensors()
|
2703
|
+
|
2704
|
+
if self._experts is not None:
|
2705
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
2706
|
+
experts = [k for d in self._experts for k in d.keys()]
|
2707
|
+
if len(experts) > 0:
|
2708
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
2709
|
+
>>>>>>> uupstream/master
|
2169
2710
|
|
2170
2711
|
|
2171
2712
|
@Model.register("GPT2LMHeadModel")
|
@@ -2182,26 +2723,23 @@ class GPT2Model(Model):
|
|
2182
2723
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
2183
2724
|
self.gguf_writer.add_file_type(self.ftype)
|
2184
2725
|
|
2185
|
-
def
|
2186
|
-
|
2187
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
2726
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2727
|
+
del bid # unused
|
2188
2728
|
|
2189
|
-
|
2190
|
-
# we don't need these
|
2191
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
|
2192
|
-
continue
|
2729
|
+
tensors: list[tuple[str, Tensor]] = []
|
2193
2730
|
|
2194
|
-
|
2195
|
-
|
2731
|
+
# we don't need these
|
2732
|
+
if name.endswith((".attn.bias", ".attn.masked_bias")):
|
2733
|
+
return tensors
|
2196
2734
|
|
2197
|
-
|
2735
|
+
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
|
2736
|
+
data_torch = data_torch.transpose(1, 0)
|
2198
2737
|
|
2199
|
-
|
2200
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2201
|
-
data_torch = data_torch.to(torch.float32)
|
2738
|
+
new_name = self.map_tensor_name(name)
|
2202
2739
|
|
2203
|
-
|
2740
|
+
tensors.append((new_name, data_torch))
|
2204
2741
|
|
2742
|
+
<<<<<<< HEAD
|
2205
2743
|
# map tensor names
|
2206
2744
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2207
2745
|
if new_name is None:
|
@@ -2230,6 +2768,13 @@ class GPT2Model(Model):
|
|
2230
2768
|
if new_name == "token_embd.weight":
|
2231
2769
|
logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
2232
2770
|
self.gguf_writer.add_tensor("output.weight", data)
|
2771
|
+
=======
|
2772
|
+
# note: GPT2 output is tied to (same as) wte in original model
|
2773
|
+
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
2774
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
2775
|
+
|
2776
|
+
return tensors
|
2777
|
+
>>>>>>> uupstream/master
|
2233
2778
|
|
2234
2779
|
|
2235
2780
|
@Model.register("PhiForCausalLM")
|
@@ -2269,7 +2814,12 @@ class Phi3MiniModel(Model):
|
|
2269
2814
|
if not tokenizer_path.is_file():
|
2270
2815
|
raise ValueError(f'Error: Missing {tokenizer_path}')
|
2271
2816
|
|
2817
|
+
<<<<<<< HEAD
|
2272
2818
|
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
2819
|
+
=======
|
2820
|
+
tokenizer = SentencePieceProcessor()
|
2821
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
2822
|
+
>>>>>>> uupstream/master
|
2273
2823
|
|
2274
2824
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
2275
2825
|
|
@@ -2279,6 +2829,7 @@ class Phi3MiniModel(Model):
|
|
2279
2829
|
|
2280
2830
|
for token_id in range(tokenizer.vocab_size()):
|
2281
2831
|
|
2832
|
+
<<<<<<< HEAD
|
2282
2833
|
piece = tokenizer.id_to_piece(token_id)
|
2283
2834
|
text = piece.encode("utf-8")
|
2284
2835
|
score = tokenizer.get_score(token_id)
|
@@ -2291,6 +2842,20 @@ class Phi3MiniModel(Model):
|
|
2291
2842
|
elif tokenizer.is_unused(token_id):
|
2292
2843
|
toktype = SentencePieceTokenTypes.UNUSED
|
2293
2844
|
elif tokenizer.is_byte(token_id):
|
2845
|
+
=======
|
2846
|
+
piece = tokenizer.IdToPiece(token_id)
|
2847
|
+
text = piece.encode("utf-8")
|
2848
|
+
score = tokenizer.GetScore(token_id)
|
2849
|
+
|
2850
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
2851
|
+
if tokenizer.IsUnknown(token_id):
|
2852
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
2853
|
+
elif tokenizer.IsControl(token_id):
|
2854
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
2855
|
+
elif tokenizer.IsUnused(token_id):
|
2856
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
2857
|
+
elif tokenizer.IsByte(token_id):
|
2858
|
+
>>>>>>> uupstream/master
|
2294
2859
|
toktype = SentencePieceTokenTypes.BYTE
|
2295
2860
|
|
2296
2861
|
tokens[token_id] = text
|
@@ -2312,6 +2877,41 @@ class Phi3MiniModel(Model):
|
|
2312
2877
|
scores[token_id] = -1000.0
|
2313
2878
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2314
2879
|
|
2880
|
+
<<<<<<< HEAD
|
2881
|
+
=======
|
2882
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2883
|
+
if tokenizer_config_file.is_file():
|
2884
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
2885
|
+
tokenizer_config_json = json.load(f)
|
2886
|
+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
2887
|
+
for token_id, foken_data in added_tokens_decoder.items():
|
2888
|
+
token_id = int(token_id)
|
2889
|
+
token = foken_data["content"].encode("utf-8")
|
2890
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
2891
|
+
assert tokens[token_id] == token
|
2892
|
+
tokens[token_id] = token
|
2893
|
+
scores[token_id] = -1000.0
|
2894
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2895
|
+
if foken_data.get("special"):
|
2896
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2897
|
+
|
2898
|
+
tokenizer_file = self.dir_model / 'tokenizer.json'
|
2899
|
+
if tokenizer_file.is_file():
|
2900
|
+
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
2901
|
+
tokenizer_json = json.load(f)
|
2902
|
+
added_tokens = tokenizer_json.get("added_tokens", [])
|
2903
|
+
for foken_data in added_tokens:
|
2904
|
+
token_id = int(foken_data["id"])
|
2905
|
+
token = foken_data["content"].encode("utf-8")
|
2906
|
+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
2907
|
+
assert tokens[token_id] == token
|
2908
|
+
tokens[token_id] = token
|
2909
|
+
scores[token_id] = -1000.0
|
2910
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2911
|
+
if foken_data.get("special"):
|
2912
|
+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2913
|
+
|
2914
|
+
>>>>>>> uupstream/master
|
2315
2915
|
self.gguf_writer.add_tokenizer_model("llama")
|
2316
2916
|
self.gguf_writer.add_tokenizer_pre("default")
|
2317
2917
|
self.gguf_writer.add_token_list(tokens)
|
@@ -2324,6 +2924,7 @@ class Phi3MiniModel(Model):
|
|
2324
2924
|
def set_gguf_parameters(self):
|
2325
2925
|
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
2326
2926
|
|
2927
|
+
<<<<<<< HEAD
|
2327
2928
|
rot_pct = 1.0
|
2328
2929
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
2329
2930
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
@@ -2341,6 +2942,61 @@ class Phi3MiniModel(Model):
|
|
2341
2942
|
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
2342
2943
|
self.gguf_writer.add_file_type(self.ftype)
|
2343
2944
|
|
2945
|
+
=======
|
2946
|
+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
2947
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
2948
|
+
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
2949
|
+
rms_eps = self.find_hparam(["rms_norm_eps"])
|
2950
|
+
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
2951
|
+
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
2952
|
+
rope_dims = n_embd // n_head
|
2953
|
+
|
2954
|
+
self.gguf_writer.add_name("Phi3")
|
2955
|
+
self.gguf_writer.add_context_length(max_pos_embds)
|
2956
|
+
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
2957
|
+
self.gguf_writer.add_embedding_length(n_embd)
|
2958
|
+
self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
2959
|
+
self.gguf_writer.add_block_count(block_count)
|
2960
|
+
self.gguf_writer.add_head_count(n_head)
|
2961
|
+
self.gguf_writer.add_head_count_kv(n_head_kv)
|
2962
|
+
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
2963
|
+
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
2964
|
+
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
2965
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2966
|
+
|
2967
|
+
# write rope scaling for long context (128k) model
|
2968
|
+
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
2969
|
+
if (rope_scaling is None):
|
2970
|
+
return
|
2971
|
+
|
2972
|
+
scale = max_pos_embds / orig_max_pos_embds
|
2973
|
+
|
2974
|
+
rope_scaling_type = rope_scaling.get('type', '').lower()
|
2975
|
+
if len(rope_scaling_type) == 0:
|
2976
|
+
raise KeyError('Missing the required key rope_scaling.type')
|
2977
|
+
|
2978
|
+
if rope_scaling_type == 'su':
|
2979
|
+
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
|
2980
|
+
elif rope_scaling_type == 'yarn':
|
2981
|
+
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
|
2982
|
+
else:
|
2983
|
+
raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
|
2984
|
+
|
2985
|
+
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
|
2986
|
+
|
2987
|
+
long_factors = rope_scaling.get('long_factor', None)
|
2988
|
+
short_factors = rope_scaling.get('short_factor', None)
|
2989
|
+
|
2990
|
+
if long_factors is None or short_factors is None:
|
2991
|
+
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
2992
|
+
|
2993
|
+
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
2994
|
+
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
2995
|
+
|
2996
|
+
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
2997
|
+
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
2998
|
+
|
2999
|
+
>>>>>>> uupstream/master
|
2344
3000
|
|
2345
3001
|
@Model.register("PlamoForCausalLM")
|
2346
3002
|
class PlamoModel(Model):
|
@@ -2361,6 +3017,7 @@ class PlamoModel(Model):
|
|
2361
3017
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
2362
3018
|
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
|
2363
3019
|
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
3020
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2364
3021
|
|
2365
3022
|
def shuffle_attn_q_weight(self, data_torch):
|
2366
3023
|
assert data_torch.size() == (5120, 5120)
|
@@ -2376,14 +3033,12 @@ class PlamoModel(Model):
|
|
2376
3033
|
data_torch = torch.reshape(data_torch, (5120, 5120))
|
2377
3034
|
return data_torch
|
2378
3035
|
|
2379
|
-
def
|
2380
|
-
|
2381
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
3036
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3037
|
+
del bid # unused
|
2382
3038
|
|
2383
|
-
|
2384
|
-
if "self_attn.rotary_emb.inv_freq" in name:
|
2385
|
-
continue
|
3039
|
+
new_name = self.map_tensor_name(name)
|
2386
3040
|
|
3041
|
+
<<<<<<< HEAD
|
2387
3042
|
# map tensor names
|
2388
3043
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2389
3044
|
if new_name is None:
|
@@ -2421,6 +3076,15 @@ class PlamoModel(Model):
|
|
2421
3076
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
2422
3077
|
|
2423
3078
|
self.gguf_writer.add_tensor(new_name, data)
|
3079
|
+
=======
|
3080
|
+
# shuffle for broadcasting of gqa in ggml_mul_mat
|
3081
|
+
if new_name.endswith("attn_q.weight"):
|
3082
|
+
data_torch = self.shuffle_attn_q_weight(data_torch)
|
3083
|
+
elif new_name.endswith("attn_output.weight"):
|
3084
|
+
data_torch = self.shuffle_attn_output_weight(data_torch)
|
3085
|
+
|
3086
|
+
return [(new_name, data_torch)]
|
3087
|
+
>>>>>>> uupstream/master
|
2424
3088
|
|
2425
3089
|
|
2426
3090
|
@Model.register("CodeShellForCausalLM")
|
@@ -2443,24 +3107,17 @@ class CodeShellModel(Model):
|
|
2443
3107
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
2444
3108
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
2445
3109
|
|
2446
|
-
def
|
2447
|
-
|
2448
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
2449
|
-
tensors = dict(self.get_tensors())
|
2450
|
-
has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
|
2451
|
-
for name, data_torch in tensors.items():
|
2452
|
-
# we don't need these
|
2453
|
-
if name.endswith((".attn.rotary_emb.inv_freq")):
|
2454
|
-
continue
|
3110
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3111
|
+
del bid # unused
|
2455
3112
|
|
2456
|
-
|
3113
|
+
new_name = self.map_tensor_name(name)
|
2457
3114
|
|
2458
|
-
|
2459
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2460
|
-
data_torch = data_torch.to(torch.float32)
|
3115
|
+
tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
|
2461
3116
|
|
2462
|
-
|
3117
|
+
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3118
|
+
assert self.tensor_names is not None
|
2463
3119
|
|
3120
|
+
<<<<<<< HEAD
|
2464
3121
|
# map tensor names
|
2465
3122
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2466
3123
|
if new_name is None:
|
@@ -2488,6 +3145,13 @@ class CodeShellModel(Model):
|
|
2488
3145
|
if not has_lm_head and name == "transformer.wte.weight":
|
2489
3146
|
self.gguf_writer.add_tensor("output.weight", data)
|
2490
3147
|
logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
3148
|
+
=======
|
3149
|
+
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
|
3150
|
+
# copy tok_embd.weight to output.weight
|
3151
|
+
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
3152
|
+
|
3153
|
+
return tensors
|
3154
|
+
>>>>>>> uupstream/master
|
2491
3155
|
|
2492
3156
|
|
2493
3157
|
@Model.register("InternLM2ForCausalLM")
|
@@ -2516,27 +3180,34 @@ class InternLM2Model(Model):
|
|
2516
3180
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
2517
3181
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
2518
3182
|
|
2519
|
-
tokenizer = SentencePieceProcessor(
|
3183
|
+
tokenizer = SentencePieceProcessor()
|
3184
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3185
|
+
|
2520
3186
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
2521
3187
|
|
2522
3188
|
for token_id in range(vocab_size):
|
2523
|
-
piece = tokenizer.
|
3189
|
+
piece = tokenizer.IdToPiece(token_id)
|
2524
3190
|
text = piece.encode("utf-8")
|
2525
|
-
score = tokenizer.
|
3191
|
+
score = tokenizer.GetScore(token_id)
|
2526
3192
|
if text == b"\x00":
|
2527
3193
|
# (TODO): fixme
|
2528
3194
|
# Hack here and replace the \x00 characters.
|
3195
|
+
<<<<<<< HEAD
|
2529
3196
|
logger.debug(f"InternLM2 convert token '{text}' to '🐉'!")
|
2530
3197
|
text = "🐉"
|
3198
|
+
=======
|
3199
|
+
logger.warning(f"InternLM2 convert token '{text}' to '🐉'!")
|
3200
|
+
text = "🐉".encode("utf-8")
|
3201
|
+
>>>>>>> uupstream/master
|
2531
3202
|
|
2532
3203
|
toktype = SentencePieceTokenTypes.NORMAL
|
2533
|
-
if tokenizer.
|
3204
|
+
if tokenizer.IsUnknown(token_id):
|
2534
3205
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
2535
|
-
elif tokenizer.
|
3206
|
+
elif tokenizer.IsControl(token_id):
|
2536
3207
|
toktype = SentencePieceTokenTypes.CONTROL
|
2537
|
-
elif tokenizer.
|
3208
|
+
elif tokenizer.IsUnused(token_id):
|
2538
3209
|
toktype = SentencePieceTokenTypes.UNUSED
|
2539
|
-
elif tokenizer.
|
3210
|
+
elif tokenizer.IsByte(token_id):
|
2540
3211
|
toktype = SentencePieceTokenTypes.BYTE
|
2541
3212
|
|
2542
3213
|
tokens.append(text)
|
@@ -2573,13 +3244,15 @@ in chat mode so that the conversation can end normally.")
|
|
2573
3244
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2574
3245
|
|
2575
3246
|
def _try_get_sft_eos(self, tokenizer):
|
2576
|
-
unused_145_list = tokenizer.
|
2577
|
-
im_end_list = tokenizer.
|
3247
|
+
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
|
3248
|
+
im_end_list = tokenizer.Encode('<|im_end|>')
|
3249
|
+
eos_token = None
|
2578
3250
|
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
|
2579
3251
|
if len(unused_145_list) == 1:
|
2580
3252
|
eos_token = unused_145_list[0]
|
2581
3253
|
if len(im_end_list) == 1:
|
2582
3254
|
eos_token = im_end_list[0]
|
3255
|
+
assert eos_token
|
2583
3256
|
return eos_token
|
2584
3257
|
|
2585
3258
|
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
@@ -2599,7 +3272,9 @@ in chat mode so that the conversation can end normally.")
|
|
2599
3272
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
2600
3273
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
2601
3274
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
3275
|
+
self.gguf_writer.add_file_type(self.ftype)
|
2602
3276
|
|
3277
|
+
<<<<<<< HEAD
|
2603
3278
|
def post_write_tensors(self, tensor_map, name, data_torch):
|
2604
3279
|
old_dtype = data_torch.dtype
|
2605
3280
|
|
@@ -2638,33 +3313,38 @@ in chat mode so that the conversation can end normally.")
|
|
2638
3313
|
num_heads = self.hparams.get("num_attention_heads")
|
2639
3314
|
num_kv_heads = self.hparams.get("num_key_value_heads")
|
2640
3315
|
hidden_size = self.hparams.get("hidden_size")
|
3316
|
+
=======
|
3317
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3318
|
+
num_heads = self.hparams["num_attention_heads"]
|
3319
|
+
num_kv_heads = self.hparams["num_key_value_heads"]
|
3320
|
+
hidden_size = self.hparams["hidden_size"]
|
3321
|
+
>>>>>>> uupstream/master
|
2641
3322
|
q_per_kv = num_heads // num_kv_heads
|
2642
3323
|
head_dim = hidden_size // num_heads
|
2643
3324
|
num_groups = num_heads // q_per_kv
|
2644
3325
|
|
2645
|
-
block_count = self.hparams["num_hidden_layers"]
|
2646
|
-
model_kv = dict(self.get_tensors())
|
2647
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
2648
3326
|
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
|
2649
|
-
for name, data_torch in model_kv.items():
|
2650
|
-
# we don't need these
|
2651
|
-
if name.endswith(".rotary_emb.inv_freq"):
|
2652
|
-
continue
|
2653
3327
|
|
2654
|
-
|
2655
|
-
|
2656
|
-
|
2657
|
-
|
2658
|
-
|
2659
|
-
|
2660
|
-
|
2661
|
-
|
2662
|
-
|
2663
|
-
|
2664
|
-
|
2665
|
-
|
2666
|
-
|
2667
|
-
|
3328
|
+
if re.match(qkv_pattern, name):
|
3329
|
+
bid = re.findall(qkv_pattern, name)[0]
|
3330
|
+
qkv = data_torch
|
3331
|
+
# qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
|
3332
|
+
qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
|
3333
|
+
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
|
3334
|
+
# The model weights of q and k equire additional reshape.
|
3335
|
+
# q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
|
3336
|
+
q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
|
3337
|
+
# k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
|
3338
|
+
k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
|
3339
|
+
# v = rearrange(v, " o g n i -> o (g n i)").T
|
3340
|
+
v = v.reshape((v.shape[0], -1)).T
|
3341
|
+
return [
|
3342
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
|
3343
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
|
3344
|
+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v),
|
3345
|
+
]
|
3346
|
+
else:
|
3347
|
+
return [(self.map_tensor_name(name), data_torch)]
|
2668
3348
|
|
2669
3349
|
|
2670
3350
|
@Model.register("BertModel", "CamembertModel")
|
@@ -2729,14 +3409,10 @@ class BertModel(Model):
|
|
2729
3409
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2730
3410
|
special_vocab.add_to_gguf(self.gguf_writer)
|
2731
3411
|
|
2732
|
-
def
|
2733
|
-
|
2734
|
-
tensors = dict(self.get_tensors())
|
2735
|
-
for name, data_torch in tensors.items():
|
2736
|
-
# we are only using BERT for embeddings so we don't need the pooling layer
|
2737
|
-
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
2738
|
-
continue # we don't need these
|
3412
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3413
|
+
del bid # unused
|
2739
3414
|
|
3415
|
+
<<<<<<< HEAD
|
2740
3416
|
# map tensor names
|
2741
3417
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2742
3418
|
if new_name is None:
|
@@ -2766,6 +3442,13 @@ class BertModel(Model):
|
|
2766
3442
|
data = data.astype(new_dtype)
|
2767
3443
|
|
2768
3444
|
self.gguf_writer.add_tensor(new_name, data)
|
3445
|
+
=======
|
3446
|
+
# we are only using BERT for embeddings so we don't need the pooling layer
|
3447
|
+
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
3448
|
+
return [] # we don't need these
|
3449
|
+
|
3450
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3451
|
+
>>>>>>> uupstream/master
|
2769
3452
|
|
2770
3453
|
|
2771
3454
|
@Model.register("NomicBertModel")
|
@@ -2831,10 +3514,10 @@ class GemmaModel(Model):
|
|
2831
3514
|
self.gguf_writer.add_value_length(hparams["head_dim"])
|
2832
3515
|
self.gguf_writer.add_file_type(self.ftype)
|
2833
3516
|
|
2834
|
-
def
|
2835
|
-
|
2836
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
3517
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3518
|
+
del bid # unused
|
2837
3519
|
|
3520
|
+
<<<<<<< HEAD
|
2838
3521
|
for name, data_torch in self.get_tensors():
|
2839
3522
|
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
2840
3523
|
# To prevent errors, skip loading lm_head.weight.
|
@@ -2843,11 +3526,19 @@ class GemmaModel(Model):
|
|
2843
3526
|
continue
|
2844
3527
|
|
2845
3528
|
old_dtype = data_torch.dtype
|
2846
|
-
|
2847
|
-
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
3529
|
+
=======
|
3530
|
+
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
3531
|
+
# To prevent errors, skip loading lm_head.weight.
|
3532
|
+
if name == "lm_head.weight":
|
3533
|
+
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
3534
|
+
return []
|
3535
|
+
>>>>>>> uupstream/master
|
3536
|
+
|
3537
|
+
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
3538
|
+
if name.endswith("norm.weight"):
|
3539
|
+
data_torch = data_torch + 1
|
3540
|
+
|
3541
|
+
<<<<<<< HEAD
|
2851
3542
|
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
2852
3543
|
if name.endswith("norm.weight"):
|
2853
3544
|
data_torch = data_torch + 1
|
@@ -2870,6 +3561,9 @@ class GemmaModel(Model):
|
|
2870
3561
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
2871
3562
|
|
2872
3563
|
self.gguf_writer.add_tensor(new_name, data)
|
3564
|
+
=======
|
3565
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3566
|
+
>>>>>>> uupstream/master
|
2873
3567
|
|
2874
3568
|
|
2875
3569
|
@Model.register("Starcoder2ForCausalLM")
|
@@ -2892,6 +3586,8 @@ class MambaModel(Model):
|
|
2892
3586
|
|
2893
3587
|
if (self.dir_model / "tokenizer.json").is_file():
|
2894
3588
|
self._set_vocab_gpt2()
|
3589
|
+
elif (self.dir_model / "tokenizer.model").is_file():
|
3590
|
+
self._set_vocab_sentencepiece()
|
2895
3591
|
else:
|
2896
3592
|
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
2897
3593
|
tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
|
@@ -2899,28 +3595,48 @@ class MambaModel(Model):
|
|
2899
3595
|
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
|
2900
3596
|
|
2901
3597
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
3598
|
+
<<<<<<< HEAD
|
2902
3599
|
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
|
2903
3600
|
|
2904
3601
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
2905
3602
|
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
|
3603
|
+
=======
|
3604
|
+
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
|
3605
|
+
|
3606
|
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
3607
|
+
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
|
3608
|
+
>>>>>>> uupstream/master
|
2906
3609
|
|
2907
3610
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
3611
|
+
assert field
|
2908
3612
|
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
2909
3613
|
|
2910
3614
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
3615
|
+
assert field
|
2911
3616
|
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
2912
3617
|
|
2913
3618
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
3619
|
+
assert field
|
2914
3620
|
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
2915
3621
|
|
2916
3622
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
|
3623
|
+
<<<<<<< HEAD
|
2917
3624
|
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
|
2918
3625
|
|
2919
3626
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
2920
3627
|
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
|
3628
|
+
=======
|
3629
|
+
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
|
3630
|
+
|
3631
|
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
3632
|
+
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
|
3633
|
+
>>>>>>> uupstream/master
|
2921
3634
|
|
2922
3635
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
|
2923
|
-
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
|
3636
|
+
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
|
3637
|
+
|
3638
|
+
field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
|
3639
|
+
self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
|
2924
3640
|
|
2925
3641
|
def set_gguf_parameters(self):
|
2926
3642
|
d_model = self.find_hparam(["hidden_size", "d_model"])
|
@@ -2949,21 +3665,17 @@ class MambaModel(Model):
|
|
2949
3665
|
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
2950
3666
|
self.gguf_writer.add_file_type(self.ftype)
|
2951
3667
|
|
2952
|
-
|
2953
|
-
block_count = self.hparams["n_layer"]
|
2954
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
3668
|
+
_tok_embd = None
|
2955
3669
|
|
2956
|
-
|
2957
|
-
|
2958
|
-
output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
|
3670
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3671
|
+
del bid # unused
|
2959
3672
|
|
2960
|
-
|
2961
|
-
|
3673
|
+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
3674
|
+
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
|
2962
3675
|
|
2963
|
-
|
2964
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2965
|
-
data_torch = data_torch.to(torch.float32)
|
3676
|
+
new_name = self.map_tensor_name(name)
|
2966
3677
|
|
3678
|
+
<<<<<<< HEAD
|
2967
3679
|
# map tensor names
|
2968
3680
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2969
3681
|
if new_name is None:
|
@@ -2980,9 +3692,26 @@ class MambaModel(Model):
|
|
2980
3692
|
continue
|
2981
3693
|
if new_name == tok_embd_name:
|
2982
3694
|
tok_embd = data_torch
|
2983
|
-
|
2984
|
-
|
2985
|
-
|
3695
|
+
=======
|
3696
|
+
if name.endswith(".A_log"):
|
3697
|
+
logger.debug("A_log --> A ==> " + new_name)
|
3698
|
+
data_torch = -torch.exp(data_torch)
|
3699
|
+
|
3700
|
+
# assuming token_embd.weight is seen before output.weight
|
3701
|
+
if self._tok_embd is not None and new_name == output_name:
|
3702
|
+
if torch.equal(self._tok_embd, data_torch):
|
3703
|
+
logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
|
3704
|
+
return []
|
3705
|
+
elif new_name == tok_embd_name:
|
3706
|
+
self._tok_embd = data_torch
|
3707
|
+
|
3708
|
+
return [(new_name, data_torch)]
|
3709
|
+
>>>>>>> uupstream/master
|
3710
|
+
|
3711
|
+
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
3712
|
+
del n_dims # unused
|
3713
|
+
|
3714
|
+
<<<<<<< HEAD
|
2986
3715
|
n_dims = len(data.shape)
|
2987
3716
|
data_dtype = data.dtype
|
2988
3717
|
|
@@ -3002,6 +3731,17 @@ class MambaModel(Model):
|
|
3002
3731
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
3003
3732
|
|
3004
3733
|
self.gguf_writer.add_tensor(new_name, data)
|
3734
|
+
=======
|
3735
|
+
return bid is not None and new_name in (
|
3736
|
+
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
3737
|
+
gguf.MODEL_TENSOR.SSM_CONV1D,
|
3738
|
+
gguf.MODEL_TENSOR.SSM_X,
|
3739
|
+
gguf.MODEL_TENSOR.SSM_DT,
|
3740
|
+
gguf.MODEL_TENSOR.SSM_A,
|
3741
|
+
gguf.MODEL_TENSOR.SSM_D,
|
3742
|
+
]
|
3743
|
+
)
|
3744
|
+
>>>>>>> uupstream/master
|
3005
3745
|
|
3006
3746
|
|
3007
3747
|
@Model.register("CohereForCausalLM")
|
@@ -3013,7 +3753,8 @@ class CommandR2Model(Model):
|
|
3013
3753
|
|
3014
3754
|
# max_position_embeddings = 8192 in config.json but model was actually
|
3015
3755
|
# trained on 128k context length
|
3016
|
-
|
3756
|
+
# aya-23 models don't have model_max_length specified
|
3757
|
+
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
3017
3758
|
|
3018
3759
|
def set_gguf_parameters(self):
|
3019
3760
|
super().set_gguf_parameters()
|
@@ -3035,6 +3776,7 @@ class OlmoModel(Model):
|
|
3035
3776
|
|
3036
3777
|
# Same as super class, but permuting q_proj, k_proj
|
3037
3778
|
# Copied from: LlamaModel
|
3779
|
+
<<<<<<< HEAD
|
3038
3780
|
def write_tensors(self):
|
3039
3781
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
3040
3782
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
@@ -3079,11 +3821,252 @@ class OlmoModel(Model):
|
|
3079
3821
|
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
3080
3822
|
|
3081
3823
|
self.gguf_writer.add_tensor(new_name, data)
|
3824
|
+
=======
|
3825
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3826
|
+
del bid # unused
|
3827
|
+
|
3828
|
+
n_head = self.hparams["num_attention_heads"]
|
3829
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
3830
|
+
|
3831
|
+
if name.endswith("q_proj.weight"):
|
3832
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
3833
|
+
if name.endswith("k_proj.weight"):
|
3834
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
3835
|
+
|
3836
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3837
|
+
|
3838
|
+
|
3839
|
+
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
3840
|
+
class JinaBertV2Model(BertModel):
|
3841
|
+
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
3842
|
+
|
3843
|
+
def __init__(self, *args, **kwargs):
|
3844
|
+
super().__init__(*args, **kwargs)
|
3845
|
+
self.intermediate_size = self.hparams["intermediate_size"]
|
3846
|
+
|
3847
|
+
def get_tensors(self):
|
3848
|
+
for name, data in super().get_tensors():
|
3849
|
+
if 'gated_layers' in name:
|
3850
|
+
d1 = data[:self.intermediate_size, :]
|
3851
|
+
name1 = name.replace('gated_layers', 'gated_layers_w')
|
3852
|
+
d2 = data[self.intermediate_size:, :]
|
3853
|
+
name2 = name.replace('gated_layers', 'gated_layers_v')
|
3854
|
+
yield name1, d1
|
3855
|
+
yield name2, d2
|
3856
|
+
continue
|
3857
|
+
|
3858
|
+
yield name, data
|
3859
|
+
|
3860
|
+
def set_vocab(self, *args, **kwargs):
|
3861
|
+
tokenizer_class = 'BertTokenizer'
|
3862
|
+
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
3863
|
+
tokenizer_class = json.load(f)['tokenizer_class']
|
3864
|
+
|
3865
|
+
if tokenizer_class == 'BertTokenizer':
|
3866
|
+
super().set_vocab()
|
3867
|
+
elif tokenizer_class == 'RobertaTokenizer':
|
3868
|
+
self._set_vocab_gpt2()
|
3869
|
+
self.gguf_writer.add_token_type_count(2)
|
3870
|
+
else:
|
3871
|
+
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
3872
|
+
self.gguf_writer.add_add_bos_token(True)
|
3873
|
+
self.gguf_writer.add_add_eos_token(True)
|
3874
|
+
|
3875
|
+
|
3876
|
+
@Model.register("ArcticForCausalLM")
|
3877
|
+
class ArcticModel(Model):
|
3878
|
+
model_arch = gguf.MODEL_ARCH.ARCTIC
|
3879
|
+
|
3880
|
+
def set_vocab(self):
|
3881
|
+
# The reason for using a custom implementation here is that the
|
3882
|
+
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
3883
|
+
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
3884
|
+
from sentencepiece import SentencePieceProcessor
|
3885
|
+
|
3886
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
3887
|
+
|
3888
|
+
if not tokenizer_path.is_file():
|
3889
|
+
logger.error(f'Error: Missing {tokenizer_path}')
|
3890
|
+
sys.exit(1)
|
3891
|
+
|
3892
|
+
# Read the whole vocabulary from the tokenizer.model file
|
3893
|
+
tokenizer = SentencePieceProcessor()
|
3894
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3895
|
+
|
3896
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
3897
|
+
|
3898
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3899
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3900
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
3901
|
+
|
3902
|
+
for token_id in range(tokenizer.vocab_size()):
|
3903
|
+
|
3904
|
+
piece = tokenizer.IdToPiece(token_id)
|
3905
|
+
text = piece.encode("utf-8")
|
3906
|
+
score = tokenizer.GetScore(token_id)
|
3907
|
+
|
3908
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3909
|
+
if tokenizer.IsUnknown(token_id):
|
3910
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3911
|
+
elif tokenizer.IsControl(token_id):
|
3912
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3913
|
+
elif tokenizer.IsUnused(token_id):
|
3914
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3915
|
+
elif tokenizer.IsByte(token_id):
|
3916
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3917
|
+
|
3918
|
+
tokens[token_id] = text
|
3919
|
+
scores[token_id] = score
|
3920
|
+
toktypes[token_id] = toktype
|
3921
|
+
|
3922
|
+
# Use the added_tokens_decoder field from tokeniser_config.json as the source
|
3923
|
+
# of information about added/redefined tokens and modify them accordingly.
|
3924
|
+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
3925
|
+
if tokenizer_config_file.is_file():
|
3926
|
+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
3927
|
+
tokenizer_config_json = json.load(f)
|
3928
|
+
|
3929
|
+
if "added_tokens_decoder" in tokenizer_config_json:
|
3930
|
+
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
3931
|
+
for token_id, token_json in added_tokens_decoder.items():
|
3932
|
+
token_id = int(token_id)
|
3933
|
+
if (token_id >= vocab_size):
|
3934
|
+
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
3935
|
+
continue
|
3936
|
+
|
3937
|
+
token_content = token_json["content"]
|
3938
|
+
token_type = SentencePieceTokenTypes.USER_DEFINED
|
3939
|
+
token_score = -10000.0
|
3940
|
+
|
3941
|
+
# Map unk_token to UNKNOWN, other special tokens to CONTROL
|
3942
|
+
# Set the score to 0.0 as in the original tokenizer.model
|
3943
|
+
if ("special" in token_json) and token_json["special"]:
|
3944
|
+
if token_content == tokenizer_config_json["unk_token"]:
|
3945
|
+
token_type = SentencePieceTokenTypes.UNKNOWN
|
3946
|
+
else:
|
3947
|
+
token_type = SentencePieceTokenTypes.CONTROL
|
3948
|
+
token_score = 0.0
|
3949
|
+
|
3950
|
+
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
|
3951
|
+
tokens[token_id] = token_content.encode("utf-8")
|
3952
|
+
toktypes[token_id] = token_type
|
3953
|
+
scores[token_id] = token_score
|
3954
|
+
|
3955
|
+
self.gguf_writer.add_tokenizer_model("llama")
|
3956
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3957
|
+
self.gguf_writer.add_token_list(tokens)
|
3958
|
+
self.gguf_writer.add_token_scores(scores)
|
3959
|
+
self.gguf_writer.add_token_types(toktypes)
|
3960
|
+
|
3961
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3962
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3963
|
+
|
3964
|
+
def set_gguf_parameters(self):
|
3965
|
+
super().set_gguf_parameters()
|
3966
|
+
hparams = self.hparams
|
3967
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
3968
|
+
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
3969
|
+
|
3970
|
+
_experts: list[dict[str, Tensor]] | None = None
|
3971
|
+
|
3972
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3973
|
+
n_head = self.hparams["num_attention_heads"]
|
3974
|
+
n_kv_head = self.hparams.get("num_key_value_heads")
|
3975
|
+
|
3976
|
+
if name.endswith("q_proj.weight"):
|
3977
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
3978
|
+
if name.endswith("k_proj.weight"):
|
3979
|
+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
3980
|
+
|
3981
|
+
# process the experts separately
|
3982
|
+
if name.find("block_sparse_moe.experts") != -1:
|
3983
|
+
n_experts = self.hparams["num_local_experts"]
|
3984
|
+
|
3985
|
+
assert bid is not None
|
3986
|
+
|
3987
|
+
if self._experts is None:
|
3988
|
+
self._experts = [{} for _ in range(self.block_count)]
|
3989
|
+
|
3990
|
+
self._experts[bid][name] = data_torch
|
3991
|
+
|
3992
|
+
if len(self._experts[bid]) >= n_experts * 3:
|
3993
|
+
tensors: list[tuple[str, Tensor]] = []
|
3994
|
+
|
3995
|
+
# merge the experts into a single 3d tensor
|
3996
|
+
for wid in ["w1", "w2", "w3"]:
|
3997
|
+
datas: list[Tensor] = []
|
3998
|
+
|
3999
|
+
for xid in range(n_experts):
|
4000
|
+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
|
4001
|
+
datas.append(self._experts[bid][ename])
|
4002
|
+
del self._experts[bid][ename]
|
4003
|
+
|
4004
|
+
data_torch = torch.stack(datas, dim=0)
|
4005
|
+
|
4006
|
+
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
|
4007
|
+
|
4008
|
+
new_name = self.map_tensor_name(merged_name)
|
4009
|
+
|
4010
|
+
tensors.append((new_name, data_torch))
|
4011
|
+
return tensors
|
4012
|
+
else:
|
4013
|
+
return []
|
4014
|
+
|
4015
|
+
return [(self.map_tensor_name(name), data_torch)]
|
4016
|
+
|
4017
|
+
def write_tensors(self):
|
4018
|
+
super().write_tensors()
|
4019
|
+
|
4020
|
+
if self._experts is not None:
|
4021
|
+
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
4022
|
+
experts = [k for d in self._experts for k in d.keys()]
|
4023
|
+
if len(experts) > 0:
|
4024
|
+
raise ValueError(f"Unprocessed experts: {experts}")
|
4025
|
+
>>>>>>> uupstream/master
|
3082
4026
|
|
3083
4027
|
|
3084
4028
|
###### CONVERSION LOGIC ######
|
3085
4029
|
|
3086
4030
|
|
4031
|
+
# tree of lazy tensors
|
4032
|
+
class LazyTorchTensor(gguf.LazyBase):
|
4033
|
+
_tensor_type = torch.Tensor
|
4034
|
+
# to keep the type-checker happy
|
4035
|
+
dtype: torch.dtype
|
4036
|
+
shape: torch.Size
|
4037
|
+
|
4038
|
+
# only used when converting a torch.Tensor to a np.ndarray
|
4039
|
+
_dtype_map: dict[torch.dtype, type] = {
|
4040
|
+
torch.float16: np.float16,
|
4041
|
+
torch.float32: np.float32,
|
4042
|
+
}
|
4043
|
+
|
4044
|
+
def numpy(self) -> gguf.LazyNumpyTensor:
|
4045
|
+
dtype = self._dtype_map[self.dtype]
|
4046
|
+
return gguf.LazyNumpyTensor(
|
4047
|
+
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
4048
|
+
lazy=self._lazy,
|
4049
|
+
args=(self,),
|
4050
|
+
func=(lambda s: s[0].numpy())
|
4051
|
+
)
|
4052
|
+
|
4053
|
+
@classmethod
|
4054
|
+
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
|
4055
|
+
return torch.empty(size=shape, dtype=dtype, device="meta")
|
4056
|
+
|
4057
|
+
@classmethod
|
4058
|
+
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
4059
|
+
del types # unused
|
4060
|
+
|
4061
|
+
if kwargs is None:
|
4062
|
+
kwargs = {}
|
4063
|
+
|
4064
|
+
if func is torch.Tensor.numpy:
|
4065
|
+
return args[0].numpy()
|
4066
|
+
|
4067
|
+
return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
|
4068
|
+
|
4069
|
+
|
3087
4070
|
def parse_args() -> argparse.Namespace:
|
3088
4071
|
parser = argparse.ArgumentParser(
|
3089
4072
|
description="Convert a huggingface model to a GGML compatible file")
|
@@ -3093,23 +4076,46 @@ def parse_args() -> argparse.Namespace:
|
|
3093
4076
|
)
|
3094
4077
|
parser.add_argument(
|
3095
4078
|
"--awq-path", type=Path, default=None,
|
3096
|
-
help="Path to scale awq cache file"
|
4079
|
+
help="Path to scale awq cache file",
|
4080
|
+
)
|
3097
4081
|
parser.add_argument(
|
3098
4082
|
"--outfile", type=Path,
|
3099
|
-
help="path to write to; default: based on input",
|
4083
|
+
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
4084
|
+
)
|
4085
|
+
parser.add_argument(
|
4086
|
+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
4087
|
+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
3100
4088
|
)
|
3101
4089
|
parser.add_argument(
|
3102
|
-
"--
|
3103
|
-
help="
|
4090
|
+
"--bigendian", action="store_true",
|
4091
|
+
help="model is executed on big endian machine",
|
3104
4092
|
)
|
3105
|
-
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
3106
4093
|
parser.add_argument(
|
3107
4094
|
"model", type=Path,
|
3108
4095
|
help="directory containing model file",
|
3109
4096
|
)
|
4097
|
+
<<<<<<< HEAD
|
3110
4098
|
parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
|
3111
4099
|
parser.add_argument("--model-name", type=str, default=None, help="name of the model")
|
3112
4100
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
4101
|
+
=======
|
4102
|
+
parser.add_argument(
|
4103
|
+
"--use-temp-file", action="store_true",
|
4104
|
+
help="use the tempfile library while processing (helpful when running out of memory, process killed)",
|
4105
|
+
)
|
4106
|
+
parser.add_argument(
|
4107
|
+
"--no-lazy", action="store_true",
|
4108
|
+
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
|
4109
|
+
)
|
4110
|
+
parser.add_argument(
|
4111
|
+
"--model-name", type=str, default=None,
|
4112
|
+
help="name of the model",
|
4113
|
+
)
|
4114
|
+
parser.add_argument(
|
4115
|
+
"--verbose", action="store_true",
|
4116
|
+
help="increase output verbosity",
|
4117
|
+
)
|
4118
|
+
>>>>>>> uupstream/master
|
3113
4119
|
|
3114
4120
|
return parser.parse_args()
|
3115
4121
|
|
@@ -3138,16 +4144,19 @@ def main() -> None:
|
|
3138
4144
|
logger.error(f'Error: {args.model} is not a directory')
|
3139
4145
|
sys.exit(1)
|
3140
4146
|
|
3141
|
-
ftype_map = {
|
3142
|
-
"f32": gguf.
|
3143
|
-
"f16": gguf.
|
4147
|
+
ftype_map: dict[str, gguf.LlamaFileType] = {
|
4148
|
+
"f32": gguf.LlamaFileType.ALL_F32,
|
4149
|
+
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
4150
|
+
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
4151
|
+
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
4152
|
+
"auto": gguf.LlamaFileType.GUESSED,
|
3144
4153
|
}
|
3145
4154
|
|
3146
4155
|
if args.outfile is not None:
|
3147
4156
|
fname_out = args.outfile
|
3148
4157
|
else:
|
3149
4158
|
# output in the same directory as the model by default
|
3150
|
-
fname_out = dir_model /
|
4159
|
+
fname_out = dir_model / 'ggml-model-{ftype}.gguf'
|
3151
4160
|
|
3152
4161
|
logger.info(f"Loading model: {dir_model.name}")
|
3153
4162
|
|
@@ -3155,7 +4164,11 @@ def main() -> None:
|
|
3155
4164
|
|
3156
4165
|
with torch.inference_mode():
|
3157
4166
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
4167
|
+
<<<<<<< HEAD
|
3158
4168
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
|
4169
|
+
=======
|
4170
|
+
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
|
4171
|
+
>>>>>>> uupstream/master
|
3159
4172
|
|
3160
4173
|
logger.info("Set model parameters")
|
3161
4174
|
model_instance.set_gguf_parameters()
|
@@ -3163,7 +4176,10 @@ def main() -> None:
|
|
3163
4176
|
logger.info("Set model tokenizer")
|
3164
4177
|
model_instance.set_vocab()
|
3165
4178
|
|
4179
|
+
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
4180
|
+
|
3166
4181
|
if args.vocab_only:
|
4182
|
+
<<<<<<< HEAD
|
3167
4183
|
logger.info(f"Exporting model vocab to '{fname_out}'")
|
3168
4184
|
model_instance.write_vocab()
|
3169
4185
|
else:
|
@@ -3171,6 +4187,15 @@ def main() -> None:
|
|
3171
4187
|
model_instance.write()
|
3172
4188
|
|
3173
4189
|
logger.info(f"Model successfully exported to '{fname_out}'")
|
4190
|
+
=======
|
4191
|
+
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
4192
|
+
model_instance.write_vocab()
|
4193
|
+
else:
|
4194
|
+
logger.info(f"Exporting model to '{model_instance.fname_out}'")
|
4195
|
+
model_instance.write()
|
4196
|
+
|
4197
|
+
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
4198
|
+
>>>>>>> uupstream/master
|
3174
4199
|
|
3175
4200
|
|
3176
4201
|
if __name__ == '__main__':
|