bigdl-core-cpp 2.5.0b20240507__py3-none-win_amd64.whl → 2.5.0b20240509__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. bigdl/cpp/convert-hf-to-gguf.py +3177 -0
  2. bigdl/cpp/convert.py +36 -24
  3. bigdl/cpp/gguf-py/gguf/constants.py +19 -3
  4. bigdl/cpp/gguf-py/gguf/gguf_reader.py +16 -3
  5. bigdl/cpp/gguf-py/gguf/gguf_writer.py +12 -1
  6. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +2 -0
  7. bigdl/cpp/gguf-py/gguf/vocab.py +13 -29
  8. bigdl/cpp/libs/baby-llama.exe +0 -0
  9. bigdl/cpp/libs/batched-bench.exe +0 -0
  10. bigdl/cpp/libs/batched.exe +0 -0
  11. bigdl/cpp/libs/beam-search.exe +0 -0
  12. bigdl/cpp/libs/benchmark.exe +0 -0
  13. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  14. bigdl/cpp/libs/embedding.exe +0 -0
  15. bigdl/cpp/libs/export-lora.exe +0 -0
  16. bigdl/cpp/libs/finetune.exe +0 -0
  17. bigdl/cpp/libs/ggml_shared.dll +0 -0
  18. bigdl/cpp/libs/gguf.exe +0 -0
  19. bigdl/cpp/libs/gritlm.exe +0 -0
  20. bigdl/cpp/libs/imatrix.exe +0 -0
  21. bigdl/cpp/libs/infill.exe +0 -0
  22. bigdl/cpp/libs/llama-bench.exe +0 -0
  23. bigdl/cpp/libs/llama.dll +0 -0
  24. bigdl/cpp/libs/llava-cli.exe +0 -0
  25. bigdl/cpp/libs/llava_shared.dll +0 -0
  26. bigdl/cpp/libs/lookahead.exe +0 -0
  27. bigdl/cpp/libs/lookup.exe +0 -0
  28. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  29. bigdl/cpp/libs/main.exe +0 -0
  30. bigdl/cpp/libs/ollama.exe +0 -0
  31. bigdl/cpp/libs/parallel.exe +0 -0
  32. bigdl/cpp/libs/passkey.exe +0 -0
  33. bigdl/cpp/libs/perplexity.exe +0 -0
  34. bigdl/cpp/libs/q8dot.exe +0 -0
  35. bigdl/cpp/libs/quantize-stats.exe +0 -0
  36. bigdl/cpp/libs/quantize.exe +0 -0
  37. bigdl/cpp/libs/save-load-state.exe +0 -0
  38. bigdl/cpp/libs/server.exe +0 -0
  39. bigdl/cpp/libs/simple.exe +0 -0
  40. bigdl/cpp/libs/speculative.exe +0 -0
  41. bigdl/cpp/libs/tokenize.exe +0 -0
  42. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  43. bigdl/cpp/libs/vdot.exe +0 -0
  44. {bigdl_core_cpp-2.5.0b20240507.data → bigdl_core_cpp-2.5.0b20240509.data}/scripts/init-llama-cpp.bat +1 -0
  45. {bigdl_core_cpp-2.5.0b20240507.dist-info → bigdl_core_cpp-2.5.0b20240509.dist-info}/METADATA +1 -1
  46. bigdl_core_cpp-2.5.0b20240509.dist-info/RECORD +55 -0
  47. bigdl_core_cpp-2.5.0b20240507.dist-info/RECORD +0 -54
  48. {bigdl_core_cpp-2.5.0b20240507.data → bigdl_core_cpp-2.5.0b20240509.data}/scripts/init-llama-cpp.ps1 +0 -0
  49. {bigdl_core_cpp-2.5.0b20240507.data → bigdl_core_cpp-2.5.0b20240509.data}/scripts/init-ollama.bat +0 -0
  50. {bigdl_core_cpp-2.5.0b20240507.dist-info → bigdl_core_cpp-2.5.0b20240509.dist-info}/WHEEL +0 -0
  51. {bigdl_core_cpp-2.5.0b20240507.dist-info → bigdl_core_cpp-2.5.0b20240509.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3177 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import argparse
7
+ import contextlib
8
+ import json
9
+ import os
10
+ import re
11
+ import sys
12
+ from abc import ABC, abstractmethod
13
+ from enum import IntEnum
14
+ from pathlib import Path
15
+ from hashlib import sha256
16
+ from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
17
+
18
+ import numpy as np
19
+ import torch
20
+
21
+ if TYPE_CHECKING:
22
+ from torch import Tensor
23
+
24
+ if 'NO_LOCAL_GGUF' not in os.environ:
25
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
26
+ import gguf
27
+
28
+ from convert import LlamaHfVocab, permute
29
+
30
+ logger = logging.getLogger("hf-to-gguf")
31
+
32
+
33
+ ###### MODEL DEFINITIONS ######
34
+
35
+ class SentencePieceTokenTypes(IntEnum):
36
+ NORMAL = 1
37
+ UNKNOWN = 2
38
+ CONTROL = 3
39
+ USER_DEFINED = 4
40
+ UNUSED = 5
41
+ BYTE = 6
42
+
43
+
44
+ AnyModel = TypeVar("AnyModel", bound="type[Model]")
45
+
46
+
47
+ class Model(ABC):
48
+ _model_classes: dict[str, type[Model]] = {}
49
+
50
+ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
51
+ self.dir_model = dir_model
52
+ self.ftype = ftype
53
+ self.fname_out = fname_out
54
+ self.is_big_endian = is_big_endian
55
+ self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
56
+ self.use_temp_file = use_temp_file
57
+ self.is_safetensors = self._is_model_safetensors()
58
+ self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
59
+ self.part_names = self._get_part_names()
60
+ self.hparams = Model.load_hparams(self.dir_model)
61
+ self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
62
+ self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
63
+
64
+ @property
65
+ @abstractmethod
66
+ def model_arch(self) -> gguf.MODEL_ARCH:
67
+ pass
68
+
69
+ def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
70
+ key = next((k for k in keys if k in self.hparams), None)
71
+ if key is not None:
72
+ return self.hparams[key]
73
+ if optional:
74
+ return None
75
+ raise KeyError(f"could not find any of: {keys}")
76
+
77
+ def set_vocab(self):
78
+ self._set_vocab_gpt2()
79
+
80
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
81
+ for part_name in self.part_names:
82
+ logger.info(f"gguf: loading model part '{part_name}'")
83
+ ctx: ContextManager[Any]
84
+ if self.is_safetensors:
85
+ from safetensors import safe_open
86
+ ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
87
+ else:
88
+ ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
89
+
90
+ with ctx as model_part:
91
+ for name in model_part.keys():
92
+ data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
93
+ yield name, data
94
+
95
+ def set_gguf_parameters(self):
96
+ self.gguf_writer.add_name(self.dir_model.name)
97
+ self.gguf_writer.add_block_count(self.block_count)
98
+
99
+ if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
100
+ self.gguf_writer.add_context_length(n_ctx)
101
+ logger.info(f"gguf: context length = {n_ctx}")
102
+
103
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
104
+ self.gguf_writer.add_embedding_length(n_embd)
105
+ logger.info(f"gguf: embedding length = {n_embd}")
106
+
107
+ if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
108
+ self.gguf_writer.add_feed_forward_length(n_ff)
109
+ logger.info(f"gguf: feed forward length = {n_ff}")
110
+
111
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
112
+ self.gguf_writer.add_head_count(n_head)
113
+ logger.info(f"gguf: head count = {n_head}")
114
+
115
+ if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
116
+ self.gguf_writer.add_head_count_kv(n_head_kv)
117
+ logger.info(f"gguf: key-value head count = {n_head_kv}")
118
+
119
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
120
+ self.gguf_writer.add_rope_freq_base(rope_theta)
121
+ logger.info(f"gguf: rope theta = {rope_theta}")
122
+ if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
123
+ self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
124
+ logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
125
+ if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
126
+ self.gguf_writer.add_layer_norm_eps(f_norm_eps)
127
+ logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
128
+ if (n_experts := self.hparams.get("num_local_experts")) is not None:
129
+ self.gguf_writer.add_expert_count(n_experts)
130
+ logger.info(f"gguf: expert count = {n_experts}")
131
+ if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
132
+ self.gguf_writer.add_expert_used_count(n_experts_used)
133
+ logger.info(f"gguf: experts used count = {n_experts_used}")
134
+
135
+ self.gguf_writer.add_file_type(self.ftype)
136
+ logger.info(f"gguf: file type = {self.ftype}")
137
+
138
+ def write_tensors(self):
139
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
140
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
141
+ for name, data_torch in self.get_tensors():
142
+ # we don't need these
143
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
144
+ continue
145
+
146
+ old_dtype = data_torch.dtype
147
+
148
+ # convert any unsupported data types to float32
149
+ if data_torch.dtype not in (torch.float16, torch.float32):
150
+ data_torch = data_torch.to(torch.float32)
151
+
152
+ data = data_torch.squeeze().numpy()
153
+
154
+ # map tensor names
155
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
156
+ if new_name is None:
157
+ raise ValueError(f"Can not map tensor {name!r}")
158
+
159
+ n_dims = len(data.shape)
160
+ data_dtype = data.dtype
161
+
162
+ # if f32 desired, convert any float16 to float32
163
+ if self.ftype == 0 and data_dtype == np.float16:
164
+ data = data.astype(np.float32)
165
+
166
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
167
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
168
+ data = data.astype(np.float32)
169
+
170
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
171
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
172
+ data = data.astype(np.float16)
173
+
174
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
175
+
176
+ self.gguf_writer.add_tensor(new_name, data)
177
+
178
+ def write(self):
179
+ self.write_tensors()
180
+ self.gguf_writer.write_header_to_file()
181
+ self.gguf_writer.write_kv_data_to_file()
182
+ self.gguf_writer.write_tensors_to_file()
183
+ self.gguf_writer.close()
184
+
185
+ def write_vocab(self):
186
+ self.gguf_writer.write_header_to_file()
187
+ self.gguf_writer.write_kv_data_to_file()
188
+ self.gguf_writer.close()
189
+
190
+ @staticmethod
191
+ def count_model_parts(dir_model: Path, prefix: str) -> int:
192
+ num_parts = 0
193
+ for filename in os.listdir(dir_model):
194
+ if filename.endswith(prefix):
195
+ num_parts += 1
196
+
197
+ return num_parts
198
+
199
+ @staticmethod
200
+ def load_hparams(dir_model):
201
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
202
+ return json.load(f)
203
+
204
+ @classmethod
205
+ def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
206
+ assert names
207
+
208
+ def func(modelcls: type[Model]):
209
+ for name in names:
210
+ cls._model_classes[name] = modelcls
211
+ return modelcls
212
+ return func
213
+
214
+ @classmethod
215
+ def from_model_architecture(cls, arch):
216
+ try:
217
+ return cls._model_classes[arch]
218
+ except KeyError:
219
+ raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
220
+
221
+ def _is_model_safetensors(self) -> bool:
222
+ return Model.count_model_parts(self.dir_model, ".safetensors") > 0
223
+
224
+ def _get_part_names(self):
225
+ if self.is_safetensors:
226
+ if self.num_parts == 1: # there's only one .safetensors file
227
+ return ("model.safetensors",)
228
+ return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
229
+
230
+ if self.num_parts == 1: # there's only one .bin file
231
+ return ("pytorch_model.bin",)
232
+ return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
233
+
234
+ # used for GPT-2 BPE and WordPiece vocabs
235
+ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
236
+ tokens: list[str] = []
237
+ toktypes: list[int] = []
238
+
239
+ from transformers import AutoTokenizer
240
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
241
+ vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
242
+ assert max(tokenizer.vocab.values()) < vocab_size
243
+
244
+ tokpre = self.get_vocab_base_pre(tokenizer)
245
+
246
+ reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
247
+ added_vocab = tokenizer.get_added_vocab()
248
+
249
+ for i in range(vocab_size):
250
+ if i not in reverse_vocab:
251
+ tokens.append(f"[PAD{i}]")
252
+ toktypes.append(gguf.TokenType.USER_DEFINED)
253
+ elif reverse_vocab[i] in added_vocab:
254
+ tokens.append(reverse_vocab[i])
255
+ if tokenizer.added_tokens_decoder[i].special:
256
+ toktypes.append(gguf.TokenType.CONTROL)
257
+ else:
258
+ toktypes.append(gguf.TokenType.USER_DEFINED)
259
+ else:
260
+ tokens.append(reverse_vocab[i])
261
+ toktypes.append(gguf.TokenType.NORMAL)
262
+
263
+ return tokens, toktypes, tokpre
264
+
265
+ # NOTE: this function is generated by convert-hf-to-gguf-update.py
266
+ # do not modify it manually!
267
+ # ref: https://github.com/ggerganov/llama.cpp/pull/6920
268
+ def get_vocab_base_pre(self, tokenizer) -> str:
269
+ # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
270
+ # is specific for the BPE pre-tokenizer used by the model
271
+ # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
272
+ # use in llama.cpp to implement the same pre-tokenizer
273
+
274
+ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
275
+
276
+ chktok = tokenizer.encode(chktxt)
277
+ chkhsh = sha256(str(chktok).encode()).hexdigest()
278
+
279
+ logger.debug(f"chktok: {chktok}")
280
+ logger.debug(f"chkhsh: {chkhsh}")
281
+
282
+ res = None
283
+
284
+ # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
285
+ # or pull the latest version of the model from Huggingface
286
+ # don't edit the hashes manually!
287
+ if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
288
+ # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
289
+ res = "llama-bpe"
290
+ if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
291
+ # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
292
+ res = "deepseek-llm"
293
+ if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
294
+ # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
295
+ res = "deepseek-coder"
296
+ if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
297
+ # ref: https://huggingface.co/tiiuae/falcon-7b
298
+ res = "falcon"
299
+ if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
300
+ # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
301
+ res = "bert-bge"
302
+ if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
303
+ # ref: https://huggingface.co/mosaicml/mpt-7b
304
+ res = "mpt"
305
+ if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
306
+ # ref: https://huggingface.co/bigcode/starcoder2-3b
307
+ res = "starcoder"
308
+ if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
309
+ # ref: https://huggingface.co/openai-community/gpt2
310
+ res = "gpt-2"
311
+ if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
312
+ # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
313
+ res = "refact"
314
+ if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
315
+ # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
316
+ res = "command-r"
317
+ if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
318
+ # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
319
+ res = "olmo"
320
+
321
+ if res is None:
322
+ logger.warning("\n")
323
+ logger.warning("**************************************************************************************")
324
+ logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
325
+ logger.warning("** There are 2 possible reasons for this:")
326
+ logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
327
+ logger.warning("** - the pre-tokenization config has changed upstream")
328
+ logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
329
+ logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
330
+ logger.warning("**")
331
+ logger.warning(f"** chkhsh: {chkhsh}")
332
+ logger.warning("**************************************************************************************")
333
+ logger.warning("\n")
334
+ raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
335
+
336
+ logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
337
+ logger.debug(f"chkhsh: {chkhsh}")
338
+
339
+ return res
340
+
341
+ def _set_vocab_gpt2(self) -> None:
342
+ tokens, toktypes, tokpre = self.get_vocab_base()
343
+ self.gguf_writer.add_tokenizer_model("gpt2")
344
+ self.gguf_writer.add_tokenizer_pre(tokpre)
345
+ self.gguf_writer.add_token_list(tokens)
346
+ self.gguf_writer.add_token_types(toktypes)
347
+
348
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
349
+ special_vocab.add_to_gguf(self.gguf_writer)
350
+
351
+ def _set_vocab_qwen(self):
352
+ dir_model = self.dir_model
353
+ hparams = self.hparams
354
+ tokens: list[str] = []
355
+ toktypes: list[int] = []
356
+
357
+ from transformers import AutoTokenizer
358
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
359
+ vocab_size = hparams["vocab_size"]
360
+ assert max(tokenizer.get_vocab().values()) < vocab_size
361
+
362
+ tokpre = self.get_vocab_base_pre(tokenizer)
363
+
364
+ merges = []
365
+ vocab = {}
366
+ mergeable_ranks = tokenizer.mergeable_ranks
367
+ for token, rank in mergeable_ranks.items():
368
+ vocab[QwenModel.token_bytes_to_string(token)] = rank
369
+ if len(token) == 1:
370
+ continue
371
+ merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
372
+ assert len(merged) == 2
373
+ merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
374
+
375
+ # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
376
+ added_vocab = tokenizer.special_tokens
377
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
378
+
379
+ for i in range(vocab_size):
380
+ if i not in reverse_vocab:
381
+ tokens.append(f"[PAD{i}]")
382
+ toktypes.append(gguf.TokenType.USER_DEFINED)
383
+ elif reverse_vocab[i] in added_vocab:
384
+ tokens.append(reverse_vocab[i])
385
+ toktypes.append(gguf.TokenType.CONTROL)
386
+ else:
387
+ tokens.append(reverse_vocab[i])
388
+ toktypes.append(gguf.TokenType.NORMAL)
389
+
390
+ self.gguf_writer.add_tokenizer_model("gpt2")
391
+ self.gguf_writer.add_tokenizer_pre(tokpre)
392
+ self.gguf_writer.add_token_list(tokens)
393
+ self.gguf_writer.add_token_types(toktypes)
394
+
395
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
396
+ special_vocab.merges = merges
397
+ # only add special tokens when they were not already loaded from config.json
398
+ if len(special_vocab.special_token_ids) == 0:
399
+ special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
400
+ special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
401
+ # this one is usually not in config.json anyway
402
+ special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
403
+ special_vocab.add_to_gguf(self.gguf_writer)
404
+
405
+ def _set_vocab_sentencepiece(self):
406
+ from sentencepiece import SentencePieceProcessor
407
+
408
+ tokenizer_path = self.dir_model / 'tokenizer.model'
409
+
410
+ tokens: list[bytes] = []
411
+ scores: list[float] = []
412
+ toktypes: list[int] = []
413
+
414
+ if not tokenizer_path.is_file():
415
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
416
+
417
+ tokenizer = SentencePieceProcessor(str(tokenizer_path))
418
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
419
+
420
+ for token_id in range(tokenizer.vocab_size()):
421
+ piece = tokenizer.id_to_piece(token_id)
422
+ text = piece.encode("utf-8")
423
+ score = tokenizer.get_score(token_id)
424
+
425
+ toktype = SentencePieceTokenTypes.NORMAL
426
+ if tokenizer.is_unknown(token_id):
427
+ toktype = SentencePieceTokenTypes.UNKNOWN
428
+ elif tokenizer.is_control(token_id):
429
+ toktype = SentencePieceTokenTypes.CONTROL
430
+ elif tokenizer.is_unused(token_id):
431
+ toktype = SentencePieceTokenTypes.UNUSED
432
+ elif tokenizer.is_byte(token_id):
433
+ toktype = SentencePieceTokenTypes.BYTE
434
+
435
+ tokens.append(text)
436
+ scores.append(score)
437
+ toktypes.append(toktype)
438
+
439
+ added_tokens_file = self.dir_model / 'added_tokens.json'
440
+ if added_tokens_file.is_file():
441
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
442
+ added_tokens_json = json.load(f)
443
+
444
+ for key in added_tokens_json:
445
+ key = key.encode("utf-8")
446
+ if key not in tokens:
447
+ tokens.append(key)
448
+ scores.append(-1000.0)
449
+ toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
450
+
451
+ if vocab_size > len(tokens):
452
+ pad_count = vocab_size - len(tokens)
453
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
454
+ for i in range(1, pad_count + 1):
455
+ tokens.append(f"[PAD{i}]")
456
+ scores.append(-1000.0)
457
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
458
+
459
+ assert len(tokens) == vocab_size
460
+
461
+ self.gguf_writer.add_tokenizer_model("llama")
462
+ self.gguf_writer.add_tokenizer_pre("default")
463
+ self.gguf_writer.add_token_list(tokens)
464
+ self.gguf_writer.add_token_scores(scores)
465
+ self.gguf_writer.add_token_types(toktypes)
466
+
467
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
468
+ special_vocab.add_to_gguf(self.gguf_writer)
469
+
470
+ def _set_vocab_llama_hf(self):
471
+ vocab = LlamaHfVocab(self.dir_model)
472
+ tokens = []
473
+ scores = []
474
+ toktypes = []
475
+
476
+ for text, score, toktype in vocab.all_tokens():
477
+ tokens.append(text)
478
+ scores.append(score)
479
+ toktypes.append(toktype)
480
+
481
+ assert len(tokens) == vocab.vocab_size
482
+
483
+ self.gguf_writer.add_tokenizer_model("llama")
484
+ self.gguf_writer.add_tokenizer_pre("default")
485
+ self.gguf_writer.add_token_list(tokens)
486
+ self.gguf_writer.add_token_scores(scores)
487
+ self.gguf_writer.add_token_types(toktypes)
488
+
489
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
490
+ special_vocab.add_to_gguf(self.gguf_writer)
491
+
492
+
493
+ @Model.register("GPTNeoXForCausalLM")
494
+ class GPTNeoXModel(Model):
495
+ model_arch = gguf.MODEL_ARCH.GPTNEOX
496
+
497
+ def set_gguf_parameters(self):
498
+ block_count = self.hparams["num_hidden_layers"]
499
+
500
+ self.gguf_writer.add_name(self.dir_model.name)
501
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
502
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
503
+ self.gguf_writer.add_block_count(block_count)
504
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
505
+ self.gguf_writer.add_rope_dimension_count(
506
+ int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
507
+ )
508
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
509
+ self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
510
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
511
+
512
+
513
+ @Model.register("BloomForCausalLM")
514
+ class BloomModel(Model):
515
+ model_arch = gguf.MODEL_ARCH.BLOOM
516
+
517
+ def set_gguf_parameters(self):
518
+ self.gguf_writer.add_name("Bloom")
519
+ n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
520
+ n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
521
+ self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
522
+ self.gguf_writer.add_embedding_length(n_embed)
523
+ self.gguf_writer.add_feed_forward_length(4 * n_embed)
524
+ self.gguf_writer.add_block_count(self.hparams["n_layer"])
525
+ self.gguf_writer.add_head_count(n_head)
526
+ self.gguf_writer.add_head_count_kv(n_head)
527
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
528
+ self.gguf_writer.add_file_type(self.ftype)
529
+
530
+ def write_tensors(self):
531
+ block_count = self.hparams["n_layer"]
532
+ tensors = dict(self.get_tensors())
533
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
534
+ has_lm_head = True
535
+ n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
536
+ n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
537
+
538
+ for name, data_torch in tensors.items():
539
+ if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
540
+ has_lm_head = False
541
+
542
+ name = re.sub(r'transformer\.', '', name)
543
+
544
+ old_dtype = data_torch.dtype
545
+
546
+ # convert any unsupported data types to float32
547
+ if data_torch.dtype not in (torch.float16, torch.float32):
548
+ data_torch = data_torch.to(torch.float32)
549
+
550
+ data = data_torch.squeeze().numpy()
551
+
552
+ if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
553
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
554
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
555
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
556
+ qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
557
+ data = np.concatenate(
558
+ (
559
+ qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
560
+ qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
561
+ qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
562
+ ),
563
+ axis=0,
564
+ )
565
+ logger.info("re-format attention.linear_qkv.weight")
566
+ elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
567
+ qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
568
+ data = np.concatenate(
569
+ (
570
+ qkv_bias[:, 0, :].reshape((n_embed,)),
571
+ qkv_bias[:, 1, :].reshape((n_embed,)),
572
+ qkv_bias[:, 2, :].reshape((n_embed,)),
573
+ ),
574
+ axis=0,
575
+ )
576
+ logger.info("re-format attention.linear_qkv.bias")
577
+
578
+ # map tensor names
579
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
580
+ if new_name is None:
581
+ raise ValueError(f"Can not map tensor {name!r}")
582
+
583
+ n_dims = len(data.shape)
584
+ data_dtype = data.dtype
585
+
586
+ # if f32 desired, convert any float16 to float32
587
+ if self.ftype == 0 and data_dtype == np.float16:
588
+ data = data.astype(np.float32)
589
+
590
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
591
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
592
+ data = data.astype(np.float32)
593
+
594
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
595
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
596
+ data = data.astype(np.float16)
597
+
598
+ logger.info(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
599
+
600
+ self.gguf_writer.add_tensor(new_name, data)
601
+
602
+ if not has_lm_head and name == "word_embeddings.weight":
603
+ self.gguf_writer.add_tensor("output.weight", data)
604
+ logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
605
+
606
+
607
+ @Model.register("MPTForCausalLM")
608
+ class MPTModel(Model):
609
+ model_arch = gguf.MODEL_ARCH.MPT
610
+
611
+ def set_vocab(self):
612
+ try:
613
+ self._set_vocab_gpt2()
614
+ except Exception:
615
+ # Fallback for SEA-LION model
616
+ self._set_vocab_sentencepiece()
617
+ self.gguf_writer.add_add_bos_token(False)
618
+ self.gguf_writer.add_pad_token_id(3)
619
+ self.gguf_writer.add_eos_token_id(1)
620
+ self.gguf_writer.add_unk_token_id(0)
621
+
622
+ def set_gguf_parameters(self):
623
+ block_count = self.hparams["n_layers"]
624
+ self.gguf_writer.add_name(self.dir_model.name)
625
+ self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
626
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
627
+ self.gguf_writer.add_block_count(block_count)
628
+ self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
629
+ self.gguf_writer.add_head_count(self.hparams["n_heads"])
630
+ if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
631
+ self.gguf_writer.add_head_count_kv(kv_n_heads)
632
+ self.gguf_writer.add_layer_norm_eps(1e-5)
633
+ if self.hparams["attn_config"]["clip_qkv"] is not None:
634
+ self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
635
+ if self.hparams["attn_config"]["alibi"]:
636
+ self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
637
+ else:
638
+ self.gguf_writer.add_max_alibi_bias(0.0)
639
+
640
+ def write_tensors(self):
641
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
642
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
643
+ for name, data_torch in self.get_tensors():
644
+ # we don't need these
645
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
646
+ continue
647
+
648
+ old_dtype = data_torch.dtype
649
+
650
+ # convert any unsupported data types to float32
651
+ if data_torch.dtype not in (torch.float16, torch.float32):
652
+ data_torch = data_torch.to(torch.float32)
653
+
654
+ data = data_torch.squeeze().numpy()
655
+
656
+ # map tensor names
657
+ if "scales" in name:
658
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
659
+ if new_name is not None:
660
+ new_name = new_name.replace("scales", "act.scales")
661
+ else:
662
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
663
+ if new_name is None:
664
+ raise ValueError(f"Can not map tensor {name!r}")
665
+
666
+ n_dims = len(data.shape)
667
+ data_dtype = data.dtype
668
+
669
+ # if f32 desired, convert any float16 to float32
670
+ if self.ftype == 0 and data_dtype == np.float16:
671
+ data = data.astype(np.float32)
672
+
673
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
674
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
675
+ data = data.astype(np.float32)
676
+
677
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
678
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
679
+ data = data.astype(np.float16)
680
+
681
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
682
+
683
+ self.gguf_writer.add_tensor(new_name, data)
684
+
685
+
686
+ @Model.register("OrionForCausalLM")
687
+ class OrionModel(Model):
688
+ model_arch = gguf.MODEL_ARCH.ORION
689
+
690
+ def set_vocab(self):
691
+ self._set_vocab_sentencepiece()
692
+
693
+ def set_gguf_parameters(self):
694
+ block_count = self.hparams["num_hidden_layers"]
695
+ head_count = self.hparams["num_attention_heads"]
696
+ head_count_kv = self.hparams.get("num_key_value_heads", head_count)
697
+ hf_repo = self.hparams.get("_name_or_path", "")
698
+
699
+ ctx_length = 0
700
+ if "max_sequence_length" in self.hparams:
701
+ ctx_length = self.hparams["max_sequence_length"]
702
+ elif "max_position_embeddings" in self.hparams:
703
+ ctx_length = self.hparams["max_position_embeddings"]
704
+ elif "model_max_length" in self.hparams:
705
+ ctx_length = self.hparams["model_max_length"]
706
+ else:
707
+ raise ValueError("gguf: can not find ctx length parameter.")
708
+
709
+ self.gguf_writer.add_file_type(self.ftype)
710
+ self.gguf_writer.add_name(self.dir_model.name)
711
+ self.gguf_writer.add_source_hf_repo(hf_repo)
712
+ self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
713
+ self.gguf_writer.add_context_length(ctx_length)
714
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
715
+ self.gguf_writer.add_block_count(block_count)
716
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
717
+ self.gguf_writer.add_head_count(head_count)
718
+ self.gguf_writer.add_head_count_kv(head_count_kv)
719
+ # note: config provides rms norm but it is actually layer norm
720
+ # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
721
+ self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
722
+
723
+ def write_tensors(self):
724
+ # Collect tensors from generator object
725
+ model_kv = dict(self.get_tensors())
726
+ block_count = self.hparams["num_hidden_layers"]
727
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
728
+
729
+ for name, data_torch in model_kv.items():
730
+ # we don't need these
731
+ if name.endswith(".rotary_emb.inv_freq"):
732
+ continue
733
+
734
+ old_dtype = data_torch.dtype
735
+
736
+ # convert any unsupported data types to float32
737
+ if data_torch.dtype not in (torch.float16, torch.float32):
738
+ data_torch = data_torch.to(torch.float32)
739
+
740
+ data = data_torch.squeeze().numpy()
741
+
742
+ # map tensor names
743
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
744
+ if new_name is None:
745
+ raise ValueError(f"Can not map tensor {name!r}")
746
+
747
+ n_dims = len(data.shape)
748
+ data_dtype = data.dtype
749
+
750
+ # if f32 desired, convert any float16 to float32
751
+ if self.ftype == 0 and data_dtype == np.float16:
752
+ data = data.astype(np.float32)
753
+
754
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
755
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
756
+ data = data.astype(np.float32)
757
+
758
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
759
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
760
+ data = data.astype(np.float16)
761
+
762
+ logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
763
+ self.gguf_writer.add_tensor(new_name, data)
764
+
765
+
766
+ @Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
767
+ class BaichuanModel(Model):
768
+ model_arch = gguf.MODEL_ARCH.BAICHUAN
769
+
770
+ def set_vocab(self):
771
+ self._set_vocab_sentencepiece()
772
+
773
+ def set_gguf_parameters(self):
774
+ block_count = self.hparams["num_hidden_layers"]
775
+ head_count = self.hparams["num_attention_heads"]
776
+ head_count_kv = self.hparams.get("num_key_value_heads", head_count)
777
+ hf_repo = self.hparams.get("_name_or_path", "")
778
+
779
+ ctx_length = 0
780
+ if "max_sequence_length" in self.hparams:
781
+ ctx_length = self.hparams["max_sequence_length"]
782
+ elif "max_position_embeddings" in self.hparams:
783
+ ctx_length = self.hparams["max_position_embeddings"]
784
+ elif "model_max_length" in self.hparams:
785
+ ctx_length = self.hparams["model_max_length"]
786
+ else:
787
+ raise ValueError("gguf: can not find ctx length parameter.")
788
+
789
+ self.gguf_writer.add_name(self.dir_model.name)
790
+ self.gguf_writer.add_source_hf_repo(hf_repo)
791
+ self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
792
+ self.gguf_writer.add_context_length(ctx_length)
793
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
794
+ self.gguf_writer.add_block_count(block_count)
795
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
796
+ self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
797
+ self.gguf_writer.add_head_count(head_count)
798
+ self.gguf_writer.add_head_count_kv(head_count_kv)
799
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
800
+
801
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
802
+ if self.hparams["rope_scaling"].get("type") == "linear":
803
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
804
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
805
+
806
+ def write_tensors(self):
807
+ # Collect tensors from generator object
808
+ model_kv = dict(self.get_tensors())
809
+ block_count = self.hparams["num_hidden_layers"]
810
+ head_count = self.hparams["num_attention_heads"]
811
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
812
+ head_count_kv = self.hparams.get("num_key_value_heads", head_count)
813
+
814
+ for i in range(block_count):
815
+ if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
816
+ logger.info(f"Unpacking and permuting layer {i}")
817
+ model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
818
+ self._reverse_hf_permute_part(w, 0, head_count, head_count)
819
+ model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
820
+ self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
821
+ model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
822
+ self._reverse_hf_part(w, 2)
823
+ del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
824
+
825
+ for name, data_torch in model_kv.items():
826
+ # we don't need these
827
+ if name.endswith(".rotary_emb.inv_freq"):
828
+ continue
829
+
830
+ old_dtype = data_torch.dtype
831
+
832
+ # convert any unsupported data types to float32
833
+ if data_torch.dtype not in (torch.float16, torch.float32):
834
+ data_torch = data_torch.to(torch.float32)
835
+
836
+ data = data_torch.squeeze().numpy()
837
+
838
+ # map tensor names
839
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
840
+ if new_name is None:
841
+ raise ValueError(f"Can not map tensor {name!r}")
842
+
843
+ n_dims = len(data.shape)
844
+ data_dtype = data.dtype
845
+
846
+ # if f32 desired, convert any float16 to float32
847
+ if self.ftype == 0 and data_dtype == np.float16:
848
+ data = data.astype(np.float32)
849
+
850
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
851
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
852
+ data = data.astype(np.float32)
853
+
854
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
855
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
856
+ data = data.astype(np.float16)
857
+
858
+ logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
859
+ self.gguf_writer.add_tensor(new_name, data)
860
+
861
+ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
862
+ if n_kv_head is not None and n_head != n_kv_head:
863
+ n_head //= n_kv_head
864
+
865
+ return (
866
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
867
+ .swapaxes(1, 2)
868
+ .reshape(weights.shape)
869
+ )
870
+
871
+ def _reverse_hf_permute_part(
872
+ self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
873
+ ) -> Tensor:
874
+ r = weights.shape[0] // 3
875
+ return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
876
+
877
+ def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
878
+ r = weights.shape[0] // 3
879
+ return weights[r * n_part:r * n_part + r, ...]
880
+
881
+
882
+ @Model.register("XverseForCausalLM")
883
+ class XverseModel(Model):
884
+ model_arch = gguf.MODEL_ARCH.XVERSE
885
+
886
+ def set_vocab(self):
887
+ assert (self.dir_model / "tokenizer.json").is_file()
888
+ dir_model = self.dir_model
889
+ hparams = self.hparams
890
+
891
+ tokens: list[bytearray] = []
892
+ toktypes: list[int] = []
893
+
894
+ from transformers import AutoTokenizer
895
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
896
+ vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
897
+ assert max(tokenizer.vocab.values()) < vocab_size
898
+
899
+ reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
900
+ added_vocab = tokenizer.get_added_vocab()
901
+
902
+ for token_id in range(vocab_size):
903
+ token_text = reverse_vocab[token_id].encode('utf-8')
904
+ # replace "\x00" to string with length > 0
905
+ if token_text == b"\x00":
906
+ toktype = gguf.TokenType.BYTE # special
907
+ token_text = f"<{token_text}>".encode('utf-8')
908
+ elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
909
+ toktype = gguf.TokenType.BYTE # special
910
+ elif reverse_vocab[token_id] in added_vocab:
911
+ if tokenizer.added_tokens_decoder[token_id].special:
912
+ toktype = gguf.TokenType.CONTROL
913
+ else:
914
+ toktype = gguf.TokenType.USER_DEFINED
915
+ else:
916
+ toktype = gguf.TokenType.NORMAL
917
+
918
+ tokens.append(token_text)
919
+ toktypes.append(toktype)
920
+
921
+ self.gguf_writer.add_tokenizer_model("llama")
922
+ self.gguf_writer.add_tokenizer_pre("default")
923
+ self.gguf_writer.add_token_list(tokens)
924
+ self.gguf_writer.add_token_types(toktypes)
925
+
926
+ special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
927
+ special_vocab.add_to_gguf(self.gguf_writer)
928
+
929
+ def set_gguf_parameters(self):
930
+ block_count = self.hparams["num_hidden_layers"]
931
+ head_count = self.hparams["num_attention_heads"]
932
+ head_count_kv = self.hparams.get("num_key_value_heads", head_count)
933
+ hf_repo = self.hparams.get("_name_or_path", "")
934
+
935
+ ctx_length = 0
936
+ if "max_sequence_length" in self.hparams:
937
+ ctx_length = self.hparams["max_sequence_length"]
938
+ elif "max_position_embeddings" in self.hparams:
939
+ ctx_length = self.hparams["max_position_embeddings"]
940
+ elif "model_max_length" in self.hparams:
941
+ ctx_length = self.hparams["model_max_length"]
942
+ else:
943
+ raise ValueError("gguf: can not find ctx length parameter.")
944
+
945
+ self.gguf_writer.add_name(self.dir_model.name)
946
+ self.gguf_writer.add_source_hf_repo(hf_repo)
947
+ self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
948
+ self.gguf_writer.add_context_length(ctx_length)
949
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
950
+ self.gguf_writer.add_block_count(block_count)
951
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
952
+ self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
953
+ self.gguf_writer.add_head_count(head_count)
954
+ self.gguf_writer.add_head_count_kv(head_count_kv)
955
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
956
+
957
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
958
+ if self.hparams["rope_scaling"].get("type") == "linear":
959
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
960
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
961
+
962
+ def write_tensors(self):
963
+ # Collect tensors from generator object
964
+ model_kv = dict(self.get_tensors())
965
+ block_count = self.hparams["num_hidden_layers"]
966
+ head_count = self.hparams["num_attention_heads"]
967
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
968
+ head_count_kv = self.hparams.get("num_key_value_heads", head_count)
969
+
970
+ for name, data_torch in model_kv.items():
971
+ # we don't need these
972
+ if name.endswith(".rotary_emb.inv_freq"):
973
+ continue
974
+
975
+ old_dtype = data_torch.dtype
976
+
977
+ # convert any unsupported data types to float32
978
+ if data_torch.dtype not in (torch.float16, torch.float32):
979
+ data_torch = data_torch.to(torch.float32)
980
+
981
+ # HF models permute some of the tensors, so we need to undo that
982
+ if name.endswith(("q_proj.weight")):
983
+ data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
984
+ if name.endswith(("k_proj.weight")):
985
+ data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
986
+
987
+ data = data_torch.squeeze().numpy()
988
+
989
+ # map tensor names
990
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
991
+ if new_name is None:
992
+ raise ValueError(f"Can not map tensor {name!r}")
993
+
994
+ n_dims = len(data.shape)
995
+ data_dtype = data.dtype
996
+
997
+ # if f32 desired, convert any float16 to float32
998
+ if self.ftype == 0 and data_dtype == np.float16:
999
+ data = data.astype(np.float32)
1000
+
1001
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1002
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1003
+ data = data.astype(np.float32)
1004
+
1005
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1006
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1007
+ data = data.astype(np.float16)
1008
+
1009
+ logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1010
+ self.gguf_writer.add_tensor(new_name, data)
1011
+
1012
+ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1013
+ if n_kv_head is not None and n_head != n_kv_head:
1014
+ n_head //= n_kv_head
1015
+
1016
+ return (
1017
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1018
+ .swapaxes(1, 2)
1019
+ .reshape(weights.shape)
1020
+ )
1021
+
1022
+
1023
+ @Model.register("FalconForCausalLM", "RWForCausalLM")
1024
+ class FalconModel(Model):
1025
+ model_arch = gguf.MODEL_ARCH.FALCON
1026
+
1027
+ def set_gguf_parameters(self):
1028
+ block_count = self.hparams.get("num_hidden_layers")
1029
+ if block_count is None:
1030
+ block_count = self.hparams["n_layer"] # old name
1031
+
1032
+ n_head = self.hparams.get("num_attention_heads")
1033
+ if n_head is None:
1034
+ n_head = self.hparams["n_head"] # old name
1035
+
1036
+ n_head_kv = self.hparams.get("num_kv_heads")
1037
+ if n_head_kv is None:
1038
+ n_head_kv = self.hparams.get("n_head_kv", 1) # old name
1039
+
1040
+ self.gguf_writer.add_name("Falcon")
1041
+ self.gguf_writer.add_context_length(2048) # not in config.json
1042
+ self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
1043
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1044
+ self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
1045
+ self.gguf_writer.add_block_count(block_count)
1046
+ self.gguf_writer.add_head_count(n_head)
1047
+ self.gguf_writer.add_head_count_kv(n_head_kv)
1048
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
1049
+ self.gguf_writer.add_file_type(self.ftype)
1050
+
1051
+ def write_tensors(self):
1052
+ block_count = self.hparams.get("num_hidden_layers")
1053
+ if block_count is None:
1054
+ block_count = self.hparams["n_layer"] # old name
1055
+
1056
+ n_head = self.hparams.get("num_attention_heads")
1057
+ if n_head is None:
1058
+ n_head = self.hparams["n_head"] # old name
1059
+
1060
+ n_head_kv = self.hparams.get("num_kv_heads")
1061
+ if n_head_kv is None:
1062
+ n_head_kv = self.hparams.get("n_head_kv", 1) # old name
1063
+
1064
+ head_dim = self.hparams["hidden_size"] // n_head
1065
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1066
+
1067
+ for name, data_torch in self.get_tensors():
1068
+ old_dtype = data_torch.dtype
1069
+
1070
+ # convert any unsupported data types to float32
1071
+ if data_torch.dtype not in (torch.float16, torch.float32):
1072
+ data_torch = data_torch.to(torch.float32)
1073
+
1074
+ # QKV tensor transform
1075
+ # The original query_key_value tensor contains n_head_kv "kv groups",
1076
+ # each consisting of n_head/n_head_kv query weights followed by one key
1077
+ # and one value weight (shared by all query heads in the kv group).
1078
+ # This layout makes it a big pain to work with in GGML.
1079
+ # So we rearrange them here,, so that we have n_head query weights
1080
+ # followed by n_head_kv key weights followed by n_head_kv value weights,
1081
+ # in contiguous fashion.
1082
+ # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
1083
+
1084
+ if "query_key_value" in name:
1085
+ qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
1086
+ q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
1087
+ k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
1088
+ v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
1089
+ data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
1090
+
1091
+ data = data_torch.squeeze().numpy()
1092
+
1093
+ # map tensor names
1094
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1095
+ if new_name is None:
1096
+ raise ValueError(f"Can not map tensor {name!r}")
1097
+
1098
+ n_dims = len(data.shape)
1099
+ data_dtype = data.dtype
1100
+
1101
+ # if f32 desired, convert any float16 to float32
1102
+ if self.ftype == 0 and data_dtype == np.float16:
1103
+ data = data.astype(np.float32)
1104
+
1105
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1106
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1107
+ data = data.astype(np.float32)
1108
+
1109
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1110
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1111
+ data = data.astype(np.float16)
1112
+
1113
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1114
+
1115
+ self.gguf_writer.add_tensor(new_name, data)
1116
+
1117
+
1118
+ @Model.register("GPTBigCodeForCausalLM")
1119
+ class StarCoderModel(Model):
1120
+ model_arch = gguf.MODEL_ARCH.STARCODER
1121
+
1122
+ def set_gguf_parameters(self):
1123
+ block_count = self.hparams["n_layer"]
1124
+
1125
+ self.gguf_writer.add_name("StarCoder")
1126
+ self.gguf_writer.add_context_length(self.hparams["n_positions"])
1127
+ self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
1128
+ self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
1129
+ self.gguf_writer.add_block_count(block_count)
1130
+ self.gguf_writer.add_head_count(self.hparams["n_head"])
1131
+ self.gguf_writer.add_head_count_kv(1)
1132
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
1133
+ self.gguf_writer.add_file_type(self.ftype)
1134
+
1135
+
1136
+ @Model.register("GPTRefactForCausalLM")
1137
+ class RefactModel(Model):
1138
+ model_arch = gguf.MODEL_ARCH.REFACT
1139
+
1140
+ def set_gguf_parameters(self):
1141
+ hidden_dim = self.hparams["n_embd"]
1142
+ inner_dim = 4 * hidden_dim
1143
+ hidden_dim = int(2 * inner_dim / 3)
1144
+ multiple_of = 256
1145
+ ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
1146
+
1147
+ block_count = self.hparams["n_layer"]
1148
+
1149
+ self.gguf_writer.add_name("Refact")
1150
+ # refact uses Alibi. So this is from config.json which might be used by training.
1151
+ self.gguf_writer.add_context_length(self.hparams["n_positions"])
1152
+ self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
1153
+
1154
+ self.gguf_writer.add_feed_forward_length(ff_dim)
1155
+ self.gguf_writer.add_block_count(block_count)
1156
+ self.gguf_writer.add_head_count(self.hparams["n_head"])
1157
+ self.gguf_writer.add_head_count_kv(1)
1158
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
1159
+ self.gguf_writer.add_file_type(self.ftype)
1160
+
1161
+ def write_tensors(self):
1162
+ hidden_dim = self.hparams["n_embd"]
1163
+ inner_dim = 4 * hidden_dim
1164
+ hidden_dim = int(2 * inner_dim / 3)
1165
+ multiple_of = 256
1166
+ ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
1167
+ n_head = self.hparams["n_head"]
1168
+ n_head_kv = 1
1169
+ head_dim = self.hparams["n_embd"] // n_head
1170
+ block_count = self.hparams["n_layer"]
1171
+
1172
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1173
+
1174
+ tensors = dict(self.get_tensors())
1175
+ for i in range(block_count):
1176
+ if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
1177
+ tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
1178
+ tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
1179
+ del tensors[f"transformer.h.{i}.attn.kv.weight"]
1180
+ if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
1181
+ tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
1182
+ del tensors[f"transformer.h.{i}.attn.q.weight"]
1183
+ if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
1184
+ tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
1185
+ tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
1186
+ del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
1187
+
1188
+ for name, data_torch in tensors.items():
1189
+ old_dtype = data_torch.dtype
1190
+
1191
+ # convert any unsupported data types to float32
1192
+ if data_torch.dtype not in (torch.float16, torch.float32):
1193
+ data_torch = data_torch.to(torch.float32)
1194
+
1195
+ data = data_torch.squeeze().numpy()
1196
+
1197
+ # map tensor names
1198
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
1199
+ if new_name is None:
1200
+ raise ValueError(f"Can not map tensor {name!r}")
1201
+
1202
+ n_dims = len(data.shape)
1203
+ data_dtype = data.dtype
1204
+
1205
+ # if f32 desired, convert any float16 to float32
1206
+ if self.ftype == 0 and data_dtype == np.float16:
1207
+ data = data.astype(np.float32)
1208
+
1209
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1210
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1211
+ data = data.astype(np.float32)
1212
+
1213
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1214
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1215
+ data = data.astype(np.float16)
1216
+
1217
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1218
+
1219
+ self.gguf_writer.add_tensor(new_name, data)
1220
+
1221
+
1222
+ @Model.register("PersimmonForCausalLM")
1223
+ class PersimmonModel(Model):
1224
+ model_arch = gguf.MODEL_ARCH.PERSIMMON
1225
+
1226
+ def set_gguf_parameters(self):
1227
+ block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
1228
+ head_count = self.hparams["num_attention_heads"]
1229
+ head_count_kv = head_count
1230
+ hidden_size = self.hparams["hidden_size"]
1231
+
1232
+ self.gguf_writer.add_name('persimmon-8b-chat')
1233
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1234
+ self.gguf_writer.add_embedding_length(hidden_size)
1235
+ self.gguf_writer.add_block_count(block_count)
1236
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1237
+
1238
+ # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
1239
+ # than the head size?
1240
+ # ref: https://github.com/ggerganov/llama.cpp/pull/4889
1241
+ # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
1242
+ self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
1243
+
1244
+ self.gguf_writer.add_head_count(head_count)
1245
+ self.gguf_writer.add_head_count_kv(head_count_kv)
1246
+ self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
1247
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
1248
+
1249
+ def set_vocab(self):
1250
+ self._set_vocab_sentencepiece()
1251
+ # self.gguf_writer.add_bos_token_id(71013)
1252
+ # self.gguf_writer.add_eos_token_id(71013)
1253
+
1254
+ def write_tensors(self):
1255
+ block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
1256
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1257
+
1258
+ for name, data_torch in self.get_tensors():
1259
+ if name.endswith(".self_attention.rotary_emb.inv_freq"):
1260
+ continue
1261
+ old_dtype = data_torch.dtype
1262
+ # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
1263
+ data = data_torch.to(torch.float32).squeeze().numpy()
1264
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1265
+ if new_name is None:
1266
+ raise ValueError(f"Can not map tensor {name!r}")
1267
+ n_dims = len(data.shape)
1268
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1269
+ self.gguf_writer.add_tensor(new_name, data)
1270
+
1271
+
1272
+ @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
1273
+ class StableLMModel(Model):
1274
+ model_arch = gguf.MODEL_ARCH.STABLELM
1275
+
1276
+ def set_vocab(self):
1277
+ if (self.dir_model / "tokenizer.json").is_file():
1278
+ self._set_vocab_gpt2()
1279
+ else:
1280
+ # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1281
+ self._set_vocab_qwen()
1282
+
1283
+ def set_gguf_parameters(self):
1284
+ hparams = self.hparams
1285
+ block_count = hparams["num_hidden_layers"]
1286
+
1287
+ self.gguf_writer.add_name(self.dir_model.name)
1288
+ self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
1289
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1290
+ self.gguf_writer.add_block_count(block_count)
1291
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
1292
+ rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
1293
+ self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
1294
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
1295
+ self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
1296
+ self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
1297
+ self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
1298
+
1299
+ def write_tensors(self):
1300
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1301
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1302
+ n_head = self.hparams.get("num_attention_heads")
1303
+ n_kv_head = self.hparams.get("num_key_value_heads")
1304
+ q_norms = dict()
1305
+ k_norms = dict()
1306
+ for name, data_torch in self.get_tensors():
1307
+ # we don't need these
1308
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1309
+ continue
1310
+
1311
+ old_dtype = data_torch.dtype
1312
+
1313
+ # convert any unsupported data types to float32
1314
+ if data_torch.dtype not in (torch.float16, torch.float32):
1315
+ data_torch = data_torch.to(torch.float32)
1316
+
1317
+ data = data_torch.squeeze().numpy()
1318
+ n_dims = len(data.shape)
1319
+ if name.find("q_layernorm.norms") != -1:
1320
+ q_norms[name] = data
1321
+ if len(q_norms) >= (block_count * n_head):
1322
+ self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
1323
+ continue
1324
+ if name.find("k_layernorm.norms") != -1:
1325
+ k_norms[name] = data
1326
+ if len(k_norms) >= (block_count * n_kv_head):
1327
+ self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
1328
+ continue
1329
+
1330
+ # map tensor names
1331
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1332
+ if new_name is None:
1333
+ raise ValueError(f"Can not map tensor {name!r}")
1334
+
1335
+ n_dims = len(data.shape)
1336
+ data_dtype = data.dtype
1337
+
1338
+ # if f32 desired, convert any float16 to float32
1339
+ if self.ftype == 0 and data_dtype == np.float16:
1340
+ data = data.astype(np.float32)
1341
+
1342
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1343
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1344
+ data = data.astype(np.float32)
1345
+
1346
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1347
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1348
+ data = data.astype(np.float16)
1349
+
1350
+ logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1351
+
1352
+ self.gguf_writer.add_tensor(new_name, data)
1353
+
1354
+ def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
1355
+ for bid in range(block_count):
1356
+ datas = []
1357
+ for xid in range(n_head):
1358
+ ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
1359
+ datas.append(norms[ename])
1360
+ del norms[ename]
1361
+ data = np.stack(datas, axis=0)
1362
+ data_dtype = data.dtype
1363
+ merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
1364
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1365
+ if new_name is None:
1366
+ raise ValueError(f"Can not map tensor {name!r}")
1367
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1368
+ data = data.astype(np.float32)
1369
+
1370
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1371
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1372
+ data = data.astype(np.float16)
1373
+
1374
+ logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1375
+
1376
+ self.gguf_writer.add_tensor(new_name, data)
1377
+
1378
+
1379
+ @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1380
+ class LlamaModel(Model):
1381
+ model_arch = gguf.MODEL_ARCH.LLAMA
1382
+
1383
+ def set_vocab(self):
1384
+ try:
1385
+ self. _set_vocab_sentencepiece()
1386
+ except FileNotFoundError:
1387
+ try:
1388
+ self._set_vocab_llama_hf()
1389
+ except (FileNotFoundError, TypeError):
1390
+ # Llama 3
1391
+ self._set_vocab_gpt2()
1392
+
1393
+ # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
1394
+ if self.hparams.get("vocab_size", 32000) == 32016:
1395
+ special_vocab = gguf.SpecialVocab(
1396
+ self.dir_model, load_merges=False,
1397
+ special_token_types = ['prefix', 'suffix', 'middle', 'eot']
1398
+ )
1399
+ special_vocab._set_special_token("prefix", 32007)
1400
+ special_vocab._set_special_token("suffix", 32008)
1401
+ special_vocab._set_special_token("middle", 32009)
1402
+ special_vocab._set_special_token("eot", 32010)
1403
+ special_vocab.add_to_gguf(self.gguf_writer)
1404
+
1405
+ def set_gguf_parameters(self):
1406
+ super().set_gguf_parameters()
1407
+ hparams = self.hparams
1408
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1409
+ self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
1410
+
1411
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1412
+ if self.hparams["rope_scaling"].get("type") == "linear":
1413
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1414
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1415
+
1416
+ # Same as super class, but permuting q_proj, k_proj
1417
+ def write_tensors(self):
1418
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1419
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1420
+ n_head = self.hparams.get("num_attention_heads")
1421
+ n_kv_head = self.hparams.get("num_key_value_heads")
1422
+ n_experts = self.hparams.get("num_local_experts")
1423
+ experts = dict()
1424
+ for name, data_torch in self.get_tensors():
1425
+ # we don't need these
1426
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
1427
+ continue
1428
+
1429
+ old_dtype = data_torch.dtype
1430
+
1431
+ # convert any unsupported data types to float32
1432
+ if data_torch.dtype not in (torch.float16, torch.float32):
1433
+ data_torch = data_torch.to(torch.float32)
1434
+
1435
+ data = data_torch.numpy()
1436
+
1437
+ if name.endswith("q_proj.weight"):
1438
+ data = permute(data, n_head, n_head)
1439
+ if name.endswith("k_proj.weight"):
1440
+ data = permute(data, n_head, n_kv_head)
1441
+
1442
+ data = data.squeeze()
1443
+
1444
+ # process the experts separately
1445
+ if name.find("block_sparse_moe.experts") != -1:
1446
+ experts[name] = data
1447
+ if len(experts) >= n_experts:
1448
+ # merge the experts into a single 3d tensor
1449
+ for bid in range(block_count):
1450
+ for wid in range(1, 4):
1451
+ full = True
1452
+ for xid in range(n_experts):
1453
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
1454
+ if ename not in experts:
1455
+ full = False
1456
+ break
1457
+ if not full:
1458
+ continue
1459
+
1460
+ datas = []
1461
+ for xid in range(n_experts):
1462
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
1463
+ datas.append(experts[ename])
1464
+ del experts[ename]
1465
+
1466
+ data = np.stack(datas, axis=0)
1467
+ data_dtype = data.dtype
1468
+
1469
+ if self.ftype == 0 and data_dtype == np.float16:
1470
+ data = data.astype(np.float32)
1471
+
1472
+ if self.ftype == 1 and data_dtype == np.float32:
1473
+ data = data.astype(np.float16)
1474
+
1475
+ merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"
1476
+
1477
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1478
+ if new_name is None:
1479
+ raise ValueError(f"Can not map tensor {name!r}")
1480
+
1481
+ logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1482
+
1483
+ self.gguf_writer.add_tensor(new_name, data)
1484
+ continue
1485
+
1486
+ # map tensor names
1487
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1488
+ if new_name is None:
1489
+ raise ValueError(f"Can not map tensor {name!r}")
1490
+
1491
+ n_dims = len(data.shape)
1492
+ data_dtype = data.dtype
1493
+
1494
+ # if f32 desired, convert any float16 to float32
1495
+ if self.ftype == 0 and data_dtype == np.float16:
1496
+ data = data.astype(np.float32)
1497
+
1498
+ # 1d tensors need to be converted to float32
1499
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1500
+ data = data.astype(np.float32)
1501
+
1502
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1503
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1504
+ data = data.astype(np.float16)
1505
+
1506
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1507
+
1508
+ self.gguf_writer.add_tensor(new_name, data)
1509
+
1510
+ if len(experts) > 0:
1511
+ raise ValueError(f"Unprocessed experts: {experts.keys()}")
1512
+
1513
+
1514
+ @Model.register("GrokForCausalLM")
1515
+ class GrokModel(Model):
1516
+ model_arch = gguf.MODEL_ARCH.GROK
1517
+
1518
+ def set_vocab(self):
1519
+ self._set_vocab_sentencepiece()
1520
+
1521
+ def __init__(self, *args, **kwargs):
1522
+ super().__init__(*args, **kwargs)
1523
+
1524
+ def set_gguf_parameters(self):
1525
+ super().set_gguf_parameters()
1526
+ self.gguf_writer.add_name("Grok")
1527
+
1528
+ def write_tensors(self):
1529
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1530
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1531
+ n_experts = self.hparams.get("num_local_experts")
1532
+ experts = dict()
1533
+ for name, data_torch in self.get_tensors():
1534
+ # we don't need these
1535
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1536
+ continue
1537
+
1538
+ old_dtype = data_torch.dtype
1539
+
1540
+ # convert any unsupported data types to float32
1541
+ if data_torch.dtype not in (torch.float16, torch.float32):
1542
+ data_torch = data_torch.to(torch.float32)
1543
+
1544
+ data = data_torch.squeeze().numpy()
1545
+
1546
+ # process the experts separately
1547
+ if name.find(".moe.") != -1:
1548
+ experts[name] = data
1549
+ if len(experts) >= n_experts:
1550
+ # merge the experts into a single 3d tensor
1551
+ for bid in range(block_count):
1552
+ for wid in ["linear", "linear_1", "linear_v"]:
1553
+ full = True
1554
+ for xid in range(n_experts):
1555
+ ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
1556
+ if ename not in experts:
1557
+ full = False
1558
+ break
1559
+ if not full:
1560
+ continue
1561
+
1562
+ datas = []
1563
+ for xid in range(n_experts):
1564
+ ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
1565
+ datas.append(experts[ename])
1566
+ del experts[ename]
1567
+
1568
+ data = np.stack(datas, axis=0)
1569
+ data_dtype = data.dtype
1570
+
1571
+ if self.ftype == 0 and data_dtype == np.float16:
1572
+ data = data.astype(np.float32)
1573
+
1574
+ if self.ftype == 1 and data_dtype == np.float32:
1575
+ data = data.astype(np.float16)
1576
+
1577
+ merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
1578
+
1579
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1580
+ if new_name is None:
1581
+ raise ValueError(f"Can not map tensor {name!r}")
1582
+
1583
+ logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1584
+
1585
+ self.gguf_writer.add_tensor(new_name, data)
1586
+ continue
1587
+
1588
+ # map tensor names
1589
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1590
+ if new_name is None:
1591
+ raise ValueError(f"Can not map tensor {name!r}")
1592
+
1593
+ n_dims = len(data.shape)
1594
+ data_dtype = data.dtype
1595
+
1596
+ # if f32 desired, convert any float16 to float32
1597
+ if self.ftype == 0 and data_dtype == np.float16:
1598
+ data = data.astype(np.float32)
1599
+
1600
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1601
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1602
+ data = data.astype(np.float32)
1603
+
1604
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1605
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1606
+ data = data.astype(np.float16)
1607
+
1608
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1609
+
1610
+ self.gguf_writer.add_tensor(new_name, data)
1611
+
1612
+
1613
+ @Model.register("DbrxForCausalLM")
1614
+ class DbrxModel(Model):
1615
+ model_arch = gguf.MODEL_ARCH.DBRX
1616
+
1617
+ def set_gguf_parameters(self):
1618
+ ffn_config = self.hparams["ffn_config"]
1619
+ attn_config = self.hparams["attn_config"]
1620
+ self.gguf_writer.add_name(self.hparams["model_type"])
1621
+ self.gguf_writer.add_block_count(self.hparams["n_layers"])
1622
+
1623
+ self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
1624
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
1625
+ self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
1626
+
1627
+ self.gguf_writer.add_head_count(self.hparams["n_heads"])
1628
+ self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
1629
+
1630
+ self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
1631
+
1632
+ self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
1633
+ self.gguf_writer.add_file_type(self.ftype)
1634
+
1635
+ self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
1636
+ self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
1637
+
1638
+ self.gguf_writer.add_layer_norm_eps(1e-5)
1639
+
1640
+ self.gguf_writer.add_file_type(self.ftype)
1641
+ logger.info(f"gguf: file type = {self.ftype}")
1642
+
1643
+ def write_tensors(self):
1644
+ block_count = self.hparams.get("n_layers")
1645
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1646
+ for name, data_torch in self.get_tensors():
1647
+ n_expert = self.hparams["ffn_config"]["moe_num_experts"]
1648
+ n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
1649
+ n_embd = self.hparams["d_model"]
1650
+
1651
+ # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
1652
+ # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
1653
+ # But llama.cpp moe graph works differently
1654
+ # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
1655
+ # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
1656
+ exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
1657
+ "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
1658
+ "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
1659
+ experts = False
1660
+ for exp_tensor_name in exp_tensor_names.keys():
1661
+ if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
1662
+ experts = True
1663
+ data_torch = data_torch.view(n_expert, n_ff, n_embd)
1664
+ if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
1665
+ data_torch = data_torch.permute(*permute_tensor)
1666
+ break
1667
+
1668
+ old_dtype = data_torch.dtype
1669
+
1670
+ # convert any unsupported data types to float32
1671
+ if data_torch.dtype not in (torch.float16, torch.float32):
1672
+ data_torch = data_torch.to(torch.float32)
1673
+
1674
+ data = data_torch.squeeze().numpy()
1675
+
1676
+ # map tensor names
1677
+ # In MoE models the ffn tensors are typically most of the model weights,
1678
+ # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
1679
+ # Every other model has the weight names ending in .weight,
1680
+ # let's assume that is the convention which is not the case for dbrx:
1681
+ # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
1682
+ new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
1683
+ if new_name is None:
1684
+ raise ValueError(f"Can not map tensor {name!r}")
1685
+
1686
+ n_dims = len(data.shape)
1687
+ data_dtype = data.dtype
1688
+
1689
+ # Most of the codebase that takes in 1D tensors only handles F32 tensors
1690
+ # and most of the outputs tensors are F32.
1691
+ if data_dtype != np.float32 and n_dims == 1:
1692
+ raise ValueError(f"Can not map tensor {name!r}: all 1D tensors must be F32")
1693
+
1694
+ # if f32 desired, convert any float16 to float32
1695
+ if self.ftype == 0 and data_dtype == np.float16:
1696
+ data = data.astype(np.float32)
1697
+
1698
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1699
+ if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
1700
+ data = data.astype(np.float16)
1701
+
1702
+ logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
1703
+
1704
+ self.gguf_writer.add_tensor(new_name, data)
1705
+
1706
+
1707
+ @Model.register("DbrxForCausalLM")
1708
+ class DbrxModel(Model):
1709
+ model_arch = gguf.MODEL_ARCH.DBRX
1710
+
1711
+ def set_gguf_parameters(self):
1712
+ ffn_config = self.hparams["ffn_config"]
1713
+ attn_config = self.hparams["attn_config"]
1714
+ self.gguf_writer.add_name(self.hparams["model_type"])
1715
+ self.gguf_writer.add_block_count(self.hparams["n_layers"])
1716
+
1717
+ self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
1718
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
1719
+ self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
1720
+
1721
+ self.gguf_writer.add_head_count(self.hparams["n_heads"])
1722
+ self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
1723
+
1724
+ self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
1725
+
1726
+ self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
1727
+ self.gguf_writer.add_file_type(self.ftype)
1728
+
1729
+ self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
1730
+ self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
1731
+
1732
+ self.gguf_writer.add_layer_norm_eps(1e-5)
1733
+
1734
+ self.gguf_writer.add_file_type(self.ftype)
1735
+ print(f"gguf: file type = {self.ftype}")
1736
+
1737
+ def write_tensors(self):
1738
+ block_count = self.hparams.get("n_layers")
1739
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1740
+ for name, data_torch in self.get_tensors():
1741
+ n_expert = self.hparams["ffn_config"]["moe_num_experts"]
1742
+ n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
1743
+ n_embd = self.hparams["d_model"]
1744
+
1745
+ # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
1746
+ # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
1747
+ # But llama.cpp moe graph works differently
1748
+ # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
1749
+ # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
1750
+ exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
1751
+ "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
1752
+ "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
1753
+ experts = False
1754
+ for exp_tensor_name in exp_tensor_names.keys():
1755
+ if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
1756
+ experts = True
1757
+ data_torch = data_torch.view(n_expert, n_ff, n_embd)
1758
+ if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
1759
+ data_torch = data_torch.permute(*permute_tensor)
1760
+ break
1761
+
1762
+ old_dtype = data_torch.dtype
1763
+
1764
+ # convert any unsupported data types to float32
1765
+ if data_torch.dtype not in (torch.float16, torch.float32):
1766
+ data_torch = data_torch.to(torch.float32)
1767
+
1768
+ data = data_torch.squeeze().numpy()
1769
+
1770
+ # map tensor names
1771
+ # In MoE models the ffn tensors are typically most of the model weights,
1772
+ # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
1773
+ # Every other model has the weight names ending in .weight,
1774
+ # let's assume that is the convention which is not the case for dbrx:
1775
+ # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
1776
+ new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
1777
+ if new_name is None:
1778
+ print(f"Can not map tensor {name!r}")
1779
+ sys.exit()
1780
+
1781
+ n_dims = len(data.shape)
1782
+ data_dtype = data.dtype
1783
+
1784
+ # Most of the codebase that takes in 1D tensors only handles F32 tensors
1785
+ # and most of the outputs tensors are F32.
1786
+ if data_dtype != np.float32 and n_dims == 1:
1787
+ print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
1788
+ sys.exit()
1789
+
1790
+ # if f32 desired, convert any float16 to float32
1791
+ if self.ftype == 0 and data_dtype == np.float16:
1792
+ data = data.astype(np.float32)
1793
+
1794
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1795
+ if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
1796
+ data = data.astype(np.float16)
1797
+
1798
+ print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
1799
+
1800
+ self.gguf_writer.add_tensor(new_name, data)
1801
+
1802
+
1803
+ @Model.register("MiniCPMForCausalLM")
1804
+ class MiniCPMModel(Model):
1805
+ model_arch = gguf.MODEL_ARCH.MINICPM
1806
+
1807
+ def set_gguf_parameters(self):
1808
+ block_count = self.hparams["num_hidden_layers"]
1809
+ self.gguf_writer.add_name("MiniCPM")
1810
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1811
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1812
+ self.gguf_writer.add_block_count(block_count)
1813
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1814
+ self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1815
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1816
+ self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
1817
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1818
+ self.gguf_writer.add_file_type(self.ftype)
1819
+
1820
+ def set_vocab(self):
1821
+ self._set_vocab_llama_hf()
1822
+
1823
+ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1824
+ if n_kv_head is not None and n_head != n_kv_head:
1825
+ n_head //= n_kv_head
1826
+
1827
+ return (
1828
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1829
+ .swapaxes(1, 2)
1830
+ .reshape(weights.shape)
1831
+ )
1832
+
1833
+ def write_tensors(self):
1834
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1835
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1836
+ n_head = self.hparams.get("num_attention_heads")
1837
+ n_kv_head = self.hparams.get("num_key_value_heads")
1838
+ for name, data_torch in self.get_tensors():
1839
+ # we don't need these
1840
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1841
+ continue
1842
+
1843
+ old_dtype = data_torch.dtype
1844
+
1845
+ # convert any unsupported data types to float32
1846
+ if data_torch.dtype not in (torch.float16, torch.float32):
1847
+ data_torch = data_torch.to(torch.float32)
1848
+
1849
+ # HF models permute some of the tensors, so we need to undo that
1850
+ if name.endswith(("q_proj.weight")):
1851
+ data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
1852
+ if name.endswith(("k_proj.weight")):
1853
+ data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
1854
+
1855
+ data = data_torch.squeeze().numpy()
1856
+
1857
+ # map tensor names
1858
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1859
+ if new_name is None:
1860
+ raise ValueError(f"Can not map tensor {name!r}")
1861
+
1862
+ n_dims = len(data.shape)
1863
+ data_dtype = data.dtype
1864
+
1865
+ # if f32 desired, convert any float16 to float32
1866
+ if self.ftype == 0 and data_dtype == np.float16:
1867
+ data = data.astype(np.float32)
1868
+
1869
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1870
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1871
+ data = data.astype(np.float32)
1872
+
1873
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1874
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1875
+ data = data.astype(np.float16)
1876
+
1877
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1878
+
1879
+ self.gguf_writer.add_tensor(new_name, data)
1880
+
1881
+
1882
+ @Model.register("QWenLMHeadModel")
1883
+ class QwenModel(Model):
1884
+ model_arch = gguf.MODEL_ARCH.QWEN
1885
+
1886
+ @staticmethod
1887
+ def token_bytes_to_string(b):
1888
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
1889
+ byte_encoder = bytes_to_unicode()
1890
+ return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
1891
+
1892
+ @staticmethod
1893
+ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
1894
+ parts = [bytes([b]) for b in token]
1895
+ while True:
1896
+ min_idx = None
1897
+ min_rank = None
1898
+ for i, pair in enumerate(zip(parts[:-1], parts[1:])):
1899
+ rank = mergeable_ranks.get(pair[0] + pair[1])
1900
+ if rank is not None and (min_rank is None or rank < min_rank):
1901
+ min_idx = i
1902
+ min_rank = rank
1903
+ if min_rank is None or (max_rank is not None and min_rank >= max_rank):
1904
+ break
1905
+ assert min_idx is not None
1906
+ parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
1907
+ return parts
1908
+
1909
+ def set_vocab(self):
1910
+ self._set_vocab_qwen()
1911
+
1912
+ def set_gguf_parameters(self):
1913
+ self.gguf_writer.add_name("Qwen")
1914
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1915
+ self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
1916
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1917
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1918
+ self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
1919
+ self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1920
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1921
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
1922
+
1923
+ def write_tensors(self):
1924
+ block_count = self.hparams["num_hidden_layers"]
1925
+ model_kv = dict(self.get_tensors())
1926
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1927
+ for name, data_torch in model_kv.items():
1928
+ # we don't need these
1929
+ if name.endswith(".rotary_emb.inv_freq"):
1930
+ continue
1931
+
1932
+ old_dtype = data_torch.dtype
1933
+
1934
+ # convert any unsupported data types to float32
1935
+ if data_torch.dtype not in (torch.float16, torch.float32):
1936
+ data_torch = data_torch.to(torch.float32)
1937
+
1938
+ data = data_torch.squeeze().numpy()
1939
+
1940
+ # map tensor names
1941
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1942
+ if new_name is None:
1943
+ raise ValueError(f"Can not map tensor {name!r}")
1944
+
1945
+ n_dims = len(data.shape)
1946
+ data_dtype = data.dtype
1947
+
1948
+ # if f32 desired, convert any float16 to float32
1949
+ if self.ftype == 0 and data_dtype == np.float16:
1950
+ data = data.astype(np.float32)
1951
+
1952
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1953
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1954
+ data = data.astype(np.float32)
1955
+
1956
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
1957
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1958
+ data = data.astype(np.float16)
1959
+
1960
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1961
+ self.gguf_writer.add_tensor(new_name, data)
1962
+
1963
+
1964
+ @Model.register("Qwen2ForCausalLM")
1965
+ class Qwen2Model(Model):
1966
+ model_arch = gguf.MODEL_ARCH.QWEN2
1967
+
1968
+ def set_vocab(self):
1969
+ try:
1970
+ self._set_vocab_sentencepiece()
1971
+ except FileNotFoundError:
1972
+ self._set_vocab_gpt2()
1973
+
1974
+
1975
+ @Model.register("Qwen2MoeForCausalLM")
1976
+ class Qwen2MoeModel(Model):
1977
+ model_arch = gguf.MODEL_ARCH.QWEN2MOE
1978
+
1979
+ def set_gguf_parameters(self):
1980
+ super().set_gguf_parameters()
1981
+ if (n_experts := self.hparams.get("num_experts")) is not None:
1982
+ self.gguf_writer.add_expert_count(n_experts)
1983
+
1984
+ def write_tensors(self):
1985
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1986
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1987
+ n_experts = self.hparams.get("num_experts")
1988
+ experts = dict()
1989
+ for name, data_torch in self.get_tensors():
1990
+ # we don't need these
1991
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1992
+ continue
1993
+
1994
+ old_dtype = data_torch.dtype
1995
+
1996
+ # convert any unsupported data types to float32
1997
+ if data_torch.dtype not in (torch.float16, torch.float32):
1998
+ data_torch = data_torch.to(torch.float32)
1999
+
2000
+ data = data_torch.squeeze().numpy()
2001
+
2002
+ # process the experts separately
2003
+ if name.find("experts") != -1:
2004
+ experts[name] = data
2005
+ if len(experts) >= n_experts * 3:
2006
+ # merge the experts into a single 3d tensor
2007
+ for bid in range(block_count):
2008
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
2009
+ full = True
2010
+ for xid in range(n_experts):
2011
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2012
+ if ename not in experts:
2013
+ full = False
2014
+ break
2015
+ if not full:
2016
+ continue
2017
+
2018
+ datas = []
2019
+ for xid in range(n_experts):
2020
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2021
+ datas.append(experts[ename])
2022
+ del experts[ename]
2023
+
2024
+ data = np.stack(datas, axis=0)
2025
+ data_dtype = data.dtype
2026
+
2027
+ if self.ftype == 0 and data_dtype == np.float16:
2028
+ data = data.astype(np.float32)
2029
+
2030
+ if self.ftype == 1 and data_dtype == np.float32:
2031
+ data = data.astype(np.float16)
2032
+
2033
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2034
+
2035
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
2036
+ if new_name is None:
2037
+ raise ValueError(f"Can not map tensor {name!r}")
2038
+
2039
+ logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
2040
+
2041
+ self.gguf_writer.add_tensor(new_name, data)
2042
+ continue
2043
+
2044
+ # map tensor names
2045
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2046
+ if new_name is None:
2047
+ raise ValueError(f"Can not map tensor {name!r}")
2048
+
2049
+ n_dims = len(data.shape)
2050
+ data_dtype = data.dtype
2051
+
2052
+ # if f32 desired, convert any float16 to float32
2053
+ if self.ftype == 0 and data_dtype == np.float16:
2054
+ data = data.astype(np.float32)
2055
+
2056
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2057
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
2058
+ data = data.astype(np.float32)
2059
+
2060
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2061
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2062
+ data = data.astype(np.float16)
2063
+
2064
+ logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
2065
+
2066
+ self.gguf_writer.add_tensor(new_name, data)
2067
+
2068
+ if len(experts) > 0:
2069
+ raise ValueError(f"Unprocessed experts: {experts.keys()}")
2070
+
2071
+
2072
+ @Model.register("Qwen2MoeForCausalLM")
2073
+ class Qwen2MoeModel(Model):
2074
+ model_arch = gguf.MODEL_ARCH.QWEN2MOE
2075
+
2076
+ def set_gguf_parameters(self):
2077
+ super().set_gguf_parameters()
2078
+ if (n_experts := self.hparams.get("num_experts")) is not None:
2079
+ self.gguf_writer.add_expert_count(n_experts)
2080
+
2081
+ def write_tensors(self):
2082
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2083
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2084
+ n_experts = self.hparams.get("num_experts")
2085
+ experts = dict()
2086
+ for name, data_torch in self.get_tensors():
2087
+ # we don't need these
2088
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
2089
+ continue
2090
+
2091
+ old_dtype = data_torch.dtype
2092
+
2093
+ # convert any unsupported data types to float32
2094
+ if data_torch.dtype not in (torch.float16, torch.float32):
2095
+ data_torch = data_torch.to(torch.float32)
2096
+
2097
+ data = data_torch.squeeze().numpy()
2098
+
2099
+ # process the experts separately
2100
+ if name.find("experts") != -1:
2101
+ experts[name] = data
2102
+ if len(experts) >= n_experts * 3:
2103
+ # merge the experts into a single 3d tensor
2104
+ for bid in range(block_count):
2105
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
2106
+ full = True
2107
+ for xid in range(n_experts):
2108
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2109
+ if ename not in experts:
2110
+ full = False
2111
+ break
2112
+ if not full:
2113
+ continue
2114
+
2115
+ datas = []
2116
+ for xid in range(n_experts):
2117
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2118
+ datas.append(experts[ename])
2119
+ del experts[ename]
2120
+
2121
+ data = np.stack(datas, axis=0)
2122
+ data_dtype = data.dtype
2123
+
2124
+ if self.ftype == 0 and data_dtype == np.float16:
2125
+ data = data.astype(np.float32)
2126
+
2127
+ if self.ftype == 1 and data_dtype == np.float32:
2128
+ data = data.astype(np.float16)
2129
+
2130
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2131
+
2132
+ new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
2133
+ if new_name is None:
2134
+ print(f"Can not map tensor {name!r}")
2135
+ sys.exit()
2136
+
2137
+ print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
2138
+
2139
+ self.gguf_writer.add_tensor(new_name, data)
2140
+ continue
2141
+
2142
+ # map tensor names
2143
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2144
+ if new_name is None:
2145
+ print(f"Can not map tensor {name!r}")
2146
+ sys.exit()
2147
+
2148
+ n_dims = len(data.shape)
2149
+ data_dtype = data.dtype
2150
+
2151
+ # if f32 desired, convert any float16 to float32
2152
+ if self.ftype == 0 and data_dtype == np.float16:
2153
+ data = data.astype(np.float32)
2154
+
2155
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2156
+ if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
2157
+ data = data.astype(np.float32)
2158
+
2159
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2160
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2161
+ data = data.astype(np.float16)
2162
+
2163
+ print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
2164
+
2165
+ self.gguf_writer.add_tensor(new_name, data)
2166
+
2167
+ if len(experts) > 0:
2168
+ raise ValueError(f"Unprocessed experts: {experts.keys()}")
2169
+
2170
+
2171
+ @Model.register("GPT2LMHeadModel")
2172
+ class GPT2Model(Model):
2173
+ model_arch = gguf.MODEL_ARCH.GPT2
2174
+
2175
+ def set_gguf_parameters(self):
2176
+ self.gguf_writer.add_name(self.dir_model.name)
2177
+ self.gguf_writer.add_block_count(self.hparams["n_layer"])
2178
+ self.gguf_writer.add_context_length(self.hparams["n_ctx"])
2179
+ self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
2180
+ self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
2181
+ self.gguf_writer.add_head_count(self.hparams["n_head"])
2182
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
2183
+ self.gguf_writer.add_file_type(self.ftype)
2184
+
2185
+ def write_tensors(self):
2186
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2187
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2188
+
2189
+ for name, data_torch in self.get_tensors():
2190
+ # we don't need these
2191
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
2192
+ continue
2193
+
2194
+ if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
2195
+ data_torch = data_torch.transpose(1, 0)
2196
+
2197
+ old_dtype = data_torch.dtype
2198
+
2199
+ # convert any unsupported data types to float32
2200
+ if data_torch.dtype not in (torch.float16, torch.float32):
2201
+ data_torch = data_torch.to(torch.float32)
2202
+
2203
+ data = data_torch.squeeze().numpy()
2204
+
2205
+ # map tensor names
2206
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2207
+ if new_name is None:
2208
+ raise ValueError(f"Can not map tensor {name!r}")
2209
+
2210
+ n_dims = len(data.shape)
2211
+ data_dtype = data.dtype
2212
+
2213
+ # if f32 desired, convert any float16 to float32
2214
+ if self.ftype == 0 and data_dtype == np.float16:
2215
+ data = data.astype(np.float32)
2216
+
2217
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2218
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2219
+ data = data.astype(np.float32)
2220
+
2221
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2222
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2223
+ data = data.astype(np.float16)
2224
+
2225
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2226
+
2227
+ self.gguf_writer.add_tensor(new_name, data)
2228
+
2229
+ # note: GPT2 output is tied to (same as) wte in original model
2230
+ if new_name == "token_embd.weight":
2231
+ logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2232
+ self.gguf_writer.add_tensor("output.weight", data)
2233
+
2234
+
2235
+ @Model.register("PhiForCausalLM")
2236
+ class Phi2Model(Model):
2237
+ model_arch = gguf.MODEL_ARCH.PHI2
2238
+
2239
+ def set_gguf_parameters(self):
2240
+ block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
2241
+
2242
+ rot_pct = self.find_hparam(["partial_rotary_factor"])
2243
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
2244
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
2245
+
2246
+ self.gguf_writer.add_name("Phi2")
2247
+ self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
2248
+
2249
+ self.gguf_writer.add_embedding_length(n_embd)
2250
+ self.gguf_writer.add_feed_forward_length(4 * n_embd)
2251
+ self.gguf_writer.add_block_count(block_count)
2252
+ self.gguf_writer.add_head_count(n_head)
2253
+ self.gguf_writer.add_head_count_kv(n_head)
2254
+ self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
2255
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
2256
+ self.gguf_writer.add_file_type(self.ftype)
2257
+ self.gguf_writer.add_add_bos_token(False)
2258
+
2259
+
2260
+ @Model.register("Phi3ForCausalLM")
2261
+ class Phi3MiniModel(Model):
2262
+ model_arch = gguf.MODEL_ARCH.PHI3
2263
+
2264
+ def set_vocab(self):
2265
+ from sentencepiece import SentencePieceProcessor
2266
+
2267
+ tokenizer_path = self.dir_model / 'tokenizer.model'
2268
+
2269
+ if not tokenizer_path.is_file():
2270
+ raise ValueError(f'Error: Missing {tokenizer_path}')
2271
+
2272
+ tokenizer = SentencePieceProcessor(str(tokenizer_path))
2273
+
2274
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2275
+
2276
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
2277
+ scores: list[float] = [-10000.0] * vocab_size
2278
+ toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2279
+
2280
+ for token_id in range(tokenizer.vocab_size()):
2281
+
2282
+ piece = tokenizer.id_to_piece(token_id)
2283
+ text = piece.encode("utf-8")
2284
+ score = tokenizer.get_score(token_id)
2285
+
2286
+ toktype = SentencePieceTokenTypes.NORMAL
2287
+ if tokenizer.is_unknown(token_id):
2288
+ toktype = SentencePieceTokenTypes.UNKNOWN
2289
+ elif tokenizer.is_control(token_id):
2290
+ toktype = SentencePieceTokenTypes.CONTROL
2291
+ elif tokenizer.is_unused(token_id):
2292
+ toktype = SentencePieceTokenTypes.UNUSED
2293
+ elif tokenizer.is_byte(token_id):
2294
+ toktype = SentencePieceTokenTypes.BYTE
2295
+
2296
+ tokens[token_id] = text
2297
+ scores[token_id] = score
2298
+ toktypes[token_id] = toktype
2299
+
2300
+ added_tokens_file = self.dir_model / 'added_tokens.json'
2301
+ if added_tokens_file.is_file():
2302
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
2303
+ added_tokens_json = json.load(f)
2304
+
2305
+ for key in added_tokens_json:
2306
+ token_id = added_tokens_json[key]
2307
+ if (token_id >= vocab_size):
2308
+ logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
2309
+ continue
2310
+
2311
+ tokens[token_id] = key.encode("utf-8")
2312
+ scores[token_id] = -1000.0
2313
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2314
+
2315
+ self.gguf_writer.add_tokenizer_model("llama")
2316
+ self.gguf_writer.add_tokenizer_pre("default")
2317
+ self.gguf_writer.add_token_list(tokens)
2318
+ self.gguf_writer.add_token_scores(scores)
2319
+ self.gguf_writer.add_token_types(toktypes)
2320
+
2321
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2322
+ special_vocab.add_to_gguf(self.gguf_writer)
2323
+
2324
+ def set_gguf_parameters(self):
2325
+ block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
2326
+
2327
+ rot_pct = 1.0
2328
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
2329
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
2330
+ rms_eps = self.find_hparam(["rms_norm_eps"])
2331
+
2332
+ self.gguf_writer.add_name("Phi3")
2333
+ self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
2334
+
2335
+ self.gguf_writer.add_embedding_length(n_embd)
2336
+ self.gguf_writer.add_feed_forward_length(8192)
2337
+ self.gguf_writer.add_block_count(block_count)
2338
+ self.gguf_writer.add_head_count(n_head)
2339
+ self.gguf_writer.add_head_count_kv(n_head)
2340
+ self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
2341
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
2342
+ self.gguf_writer.add_file_type(self.ftype)
2343
+
2344
+
2345
+ @Model.register("PlamoForCausalLM")
2346
+ class PlamoModel(Model):
2347
+ model_arch = gguf.MODEL_ARCH.PLAMO
2348
+
2349
+ def set_vocab(self):
2350
+ self._set_vocab_sentencepiece()
2351
+
2352
+ def set_gguf_parameters(self):
2353
+ hparams = self.hparams
2354
+ block_count = hparams["num_hidden_layers"]
2355
+
2356
+ self.gguf_writer.add_name("PLaMo")
2357
+ self.gguf_writer.add_context_length(4096) # not in config.json
2358
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2359
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2360
+ self.gguf_writer.add_block_count(block_count)
2361
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2362
+ self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
2363
+ self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
2364
+
2365
+ def shuffle_attn_q_weight(self, data_torch):
2366
+ assert data_torch.size() == (5120, 5120)
2367
+ data_torch = data_torch.reshape(8, 5, 128, 5120)
2368
+ data_torch = torch.permute(data_torch, (1, 0, 2, 3))
2369
+ data_torch = torch.reshape(data_torch, (5120, 5120))
2370
+ return data_torch
2371
+
2372
+ def shuffle_attn_output_weight(self, data_torch):
2373
+ assert data_torch.size() == (5120, 5120)
2374
+ data_torch = data_torch.reshape(5120, 8, 5, 128)
2375
+ data_torch = torch.permute(data_torch, (0, 2, 1, 3))
2376
+ data_torch = torch.reshape(data_torch, (5120, 5120))
2377
+ return data_torch
2378
+
2379
+ def write_tensors(self):
2380
+ block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
2381
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2382
+
2383
+ for name, data_torch in self.get_tensors():
2384
+ if "self_attn.rotary_emb.inv_freq" in name:
2385
+ continue
2386
+
2387
+ # map tensor names
2388
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2389
+ if new_name is None:
2390
+ raise ValueError(f"Can not map tensor {name!r}")
2391
+
2392
+ # shuffle for broadcasting of gqa in ggml_mul_mat
2393
+ if new_name.endswith("attn_q.weight"):
2394
+ data_torch = self.shuffle_attn_q_weight(data_torch)
2395
+ elif new_name.endswith("attn_output.weight"):
2396
+ data_torch = self.shuffle_attn_output_weight(data_torch)
2397
+
2398
+ old_dtype = data_torch.dtype
2399
+
2400
+ # convert any unsupported data types to float32
2401
+ if data_torch.dtype not in (torch.float16, torch.float32):
2402
+ data_torch = data_torch.to(torch.float32)
2403
+
2404
+ data = data_torch.squeeze().numpy()
2405
+
2406
+ n_dims = len(data.shape)
2407
+ data_dtype = data.dtype
2408
+
2409
+ # if f32 desired, convert any float16 to float32
2410
+ if self.ftype == 0 and data_dtype == np.float16:
2411
+ data = data.astype(np.float32)
2412
+
2413
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2414
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2415
+ data = data.astype(np.float32)
2416
+
2417
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2418
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2419
+ data = data.astype(np.float16)
2420
+
2421
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2422
+
2423
+ self.gguf_writer.add_tensor(new_name, data)
2424
+
2425
+
2426
+ @Model.register("CodeShellForCausalLM")
2427
+ class CodeShellModel(Model):
2428
+ model_arch = gguf.MODEL_ARCH.CODESHELL
2429
+
2430
+ def set_gguf_parameters(self):
2431
+ block_count = self.hparams["n_layer"]
2432
+
2433
+ self.gguf_writer.add_name("CodeShell")
2434
+ self.gguf_writer.add_context_length(self.hparams["n_positions"])
2435
+ self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
2436
+ self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
2437
+ self.gguf_writer.add_block_count(block_count)
2438
+ self.gguf_writer.add_head_count(self.hparams["n_head"])
2439
+ self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
2440
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
2441
+ self.gguf_writer.add_file_type(self.ftype)
2442
+ self.gguf_writer.add_rope_freq_base(10000.0)
2443
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2444
+ self.gguf_writer.add_rope_scaling_factor(1.0)
2445
+
2446
+ def write_tensors(self):
2447
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2448
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2449
+ tensors = dict(self.get_tensors())
2450
+ has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
2451
+ for name, data_torch in tensors.items():
2452
+ # we don't need these
2453
+ if name.endswith((".attn.rotary_emb.inv_freq")):
2454
+ continue
2455
+
2456
+ old_dtype = data_torch.dtype
2457
+
2458
+ # convert any unsupported data types to float32
2459
+ if data_torch.dtype not in (torch.float16, torch.float32):
2460
+ data_torch = data_torch.to(torch.float32)
2461
+
2462
+ data = data_torch.squeeze().numpy()
2463
+
2464
+ # map tensor names
2465
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2466
+ if new_name is None:
2467
+ raise ValueError(f"Can not map tensor {name!r}")
2468
+
2469
+ n_dims = len(data.shape)
2470
+ data_dtype = data.dtype
2471
+
2472
+ # if f32 desired, convert any float16 to float32
2473
+ if self.ftype == 0 and data_dtype == np.float16:
2474
+ data = data.astype(np.float32)
2475
+
2476
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2477
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2478
+ data = data.astype(np.float32)
2479
+
2480
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2481
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2482
+ data = data.astype(np.float16)
2483
+
2484
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2485
+
2486
+ self.gguf_writer.add_tensor(new_name, data)
2487
+
2488
+ if not has_lm_head and name == "transformer.wte.weight":
2489
+ self.gguf_writer.add_tensor("output.weight", data)
2490
+ logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
2491
+
2492
+
2493
+ @Model.register("InternLM2ForCausalLM")
2494
+ class InternLM2Model(Model):
2495
+ model_arch = gguf.MODEL_ARCH.INTERNLM2
2496
+
2497
+ def set_vocab(self):
2498
+ # (TODO): Is there a better way?
2499
+ # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
2500
+ # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
2501
+ # recognized as an empty string in C++.
2502
+ from sentencepiece import SentencePieceProcessor
2503
+ from sentencepiece import sentencepiece_model_pb2 as model
2504
+
2505
+ tokenizer_path = self.dir_model / 'tokenizer.model'
2506
+
2507
+ tokens: list[bytes] = []
2508
+ scores: list[float] = []
2509
+ toktypes: list[int] = []
2510
+
2511
+ if not tokenizer_path.is_file():
2512
+ logger.error(f'Error: Missing {tokenizer_path}')
2513
+ sys.exit(1)
2514
+
2515
+ sentencepiece_model = model.ModelProto()
2516
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2517
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2518
+
2519
+ tokenizer = SentencePieceProcessor(str(tokenizer_path))
2520
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2521
+
2522
+ for token_id in range(vocab_size):
2523
+ piece = tokenizer.id_to_piece(token_id)
2524
+ text = piece.encode("utf-8")
2525
+ score = tokenizer.get_score(token_id)
2526
+ if text == b"\x00":
2527
+ # (TODO): fixme
2528
+ # Hack here and replace the \x00 characters.
2529
+ logger.debug(f"InternLM2 convert token '{text}' to '🐉'!")
2530
+ text = "🐉"
2531
+
2532
+ toktype = SentencePieceTokenTypes.NORMAL
2533
+ if tokenizer.is_unknown(token_id):
2534
+ toktype = SentencePieceTokenTypes.UNKNOWN
2535
+ elif tokenizer.is_control(token_id):
2536
+ toktype = SentencePieceTokenTypes.CONTROL
2537
+ elif tokenizer.is_unused(token_id):
2538
+ toktype = SentencePieceTokenTypes.UNUSED
2539
+ elif tokenizer.is_byte(token_id):
2540
+ toktype = SentencePieceTokenTypes.BYTE
2541
+
2542
+ tokens.append(text)
2543
+ scores.append(score)
2544
+ toktypes.append(toktype)
2545
+
2546
+ added_tokens_file = self.dir_model / 'added_tokens.json'
2547
+ if added_tokens_file.is_file():
2548
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
2549
+ added_tokens_json = json.load(f)
2550
+
2551
+ for key in added_tokens_json:
2552
+ tokens.append(key.encode("utf-8"))
2553
+ scores.append(-1000.0)
2554
+ toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
2555
+
2556
+ self.gguf_writer.add_tokenizer_model("llama")
2557
+ self.gguf_writer.add_tokenizer_pre("default")
2558
+ self.gguf_writer.add_token_list(tokens)
2559
+ self.gguf_writer.add_token_scores(scores)
2560
+ self.gguf_writer.add_token_types(toktypes)
2561
+ self.gguf_writer.add_add_space_prefix(add_prefix)
2562
+
2563
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2564
+ old_eos = special_vocab.special_token_ids["eos"]
2565
+ if "chat" in os.path.basename(self.dir_model.absolute()):
2566
+ # For the chat model, we replace the eos with '<|im_end|>'.
2567
+ # TODO: this is a hack, should be fixed
2568
+ # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2569
+ special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
2570
+ logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
2571
+ in chat mode so that the conversation can end normally.")
2572
+
2573
+ special_vocab.add_to_gguf(self.gguf_writer)
2574
+
2575
+ def _try_get_sft_eos(self, tokenizer):
2576
+ unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
2577
+ im_end_list = tokenizer.encode('<|im_end|>')
2578
+ assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
2579
+ if len(unused_145_list) == 1:
2580
+ eos_token = unused_145_list[0]
2581
+ if len(im_end_list) == 1:
2582
+ eos_token = im_end_list[0]
2583
+ return eos_token
2584
+
2585
+ def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
2586
+ if n_head_kv is not None and n_head != n_head_kv:
2587
+ n_head = n_head_kv
2588
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2589
+ .swapaxes(1, 2)
2590
+ .reshape(weights.shape))
2591
+
2592
+ def set_gguf_parameters(self):
2593
+ self.gguf_writer.add_name("InternLM2")
2594
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
2595
+ self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
2596
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
2597
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
2598
+ self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
2599
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
2600
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2601
+ self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
2602
+
2603
+ def post_write_tensors(self, tensor_map, name, data_torch):
2604
+ old_dtype = data_torch.dtype
2605
+
2606
+ # convert any unsupported data types to float32
2607
+ if data_torch.dtype not in (torch.float16, torch.float32):
2608
+ data_torch = data_torch.to(torch.float32)
2609
+
2610
+ data = data_torch.squeeze().numpy()
2611
+
2612
+ # map tensor names
2613
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2614
+ if new_name is None:
2615
+ raise ValueError(f"Can not map tensor {name!r}")
2616
+
2617
+ n_dims = len(data.shape)
2618
+ data_dtype = data.dtype
2619
+
2620
+ # if f32 desired, convert any float16 to float32
2621
+ if self.ftype == 0 and data_dtype == np.float16:
2622
+ data = data.astype(np.float32)
2623
+
2624
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2625
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2626
+ data = data.astype(np.float32)
2627
+
2628
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2629
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2630
+ data = data.astype(np.float16)
2631
+
2632
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2633
+ self.gguf_writer.add_tensor(new_name, data)
2634
+
2635
+ def write_tensors(self):
2636
+ from einops import rearrange
2637
+
2638
+ num_heads = self.hparams.get("num_attention_heads")
2639
+ num_kv_heads = self.hparams.get("num_key_value_heads")
2640
+ hidden_size = self.hparams.get("hidden_size")
2641
+ q_per_kv = num_heads // num_kv_heads
2642
+ head_dim = hidden_size // num_heads
2643
+ num_groups = num_heads // q_per_kv
2644
+
2645
+ block_count = self.hparams["num_hidden_layers"]
2646
+ model_kv = dict(self.get_tensors())
2647
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2648
+ qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
2649
+ for name, data_torch in model_kv.items():
2650
+ # we don't need these
2651
+ if name.endswith(".rotary_emb.inv_freq"):
2652
+ continue
2653
+
2654
+ if re.match(qkv_pattern, name):
2655
+ bid = re.findall(qkv_pattern, name)[0]
2656
+ qkv = data_torch
2657
+ qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
2658
+ q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
2659
+ # The model weights of q and k equire additional reshape.
2660
+ q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
2661
+ k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
2662
+ v = rearrange(v, " o g n i -> o (g n i)").T
2663
+ self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
2664
+ self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
2665
+ self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
2666
+ else:
2667
+ self.post_write_tensors(tensor_map, name, data_torch)
2668
+
2669
+
2670
+ @Model.register("BertModel", "CamembertModel")
2671
+ class BertModel(Model):
2672
+ model_arch = gguf.MODEL_ARCH.BERT
2673
+
2674
+ def __init__(self, *args, **kwargs):
2675
+ super().__init__(*args, **kwargs)
2676
+ self.vocab_size = None
2677
+
2678
+ def set_gguf_parameters(self):
2679
+ super().set_gguf_parameters()
2680
+ self.gguf_writer.add_causal_attention(False)
2681
+
2682
+ # get pooling path
2683
+ pooling_path = None
2684
+ module_path = self.dir_model / "modules.json"
2685
+ if module_path.is_file():
2686
+ with open(module_path, encoding="utf-8") as f:
2687
+ modules = json.load(f)
2688
+ for mod in modules:
2689
+ if mod["type"] == "sentence_transformers.models.Pooling":
2690
+ pooling_path = mod["path"]
2691
+ break
2692
+
2693
+ # get pooling type
2694
+ if pooling_path is not None:
2695
+ with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
2696
+ pooling = json.load(f)
2697
+ if pooling["pooling_mode_mean_tokens"]:
2698
+ pooling_type = gguf.PoolingType.MEAN
2699
+ elif pooling["pooling_mode_cls_token"]:
2700
+ pooling_type = gguf.PoolingType.CLS
2701
+ else:
2702
+ raise NotImplementedError("Only MEAN and CLS pooling types supported")
2703
+ self.gguf_writer.add_pooling_type(pooling_type)
2704
+
2705
+ def set_vocab(self):
2706
+ tokens, toktypes, tokpre = self.get_vocab_base()
2707
+ self.vocab_size = len(tokens)
2708
+
2709
+ # we need this to validate the size of the token_type embeddings
2710
+ # though currently we are passing all zeros to the token_type embeddings
2711
+ self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
2712
+
2713
+ # convert to phantom space vocab
2714
+ def phantom(tok):
2715
+ if tok.startswith("[") and tok.endswith("]"):
2716
+ return tok
2717
+ if tok.startswith("##"):
2718
+ return tok[2:]
2719
+ return "\u2581" + tok
2720
+ tokens = list(map(phantom, tokens))
2721
+
2722
+ # add vocab to gguf
2723
+ self.gguf_writer.add_tokenizer_model("bert")
2724
+ self.gguf_writer.add_tokenizer_pre(tokpre)
2725
+ self.gguf_writer.add_token_list(tokens)
2726
+ self.gguf_writer.add_token_types(toktypes)
2727
+
2728
+ # handle special tokens
2729
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2730
+ special_vocab.add_to_gguf(self.gguf_writer)
2731
+
2732
+ def write_tensors(self):
2733
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
2734
+ tensors = dict(self.get_tensors())
2735
+ for name, data_torch in tensors.items():
2736
+ # we are only using BERT for embeddings so we don't need the pooling layer
2737
+ if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
2738
+ continue # we don't need these
2739
+
2740
+ # map tensor names
2741
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2742
+ if new_name is None:
2743
+ raise ValueError(f"Can not map tensor {name!r}")
2744
+
2745
+ # convert any unsupported data types to float32
2746
+ if data_torch.dtype not in (torch.float16, torch.float32):
2747
+ data_torch = data_torch.to(torch.float32)
2748
+
2749
+ data = data_torch.squeeze().numpy()
2750
+ n_dims = len(data.shape)
2751
+ new_dtype: type[np.floating[Any]]
2752
+
2753
+ if (
2754
+ self.ftype == 1 and name.endswith(".weight") and n_dims == 2
2755
+ and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
2756
+ ):
2757
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2758
+ new_dtype = np.float16
2759
+ else:
2760
+ # if f32 desired, convert any float16 to float32
2761
+ new_dtype = np.float32
2762
+
2763
+ logger.info(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
2764
+
2765
+ if data.dtype != new_dtype:
2766
+ data = data.astype(new_dtype)
2767
+
2768
+ self.gguf_writer.add_tensor(new_name, data)
2769
+
2770
+
2771
+ @Model.register("NomicBertModel")
2772
+ class NomicBertModel(BertModel):
2773
+ model_arch = gguf.MODEL_ARCH.NOMIC_BERT
2774
+
2775
+ def __init__(self, *args, **kwargs):
2776
+ super().__init__(*args, **kwargs)
2777
+
2778
+ # the HF config claims n_ctx=8192, but it uses RoPE scaling
2779
+ self.hparams["n_ctx"] = 2048
2780
+
2781
+ # SwigLU activation
2782
+ assert self.hparams["activation_function"] == "swiglu"
2783
+ # this doesn't do anything in the HF version
2784
+ assert self.hparams["causal"] is False
2785
+ # no bias tensors
2786
+ assert self.hparams["qkv_proj_bias"] is False
2787
+ assert self.hparams["mlp_fc1_bias"] is False
2788
+ assert self.hparams["mlp_fc2_bias"] is False
2789
+ # norm at end of layer
2790
+ assert self.hparams["prenorm"] is False
2791
+ # standard RoPE
2792
+ assert self.hparams["rotary_emb_fraction"] == 1.0
2793
+ assert self.hparams["rotary_emb_interleaved"] is False
2794
+ assert self.hparams["rotary_emb_scale_base"] is None
2795
+
2796
+ def set_gguf_parameters(self):
2797
+ super().set_gguf_parameters()
2798
+ self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
2799
+
2800
+
2801
+ @Model.register("GemmaForCausalLM")
2802
+ class GemmaModel(Model):
2803
+ model_arch = gguf.MODEL_ARCH.GEMMA
2804
+
2805
+ def set_vocab(self):
2806
+ self._set_vocab_sentencepiece()
2807
+
2808
+ # TODO: these special tokens should be exported only for the CodeGemma family
2809
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
2810
+ special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
2811
+ special_vocab._set_special_token("prefix", 67)
2812
+ special_vocab._set_special_token("suffix", 69)
2813
+ special_vocab._set_special_token("middle", 68)
2814
+ special_vocab._set_special_token("fsep", 70)
2815
+ special_vocab._set_special_token("eot", 107)
2816
+ special_vocab.add_to_gguf(self.gguf_writer)
2817
+
2818
+ def set_gguf_parameters(self):
2819
+ hparams = self.hparams
2820
+ block_count = hparams["num_hidden_layers"]
2821
+
2822
+ self.gguf_writer.add_name(self.dir_model.name)
2823
+ self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2824
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2825
+ self.gguf_writer.add_block_count(block_count)
2826
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2827
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2828
+ self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
2829
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2830
+ self.gguf_writer.add_key_length(hparams["head_dim"])
2831
+ self.gguf_writer.add_value_length(hparams["head_dim"])
2832
+ self.gguf_writer.add_file_type(self.ftype)
2833
+
2834
+ def write_tensors(self):
2835
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2836
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2837
+
2838
+ for name, data_torch in self.get_tensors():
2839
+ # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
2840
+ # To prevent errors, skip loading lm_head.weight.
2841
+ if name == "lm_head.weight":
2842
+ logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
2843
+ continue
2844
+
2845
+ old_dtype = data_torch.dtype
2846
+
2847
+ # convert any unsupported data types to float32
2848
+ if data_torch.dtype not in (torch.float16, torch.float32):
2849
+ data_torch = data_torch.to(torch.float32)
2850
+
2851
+ # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
2852
+ if name.endswith("norm.weight"):
2853
+ data_torch = data_torch + 1
2854
+ data = data_torch.squeeze().numpy()
2855
+
2856
+ # map tensor names
2857
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2858
+ if new_name is None:
2859
+ raise ValueError(f"Can not map tensor {name!r}")
2860
+
2861
+ n_dims = len(data.shape)
2862
+ data_dtype = data.dtype
2863
+
2864
+ data = data.astype(np.float32)
2865
+
2866
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2867
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2868
+ data = data.astype(np.float16)
2869
+
2870
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2871
+
2872
+ self.gguf_writer.add_tensor(new_name, data)
2873
+
2874
+
2875
+ @Model.register("Starcoder2ForCausalLM")
2876
+ class StarCoder2Model(Model):
2877
+ model_arch = gguf.MODEL_ARCH.STARCODER2
2878
+
2879
+
2880
+ @Model.register("MambaForCausalLM", "MambaLMHeadModel")
2881
+ class MambaModel(Model):
2882
+ model_arch = gguf.MODEL_ARCH.MAMBA
2883
+
2884
+ def set_vocab(self):
2885
+ vocab_size = self.hparams["vocab_size"]
2886
+ # Round vocab size to next multiple of 8
2887
+ pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
2888
+ # pad using ceiling division
2889
+ # ref: https://stackoverflow.com/a/17511341/22827863
2890
+ vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
2891
+ self.hparams["vocab_size"] = vocab_size
2892
+
2893
+ if (self.dir_model / "tokenizer.json").is_file():
2894
+ self._set_vocab_gpt2()
2895
+ else:
2896
+ # Use the GPT-NeoX tokenizer when no tokenizer files are present
2897
+ tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
2898
+ logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
2899
+ neox_reader = gguf.GGUFReader(tokenizer_path, "r")
2900
+
2901
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
2902
+ self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
2903
+
2904
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
2905
+ self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
2906
+
2907
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
2908
+ self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
2909
+
2910
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
2911
+ self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
2912
+
2913
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
2914
+ self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
2915
+
2916
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
2917
+ self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
2918
+
2919
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
2920
+ self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
2921
+
2922
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
2923
+ self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
2924
+
2925
+ def set_gguf_parameters(self):
2926
+ d_model = self.find_hparam(["hidden_size", "d_model"])
2927
+ d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
2928
+ d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
2929
+ d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
2930
+ # ceiling division
2931
+ # ref: https://stackoverflow.com/a/17511341/22827863
2932
+ # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
2933
+ dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
2934
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
2935
+
2936
+ # Fail early for models which don't have a block expansion factor of 2
2937
+ assert d_inner == 2 * d_model
2938
+
2939
+ self.gguf_writer.add_name(self.dir_model.name)
2940
+ self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
2941
+ self.gguf_writer.add_embedding_length(d_model)
2942
+ self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
2943
+ self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
2944
+ self.gguf_writer.add_block_count(self.hparams["n_layer"])
2945
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
2946
+ self.gguf_writer.add_ssm_inner_size(d_inner)
2947
+ self.gguf_writer.add_ssm_state_size(d_state)
2948
+ self.gguf_writer.add_ssm_time_step_rank(dt_rank)
2949
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
2950
+ self.gguf_writer.add_file_type(self.ftype)
2951
+
2952
+ def write_tensors(self):
2953
+ block_count = self.hparams["n_layer"]
2954
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2955
+
2956
+ tok_embd = None
2957
+ tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
2958
+ output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
2959
+
2960
+ for name, data_torch in self.get_tensors():
2961
+ old_dtype = data_torch.dtype
2962
+
2963
+ # convert any unsupported data types to float32
2964
+ if data_torch.dtype not in (torch.float16, torch.float32):
2965
+ data_torch = data_torch.to(torch.float32)
2966
+
2967
+ # map tensor names
2968
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2969
+ if new_name is None:
2970
+ raise ValueError(f"Can not map tensor {name!r}")
2971
+
2972
+ if name.endswith(".A_log"):
2973
+ logger.debug("A_log --> A ==> " + new_name)
2974
+ data_torch = -torch.exp(data_torch)
2975
+
2976
+ # assuming token_embd.weight is seen before output.weight
2977
+ if tok_embd is not None and new_name == output_name:
2978
+ if torch.equal(tok_embd, data_torch):
2979
+ logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
2980
+ continue
2981
+ if new_name == tok_embd_name:
2982
+ tok_embd = data_torch
2983
+
2984
+ data = data_torch.squeeze().numpy()
2985
+
2986
+ n_dims = len(data.shape)
2987
+ data_dtype = data.dtype
2988
+
2989
+ # if f32 desired, convert any float16 to float32
2990
+ if self.ftype == 0 and data_dtype == np.float16:
2991
+ data = data.astype(np.float32)
2992
+
2993
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2994
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2995
+ data = data.astype(np.float32)
2996
+
2997
+ # if f16 desired, convert big float32 2-dim weight tensors to float16
2998
+ new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
2999
+ if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
3000
+ data = data.astype(np.float16)
3001
+
3002
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3003
+
3004
+ self.gguf_writer.add_tensor(new_name, data)
3005
+
3006
+
3007
+ @Model.register("CohereForCausalLM")
3008
+ class CommandR2Model(Model):
3009
+ model_arch = gguf.MODEL_ARCH.COMMAND_R
3010
+
3011
+ def __init__(self, *args, **kwargs):
3012
+ super().__init__(*args, **kwargs)
3013
+
3014
+ # max_position_embeddings = 8192 in config.json but model was actually
3015
+ # trained on 128k context length
3016
+ self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
3017
+
3018
+ def set_gguf_parameters(self):
3019
+ super().set_gguf_parameters()
3020
+ self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
3021
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3022
+
3023
+
3024
+ @Model.register("OlmoForCausalLM")
3025
+ @Model.register("OLMoForCausalLM")
3026
+ class OlmoModel(Model):
3027
+ model_arch = gguf.MODEL_ARCH.OLMO
3028
+
3029
+ def set_gguf_parameters(self):
3030
+ super().set_gguf_parameters()
3031
+ self.gguf_writer.add_layer_norm_eps(1e-5)
3032
+ clip_qkv = self.hparams.get("clip_qkv")
3033
+ if clip_qkv is not None:
3034
+ self.gguf_writer.add_clamp_kqv(clip_qkv)
3035
+
3036
+ # Same as super class, but permuting q_proj, k_proj
3037
+ # Copied from: LlamaModel
3038
+ def write_tensors(self):
3039
+ block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
3040
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
3041
+ n_head = self.hparams.get("num_attention_heads")
3042
+ n_kv_head = self.hparams.get("num_key_value_heads")
3043
+ for name, data_torch in self.get_tensors():
3044
+ old_dtype = data_torch.dtype
3045
+
3046
+ # convert any unsupported data types to float32
3047
+ if data_torch.dtype not in (torch.float16, torch.float32):
3048
+ data_torch = data_torch.to(torch.float32)
3049
+
3050
+ data = data_torch.numpy()
3051
+
3052
+ if name.endswith("q_proj.weight"):
3053
+ data = permute(data, n_head, n_head)
3054
+ if name.endswith("k_proj.weight"):
3055
+ data = permute(data, n_head, n_kv_head)
3056
+
3057
+ data = data.squeeze()
3058
+
3059
+ # map tensor names
3060
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
3061
+ if new_name is None:
3062
+ raise ValueError(f"Can not map tensor {name!r}")
3063
+
3064
+ n_dims = len(data.shape)
3065
+ data_dtype = data.dtype
3066
+
3067
+ # if f32 desired, convert any float16 to float32
3068
+ if self.ftype == 0 and data_dtype == np.float16:
3069
+ data = data.astype(np.float32)
3070
+
3071
+ # 1d tensors need to be converted to float32
3072
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
3073
+ data = data.astype(np.float32)
3074
+
3075
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
3076
+ if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
3077
+ data = data.astype(np.float16)
3078
+
3079
+ logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3080
+
3081
+ self.gguf_writer.add_tensor(new_name, data)
3082
+
3083
+
3084
+ ###### CONVERSION LOGIC ######
3085
+
3086
+
3087
+ def parse_args() -> argparse.Namespace:
3088
+ parser = argparse.ArgumentParser(
3089
+ description="Convert a huggingface model to a GGML compatible file")
3090
+ parser.add_argument(
3091
+ "--vocab-only", action="store_true",
3092
+ help="extract only the vocab",
3093
+ )
3094
+ parser.add_argument(
3095
+ "--awq-path", type=Path, default=None,
3096
+ help="Path to scale awq cache file")
3097
+ parser.add_argument(
3098
+ "--outfile", type=Path,
3099
+ help="path to write to; default: based on input",
3100
+ )
3101
+ parser.add_argument(
3102
+ "--outtype", type=str, choices=["f32", "f16"], default="f16",
3103
+ help="output format - use f32 for float32, f16 for float16",
3104
+ )
3105
+ parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
3106
+ parser.add_argument(
3107
+ "model", type=Path,
3108
+ help="directory containing model file",
3109
+ )
3110
+ parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
3111
+ parser.add_argument("--model-name", type=str, default=None, help="name of the model")
3112
+ parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
3113
+
3114
+ return parser.parse_args()
3115
+
3116
+
3117
+ def main() -> None:
3118
+ args = parse_args()
3119
+
3120
+ logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
3121
+
3122
+ dir_model = args.model
3123
+
3124
+ if args.awq_path:
3125
+ sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
3126
+ from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
3127
+ tmp_model_path = args.model / "weighted_model"
3128
+ dir_model = tmp_model_path
3129
+ if tmp_model_path.is_dir():
3130
+ logger.info(f"{tmp_model_path} exists as a weighted model.")
3131
+ else:
3132
+ tmp_model_path.mkdir(parents=True, exist_ok=True)
3133
+ logger.info("Saving new weighted model ...")
3134
+ add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
3135
+ logger.info(f"Saved weighted model at {tmp_model_path}.")
3136
+
3137
+ if not dir_model.is_dir():
3138
+ logger.error(f'Error: {args.model} is not a directory')
3139
+ sys.exit(1)
3140
+
3141
+ ftype_map = {
3142
+ "f32": gguf.GGMLQuantizationType.F32,
3143
+ "f16": gguf.GGMLQuantizationType.F16,
3144
+ }
3145
+
3146
+ if args.outfile is not None:
3147
+ fname_out = args.outfile
3148
+ else:
3149
+ # output in the same directory as the model by default
3150
+ fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
3151
+
3152
+ logger.info(f"Loading model: {dir_model.name}")
3153
+
3154
+ hparams = Model.load_hparams(dir_model)
3155
+
3156
+ with torch.inference_mode():
3157
+ model_class = Model.from_model_architecture(hparams["architectures"][0])
3158
+ model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
3159
+
3160
+ logger.info("Set model parameters")
3161
+ model_instance.set_gguf_parameters()
3162
+
3163
+ logger.info("Set model tokenizer")
3164
+ model_instance.set_vocab()
3165
+
3166
+ if args.vocab_only:
3167
+ logger.info(f"Exporting model vocab to '{fname_out}'")
3168
+ model_instance.write_vocab()
3169
+ else:
3170
+ logger.info(f"Exporting model to '{fname_out}'")
3171
+ model_instance.write()
3172
+
3173
+ logger.info(f"Model successfully exported to '{fname_out}'")
3174
+
3175
+
3176
+ if __name__ == '__main__':
3177
+ main()