bigdl-core-cpp 2.5.0b20240527__py3-none-manylinux2010_x86_64.whl → 2.5.0b20240529__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. bigdl/cpp/convert-hf-to-gguf.py +1363 -338
  2. bigdl/cpp/convert.py +199 -52
  3. bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
  4. bigdl/cpp/gguf-py/gguf/constants.py +102 -28
  5. bigdl/cpp/gguf-py/gguf/gguf_reader.py +9 -5
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +36 -11
  7. bigdl/cpp/gguf-py/gguf/lazy.py +236 -0
  8. bigdl/cpp/gguf-py/gguf/quants.py +123 -0
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +28 -1
  10. bigdl/cpp/gguf-py/gguf/vocab.py +3 -3
  11. bigdl/cpp/libs/baby-llama +0 -0
  12. bigdl/cpp/libs/batched +0 -0
  13. bigdl/cpp/libs/batched-bench +0 -0
  14. bigdl/cpp/libs/beam-search +0 -0
  15. bigdl/cpp/libs/benchmark +0 -0
  16. bigdl/cpp/libs/convert-llama2c-to-ggml +0 -0
  17. bigdl/cpp/libs/embedding +0 -0
  18. bigdl/cpp/libs/export-lora +0 -0
  19. bigdl/cpp/libs/finetune +0 -0
  20. bigdl/cpp/libs/gguf +0 -0
  21. bigdl/cpp/libs/gritlm +0 -0
  22. bigdl/cpp/libs/imatrix +0 -0
  23. bigdl/cpp/libs/infill +0 -0
  24. bigdl/cpp/libs/llama-bench +0 -0
  25. bigdl/cpp/libs/llava-cli +0 -0
  26. bigdl/cpp/libs/lookahead +0 -0
  27. bigdl/cpp/libs/lookup +0 -0
  28. bigdl/cpp/libs/ls-sycl-device +0 -0
  29. bigdl/cpp/libs/main +0 -0
  30. bigdl/cpp/libs/ollama +0 -0
  31. bigdl/cpp/libs/parallel +0 -0
  32. bigdl/cpp/libs/passkey +0 -0
  33. bigdl/cpp/libs/perplexity +0 -0
  34. bigdl/cpp/libs/q8dot +0 -0
  35. bigdl/cpp/libs/quantize +0 -0
  36. bigdl/cpp/libs/quantize-stats +0 -0
  37. bigdl/cpp/libs/save-load-state +0 -0
  38. bigdl/cpp/libs/server +0 -0
  39. bigdl/cpp/libs/simple +0 -0
  40. bigdl/cpp/libs/speculative +0 -0
  41. bigdl/cpp/libs/tokenize +0 -0
  42. bigdl/cpp/libs/train-text-from-scratch +0 -0
  43. bigdl/cpp/libs/vdot +0 -0
  44. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/METADATA +1 -1
  45. bigdl_core_cpp-2.5.0b20240529.dist-info/RECORD +55 -0
  46. bigdl_core_cpp-2.5.0b20240527.dist-info/RECORD +0 -53
  47. {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-llama-cpp +0 -0
  48. {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-ollama +0 -0
  49. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/WHEEL +0 -0
  50. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/top_level.txt +0 -0
@@ -9,12 +9,16 @@ import json
9
9
  import os
10
10
  import re
11
11
  import sys
12
- from abc import ABC, abstractmethod
13
12
  from enum import IntEnum
14
13
  from pathlib import Path
15
14
  from hashlib import sha256
15
+ <<<<<<< HEAD
16
16
  from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
17
+ =======
18
+ from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
19
+ >>>>>>> uupstream/master
17
20
 
21
+ import math
18
22
  import numpy as np
19
23
  import torch
20
24
 
@@ -25,7 +29,9 @@ if 'NO_LOCAL_GGUF' not in os.environ:
25
29
  sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
26
30
  import gguf
27
31
 
28
- from convert import LlamaHfVocab, permute
32
+ from convert import LlamaHfVocab
33
+
34
+ logger = logging.getLogger("hf-to-gguf")
29
35
 
30
36
  logger = logging.getLogger("hf-to-gguf")
31
37
 
@@ -44,29 +50,79 @@ class SentencePieceTokenTypes(IntEnum):
44
50
  AnyModel = TypeVar("AnyModel", bound="type[Model]")
45
51
 
46
52
 
47
- class Model(ABC):
53
+ class Model:
48
54
  _model_classes: dict[str, type[Model]] = {}
49
55
 
56
+ <<<<<<< HEAD
50
57
  def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
58
+ =======
59
+ dir_model: Path
60
+ ftype: int
61
+ is_big_endian: bool
62
+ endianess: gguf.GGUFEndian
63
+ use_temp_file: bool
64
+ lazy: bool
65
+ part_names: list[str]
66
+ is_safetensors: bool
67
+ hparams: dict[str, Any]
68
+ block_count: int
69
+ tensor_map: gguf.TensorNameMap
70
+ tensor_names: set[str] | None
71
+ fname_out: Path
72
+ gguf_writer: gguf.GGUFWriter
73
+
74
+ # subclasses should define this!
75
+ model_arch: gguf.MODEL_ARCH
76
+
77
+ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
78
+ if type(self) is Model:
79
+ raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
80
+ >>>>>>> uupstream/master
51
81
  self.dir_model = dir_model
52
82
  self.ftype = ftype
53
- self.fname_out = fname_out
54
83
  self.is_big_endian = is_big_endian
55
84
  self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
56
85
  self.use_temp_file = use_temp_file
86
+ <<<<<<< HEAD
57
87
  self.is_safetensors = self._is_model_safetensors()
58
88
  self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
59
89
  self.part_names = self._get_part_names()
60
90
  self.hparams = Model.load_hparams(self.dir_model)
61
91
  self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
92
+ =======
93
+ self.lazy = not eager
94
+ self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
95
+ self.is_safetensors = len(self.part_names) > 0
96
+ if not self.is_safetensors:
97
+ self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
98
+ self.hparams = Model.load_hparams(self.dir_model)
99
+ >>>>>>> uupstream/master
62
100
  self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
101
+ self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
102
+ self.tensor_names = None
103
+ if self.ftype == gguf.LlamaFileType.GUESSED:
104
+ # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
105
+ _, first_tensor = next(self.get_tensors())
106
+ if first_tensor.dtype == torch.float16:
107
+ logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
108
+ self.ftype = gguf.LlamaFileType.MOSTLY_F16
109
+ else:
110
+ logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
111
+ self.ftype = gguf.LlamaFileType.MOSTLY_BF16
112
+ ftype_up: str = self.ftype.name.partition("_")[2].upper()
113
+ ftype_lw: str = ftype_up.lower()
114
+ # allow templating the file name with the output ftype, useful with the "auto" ftype
115
+ self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
116
+ self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
63
117
 
64
- @property
65
- @abstractmethod
66
- def model_arch(self) -> gguf.MODEL_ARCH:
67
- pass
118
+ @classmethod
119
+ def __init_subclass__(cls):
120
+ # can't use an abstract property, because overriding it without type errors
121
+ # would require using decorated functions instead of simply defining the property
122
+ if "model_arch" not in cls.__dict__:
123
+ raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
68
124
 
69
- def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
125
+ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
70
126
  key = next((k for k in keys if k in self.hparams), None)
71
127
  if key is not None:
72
128
  return self.hparams[key]
@@ -78,6 +134,22 @@ class Model(ABC):
78
134
  self._set_vocab_gpt2()
79
135
 
80
136
  def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
137
+ tensor_names_from_parts: set[str] = set()
138
+
139
+ if len(self.part_names) > 1:
140
+ self.tensor_names = set()
141
+ index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
142
+ index_name += ".index.json"
143
+ logger.info(f"gguf: loading model weight map from '{index_name}'")
144
+ with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
145
+ index: dict[str, Any] = json.load(f)
146
+ weight_map = index.get("weight_map")
147
+ if weight_map is None or not isinstance(weight_map, dict):
148
+ raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
149
+ self.tensor_names.update(weight_map.keys())
150
+ else:
151
+ self.tensor_names = tensor_names_from_parts
152
+
81
153
  for part_name in self.part_names:
82
154
  logger.info(f"gguf: loading model part '{part_name}'")
83
155
  ctx: ContextManager[Any]
@@ -88,10 +160,46 @@ class Model(ABC):
88
160
  ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
89
161
 
90
162
  with ctx as model_part:
163
+ tensor_names_from_parts.update(model_part.keys())
164
+
91
165
  for name in model_part.keys():
92
166
  data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
167
+ if self.lazy:
168
+ data = LazyTorchTensor.from_eager(data)
93
169
  yield name, data
94
170
 
171
+ # only verify tensor name presence; it doesn't matter if they are not in the right files
172
+ if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
173
+ raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
174
+
175
+ def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
176
+ if key not in gguf.MODEL_TENSORS[self.model_arch]:
177
+ raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
178
+ name: str = gguf.TENSOR_NAMES[key]
179
+ if "{bid}" in name:
180
+ assert bid is not None
181
+ name = name.format(bid=bid)
182
+ return name + suffix
183
+
184
+ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
185
+ if key not in gguf.MODEL_TENSORS[self.model_arch]:
186
+ return False
187
+ key_name: str = gguf.TENSOR_NAMES[key]
188
+ if "{bid}" in key_name:
189
+ if bid is None:
190
+ return False
191
+ key_name = key_name.format(bid=bid)
192
+ else:
193
+ if bid is not None:
194
+ return False
195
+ return name == (key_name + suffix)
196
+
197
+ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
198
+ new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
199
+ if new_name is None:
200
+ raise ValueError(f"Can not map tensor {name!r}")
201
+ return new_name
202
+
95
203
  def set_gguf_parameters(self):
96
204
  self.gguf_writer.add_name(self.dir_model.name)
97
205
  self.gguf_writer.add_block_count(self.block_count)
@@ -134,13 +242,31 @@ class Model(ABC):
134
242
 
135
243
  self.gguf_writer.add_file_type(self.ftype)
136
244
  logger.info(f"gguf: file type = {self.ftype}")
245
+ <<<<<<< HEAD
246
+ =======
247
+
248
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
249
+ del bid # unused
250
+
251
+ return [(self.map_tensor_name(name), data_torch)]
252
+
253
+ def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
254
+ del name, new_name, bid, n_dims # unused
255
+
256
+ return False
257
+
258
+ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
259
+ del name, new_name, bid, n_dims # unused
260
+
261
+ return False
262
+ >>>>>>> uupstream/master
137
263
 
138
264
  def write_tensors(self):
139
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
140
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
265
+ max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
266
+
141
267
  for name, data_torch in self.get_tensors():
142
268
  # we don't need these
143
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
269
+ if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
144
270
  continue
145
271
 
146
272
  old_dtype = data_torch.dtype
@@ -149,37 +275,97 @@ class Model(ABC):
149
275
  if data_torch.dtype not in (torch.float16, torch.float32):
150
276
  data_torch = data_torch.to(torch.float32)
151
277
 
152
- data = data_torch.squeeze().numpy()
278
+ # use the first number-like part of the tensor name as the block id
279
+ bid = None
280
+ for part in name.split("."):
281
+ if part.isdecimal():
282
+ bid = int(part)
283
+ break
153
284
 
285
+ <<<<<<< HEAD
154
286
  # map tensor names
155
287
  new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
156
288
  if new_name is None:
157
289
  raise ValueError(f"Can not map tensor {name!r}")
158
-
159
- n_dims = len(data.shape)
160
- data_dtype = data.dtype
161
-
162
- # if f32 desired, convert any float16 to float32
163
- if self.ftype == 0 and data_dtype == np.float16:
164
- data = data.astype(np.float32)
165
-
290
+ =======
291
+ for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
292
+ data: np.ndarray = data # type hint
293
+ n_dims = len(data.shape)
294
+ data_dtype = data.dtype
295
+ data_qtype: gguf.GGMLQuantizationType | None = None
296
+ >>>>>>> uupstream/master
297
+
298
+ # when both are True, f32 should win
299
+ extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
300
+ extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
301
+
302
+ # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
303
+ # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
304
+ extra_f32 = any(cond for cond in (
305
+ extra_f32,
306
+ n_dims == 1,
307
+ new_name.endswith("_norm.weight"),
308
+ ))
309
+
310
+ <<<<<<< HEAD
166
311
  # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
167
312
  if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
168
313
  data = data.astype(np.float32)
314
+ =======
315
+ # Some tensor types are always in float32
316
+ extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
317
+ gguf.MODEL_TENSOR.FFN_GATE_INP,
318
+ gguf.MODEL_TENSOR.POS_EMBD,
319
+ gguf.MODEL_TENSOR.TOKEN_TYPES,
320
+ ))
321
+ >>>>>>> uupstream/master
169
322
 
170
- # if f16 desired, convert any float32 2-dim weight tensors to float16
171
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
172
- data = data.astype(np.float16)
323
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
324
+ extra_f16 = any(cond for cond in (
325
+ extra_f16,
326
+ (name.endswith(".weight") and n_dims >= 2),
327
+ ))
173
328
 
329
+ <<<<<<< HEAD
174
330
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
331
+ =======
332
+ if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
333
+ if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
334
+ data = gguf.quantize_bf16(data)
335
+ assert data.dtype == np.int16
336
+ data_qtype = gguf.GGMLQuantizationType.BF16
337
+ >>>>>>> uupstream/master
175
338
 
176
- self.gguf_writer.add_tensor(new_name, data)
339
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
340
+ data = gguf.quantize_q8_0(data)
341
+ assert data.dtype == np.uint8
342
+ data_qtype = gguf.GGMLQuantizationType.Q8_0
343
+
344
+ else: # default to float16 for quantized tensors
345
+ if data_dtype != np.float16:
346
+ data = data.astype(np.float16)
347
+ data_qtype = gguf.GGMLQuantizationType.F16
348
+
349
+ if data_qtype is None: # by default, convert to float32
350
+ if data_dtype != np.float32:
351
+ data = data.astype(np.float32)
352
+ data_qtype = gguf.GGMLQuantizationType.F32
353
+
354
+ shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
355
+
356
+ # reverse shape to make it similar to the internal ggml dimension order
357
+ shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
358
+
359
+ # n_dims is implicit in the shape
360
+ logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
361
+
362
+ self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
177
363
 
178
364
  def write(self):
179
365
  self.write_tensors()
180
366
  self.gguf_writer.write_header_to_file()
181
367
  self.gguf_writer.write_kv_data_to_file()
182
- self.gguf_writer.write_tensors_to_file()
368
+ self.gguf_writer.write_tensors_to_file(progress=True)
183
369
  self.gguf_writer.close()
184
370
 
185
371
  def write_vocab(self):
@@ -188,16 +374,18 @@ class Model(ABC):
188
374
  self.gguf_writer.close()
189
375
 
190
376
  @staticmethod
191
- def count_model_parts(dir_model: Path, prefix: str) -> int:
192
- num_parts = 0
377
+ def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
378
+ part_names: list[str] = []
193
379
  for filename in os.listdir(dir_model):
194
- if filename.endswith(prefix):
195
- num_parts += 1
380
+ if filename.endswith(suffix):
381
+ part_names.append(filename)
382
+
383
+ part_names.sort()
196
384
 
197
- return num_parts
385
+ return part_names
198
386
 
199
387
  @staticmethod
200
- def load_hparams(dir_model):
388
+ def load_hparams(dir_model: Path):
201
389
  with open(dir_model / "config.json", "r", encoding="utf-8") as f:
202
390
  return json.load(f)
203
391
 
@@ -205,19 +393,20 @@ class Model(ABC):
205
393
  def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
206
394
  assert names
207
395
 
208
- def func(modelcls: type[Model]):
396
+ def func(modelcls: AnyModel) -> AnyModel:
209
397
  for name in names:
210
398
  cls._model_classes[name] = modelcls
211
399
  return modelcls
212
400
  return func
213
401
 
214
402
  @classmethod
215
- def from_model_architecture(cls, arch):
403
+ def from_model_architecture(cls, arch: str) -> type[Model]:
216
404
  try:
217
405
  return cls._model_classes[arch]
218
406
  except KeyError:
219
407
  raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
220
408
 
409
+ <<<<<<< HEAD
221
410
  def _is_model_safetensors(self) -> bool:
222
411
  return Model.count_model_parts(self.dir_model, ".safetensors") > 0
223
412
 
@@ -231,6 +420,8 @@ class Model(ABC):
231
420
  return ("pytorch_model.bin",)
232
421
  return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
233
422
 
423
+ =======
424
+ >>>>>>> uupstream/master
234
425
  # used for GPT-2 BPE and WordPiece vocabs
235
426
  def get_vocab_base(self) -> tuple[list[str], list[int], str]:
236
427
  tokens: list[str] = []
@@ -265,6 +456,10 @@ class Model(ABC):
265
456
  # NOTE: this function is generated by convert-hf-to-gguf-update.py
266
457
  # do not modify it manually!
267
458
  # ref: https://github.com/ggerganov/llama.cpp/pull/6920
459
+ <<<<<<< HEAD
460
+ =======
461
+ # Marker: Start get_vocab_base_pre
462
+ >>>>>>> uupstream/master
268
463
  def get_vocab_base_pre(self, tokenizer) -> str:
269
464
  # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
270
465
  # is specific for the BPE pre-tokenizer used by the model
@@ -308,15 +503,45 @@ class Model(ABC):
308
503
  if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
309
504
  # ref: https://huggingface.co/openai-community/gpt2
310
505
  res = "gpt-2"
506
+ <<<<<<< HEAD
507
+ =======
508
+ if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
509
+ # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
510
+ res = "stablelm2"
511
+ >>>>>>> uupstream/master
311
512
  if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
312
513
  # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
313
514
  res = "refact"
314
515
  if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
315
516
  # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
316
517
  res = "command-r"
518
+ <<<<<<< HEAD
519
+ if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
520
+ # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
521
+ res = "olmo"
522
+ =======
523
+ if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
524
+ # ref: https://huggingface.co/Qwen/Qwen1.5-7B
525
+ res = "qwen2"
317
526
  if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
318
527
  # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
319
528
  res = "olmo"
529
+ if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
530
+ # ref: https://huggingface.co/databricks/dbrx-base
531
+ res = "dbrx"
532
+ if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
533
+ # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
534
+ res = "jina-v2-en"
535
+ if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
536
+ # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
537
+ res = "jina-v2-es"
538
+ if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
539
+ # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
540
+ res = "jina-v2-de"
541
+ if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
542
+ # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
543
+ res = "smaug-bpe"
544
+ >>>>>>> uupstream/master
320
545
 
321
546
  if res is None:
322
547
  logger.warning("\n")
@@ -337,6 +562,10 @@ class Model(ABC):
337
562
  logger.debug(f"chkhsh: {chkhsh}")
338
563
 
339
564
  return res
565
+ <<<<<<< HEAD
566
+ =======
567
+ # Marker: End get_vocab_base_pre
568
+ >>>>>>> uupstream/master
340
569
 
341
570
  def _set_vocab_gpt2(self) -> None:
342
571
  tokens, toktypes, tokpre = self.get_vocab_base()
@@ -374,7 +603,7 @@ class Model(ABC):
374
603
 
375
604
  # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
376
605
  added_vocab = tokenizer.special_tokens
377
- reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
606
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
378
607
 
379
608
  for i in range(vocab_size):
380
609
  if i not in reverse_vocab:
@@ -414,49 +643,66 @@ class Model(ABC):
414
643
  if not tokenizer_path.is_file():
415
644
  raise FileNotFoundError(f"File not found: {tokenizer_path}")
416
645
 
417
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
646
+ tokenizer = SentencePieceProcessor()
647
+ tokenizer.LoadFromFile(str(tokenizer_path))
648
+
418
649
  vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
419
650
 
651
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
652
+ scores: list[float] = [-10000.0] * vocab_size
653
+ toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
654
+
420
655
  for token_id in range(tokenizer.vocab_size()):
421
- piece = tokenizer.id_to_piece(token_id)
656
+ piece = tokenizer.IdToPiece(token_id)
422
657
  text = piece.encode("utf-8")
423
- score = tokenizer.get_score(token_id)
658
+ score = tokenizer.GetScore(token_id)
424
659
 
425
660
  toktype = SentencePieceTokenTypes.NORMAL
426
- if tokenizer.is_unknown(token_id):
661
+ if tokenizer.IsUnknown(token_id):
427
662
  toktype = SentencePieceTokenTypes.UNKNOWN
428
- elif tokenizer.is_control(token_id):
663
+ elif tokenizer.IsControl(token_id):
429
664
  toktype = SentencePieceTokenTypes.CONTROL
430
- elif tokenizer.is_unused(token_id):
665
+ elif tokenizer.IsUnused(token_id):
431
666
  toktype = SentencePieceTokenTypes.UNUSED
432
- elif tokenizer.is_byte(token_id):
667
+ elif tokenizer.IsByte(token_id):
433
668
  toktype = SentencePieceTokenTypes.BYTE
434
669
 
435
- tokens.append(text)
436
- scores.append(score)
437
- toktypes.append(toktype)
670
+ tokens[token_id] = text
671
+ scores[token_id] = score
672
+ toktypes[token_id] = toktype
438
673
 
439
674
  added_tokens_file = self.dir_model / 'added_tokens.json'
440
675
  if added_tokens_file.is_file():
441
676
  with open(added_tokens_file, "r", encoding="utf-8") as f:
442
677
  added_tokens_json = json.load(f)
443
-
444
678
  for key in added_tokens_json:
445
- key = key.encode("utf-8")
446
- if key not in tokens:
447
- tokens.append(key)
448
- scores.append(-1000.0)
449
- toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
679
+ token_id = added_tokens_json[key]
680
+ if (token_id >= vocab_size):
681
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
682
+ continue
450
683
 
684
+ <<<<<<< HEAD
685
+ =======
686
+ tokens[token_id] = key.encode("utf-8")
687
+ scores[token_id] = -1000.0
688
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
689
+
690
+ >>>>>>> uupstream/master
451
691
  if vocab_size > len(tokens):
452
692
  pad_count = vocab_size - len(tokens)
453
693
  logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
454
694
  for i in range(1, pad_count + 1):
695
+ <<<<<<< HEAD
455
696
  tokens.append(f"[PAD{i}]")
456
697
  scores.append(-1000.0)
457
698
  toktypes.append(SentencePieceTokenTypes.UNUSED)
458
699
 
459
700
  assert len(tokens) == vocab_size
701
+ =======
702
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
703
+ scores.append(-1000.0)
704
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
705
+ >>>>>>> uupstream/master
460
706
 
461
707
  self.gguf_writer.add_tokenizer_model("llama")
462
708
  self.gguf_writer.add_tokenizer_pre("default")
@@ -509,6 +755,44 @@ class GPTNeoXModel(Model):
509
755
  self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
510
756
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
511
757
 
758
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
759
+ del bid # unused
760
+
761
+ n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
762
+ n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
763
+
764
+ tensors: list[tuple[str, Tensor]] = []
765
+
766
+ if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
767
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
768
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
769
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
770
+ qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
771
+ data_torch = torch.cat(
772
+ (
773
+ qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
774
+ qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
775
+ qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
776
+ ),
777
+ dim=0,
778
+ )
779
+ logger.info("re-format attention.linear_qkv.weight")
780
+ elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
781
+ qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
782
+ data_torch = torch.cat(
783
+ (
784
+ qkv_bias[:, 0, :].reshape((n_embed,)),
785
+ qkv_bias[:, 1, :].reshape((n_embed,)),
786
+ qkv_bias[:, 2, :].reshape((n_embed,)),
787
+ ),
788
+ dim=0,
789
+ )
790
+ logger.info("re-format attention.linear_qkv.bias")
791
+
792
+ tensors.append((self.map_tensor_name(name), data_torch))
793
+
794
+ return tensors
795
+
512
796
 
513
797
  @Model.register("BloomForCausalLM")
514
798
  class BloomModel(Model):
@@ -527,28 +811,48 @@ class BloomModel(Model):
527
811
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
528
812
  self.gguf_writer.add_file_type(self.ftype)
529
813
 
530
- def write_tensors(self):
531
- block_count = self.hparams["n_layer"]
532
- tensors = dict(self.get_tensors())
533
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
534
- has_lm_head = True
814
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
815
+ del bid # unused
816
+
535
817
  n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
536
818
  n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
537
819
 
538
- for name, data_torch in tensors.items():
539
- if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
540
- has_lm_head = False
541
-
542
- name = re.sub(r'transformer\.', '', name)
543
-
544
- old_dtype = data_torch.dtype
820
+ name = re.sub(r'transformer\.', '', name)
821
+
822
+ tensors: list[tuple[str, Tensor]] = []
823
+
824
+ if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
825
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
826
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
827
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
828
+ qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
829
+ data_torch = torch.cat(
830
+ (
831
+ qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
832
+ qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
833
+ qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
834
+ ),
835
+ dim=0,
836
+ )
837
+ logger.info("re-format attention.linear_qkv.weight")
838
+ elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
839
+ qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
840
+ data_torch = torch.cat(
841
+ (
842
+ qkv_bias[:, 0, :].reshape((n_embed,)),
843
+ qkv_bias[:, 1, :].reshape((n_embed,)),
844
+ qkv_bias[:, 2, :].reshape((n_embed,)),
845
+ ),
846
+ dim=0,
847
+ )
848
+ logger.info("re-format attention.linear_qkv.bias")
545
849
 
546
- # convert any unsupported data types to float32
547
- if data_torch.dtype not in (torch.float16, torch.float32):
548
- data_torch = data_torch.to(torch.float32)
850
+ tensors.append((self.map_tensor_name(name), data_torch))
549
851
 
550
- data = data_torch.squeeze().numpy()
852
+ if name == "word_embeddings.weight":
853
+ assert self.tensor_names is not None
551
854
 
855
+ <<<<<<< HEAD
552
856
  if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
553
857
  # Map bloom-style qkv_linear to gpt-style qkv_linear
554
858
  # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
@@ -602,6 +906,13 @@ class BloomModel(Model):
602
906
  if not has_lm_head and name == "word_embeddings.weight":
603
907
  self.gguf_writer.add_tensor("output.weight", data)
604
908
  logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
909
+ =======
910
+ # TODO: tie them at runtime, don't duplicate in the model file
911
+ if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
912
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
913
+
914
+ return tensors
915
+ >>>>>>> uupstream/master
605
916
 
606
917
 
607
918
  @Model.register("MPTForCausalLM")
@@ -637,16 +948,16 @@ class MPTModel(Model):
637
948
  else:
638
949
  self.gguf_writer.add_max_alibi_bias(0.0)
639
950
 
640
- def write_tensors(self):
641
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
642
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
643
- for name, data_torch in self.get_tensors():
644
- # we don't need these
645
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
646
- continue
951
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
952
+ del bid # unused
647
953
 
648
- old_dtype = data_torch.dtype
954
+ if "scales" in name:
955
+ new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales"))
956
+ new_name = new_name.replace("scales", "act.scales")
957
+ else:
958
+ new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias"))
649
959
 
960
+ <<<<<<< HEAD
650
961
  # convert any unsupported data types to float32
651
962
  if data_torch.dtype not in (torch.float16, torch.float32):
652
963
  data_torch = data_torch.to(torch.float32)
@@ -681,6 +992,9 @@ class MPTModel(Model):
681
992
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
682
993
 
683
994
  self.gguf_writer.add_tensor(new_name, data)
995
+ =======
996
+ return [(new_name, data_torch)]
997
+ >>>>>>> uupstream/master
684
998
 
685
999
 
686
1000
  @Model.register("OrionForCausalLM")
@@ -720,6 +1034,7 @@ class OrionModel(Model):
720
1034
  # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
721
1035
  self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
722
1036
 
1037
+ <<<<<<< HEAD
723
1038
  def write_tensors(self):
724
1039
  # Collect tensors from generator object
725
1040
  model_kv = dict(self.get_tensors())
@@ -762,6 +1077,8 @@ class OrionModel(Model):
762
1077
  logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
763
1078
  self.gguf_writer.add_tensor(new_name, data)
764
1079
 
1080
+ =======
1081
+ >>>>>>> uupstream/master
765
1082
 
766
1083
  @Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
767
1084
  class BaichuanModel(Model):
@@ -797,20 +1114,18 @@ class BaichuanModel(Model):
797
1114
  self.gguf_writer.add_head_count(head_count)
798
1115
  self.gguf_writer.add_head_count_kv(head_count_kv)
799
1116
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1117
+ self.gguf_writer.add_file_type(self.ftype)
800
1118
 
801
1119
  if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
802
1120
  if self.hparams["rope_scaling"].get("type") == "linear":
803
1121
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
804
1122
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
805
1123
 
806
- def write_tensors(self):
807
- # Collect tensors from generator object
808
- model_kv = dict(self.get_tensors())
809
- block_count = self.hparams["num_hidden_layers"]
1124
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
810
1125
  head_count = self.hparams["num_attention_heads"]
811
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
812
1126
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
813
1127
 
1128
+ <<<<<<< HEAD
814
1129
  for i in range(block_count):
815
1130
  if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
816
1131
  logger.info(f"Unpacking and permuting layer {i}")
@@ -821,12 +1136,24 @@ class BaichuanModel(Model):
821
1136
  model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
822
1137
  self._reverse_hf_part(w, 2)
823
1138
  del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
1139
+ =======
1140
+ tensors: list[tuple[str, Tensor]] = []
1141
+ >>>>>>> uupstream/master
1142
+
1143
+ if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
1144
+ logger.info(f"Unpacking and permuting layer {bid}")
1145
+ tensors = [
1146
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid),
1147
+ self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)),
1148
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid),
1149
+ self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)),
1150
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid),
1151
+ self._reverse_hf_part(data_torch, 2)),
1152
+ ]
1153
+ else:
1154
+ tensors = [(self.map_tensor_name(name), data_torch)]
824
1155
 
825
- for name, data_torch in model_kv.items():
826
- # we don't need these
827
- if name.endswith(".rotary_emb.inv_freq"):
828
- continue
829
-
1156
+ <<<<<<< HEAD
830
1157
  old_dtype = data_torch.dtype
831
1158
 
832
1159
  # convert any unsupported data types to float32
@@ -857,6 +1184,9 @@ class BaichuanModel(Model):
857
1184
 
858
1185
  logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
859
1186
  self.gguf_writer.add_tensor(new_name, data)
1187
+ =======
1188
+ return tensors
1189
+ >>>>>>> uupstream/master
860
1190
 
861
1191
  def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
862
1192
  if n_kv_head is not None and n_head != n_kv_head:
@@ -888,7 +1218,7 @@ class XverseModel(Model):
888
1218
  dir_model = self.dir_model
889
1219
  hparams = self.hparams
890
1220
 
891
- tokens: list[bytearray] = []
1221
+ tokens: list[bytes] = []
892
1222
  toktypes: list[int] = []
893
1223
 
894
1224
  from transformers import AutoTokenizer
@@ -896,7 +1226,7 @@ class XverseModel(Model):
896
1226
  vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
897
1227
  assert max(tokenizer.vocab.values()) < vocab_size
898
1228
 
899
- reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
1229
+ reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
900
1230
  added_vocab = tokenizer.get_added_vocab()
901
1231
 
902
1232
  for token_id in range(vocab_size):
@@ -953,25 +1283,26 @@ class XverseModel(Model):
953
1283
  self.gguf_writer.add_head_count(head_count)
954
1284
  self.gguf_writer.add_head_count_kv(head_count_kv)
955
1285
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1286
+ self.gguf_writer.add_file_type(self.ftype)
956
1287
 
957
1288
  if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
958
1289
  if self.hparams["rope_scaling"].get("type") == "linear":
959
1290
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
960
1291
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
961
1292
 
962
- def write_tensors(self):
963
- # Collect tensors from generator object
964
- model_kv = dict(self.get_tensors())
965
- block_count = self.hparams["num_hidden_layers"]
1293
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1294
+ del bid # unused
1295
+
966
1296
  head_count = self.hparams["num_attention_heads"]
967
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
968
1297
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
969
1298
 
970
- for name, data_torch in model_kv.items():
971
- # we don't need these
972
- if name.endswith(".rotary_emb.inv_freq"):
973
- continue
1299
+ # HF models permute some of the tensors, so we need to undo that
1300
+ if name.endswith("q_proj.weight"):
1301
+ data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
1302
+ if name.endswith("k_proj.weight"):
1303
+ data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
974
1304
 
1305
+ <<<<<<< HEAD
975
1306
  old_dtype = data_torch.dtype
976
1307
 
977
1308
  # convert any unsupported data types to float32
@@ -1008,6 +1339,9 @@ class XverseModel(Model):
1008
1339
 
1009
1340
  logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1010
1341
  self.gguf_writer.add_tensor(new_name, data)
1342
+ =======
1343
+ return [(self.map_tensor_name(name), data_torch)]
1344
+ >>>>>>> uupstream/master
1011
1345
 
1012
1346
  def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1013
1347
  if n_kv_head is not None and n_head != n_kv_head:
@@ -1048,22 +1382,31 @@ class FalconModel(Model):
1048
1382
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
1049
1383
  self.gguf_writer.add_file_type(self.ftype)
1050
1384
 
1051
- def write_tensors(self):
1052
- block_count = self.hparams.get("num_hidden_layers")
1053
- if block_count is None:
1054
- block_count = self.hparams["n_layer"] # old name
1055
-
1056
- n_head = self.hparams.get("num_attention_heads")
1057
- if n_head is None:
1058
- n_head = self.hparams["n_head"] # old name
1059
-
1060
- n_head_kv = self.hparams.get("num_kv_heads")
1061
- if n_head_kv is None:
1062
- n_head_kv = self.hparams.get("n_head_kv", 1) # old name
1063
-
1064
- head_dim = self.hparams["hidden_size"] // n_head
1065
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1066
-
1385
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1386
+ del bid # unused
1387
+
1388
+ # QKV tensor transform
1389
+ # The original query_key_value tensor contains n_head_kv "kv groups",
1390
+ # each consisting of n_head/n_head_kv query weights followed by one key
1391
+ # and one value weight (shared by all query heads in the kv group).
1392
+ # This layout makes it a big pain to work with in GGML.
1393
+ # So we rearrange them here,, so that we have n_head query weights
1394
+ # followed by n_head_kv key weights followed by n_head_kv value weights,
1395
+ # in contiguous fashion.
1396
+ # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
1397
+
1398
+ if "query_key_value" in name:
1399
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
1400
+ n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1
1401
+ head_dim = self.hparams["hidden_size"] // n_head
1402
+
1403
+ qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
1404
+ q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
1405
+ k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
1406
+ v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
1407
+ data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
1408
+
1409
+ <<<<<<< HEAD
1067
1410
  for name, data_torch in self.get_tensors():
1068
1411
  old_dtype = data_torch.dtype
1069
1412
 
@@ -1113,6 +1456,9 @@ class FalconModel(Model):
1113
1456
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1114
1457
 
1115
1458
  self.gguf_writer.add_tensor(new_name, data)
1459
+ =======
1460
+ return [(self.map_tensor_name(name), data_torch)]
1461
+ >>>>>>> uupstream/master
1116
1462
 
1117
1463
 
1118
1464
  @Model.register("GPTBigCodeForCausalLM")
@@ -1137,6 +1483,18 @@ class StarCoderModel(Model):
1137
1483
  class RefactModel(Model):
1138
1484
  model_arch = gguf.MODEL_ARCH.REFACT
1139
1485
 
1486
+ def set_vocab(self):
1487
+ super().set_vocab()
1488
+
1489
+ # TODO: how to determine special FIM tokens automatically?
1490
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
1491
+ special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
1492
+ special_vocab._set_special_token("prefix", 1)
1493
+ special_vocab._set_special_token("suffix", 3)
1494
+ special_vocab._set_special_token("middle", 2)
1495
+ special_vocab._set_special_token("fsep", 4) # is this correct?
1496
+ special_vocab.add_to_gguf(self.gguf_writer)
1497
+
1140
1498
  def set_gguf_parameters(self):
1141
1499
  hidden_dim = self.hparams["n_embd"]
1142
1500
  inner_dim = 4 * hidden_dim
@@ -1158,7 +1516,7 @@ class RefactModel(Model):
1158
1516
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
1159
1517
  self.gguf_writer.add_file_type(self.ftype)
1160
1518
 
1161
- def write_tensors(self):
1519
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1162
1520
  hidden_dim = self.hparams["n_embd"]
1163
1521
  inner_dim = 4 * hidden_dim
1164
1522
  hidden_dim = int(2 * inner_dim / 3)
@@ -1167,27 +1525,23 @@ class RefactModel(Model):
1167
1525
  n_head = self.hparams["n_head"]
1168
1526
  n_head_kv = 1
1169
1527
  head_dim = self.hparams["n_embd"] // n_head
1170
- block_count = self.hparams["n_layer"]
1171
1528
 
1172
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1529
+ tensors: list[tuple[str, Tensor]] = []
1173
1530
 
1174
- tensors = dict(self.get_tensors())
1175
- for i in range(block_count):
1176
- if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
1177
- tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
1178
- tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
1179
- del tensors[f"transformer.h.{i}.attn.kv.weight"]
1180
- if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
1181
- tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
1182
- del tensors[f"transformer.h.{i}.attn.q.weight"]
1183
- if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
1184
- tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
1185
- tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
1186
- del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
1187
-
1188
- for name, data_torch in tensors.items():
1189
- old_dtype = data_torch.dtype
1531
+ if bid is not None:
1532
+ if name == f"transformer.h.{bid}.attn.kv.weight":
1533
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim]))
1534
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:]))
1535
+ elif name == f"transformer.h.{bid}.attn.q.weight":
1536
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch))
1537
+ elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight":
1538
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]))
1539
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]))
1190
1540
 
1541
+ if len(tensors) == 0:
1542
+ tensors.append((self.map_tensor_name(name), data_torch))
1543
+
1544
+ <<<<<<< HEAD
1191
1545
  # convert any unsupported data types to float32
1192
1546
  if data_torch.dtype not in (torch.float16, torch.float32):
1193
1547
  data_torch = data_torch.to(torch.float32)
@@ -1267,6 +1621,9 @@ class PersimmonModel(Model):
1267
1621
  n_dims = len(data.shape)
1268
1622
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1269
1623
  self.gguf_writer.add_tensor(new_name, data)
1624
+ =======
1625
+ return tensors
1626
+ >>>>>>> uupstream/master
1270
1627
 
1271
1628
 
1272
1629
  @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
@@ -1295,6 +1652,69 @@ class StableLMModel(Model):
1295
1652
  self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
1296
1653
  self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
1297
1654
  self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
1655
+ self.gguf_writer.add_file_type(self.ftype)
1656
+
1657
+ _q_norms: list[dict[str, Tensor]] | None = None
1658
+ _k_norms: list[dict[str, Tensor]] | None = None
1659
+
1660
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1661
+ n_head = self.hparams["num_attention_heads"]
1662
+ n_kv_head = self.hparams["num_key_value_heads"]
1663
+
1664
+ if name.find("q_layernorm.norms") != -1:
1665
+ assert bid is not None
1666
+
1667
+ if self._q_norms is None:
1668
+ self._q_norms = [{} for _ in range(self.block_count)]
1669
+
1670
+ self._q_norms[bid][name] = data_torch
1671
+
1672
+ if len(self._q_norms[bid]) >= n_head:
1673
+ return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm")
1674
+ else:
1675
+ return []
1676
+
1677
+ if name.find("k_layernorm.norms") != -1:
1678
+ assert bid is not None
1679
+
1680
+ if self._k_norms is None:
1681
+ self._k_norms = [{} for _ in range(self.block_count)]
1682
+
1683
+ self._k_norms[bid][name] = data_torch
1684
+
1685
+ if len(self._k_norms[bid]) >= n_kv_head:
1686
+ return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm")
1687
+ else:
1688
+ return []
1689
+
1690
+ return [(self.map_tensor_name(name), data_torch)]
1691
+
1692
+ def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"):
1693
+ datas: list[Tensor] = []
1694
+ # extract the norms in order
1695
+ for xid in range(n_head):
1696
+ ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
1697
+ datas.append(norms[ename])
1698
+ del norms[ename]
1699
+ data_torch = torch.stack(datas, dim=0)
1700
+
1701
+ merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
1702
+ new_name = self.map_tensor_name(merged_name)
1703
+
1704
+ return [(new_name, data_torch)]
1705
+
1706
+ def write_tensors(self):
1707
+ super().write_tensors()
1708
+
1709
+ if self._q_norms is not None or self._k_norms is not None:
1710
+ # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
1711
+ norms = (
1712
+ [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else []
1713
+ ) + (
1714
+ [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else []
1715
+ )
1716
+ if len(norms) > 0:
1717
+ raise ValueError(f"Unprocessed norms: {norms}")
1298
1718
 
1299
1719
  def write_tensors(self):
1300
1720
  block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -1413,6 +1833,7 @@ class LlamaModel(Model):
1413
1833
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1414
1834
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1415
1835
 
1836
+ <<<<<<< HEAD
1416
1837
  # Same as super class, but permuting q_proj, k_proj
1417
1838
  def write_tensors(self):
1418
1839
  block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -1425,64 +1846,75 @@ class LlamaModel(Model):
1425
1846
  # we don't need these
1426
1847
  if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
1427
1848
  continue
1849
+ =======
1850
+ @staticmethod
1851
+ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1852
+ if n_head_kv is not None and n_head != n_head_kv:
1853
+ n_head = n_head_kv
1854
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1855
+ .swapaxes(1, 2)
1856
+ .reshape(weights.shape))
1428
1857
 
1429
- old_dtype = data_torch.dtype
1858
+ _experts: list[dict[str, Tensor]] | None = None
1430
1859
 
1431
- # convert any unsupported data types to float32
1432
- if data_torch.dtype not in (torch.float16, torch.float32):
1433
- data_torch = data_torch.to(torch.float32)
1860
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1861
+ n_head = self.hparams["num_attention_heads"]
1862
+ n_kv_head = self.hparams.get("num_key_value_heads")
1863
+ >>>>>>> uupstream/master
1434
1864
 
1435
- data = data_torch.numpy()
1865
+ if name.endswith("q_proj.weight"):
1866
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1867
+ if name.endswith("k_proj.weight"):
1868
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1436
1869
 
1437
- if name.endswith("q_proj.weight"):
1438
- data = permute(data, n_head, n_head)
1439
- if name.endswith("k_proj.weight"):
1440
- data = permute(data, n_head, n_kv_head)
1870
+ # process the experts separately
1871
+ if name.find("block_sparse_moe.experts") != -1:
1872
+ n_experts = self.hparams["num_local_experts"]
1441
1873
 
1442
- data = data.squeeze()
1874
+ assert bid is not None
1443
1875
 
1444
- # process the experts separately
1445
- if name.find("block_sparse_moe.experts") != -1:
1446
- experts[name] = data
1447
- if len(experts) >= n_experts:
1448
- # merge the experts into a single 3d tensor
1449
- for bid in range(block_count):
1450
- for wid in range(1, 4):
1451
- full = True
1452
- for xid in range(n_experts):
1453
- ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
1454
- if ename not in experts:
1455
- full = False
1456
- break
1457
- if not full:
1458
- continue
1876
+ if self._experts is None:
1877
+ self._experts = [{} for _ in range(self.block_count)]
1459
1878
 
1460
- datas = []
1461
- for xid in range(n_experts):
1462
- ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
1463
- datas.append(experts[ename])
1464
- del experts[ename]
1879
+ self._experts[bid][name] = data_torch
1465
1880
 
1466
- data = np.stack(datas, axis=0)
1467
- data_dtype = data.dtype
1881
+ if len(self._experts[bid]) >= n_experts * 3:
1882
+ tensors: list[tuple[str, Tensor]] = []
1468
1883
 
1469
- if self.ftype == 0 and data_dtype == np.float16:
1470
- data = data.astype(np.float32)
1884
+ # merge the experts into a single 3d tensor
1885
+ for wid in ["w1", "w2", "w3"]:
1886
+ datas: list[Tensor] = []
1471
1887
 
1472
- if self.ftype == 1 and data_dtype == np.float32:
1473
- data = data.astype(np.float16)
1888
+ for xid in range(n_experts):
1889
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
1890
+ datas.append(self._experts[bid][ename])
1891
+ del self._experts[bid][ename]
1892
+
1893
+ data_torch = torch.stack(datas, dim=0)
1474
1894
 
1475
- merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"
1895
+ merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
1476
1896
 
1897
+ new_name = self.map_tensor_name(merged_name)
1898
+
1899
+ <<<<<<< HEAD
1477
1900
  new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1478
1901
  if new_name is None:
1479
1902
  raise ValueError(f"Can not map tensor {name!r}")
1480
1903
 
1481
1904
  logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1905
+ =======
1906
+ tensors.append((new_name, data_torch))
1907
+ return tensors
1908
+ else:
1909
+ return []
1482
1910
 
1483
- self.gguf_writer.add_tensor(new_name, data)
1484
- continue
1911
+ return [(self.map_tensor_name(name), data_torch)]
1912
+ >>>>>>> uupstream/master
1485
1913
 
1914
+ def write_tensors(self):
1915
+ super().write_tensors()
1916
+
1917
+ <<<<<<< HEAD
1486
1918
  # map tensor names
1487
1919
  new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1488
1920
  if new_name is None:
@@ -1509,6 +1941,13 @@ class LlamaModel(Model):
1509
1941
 
1510
1942
  if len(experts) > 0:
1511
1943
  raise ValueError(f"Unprocessed experts: {experts.keys()}")
1944
+ =======
1945
+ if self._experts is not None:
1946
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
1947
+ experts = [k for d in self._experts for k in d.keys()]
1948
+ if len(experts) > 0:
1949
+ raise ValueError(f"Unprocessed experts: {experts}")
1950
+ >>>>>>> uupstream/master
1512
1951
 
1513
1952
 
1514
1953
  @Model.register("GrokForCausalLM")
@@ -1525,86 +1964,79 @@ class GrokModel(Model):
1525
1964
  super().set_gguf_parameters()
1526
1965
  self.gguf_writer.add_name("Grok")
1527
1966
 
1528
- def write_tensors(self):
1529
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1530
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1531
- n_experts = self.hparams.get("num_local_experts")
1532
- experts = dict()
1533
- for name, data_torch in self.get_tensors():
1534
- # we don't need these
1535
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1536
- continue
1967
+ _experts: list[dict[str, Tensor]] | None = None
1537
1968
 
1538
- old_dtype = data_torch.dtype
1969
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1970
+ # process the experts separately
1971
+ if name.find(".moe.") != -1:
1972
+ n_experts = self.hparams["num_local_experts"]
1539
1973
 
1540
- # convert any unsupported data types to float32
1541
- if data_torch.dtype not in (torch.float16, torch.float32):
1542
- data_torch = data_torch.to(torch.float32)
1974
+ assert bid is not None
1543
1975
 
1544
- data = data_torch.squeeze().numpy()
1976
+ if self._experts is None:
1977
+ self._experts = [{} for _ in range(self.block_count)]
1545
1978
 
1546
- # process the experts separately
1547
- if name.find(".moe.") != -1:
1548
- experts[name] = data
1549
- if len(experts) >= n_experts:
1550
- # merge the experts into a single 3d tensor
1551
- for bid in range(block_count):
1552
- for wid in ["linear", "linear_1", "linear_v"]:
1553
- full = True
1554
- for xid in range(n_experts):
1555
- ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
1556
- if ename not in experts:
1557
- full = False
1558
- break
1559
- if not full:
1560
- continue
1979
+ self._experts[bid][name] = data_torch
1561
1980
 
1562
- datas = []
1563
- for xid in range(n_experts):
1564
- ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
1565
- datas.append(experts[ename])
1566
- del experts[ename]
1981
+ if len(self._experts[bid]) >= n_experts * 3:
1982
+ tensors: list[tuple[str, Tensor]] = []
1567
1983
 
1568
- data = np.stack(datas, axis=0)
1569
- data_dtype = data.dtype
1984
+ # merge the experts into a single 3d tensor
1985
+ for wid in ["linear", "linear_1", "linear_v"]:
1986
+ datas: list[Tensor] = []
1570
1987
 
1571
- if self.ftype == 0 and data_dtype == np.float16:
1572
- data = data.astype(np.float32)
1988
+ for xid in range(n_experts):
1989
+ ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
1990
+ datas.append(self._experts[bid][ename])
1991
+ del self._experts[bid][ename]
1573
1992
 
1574
- if self.ftype == 1 and data_dtype == np.float32:
1575
- data = data.astype(np.float16)
1993
+ data_torch = torch.stack(datas, dim=0)
1576
1994
 
1577
- merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
1995
+ merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
1578
1996
 
1997
+ <<<<<<< HEAD
1579
1998
  new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1580
1999
  if new_name is None:
1581
2000
  raise ValueError(f"Can not map tensor {name!r}")
1582
2001
 
1583
2002
  logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
2003
+ =======
2004
+ new_name = self.map_tensor_name(merged_name)
1584
2005
 
1585
- self.gguf_writer.add_tensor(new_name, data)
1586
- continue
2006
+ tensors.append((new_name, data_torch))
2007
+ return tensors
2008
+ else:
2009
+ return []
2010
+ >>>>>>> uupstream/master
1587
2011
 
2012
+ return [(self.map_tensor_name(name), data_torch)]
2013
+
2014
+ <<<<<<< HEAD
1588
2015
  # map tensor names
1589
2016
  new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1590
2017
  if new_name is None:
1591
2018
  raise ValueError(f"Can not map tensor {name!r}")
2019
+ =======
2020
+ >>>>>>> uupstream/master
1592
2021
 
1593
- n_dims = len(data.shape)
1594
- data_dtype = data.dtype
2022
+ @Model.register("DbrxForCausalLM")
2023
+ class DbrxModel(Model):
2024
+ model_arch = gguf.MODEL_ARCH.DBRX
1595
2025
 
1596
- # if f32 desired, convert any float16 to float32
1597
- if self.ftype == 0 and data_dtype == np.float16:
1598
- data = data.astype(np.float32)
2026
+ def set_gguf_parameters(self):
2027
+ ffn_config = self.hparams["ffn_config"]
2028
+ attn_config = self.hparams["attn_config"]
2029
+ self.gguf_writer.add_name(self.hparams["model_type"])
2030
+ self.gguf_writer.add_block_count(self.hparams["n_layers"])
1599
2031
 
1600
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1601
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1602
- data = data.astype(np.float32)
2032
+ self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
2033
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
2034
+ self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
1603
2035
 
1604
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1605
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1606
- data = data.astype(np.float16)
2036
+ self.gguf_writer.add_head_count(self.hparams["n_heads"])
2037
+ self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
1607
2038
 
2039
+ <<<<<<< HEAD
1608
2040
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1609
2041
 
1610
2042
  self.gguf_writer.add_tensor(new_name, data)
@@ -1796,8 +2228,60 @@ class DbrxModel(Model):
1796
2228
  data = data.astype(np.float16)
1797
2229
 
1798
2230
  print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
2231
+ =======
2232
+ self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
2233
+ >>>>>>> uupstream/master
1799
2234
 
1800
- self.gguf_writer.add_tensor(new_name, data)
2235
+ self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
2236
+ self.gguf_writer.add_file_type(self.ftype)
2237
+
2238
+ self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
2239
+ self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
2240
+
2241
+ self.gguf_writer.add_layer_norm_eps(1e-5)
2242
+
2243
+ self.gguf_writer.add_file_type(self.ftype)
2244
+ logger.info(f"gguf: file type = {self.ftype}")
2245
+
2246
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2247
+ del bid # unused
2248
+
2249
+ n_expert = self.hparams["ffn_config"]["moe_num_experts"]
2250
+ n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
2251
+ n_embd = self.hparams["d_model"]
2252
+
2253
+ # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
2254
+ # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
2255
+ # But llama.cpp moe graph works differently
2256
+ # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
2257
+ # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
2258
+ exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
2259
+ "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
2260
+ "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
2261
+ experts = False
2262
+
2263
+ for exp_tensor_name in exp_tensor_names.keys():
2264
+ if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
2265
+ experts = True
2266
+ data_torch = data_torch.view(n_expert, n_ff, n_embd)
2267
+ if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
2268
+ data_torch = data_torch.permute(*permute_tensor)
2269
+ break
2270
+
2271
+ # map tensor names
2272
+ # In MoE models the ffn tensors are typically most of the model weights,
2273
+ # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
2274
+ # Every other model has the weight names ending in .weight,
2275
+ # let's assume that is the convention which is not the case for dbrx:
2276
+ # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
2277
+ new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
2278
+
2279
+ return [(new_name, data_torch)]
2280
+
2281
+ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
2282
+ del name, new_name, bid # unused
2283
+
2284
+ return n_dims > 1
1801
2285
 
1802
2286
 
1803
2287
  @Model.register("MiniCPMForCausalLM")
@@ -1830,18 +2314,19 @@ class MiniCPMModel(Model):
1830
2314
  .reshape(weights.shape)
1831
2315
  )
1832
2316
 
1833
- def write_tensors(self):
1834
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1835
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1836
- n_head = self.hparams.get("num_attention_heads")
2317
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2318
+ del bid # unused
2319
+
2320
+ n_head = self.hparams["num_attention_heads"]
1837
2321
  n_kv_head = self.hparams.get("num_key_value_heads")
1838
- for name, data_torch in self.get_tensors():
1839
- # we don't need these
1840
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1841
- continue
1842
2322
 
1843
- old_dtype = data_torch.dtype
2323
+ # HF models permute some of the tensors, so we need to undo that
2324
+ if name.endswith(("q_proj.weight")):
2325
+ data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
2326
+ if name.endswith(("k_proj.weight")):
2327
+ data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
1844
2328
 
2329
+ <<<<<<< HEAD
1845
2330
  # convert any unsupported data types to float32
1846
2331
  if data_torch.dtype not in (torch.float16, torch.float32):
1847
2332
  data_torch = data_torch.to(torch.float32)
@@ -1877,6 +2362,9 @@ class MiniCPMModel(Model):
1877
2362
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1878
2363
 
1879
2364
  self.gguf_writer.add_tensor(new_name, data)
2365
+ =======
2366
+ return [(self.map_tensor_name(name), data_torch)]
2367
+ >>>>>>> uupstream/master
1880
2368
 
1881
2369
 
1882
2370
  @Model.register("QWenLMHeadModel")
@@ -1919,6 +2407,7 @@ class QwenModel(Model):
1919
2407
  self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1920
2408
  self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1921
2409
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
2410
+ <<<<<<< HEAD
1922
2411
 
1923
2412
  def write_tensors(self):
1924
2413
  block_count = self.hparams["num_hidden_layers"]
@@ -1959,6 +2448,9 @@ class QwenModel(Model):
1959
2448
 
1960
2449
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1961
2450
  self.gguf_writer.add_tensor(new_name, data)
2451
+ =======
2452
+ self.gguf_writer.add_file_type(self.ftype)
2453
+ >>>>>>> uupstream/master
1962
2454
 
1963
2455
 
1964
2456
  @Model.register("Qwen2ForCausalLM")
@@ -1981,6 +2473,7 @@ class Qwen2MoeModel(Model):
1981
2473
  if (n_experts := self.hparams.get("num_experts")) is not None:
1982
2474
  self.gguf_writer.add_expert_count(n_experts)
1983
2475
 
2476
+ <<<<<<< HEAD
1984
2477
  def write_tensors(self):
1985
2478
  block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1986
2479
  tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
@@ -2166,6 +2659,54 @@ class Qwen2MoeModel(Model):
2166
2659
 
2167
2660
  if len(experts) > 0:
2168
2661
  raise ValueError(f"Unprocessed experts: {experts.keys()}")
2662
+ =======
2663
+ _experts: list[dict[str, Tensor]] | None = None
2664
+
2665
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2666
+ # process the experts separately
2667
+ if name.find("experts") != -1:
2668
+ n_experts = self.hparams["num_experts"]
2669
+ assert bid is not None
2670
+
2671
+ if self._experts is None:
2672
+ self._experts = [{} for _ in range(self.block_count)]
2673
+
2674
+ self._experts[bid][name] = data_torch
2675
+
2676
+ if len(self._experts[bid]) >= n_experts * 3:
2677
+ tensors: list[tuple[str, Tensor]] = []
2678
+
2679
+ # merge the experts into a single 3d tensor
2680
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
2681
+ datas: list[Tensor] = []
2682
+
2683
+ for xid in range(n_experts):
2684
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2685
+ datas.append(self._experts[bid][ename])
2686
+ del self._experts[bid][ename]
2687
+
2688
+ data_torch = torch.stack(datas, dim=0)
2689
+
2690
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2691
+
2692
+ new_name = self.map_tensor_name(merged_name)
2693
+
2694
+ tensors.append((new_name, data_torch))
2695
+ return tensors
2696
+ else:
2697
+ return []
2698
+
2699
+ return [(self.map_tensor_name(name), data_torch)]
2700
+
2701
+ def write_tensors(self):
2702
+ super().write_tensors()
2703
+
2704
+ if self._experts is not None:
2705
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2706
+ experts = [k for d in self._experts for k in d.keys()]
2707
+ if len(experts) > 0:
2708
+ raise ValueError(f"Unprocessed experts: {experts}")
2709
+ >>>>>>> uupstream/master
2169
2710
 
2170
2711
 
2171
2712
  @Model.register("GPT2LMHeadModel")
@@ -2182,26 +2723,23 @@ class GPT2Model(Model):
2182
2723
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
2183
2724
  self.gguf_writer.add_file_type(self.ftype)
2184
2725
 
2185
- def write_tensors(self):
2186
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2187
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2726
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2727
+ del bid # unused
2188
2728
 
2189
- for name, data_torch in self.get_tensors():
2190
- # we don't need these
2191
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
2192
- continue
2729
+ tensors: list[tuple[str, Tensor]] = []
2193
2730
 
2194
- if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
2195
- data_torch = data_torch.transpose(1, 0)
2731
+ # we don't need these
2732
+ if name.endswith((".attn.bias", ".attn.masked_bias")):
2733
+ return tensors
2196
2734
 
2197
- old_dtype = data_torch.dtype
2735
+ if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
2736
+ data_torch = data_torch.transpose(1, 0)
2198
2737
 
2199
- # convert any unsupported data types to float32
2200
- if data_torch.dtype not in (torch.float16, torch.float32):
2201
- data_torch = data_torch.to(torch.float32)
2738
+ new_name = self.map_tensor_name(name)
2202
2739
 
2203
- data = data_torch.squeeze().numpy()
2740
+ tensors.append((new_name, data_torch))
2204
2741
 
2742
+ <<<<<<< HEAD
2205
2743
  # map tensor names
2206
2744
  new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2207
2745
  if new_name is None:
@@ -2230,6 +2768,13 @@ class GPT2Model(Model):
2230
2768
  if new_name == "token_embd.weight":
2231
2769
  logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2232
2770
  self.gguf_writer.add_tensor("output.weight", data)
2771
+ =======
2772
+ # note: GPT2 output is tied to (same as) wte in original model
2773
+ if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2774
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2775
+
2776
+ return tensors
2777
+ >>>>>>> uupstream/master
2233
2778
 
2234
2779
 
2235
2780
  @Model.register("PhiForCausalLM")
@@ -2269,7 +2814,12 @@ class Phi3MiniModel(Model):
2269
2814
  if not tokenizer_path.is_file():
2270
2815
  raise ValueError(f'Error: Missing {tokenizer_path}')
2271
2816
 
2817
+ <<<<<<< HEAD
2272
2818
  tokenizer = SentencePieceProcessor(str(tokenizer_path))
2819
+ =======
2820
+ tokenizer = SentencePieceProcessor()
2821
+ tokenizer.LoadFromFile(str(tokenizer_path))
2822
+ >>>>>>> uupstream/master
2273
2823
 
2274
2824
  vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2275
2825
 
@@ -2279,6 +2829,7 @@ class Phi3MiniModel(Model):
2279
2829
 
2280
2830
  for token_id in range(tokenizer.vocab_size()):
2281
2831
 
2832
+ <<<<<<< HEAD
2282
2833
  piece = tokenizer.id_to_piece(token_id)
2283
2834
  text = piece.encode("utf-8")
2284
2835
  score = tokenizer.get_score(token_id)
@@ -2291,6 +2842,20 @@ class Phi3MiniModel(Model):
2291
2842
  elif tokenizer.is_unused(token_id):
2292
2843
  toktype = SentencePieceTokenTypes.UNUSED
2293
2844
  elif tokenizer.is_byte(token_id):
2845
+ =======
2846
+ piece = tokenizer.IdToPiece(token_id)
2847
+ text = piece.encode("utf-8")
2848
+ score = tokenizer.GetScore(token_id)
2849
+
2850
+ toktype = SentencePieceTokenTypes.NORMAL
2851
+ if tokenizer.IsUnknown(token_id):
2852
+ toktype = SentencePieceTokenTypes.UNKNOWN
2853
+ elif tokenizer.IsControl(token_id):
2854
+ toktype = SentencePieceTokenTypes.CONTROL
2855
+ elif tokenizer.IsUnused(token_id):
2856
+ toktype = SentencePieceTokenTypes.UNUSED
2857
+ elif tokenizer.IsByte(token_id):
2858
+ >>>>>>> uupstream/master
2294
2859
  toktype = SentencePieceTokenTypes.BYTE
2295
2860
 
2296
2861
  tokens[token_id] = text
@@ -2312,6 +2877,41 @@ class Phi3MiniModel(Model):
2312
2877
  scores[token_id] = -1000.0
2313
2878
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2314
2879
 
2880
+ <<<<<<< HEAD
2881
+ =======
2882
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2883
+ if tokenizer_config_file.is_file():
2884
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2885
+ tokenizer_config_json = json.load(f)
2886
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
2887
+ for token_id, foken_data in added_tokens_decoder.items():
2888
+ token_id = int(token_id)
2889
+ token = foken_data["content"].encode("utf-8")
2890
+ if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
2891
+ assert tokens[token_id] == token
2892
+ tokens[token_id] = token
2893
+ scores[token_id] = -1000.0
2894
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2895
+ if foken_data.get("special"):
2896
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2897
+
2898
+ tokenizer_file = self.dir_model / 'tokenizer.json'
2899
+ if tokenizer_file.is_file():
2900
+ with open(tokenizer_file, "r", encoding="utf-8") as f:
2901
+ tokenizer_json = json.load(f)
2902
+ added_tokens = tokenizer_json.get("added_tokens", [])
2903
+ for foken_data in added_tokens:
2904
+ token_id = int(foken_data["id"])
2905
+ token = foken_data["content"].encode("utf-8")
2906
+ if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
2907
+ assert tokens[token_id] == token
2908
+ tokens[token_id] = token
2909
+ scores[token_id] = -1000.0
2910
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2911
+ if foken_data.get("special"):
2912
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2913
+
2914
+ >>>>>>> uupstream/master
2315
2915
  self.gguf_writer.add_tokenizer_model("llama")
2316
2916
  self.gguf_writer.add_tokenizer_pre("default")
2317
2917
  self.gguf_writer.add_token_list(tokens)
@@ -2324,6 +2924,7 @@ class Phi3MiniModel(Model):
2324
2924
  def set_gguf_parameters(self):
2325
2925
  block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
2326
2926
 
2927
+ <<<<<<< HEAD
2327
2928
  rot_pct = 1.0
2328
2929
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
2329
2930
  n_head = self.find_hparam(["num_attention_heads", "n_head"])
@@ -2341,6 +2942,61 @@ class Phi3MiniModel(Model):
2341
2942
  self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
2342
2943
  self.gguf_writer.add_file_type(self.ftype)
2343
2944
 
2945
+ =======
2946
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
2947
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
2948
+ n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
2949
+ rms_eps = self.find_hparam(["rms_norm_eps"])
2950
+ max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
2951
+ orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
2952
+ rope_dims = n_embd // n_head
2953
+
2954
+ self.gguf_writer.add_name("Phi3")
2955
+ self.gguf_writer.add_context_length(max_pos_embds)
2956
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
2957
+ self.gguf_writer.add_embedding_length(n_embd)
2958
+ self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
2959
+ self.gguf_writer.add_block_count(block_count)
2960
+ self.gguf_writer.add_head_count(n_head)
2961
+ self.gguf_writer.add_head_count_kv(n_head_kv)
2962
+ self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
2963
+ self.gguf_writer.add_rope_dimension_count(rope_dims)
2964
+ self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
2965
+ self.gguf_writer.add_file_type(self.ftype)
2966
+
2967
+ # write rope scaling for long context (128k) model
2968
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
2969
+ if (rope_scaling is None):
2970
+ return
2971
+
2972
+ scale = max_pos_embds / orig_max_pos_embds
2973
+
2974
+ rope_scaling_type = rope_scaling.get('type', '').lower()
2975
+ if len(rope_scaling_type) == 0:
2976
+ raise KeyError('Missing the required key rope_scaling.type')
2977
+
2978
+ if rope_scaling_type == 'su':
2979
+ attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
2980
+ elif rope_scaling_type == 'yarn':
2981
+ attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
2982
+ else:
2983
+ raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
2984
+
2985
+ self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
2986
+
2987
+ long_factors = rope_scaling.get('long_factor', None)
2988
+ short_factors = rope_scaling.get('short_factor', None)
2989
+
2990
+ if long_factors is None or short_factors is None:
2991
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
2992
+
2993
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
2994
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
2995
+
2996
+ self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
2997
+ self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
2998
+
2999
+ >>>>>>> uupstream/master
2344
3000
 
2345
3001
  @Model.register("PlamoForCausalLM")
2346
3002
  class PlamoModel(Model):
@@ -2361,6 +3017,7 @@ class PlamoModel(Model):
2361
3017
  self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2362
3018
  self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
2363
3019
  self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
3020
+ self.gguf_writer.add_file_type(self.ftype)
2364
3021
 
2365
3022
  def shuffle_attn_q_weight(self, data_torch):
2366
3023
  assert data_torch.size() == (5120, 5120)
@@ -2376,14 +3033,12 @@ class PlamoModel(Model):
2376
3033
  data_torch = torch.reshape(data_torch, (5120, 5120))
2377
3034
  return data_torch
2378
3035
 
2379
- def write_tensors(self):
2380
- block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
2381
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
3036
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3037
+ del bid # unused
2382
3038
 
2383
- for name, data_torch in self.get_tensors():
2384
- if "self_attn.rotary_emb.inv_freq" in name:
2385
- continue
3039
+ new_name = self.map_tensor_name(name)
2386
3040
 
3041
+ <<<<<<< HEAD
2387
3042
  # map tensor names
2388
3043
  new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2389
3044
  if new_name is None:
@@ -2421,6 +3076,15 @@ class PlamoModel(Model):
2421
3076
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2422
3077
 
2423
3078
  self.gguf_writer.add_tensor(new_name, data)
3079
+ =======
3080
+ # shuffle for broadcasting of gqa in ggml_mul_mat
3081
+ if new_name.endswith("attn_q.weight"):
3082
+ data_torch = self.shuffle_attn_q_weight(data_torch)
3083
+ elif new_name.endswith("attn_output.weight"):
3084
+ data_torch = self.shuffle_attn_output_weight(data_torch)
3085
+
3086
+ return [(new_name, data_torch)]
3087
+ >>>>>>> uupstream/master
2424
3088
 
2425
3089
 
2426
3090
  @Model.register("CodeShellForCausalLM")
@@ -2443,24 +3107,17 @@ class CodeShellModel(Model):
2443
3107
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2444
3108
  self.gguf_writer.add_rope_scaling_factor(1.0)
2445
3109
 
2446
- def write_tensors(self):
2447
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2448
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2449
- tensors = dict(self.get_tensors())
2450
- has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
2451
- for name, data_torch in tensors.items():
2452
- # we don't need these
2453
- if name.endswith((".attn.rotary_emb.inv_freq")):
2454
- continue
3110
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3111
+ del bid # unused
2455
3112
 
2456
- old_dtype = data_torch.dtype
3113
+ new_name = self.map_tensor_name(name)
2457
3114
 
2458
- # convert any unsupported data types to float32
2459
- if data_torch.dtype not in (torch.float16, torch.float32):
2460
- data_torch = data_torch.to(torch.float32)
3115
+ tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
2461
3116
 
2462
- data = data_torch.squeeze().numpy()
3117
+ if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3118
+ assert self.tensor_names is not None
2463
3119
 
3120
+ <<<<<<< HEAD
2464
3121
  # map tensor names
2465
3122
  new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2466
3123
  if new_name is None:
@@ -2488,6 +3145,13 @@ class CodeShellModel(Model):
2488
3145
  if not has_lm_head and name == "transformer.wte.weight":
2489
3146
  self.gguf_writer.add_tensor("output.weight", data)
2490
3147
  logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
3148
+ =======
3149
+ if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
3150
+ # copy tok_embd.weight to output.weight
3151
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
3152
+
3153
+ return tensors
3154
+ >>>>>>> uupstream/master
2491
3155
 
2492
3156
 
2493
3157
  @Model.register("InternLM2ForCausalLM")
@@ -2516,27 +3180,34 @@ class InternLM2Model(Model):
2516
3180
  sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2517
3181
  add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2518
3182
 
2519
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
3183
+ tokenizer = SentencePieceProcessor()
3184
+ tokenizer.LoadFromFile(str(tokenizer_path))
3185
+
2520
3186
  vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2521
3187
 
2522
3188
  for token_id in range(vocab_size):
2523
- piece = tokenizer.id_to_piece(token_id)
3189
+ piece = tokenizer.IdToPiece(token_id)
2524
3190
  text = piece.encode("utf-8")
2525
- score = tokenizer.get_score(token_id)
3191
+ score = tokenizer.GetScore(token_id)
2526
3192
  if text == b"\x00":
2527
3193
  # (TODO): fixme
2528
3194
  # Hack here and replace the \x00 characters.
3195
+ <<<<<<< HEAD
2529
3196
  logger.debug(f"InternLM2 convert token '{text}' to '🐉'!")
2530
3197
  text = "🐉"
3198
+ =======
3199
+ logger.warning(f"InternLM2 convert token '{text}' to '🐉'!")
3200
+ text = "🐉".encode("utf-8")
3201
+ >>>>>>> uupstream/master
2531
3202
 
2532
3203
  toktype = SentencePieceTokenTypes.NORMAL
2533
- if tokenizer.is_unknown(token_id):
3204
+ if tokenizer.IsUnknown(token_id):
2534
3205
  toktype = SentencePieceTokenTypes.UNKNOWN
2535
- elif tokenizer.is_control(token_id):
3206
+ elif tokenizer.IsControl(token_id):
2536
3207
  toktype = SentencePieceTokenTypes.CONTROL
2537
- elif tokenizer.is_unused(token_id):
3208
+ elif tokenizer.IsUnused(token_id):
2538
3209
  toktype = SentencePieceTokenTypes.UNUSED
2539
- elif tokenizer.is_byte(token_id):
3210
+ elif tokenizer.IsByte(token_id):
2540
3211
  toktype = SentencePieceTokenTypes.BYTE
2541
3212
 
2542
3213
  tokens.append(text)
@@ -2573,13 +3244,15 @@ in chat mode so that the conversation can end normally.")
2573
3244
  special_vocab.add_to_gguf(self.gguf_writer)
2574
3245
 
2575
3246
  def _try_get_sft_eos(self, tokenizer):
2576
- unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
2577
- im_end_list = tokenizer.encode('<|im_end|>')
3247
+ unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
3248
+ im_end_list = tokenizer.Encode('<|im_end|>')
3249
+ eos_token = None
2578
3250
  assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
2579
3251
  if len(unused_145_list) == 1:
2580
3252
  eos_token = unused_145_list[0]
2581
3253
  if len(im_end_list) == 1:
2582
3254
  eos_token = im_end_list[0]
3255
+ assert eos_token
2583
3256
  return eos_token
2584
3257
 
2585
3258
  def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
@@ -2599,7 +3272,9 @@ in chat mode so that the conversation can end normally.")
2599
3272
  self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
2600
3273
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2601
3274
  self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
3275
+ self.gguf_writer.add_file_type(self.ftype)
2602
3276
 
3277
+ <<<<<<< HEAD
2603
3278
  def post_write_tensors(self, tensor_map, name, data_torch):
2604
3279
  old_dtype = data_torch.dtype
2605
3280
 
@@ -2638,33 +3313,38 @@ in chat mode so that the conversation can end normally.")
2638
3313
  num_heads = self.hparams.get("num_attention_heads")
2639
3314
  num_kv_heads = self.hparams.get("num_key_value_heads")
2640
3315
  hidden_size = self.hparams.get("hidden_size")
3316
+ =======
3317
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3318
+ num_heads = self.hparams["num_attention_heads"]
3319
+ num_kv_heads = self.hparams["num_key_value_heads"]
3320
+ hidden_size = self.hparams["hidden_size"]
3321
+ >>>>>>> uupstream/master
2641
3322
  q_per_kv = num_heads // num_kv_heads
2642
3323
  head_dim = hidden_size // num_heads
2643
3324
  num_groups = num_heads // q_per_kv
2644
3325
 
2645
- block_count = self.hparams["num_hidden_layers"]
2646
- model_kv = dict(self.get_tensors())
2647
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2648
3326
  qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
2649
- for name, data_torch in model_kv.items():
2650
- # we don't need these
2651
- if name.endswith(".rotary_emb.inv_freq"):
2652
- continue
2653
3327
 
2654
- if re.match(qkv_pattern, name):
2655
- bid = re.findall(qkv_pattern, name)[0]
2656
- qkv = data_torch
2657
- qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
2658
- q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
2659
- # The model weights of q and k equire additional reshape.
2660
- q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
2661
- k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
2662
- v = rearrange(v, " o g n i -> o (g n i)").T
2663
- self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
2664
- self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
2665
- self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
2666
- else:
2667
- self.post_write_tensors(tensor_map, name, data_torch)
3328
+ if re.match(qkv_pattern, name):
3329
+ bid = re.findall(qkv_pattern, name)[0]
3330
+ qkv = data_torch
3331
+ # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
3332
+ qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
3333
+ q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
3334
+ # The model weights of q and k equire additional reshape.
3335
+ # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
3336
+ q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
3337
+ # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
3338
+ k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
3339
+ # v = rearrange(v, " o g n i -> o (g n i)").T
3340
+ v = v.reshape((v.shape[0], -1)).T
3341
+ return [
3342
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
3343
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
3344
+ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v),
3345
+ ]
3346
+ else:
3347
+ return [(self.map_tensor_name(name), data_torch)]
2668
3348
 
2669
3349
 
2670
3350
  @Model.register("BertModel", "CamembertModel")
@@ -2729,14 +3409,10 @@ class BertModel(Model):
2729
3409
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2730
3410
  special_vocab.add_to_gguf(self.gguf_writer)
2731
3411
 
2732
- def write_tensors(self):
2733
- tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
2734
- tensors = dict(self.get_tensors())
2735
- for name, data_torch in tensors.items():
2736
- # we are only using BERT for embeddings so we don't need the pooling layer
2737
- if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
2738
- continue # we don't need these
3412
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3413
+ del bid # unused
2739
3414
 
3415
+ <<<<<<< HEAD
2740
3416
  # map tensor names
2741
3417
  new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2742
3418
  if new_name is None:
@@ -2766,6 +3442,13 @@ class BertModel(Model):
2766
3442
  data = data.astype(new_dtype)
2767
3443
 
2768
3444
  self.gguf_writer.add_tensor(new_name, data)
3445
+ =======
3446
+ # we are only using BERT for embeddings so we don't need the pooling layer
3447
+ if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
3448
+ return [] # we don't need these
3449
+
3450
+ return [(self.map_tensor_name(name), data_torch)]
3451
+ >>>>>>> uupstream/master
2769
3452
 
2770
3453
 
2771
3454
  @Model.register("NomicBertModel")
@@ -2831,10 +3514,10 @@ class GemmaModel(Model):
2831
3514
  self.gguf_writer.add_value_length(hparams["head_dim"])
2832
3515
  self.gguf_writer.add_file_type(self.ftype)
2833
3516
 
2834
- def write_tensors(self):
2835
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2836
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
3517
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3518
+ del bid # unused
2837
3519
 
3520
+ <<<<<<< HEAD
2838
3521
  for name, data_torch in self.get_tensors():
2839
3522
  # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
2840
3523
  # To prevent errors, skip loading lm_head.weight.
@@ -2843,11 +3526,19 @@ class GemmaModel(Model):
2843
3526
  continue
2844
3527
 
2845
3528
  old_dtype = data_torch.dtype
2846
-
2847
- # convert any unsupported data types to float32
2848
- if data_torch.dtype not in (torch.float16, torch.float32):
2849
- data_torch = data_torch.to(torch.float32)
2850
-
3529
+ =======
3530
+ # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
3531
+ # To prevent errors, skip loading lm_head.weight.
3532
+ if name == "lm_head.weight":
3533
+ logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
3534
+ return []
3535
+ >>>>>>> uupstream/master
3536
+
3537
+ # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
3538
+ if name.endswith("norm.weight"):
3539
+ data_torch = data_torch + 1
3540
+
3541
+ <<<<<<< HEAD
2851
3542
  # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
2852
3543
  if name.endswith("norm.weight"):
2853
3544
  data_torch = data_torch + 1
@@ -2870,6 +3561,9 @@ class GemmaModel(Model):
2870
3561
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2871
3562
 
2872
3563
  self.gguf_writer.add_tensor(new_name, data)
3564
+ =======
3565
+ return [(self.map_tensor_name(name), data_torch)]
3566
+ >>>>>>> uupstream/master
2873
3567
 
2874
3568
 
2875
3569
  @Model.register("Starcoder2ForCausalLM")
@@ -2892,6 +3586,8 @@ class MambaModel(Model):
2892
3586
 
2893
3587
  if (self.dir_model / "tokenizer.json").is_file():
2894
3588
  self._set_vocab_gpt2()
3589
+ elif (self.dir_model / "tokenizer.model").is_file():
3590
+ self._set_vocab_sentencepiece()
2895
3591
  else:
2896
3592
  # Use the GPT-NeoX tokenizer when no tokenizer files are present
2897
3593
  tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
@@ -2899,28 +3595,48 @@ class MambaModel(Model):
2899
3595
  neox_reader = gguf.GGUFReader(tokenizer_path, "r")
2900
3596
 
2901
3597
  field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
3598
+ <<<<<<< HEAD
2902
3599
  self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
2903
3600
 
2904
3601
  field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
2905
3602
  self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
3603
+ =======
3604
+ self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
3605
+
3606
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
3607
+ self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
3608
+ >>>>>>> uupstream/master
2906
3609
 
2907
3610
  field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
3611
+ assert field
2908
3612
  self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
2909
3613
 
2910
3614
  field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
3615
+ assert field
2911
3616
  self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
2912
3617
 
2913
3618
  field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
3619
+ assert field
2914
3620
  self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
2915
3621
 
2916
3622
  field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
3623
+ <<<<<<< HEAD
2917
3624
  self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
2918
3625
 
2919
3626
  field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
2920
3627
  self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
3628
+ =======
3629
+ self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
3630
+
3631
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
3632
+ self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
3633
+ >>>>>>> uupstream/master
2921
3634
 
2922
3635
  field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
2923
- self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
3636
+ self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
3637
+
3638
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
3639
+ self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
2924
3640
 
2925
3641
  def set_gguf_parameters(self):
2926
3642
  d_model = self.find_hparam(["hidden_size", "d_model"])
@@ -2949,21 +3665,17 @@ class MambaModel(Model):
2949
3665
  self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
2950
3666
  self.gguf_writer.add_file_type(self.ftype)
2951
3667
 
2952
- def write_tensors(self):
2953
- block_count = self.hparams["n_layer"]
2954
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
3668
+ _tok_embd = None
2955
3669
 
2956
- tok_embd = None
2957
- tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
2958
- output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
3670
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3671
+ del bid # unused
2959
3672
 
2960
- for name, data_torch in self.get_tensors():
2961
- old_dtype = data_torch.dtype
3673
+ output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
3674
+ tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
2962
3675
 
2963
- # convert any unsupported data types to float32
2964
- if data_torch.dtype not in (torch.float16, torch.float32):
2965
- data_torch = data_torch.to(torch.float32)
3676
+ new_name = self.map_tensor_name(name)
2966
3677
 
3678
+ <<<<<<< HEAD
2967
3679
  # map tensor names
2968
3680
  new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2969
3681
  if new_name is None:
@@ -2980,9 +3692,26 @@ class MambaModel(Model):
2980
3692
  continue
2981
3693
  if new_name == tok_embd_name:
2982
3694
  tok_embd = data_torch
2983
-
2984
- data = data_torch.squeeze().numpy()
2985
-
3695
+ =======
3696
+ if name.endswith(".A_log"):
3697
+ logger.debug("A_log --> A ==> " + new_name)
3698
+ data_torch = -torch.exp(data_torch)
3699
+
3700
+ # assuming token_embd.weight is seen before output.weight
3701
+ if self._tok_embd is not None and new_name == output_name:
3702
+ if torch.equal(self._tok_embd, data_torch):
3703
+ logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
3704
+ return []
3705
+ elif new_name == tok_embd_name:
3706
+ self._tok_embd = data_torch
3707
+
3708
+ return [(new_name, data_torch)]
3709
+ >>>>>>> uupstream/master
3710
+
3711
+ def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
3712
+ del n_dims # unused
3713
+
3714
+ <<<<<<< HEAD
2986
3715
  n_dims = len(data.shape)
2987
3716
  data_dtype = data.dtype
2988
3717
 
@@ -3002,6 +3731,17 @@ class MambaModel(Model):
3002
3731
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3003
3732
 
3004
3733
  self.gguf_writer.add_tensor(new_name, data)
3734
+ =======
3735
+ return bid is not None and new_name in (
3736
+ self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
3737
+ gguf.MODEL_TENSOR.SSM_CONV1D,
3738
+ gguf.MODEL_TENSOR.SSM_X,
3739
+ gguf.MODEL_TENSOR.SSM_DT,
3740
+ gguf.MODEL_TENSOR.SSM_A,
3741
+ gguf.MODEL_TENSOR.SSM_D,
3742
+ ]
3743
+ )
3744
+ >>>>>>> uupstream/master
3005
3745
 
3006
3746
 
3007
3747
  @Model.register("CohereForCausalLM")
@@ -3013,7 +3753,8 @@ class CommandR2Model(Model):
3013
3753
 
3014
3754
  # max_position_embeddings = 8192 in config.json but model was actually
3015
3755
  # trained on 128k context length
3016
- self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
3756
+ # aya-23 models don't have model_max_length specified
3757
+ self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
3017
3758
 
3018
3759
  def set_gguf_parameters(self):
3019
3760
  super().set_gguf_parameters()
@@ -3035,6 +3776,7 @@ class OlmoModel(Model):
3035
3776
 
3036
3777
  # Same as super class, but permuting q_proj, k_proj
3037
3778
  # Copied from: LlamaModel
3779
+ <<<<<<< HEAD
3038
3780
  def write_tensors(self):
3039
3781
  block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
3040
3782
  tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
@@ -3079,11 +3821,252 @@ class OlmoModel(Model):
3079
3821
  logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3080
3822
 
3081
3823
  self.gguf_writer.add_tensor(new_name, data)
3824
+ =======
3825
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3826
+ del bid # unused
3827
+
3828
+ n_head = self.hparams["num_attention_heads"]
3829
+ n_kv_head = self.hparams.get("num_key_value_heads")
3830
+
3831
+ if name.endswith("q_proj.weight"):
3832
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
3833
+ if name.endswith("k_proj.weight"):
3834
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
3835
+
3836
+ return [(self.map_tensor_name(name), data_torch)]
3837
+
3838
+
3839
+ @Model.register("JinaBertModel", "JinaBertForMaskedLM")
3840
+ class JinaBertV2Model(BertModel):
3841
+ model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
3842
+
3843
+ def __init__(self, *args, **kwargs):
3844
+ super().__init__(*args, **kwargs)
3845
+ self.intermediate_size = self.hparams["intermediate_size"]
3846
+
3847
+ def get_tensors(self):
3848
+ for name, data in super().get_tensors():
3849
+ if 'gated_layers' in name:
3850
+ d1 = data[:self.intermediate_size, :]
3851
+ name1 = name.replace('gated_layers', 'gated_layers_w')
3852
+ d2 = data[self.intermediate_size:, :]
3853
+ name2 = name.replace('gated_layers', 'gated_layers_v')
3854
+ yield name1, d1
3855
+ yield name2, d2
3856
+ continue
3857
+
3858
+ yield name, data
3859
+
3860
+ def set_vocab(self, *args, **kwargs):
3861
+ tokenizer_class = 'BertTokenizer'
3862
+ with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
3863
+ tokenizer_class = json.load(f)['tokenizer_class']
3864
+
3865
+ if tokenizer_class == 'BertTokenizer':
3866
+ super().set_vocab()
3867
+ elif tokenizer_class == 'RobertaTokenizer':
3868
+ self._set_vocab_gpt2()
3869
+ self.gguf_writer.add_token_type_count(2)
3870
+ else:
3871
+ raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
3872
+ self.gguf_writer.add_add_bos_token(True)
3873
+ self.gguf_writer.add_add_eos_token(True)
3874
+
3875
+
3876
+ @Model.register("ArcticForCausalLM")
3877
+ class ArcticModel(Model):
3878
+ model_arch = gguf.MODEL_ARCH.ARCTIC
3879
+
3880
+ def set_vocab(self):
3881
+ # The reason for using a custom implementation here is that the
3882
+ # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
3883
+ # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
3884
+ from sentencepiece import SentencePieceProcessor
3885
+
3886
+ tokenizer_path = self.dir_model / 'tokenizer.model'
3887
+
3888
+ if not tokenizer_path.is_file():
3889
+ logger.error(f'Error: Missing {tokenizer_path}')
3890
+ sys.exit(1)
3891
+
3892
+ # Read the whole vocabulary from the tokenizer.model file
3893
+ tokenizer = SentencePieceProcessor()
3894
+ tokenizer.LoadFromFile(str(tokenizer_path))
3895
+
3896
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3897
+
3898
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3899
+ scores: list[float] = [-10000.0] * vocab_size
3900
+ toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
3901
+
3902
+ for token_id in range(tokenizer.vocab_size()):
3903
+
3904
+ piece = tokenizer.IdToPiece(token_id)
3905
+ text = piece.encode("utf-8")
3906
+ score = tokenizer.GetScore(token_id)
3907
+
3908
+ toktype = SentencePieceTokenTypes.NORMAL
3909
+ if tokenizer.IsUnknown(token_id):
3910
+ toktype = SentencePieceTokenTypes.UNKNOWN
3911
+ elif tokenizer.IsControl(token_id):
3912
+ toktype = SentencePieceTokenTypes.CONTROL
3913
+ elif tokenizer.IsUnused(token_id):
3914
+ toktype = SentencePieceTokenTypes.UNUSED
3915
+ elif tokenizer.IsByte(token_id):
3916
+ toktype = SentencePieceTokenTypes.BYTE
3917
+
3918
+ tokens[token_id] = text
3919
+ scores[token_id] = score
3920
+ toktypes[token_id] = toktype
3921
+
3922
+ # Use the added_tokens_decoder field from tokeniser_config.json as the source
3923
+ # of information about added/redefined tokens and modify them accordingly.
3924
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
3925
+ if tokenizer_config_file.is_file():
3926
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
3927
+ tokenizer_config_json = json.load(f)
3928
+
3929
+ if "added_tokens_decoder" in tokenizer_config_json:
3930
+ added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
3931
+ for token_id, token_json in added_tokens_decoder.items():
3932
+ token_id = int(token_id)
3933
+ if (token_id >= vocab_size):
3934
+ logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
3935
+ continue
3936
+
3937
+ token_content = token_json["content"]
3938
+ token_type = SentencePieceTokenTypes.USER_DEFINED
3939
+ token_score = -10000.0
3940
+
3941
+ # Map unk_token to UNKNOWN, other special tokens to CONTROL
3942
+ # Set the score to 0.0 as in the original tokenizer.model
3943
+ if ("special" in token_json) and token_json["special"]:
3944
+ if token_content == tokenizer_config_json["unk_token"]:
3945
+ token_type = SentencePieceTokenTypes.UNKNOWN
3946
+ else:
3947
+ token_type = SentencePieceTokenTypes.CONTROL
3948
+ token_score = 0.0
3949
+
3950
+ logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
3951
+ tokens[token_id] = token_content.encode("utf-8")
3952
+ toktypes[token_id] = token_type
3953
+ scores[token_id] = token_score
3954
+
3955
+ self.gguf_writer.add_tokenizer_model("llama")
3956
+ self.gguf_writer.add_tokenizer_pre("default")
3957
+ self.gguf_writer.add_token_list(tokens)
3958
+ self.gguf_writer.add_token_scores(scores)
3959
+ self.gguf_writer.add_token_types(toktypes)
3960
+
3961
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3962
+ special_vocab.add_to_gguf(self.gguf_writer)
3963
+
3964
+ def set_gguf_parameters(self):
3965
+ super().set_gguf_parameters()
3966
+ hparams = self.hparams
3967
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3968
+ self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
3969
+
3970
+ _experts: list[dict[str, Tensor]] | None = None
3971
+
3972
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3973
+ n_head = self.hparams["num_attention_heads"]
3974
+ n_kv_head = self.hparams.get("num_key_value_heads")
3975
+
3976
+ if name.endswith("q_proj.weight"):
3977
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
3978
+ if name.endswith("k_proj.weight"):
3979
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
3980
+
3981
+ # process the experts separately
3982
+ if name.find("block_sparse_moe.experts") != -1:
3983
+ n_experts = self.hparams["num_local_experts"]
3984
+
3985
+ assert bid is not None
3986
+
3987
+ if self._experts is None:
3988
+ self._experts = [{} for _ in range(self.block_count)]
3989
+
3990
+ self._experts[bid][name] = data_torch
3991
+
3992
+ if len(self._experts[bid]) >= n_experts * 3:
3993
+ tensors: list[tuple[str, Tensor]] = []
3994
+
3995
+ # merge the experts into a single 3d tensor
3996
+ for wid in ["w1", "w2", "w3"]:
3997
+ datas: list[Tensor] = []
3998
+
3999
+ for xid in range(n_experts):
4000
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
4001
+ datas.append(self._experts[bid][ename])
4002
+ del self._experts[bid][ename]
4003
+
4004
+ data_torch = torch.stack(datas, dim=0)
4005
+
4006
+ merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
4007
+
4008
+ new_name = self.map_tensor_name(merged_name)
4009
+
4010
+ tensors.append((new_name, data_torch))
4011
+ return tensors
4012
+ else:
4013
+ return []
4014
+
4015
+ return [(self.map_tensor_name(name), data_torch)]
4016
+
4017
+ def write_tensors(self):
4018
+ super().write_tensors()
4019
+
4020
+ if self._experts is not None:
4021
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
4022
+ experts = [k for d in self._experts for k in d.keys()]
4023
+ if len(experts) > 0:
4024
+ raise ValueError(f"Unprocessed experts: {experts}")
4025
+ >>>>>>> uupstream/master
3082
4026
 
3083
4027
 
3084
4028
  ###### CONVERSION LOGIC ######
3085
4029
 
3086
4030
 
4031
+ # tree of lazy tensors
4032
+ class LazyTorchTensor(gguf.LazyBase):
4033
+ _tensor_type = torch.Tensor
4034
+ # to keep the type-checker happy
4035
+ dtype: torch.dtype
4036
+ shape: torch.Size
4037
+
4038
+ # only used when converting a torch.Tensor to a np.ndarray
4039
+ _dtype_map: dict[torch.dtype, type] = {
4040
+ torch.float16: np.float16,
4041
+ torch.float32: np.float32,
4042
+ }
4043
+
4044
+ def numpy(self) -> gguf.LazyNumpyTensor:
4045
+ dtype = self._dtype_map[self.dtype]
4046
+ return gguf.LazyNumpyTensor(
4047
+ meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
4048
+ lazy=self._lazy,
4049
+ args=(self,),
4050
+ func=(lambda s: s[0].numpy())
4051
+ )
4052
+
4053
+ @classmethod
4054
+ def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
4055
+ return torch.empty(size=shape, dtype=dtype, device="meta")
4056
+
4057
+ @classmethod
4058
+ def __torch_function__(cls, func, types, args=(), kwargs=None):
4059
+ del types # unused
4060
+
4061
+ if kwargs is None:
4062
+ kwargs = {}
4063
+
4064
+ if func is torch.Tensor.numpy:
4065
+ return args[0].numpy()
4066
+
4067
+ return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
4068
+
4069
+
3087
4070
  def parse_args() -> argparse.Namespace:
3088
4071
  parser = argparse.ArgumentParser(
3089
4072
  description="Convert a huggingface model to a GGML compatible file")
@@ -3093,23 +4076,46 @@ def parse_args() -> argparse.Namespace:
3093
4076
  )
3094
4077
  parser.add_argument(
3095
4078
  "--awq-path", type=Path, default=None,
3096
- help="Path to scale awq cache file")
4079
+ help="Path to scale awq cache file",
4080
+ )
3097
4081
  parser.add_argument(
3098
4082
  "--outfile", type=Path,
3099
- help="path to write to; default: based on input",
4083
+ help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
4084
+ )
4085
+ parser.add_argument(
4086
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
4087
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
3100
4088
  )
3101
4089
  parser.add_argument(
3102
- "--outtype", type=str, choices=["f32", "f16"], default="f16",
3103
- help="output format - use f32 for float32, f16 for float16",
4090
+ "--bigendian", action="store_true",
4091
+ help="model is executed on big endian machine",
3104
4092
  )
3105
- parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
3106
4093
  parser.add_argument(
3107
4094
  "model", type=Path,
3108
4095
  help="directory containing model file",
3109
4096
  )
4097
+ <<<<<<< HEAD
3110
4098
  parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
3111
4099
  parser.add_argument("--model-name", type=str, default=None, help="name of the model")
3112
4100
  parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
4101
+ =======
4102
+ parser.add_argument(
4103
+ "--use-temp-file", action="store_true",
4104
+ help="use the tempfile library while processing (helpful when running out of memory, process killed)",
4105
+ )
4106
+ parser.add_argument(
4107
+ "--no-lazy", action="store_true",
4108
+ help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
4109
+ )
4110
+ parser.add_argument(
4111
+ "--model-name", type=str, default=None,
4112
+ help="name of the model",
4113
+ )
4114
+ parser.add_argument(
4115
+ "--verbose", action="store_true",
4116
+ help="increase output verbosity",
4117
+ )
4118
+ >>>>>>> uupstream/master
3113
4119
 
3114
4120
  return parser.parse_args()
3115
4121
 
@@ -3138,16 +4144,19 @@ def main() -> None:
3138
4144
  logger.error(f'Error: {args.model} is not a directory')
3139
4145
  sys.exit(1)
3140
4146
 
3141
- ftype_map = {
3142
- "f32": gguf.GGMLQuantizationType.F32,
3143
- "f16": gguf.GGMLQuantizationType.F16,
4147
+ ftype_map: dict[str, gguf.LlamaFileType] = {
4148
+ "f32": gguf.LlamaFileType.ALL_F32,
4149
+ "f16": gguf.LlamaFileType.MOSTLY_F16,
4150
+ "bf16": gguf.LlamaFileType.MOSTLY_BF16,
4151
+ "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
4152
+ "auto": gguf.LlamaFileType.GUESSED,
3144
4153
  }
3145
4154
 
3146
4155
  if args.outfile is not None:
3147
4156
  fname_out = args.outfile
3148
4157
  else:
3149
4158
  # output in the same directory as the model by default
3150
- fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
4159
+ fname_out = dir_model / 'ggml-model-{ftype}.gguf'
3151
4160
 
3152
4161
  logger.info(f"Loading model: {dir_model.name}")
3153
4162
 
@@ -3155,7 +4164,11 @@ def main() -> None:
3155
4164
 
3156
4165
  with torch.inference_mode():
3157
4166
  model_class = Model.from_model_architecture(hparams["architectures"][0])
4167
+ <<<<<<< HEAD
3158
4168
  model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
4169
+ =======
4170
+ model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
4171
+ >>>>>>> uupstream/master
3159
4172
 
3160
4173
  logger.info("Set model parameters")
3161
4174
  model_instance.set_gguf_parameters()
@@ -3163,7 +4176,10 @@ def main() -> None:
3163
4176
  logger.info("Set model tokenizer")
3164
4177
  model_instance.set_vocab()
3165
4178
 
4179
+ model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
4180
+
3166
4181
  if args.vocab_only:
4182
+ <<<<<<< HEAD
3167
4183
  logger.info(f"Exporting model vocab to '{fname_out}'")
3168
4184
  model_instance.write_vocab()
3169
4185
  else:
@@ -3171,6 +4187,15 @@ def main() -> None:
3171
4187
  model_instance.write()
3172
4188
 
3173
4189
  logger.info(f"Model successfully exported to '{fname_out}'")
4190
+ =======
4191
+ logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
4192
+ model_instance.write_vocab()
4193
+ else:
4194
+ logger.info(f"Exporting model to '{model_instance.fname_out}'")
4195
+ model_instance.write()
4196
+
4197
+ logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
4198
+ >>>>>>> uupstream/master
3174
4199
 
3175
4200
 
3176
4201
  if __name__ == '__main__':