bigdl-core-cpp 2.1.0b20230202__py3-none-manylinux2010_x86_64.whl → 2.1.0b20240820.post1__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. bigdl/cpp/convert-hf-to-gguf.py +1169 -311
  2. bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
  3. bigdl/cpp/gguf-py/gguf/constants.py +463 -167
  4. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  5. bigdl/cpp/gguf-py/gguf/gguf_reader.py +29 -8
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +475 -156
  7. bigdl/cpp/gguf-py/gguf/lazy.py +24 -49
  8. bigdl/cpp/gguf-py/gguf/metadata.py +503 -0
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +209 -23
  10. bigdl/cpp/gguf-py/gguf/utility.py +69 -0
  11. bigdl/cpp/libs/baby-llama +0 -0
  12. bigdl/cpp/libs/batched +0 -0
  13. bigdl/cpp/libs/batched-bench +0 -0
  14. bigdl/cpp/libs/benchmark +0 -0
  15. bigdl/cpp/libs/embedding +0 -0
  16. bigdl/cpp/libs/gguf +0 -0
  17. bigdl/cpp/libs/imatrix +0 -0
  18. bigdl/cpp/libs/llama-bench +0 -0
  19. bigdl/cpp/libs/llava-cli +0 -0
  20. bigdl/cpp/libs/lookahead +0 -0
  21. bigdl/cpp/libs/lookup +0 -0
  22. bigdl/cpp/libs/ls-sycl-device +0 -0
  23. bigdl/cpp/libs/main +0 -0
  24. bigdl/cpp/libs/ollama +0 -0
  25. bigdl/cpp/libs/perplexity +0 -0
  26. bigdl/cpp/libs/quantize +0 -0
  27. bigdl/cpp/libs/quantize-stats +0 -0
  28. bigdl/cpp/libs/save-load-state +0 -0
  29. bigdl/cpp/libs/server +0 -0
  30. bigdl/cpp/libs/speculative +0 -0
  31. bigdl/cpp/libs/tokenize +0 -0
  32. {bigdl_core_cpp-2.1.0b20230202.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/METADATA +8 -8
  33. bigdl_core_cpp-2.1.0b20240820.post1.dist-info/RECORD +45 -0
  34. {bigdl_core_cpp-2.1.0b20230202.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/WHEEL +1 -1
  35. bigdl/cpp/libs/export-lora +0 -0
  36. bigdl/cpp/libs/finetune +0 -0
  37. bigdl/cpp/libs/gritlm +0 -0
  38. bigdl/cpp/libs/infill +0 -0
  39. bigdl/cpp/libs/parallel +0 -0
  40. bigdl/cpp/libs/simple +0 -0
  41. bigdl/cpp/libs/train-text-from-scratch +0 -0
  42. bigdl_core_cpp-2.1.0b20230202.dist-info/RECORD +0 -50
  43. {bigdl_core_cpp-2.1.0b20230202.data → bigdl_core_cpp-2.1.0b20240820.post1.data}/scripts/init-llama-cpp +0 -0
  44. {bigdl_core_cpp-2.1.0b20230202.data → bigdl_core_cpp-2.1.0b20240820.post1.data}/scripts/init-ollama +0 -0
  45. {bigdl_core_cpp-2.1.0b20230202.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
2
3
 
3
4
  from __future__ import annotations
4
5
 
@@ -12,7 +13,7 @@ import sys
12
13
  from enum import IntEnum
13
14
  from pathlib import Path
14
15
  from hashlib import sha256
15
- from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
16
+ from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
16
17
 
17
18
  import math
18
19
  import numpy as np
@@ -25,10 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
25
26
  sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
26
27
  import gguf
27
28
 
28
- from convert import LlamaHfVocab
29
-
30
- logger = logging.getLogger("hf-to-gguf")
31
-
32
29
  logger = logging.getLogger("hf-to-gguf")
33
30
 
34
31
 
@@ -50,7 +47,8 @@ class Model:
50
47
  _model_classes: dict[str, type[Model]] = {}
51
48
 
52
49
  dir_model: Path
53
- ftype: int
50
+ ftype: gguf.LlamaFileType
51
+ fname_out: Path
54
52
  is_big_endian: bool
55
53
  endianess: gguf.GGUFEndian
56
54
  use_temp_file: bool
@@ -61,29 +59,41 @@ class Model:
61
59
  block_count: int
62
60
  tensor_map: gguf.TensorNameMap
63
61
  tensor_names: set[str] | None
64
- fname_out: Path
65
62
  gguf_writer: gguf.GGUFWriter
63
+ model_name: str | None
64
+ metadata_override: Path | None
65
+ dir_model_card: Path
66
66
 
67
67
  # subclasses should define this!
68
68
  model_arch: gguf.MODEL_ARCH
69
69
 
70
- def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
70
+ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
71
+ use_temp_file: bool = False, eager: bool = False,
72
+ metadata_override: Path | None = None, model_name: str | None = None,
73
+ split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
71
74
  if type(self) is Model:
72
75
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
76
+
73
77
  self.dir_model = dir_model
74
78
  self.ftype = ftype
79
+ self.fname_out = fname_out
75
80
  self.is_big_endian = is_big_endian
76
81
  self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
77
82
  self.use_temp_file = use_temp_file
78
83
  self.lazy = not eager
79
- self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
84
+ self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
80
85
  self.is_safetensors = len(self.part_names) > 0
81
86
  if not self.is_safetensors:
82
- self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
87
+ self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
83
88
  self.hparams = Model.load_hparams(self.dir_model)
84
- self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
89
+ self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
85
90
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
86
91
  self.tensor_names = None
92
+ self.metadata_override = metadata_override
93
+ self.model_name = model_name
94
+ self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
95
+
96
+ # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
87
97
  if self.ftype == gguf.LlamaFileType.GUESSED:
88
98
  # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
89
99
  _, first_tensor = next(self.get_tensors())
@@ -93,11 +103,10 @@ class Model:
93
103
  else:
94
104
  logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
95
105
  self.ftype = gguf.LlamaFileType.MOSTLY_BF16
96
- ftype_up: str = self.ftype.name.partition("_")[2].upper()
97
- ftype_lw: str = ftype_up.lower()
98
- # allow templating the file name with the output ftype, useful with the "auto" ftype
99
- self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
100
- self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
106
+
107
+ # Configure GGUF Writer
108
+ self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
109
+ split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
101
110
 
102
111
  @classmethod
103
112
  def __init_subclass__(cls):
@@ -147,9 +156,16 @@ class Model:
147
156
  tensor_names_from_parts.update(model_part.keys())
148
157
 
149
158
  for name in model_part.keys():
150
- data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
151
- if self.lazy:
152
- data = LazyTorchTensor.from_eager(data)
159
+ if self.is_safetensors:
160
+ if self.lazy:
161
+ data = model_part.get_slice(name)
162
+ data = LazyTorchTensor.from_safetensors_slice(data)
163
+ else:
164
+ data = model_part.get_tensor(name)
165
+ else:
166
+ data = model_part[name]
167
+ if self.lazy:
168
+ data = LazyTorchTensor.from_eager(data)
153
169
  yield name, data
154
170
 
155
171
  # only verify tensor name presence; it doesn't matter if they are not in the right files
@@ -185,7 +201,6 @@ class Model:
185
201
  return new_name
186
202
 
187
203
  def set_gguf_parameters(self):
188
- self.gguf_writer.add_name(self.dir_model.name)
189
204
  self.gguf_writer.add_block_count(self.block_count)
190
205
 
191
206
  if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@@ -224,6 +239,10 @@ class Model:
224
239
  self.gguf_writer.add_expert_used_count(n_experts_used)
225
240
  logger.info(f"gguf: experts used count = {n_experts_used}")
226
241
 
242
+ if (head_dim := self.hparams.get("head_dim")) is not None:
243
+ self.gguf_writer.add_key_length(head_dim)
244
+ self.gguf_writer.add_value_length(head_dim)
245
+
227
246
  self.gguf_writer.add_file_type(self.ftype)
228
247
  logger.info(f"gguf: file type = {self.ftype}")
229
248
 
@@ -242,7 +261,7 @@ class Model:
242
261
 
243
262
  return False
244
263
 
245
- def write_tensors(self):
264
+ def prepare_tensors(self):
246
265
  max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
247
266
 
248
267
  for name, data_torch in self.get_tensors():
@@ -264,7 +283,7 @@ class Model:
264
283
  break
265
284
 
266
285
  for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
267
- data: np.ndarray = data # type hint
286
+ data: np.ndarray # type hint
268
287
  n_dims = len(data.shape)
269
288
  data_dtype = data.dtype
270
289
  data_qtype: gguf.GGMLQuantizationType | None = None
@@ -325,23 +344,80 @@ class Model:
325
344
 
326
345
  self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
327
346
 
347
+ def set_type(self):
348
+ self.gguf_writer.add_type(gguf.GGUFType.MODEL)
349
+
350
+ def prepare_metadata(self, vocab_only: bool):
351
+
352
+ total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
353
+
354
+ self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
355
+
356
+ # Fallback to model directory name if metadata name is still missing
357
+ if self.metadata.name is None:
358
+ self.metadata.name = self.dir_model.name
359
+
360
+ # Generate parameter weight class (useful for leader boards) if not yet determined
361
+ if self.metadata.size_label is None and total_params > 0:
362
+ self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
363
+
364
+ # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
365
+ output_type: str = self.ftype.name.partition("_")[2]
366
+
367
+ # Filename Output
368
+ if self.fname_out.is_dir():
369
+ # Generate default filename based on model specification and available metadata
370
+ if not vocab_only:
371
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
372
+ else:
373
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
374
+
375
+ # Use the default filename
376
+ self.fname_out = self.fname_out / f"{fname_default}.gguf"
377
+ else:
378
+ # Output path is a custom defined templated filename
379
+ # Note: `not is_dir()` is used because `.is_file()` will not detect
380
+ # file template strings as it doesn't actually exist as a file
381
+
382
+ # Process templated file name with the output ftype, useful with the "auto" ftype
383
+ self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
384
+
385
+ self.set_type()
386
+
387
+ logger.info("Set meta model")
388
+ self.metadata.set_gguf_meta_model(self.gguf_writer)
389
+
390
+ logger.info("Set model parameters")
391
+ self.set_gguf_parameters()
392
+
393
+ logger.info("Set model tokenizer")
394
+ self.set_vocab()
395
+
396
+ logger.info("Set model quantization version")
397
+ self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
398
+
328
399
  def write(self):
329
- self.write_tensors()
330
- self.gguf_writer.write_header_to_file()
400
+ self.prepare_tensors()
401
+ self.prepare_metadata(vocab_only=False)
402
+ self.gguf_writer.write_header_to_file(path=self.fname_out)
331
403
  self.gguf_writer.write_kv_data_to_file()
332
404
  self.gguf_writer.write_tensors_to_file(progress=True)
333
405
  self.gguf_writer.close()
334
406
 
335
407
  def write_vocab(self):
336
- self.gguf_writer.write_header_to_file()
408
+ if len(self.gguf_writer.tensors) != 1:
409
+ raise ValueError('Splitting the vocabulary is not supported')
410
+
411
+ self.prepare_metadata(vocab_only=True)
412
+ self.gguf_writer.write_header_to_file(path=self.fname_out)
337
413
  self.gguf_writer.write_kv_data_to_file()
338
414
  self.gguf_writer.close()
339
415
 
340
416
  @staticmethod
341
- def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
417
+ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
342
418
  part_names: list[str] = []
343
419
  for filename in os.listdir(dir_model):
344
- if filename.endswith(suffix):
420
+ if filename.startswith(prefix) and filename.endswith(suffix):
345
421
  part_names.append(filename)
346
422
 
347
423
  part_names.sort()
@@ -370,6 +446,29 @@ class Model:
370
446
  except KeyError:
371
447
  raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
372
448
 
449
+ def does_token_look_special(self, token: str | bytes) -> bool:
450
+ if isinstance(token, (bytes, bytearray)):
451
+ token_text = token.decode(encoding="utf-8")
452
+ elif isinstance(token, memoryview):
453
+ token_text = token.tobytes().decode(encoding="utf-8")
454
+ else:
455
+ token_text = token
456
+
457
+ # Some models mark some added tokens which ought to be control tokens as not special.
458
+ # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
459
+ seems_special = token_text in (
460
+ "<pad>", # deepseek-coder
461
+ "<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
462
+ )
463
+
464
+ seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
465
+ seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
466
+
467
+ # TODO: should these be marked as UNUSED instead? (maybe not)
468
+ seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
469
+
470
+ return seems_special
471
+
373
472
  # used for GPT-2 BPE and WordPiece vocabs
374
473
  def get_vocab_base(self) -> tuple[list[str], list[int], str]:
375
474
  tokens: list[str] = []
@@ -388,20 +487,22 @@ class Model:
388
487
  for i in range(vocab_size):
389
488
  if i not in reverse_vocab:
390
489
  tokens.append(f"[PAD{i}]")
391
- toktypes.append(gguf.TokenType.USER_DEFINED)
392
- elif reverse_vocab[i] in added_vocab:
393
- tokens.append(reverse_vocab[i])
394
- if tokenizer.added_tokens_decoder[i].special:
395
- toktypes.append(gguf.TokenType.CONTROL)
396
- else:
397
- toktypes.append(gguf.TokenType.USER_DEFINED)
490
+ toktypes.append(gguf.TokenType.UNUSED)
398
491
  else:
399
- tokens.append(reverse_vocab[i])
400
- toktypes.append(gguf.TokenType.NORMAL)
492
+ token: str = reverse_vocab[i]
493
+ if token in added_vocab:
494
+ if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
495
+ toktypes.append(gguf.TokenType.CONTROL)
496
+ else:
497
+ token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
498
+ toktypes.append(gguf.TokenType.USER_DEFINED)
499
+ else:
500
+ toktypes.append(gguf.TokenType.NORMAL)
501
+ tokens.append(token)
401
502
 
402
503
  return tokens, toktypes, tokpre
403
504
 
404
- # NOTE: this function is generated by convert-hf-to-gguf-update.py
505
+ # NOTE: this function is generated by convert_hf_to_gguf_update.py
405
506
  # do not modify it manually!
406
507
  # ref: https://github.com/ggerganov/llama.cpp/pull/6920
407
508
  # Marker: Start get_vocab_base_pre
@@ -421,7 +522,7 @@ class Model:
421
522
 
422
523
  res = None
423
524
 
424
- # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
525
+ # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
425
526
  # or pull the latest version of the model from Huggingface
426
527
  # don't edit the hashes manually!
427
528
  if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
@@ -478,15 +579,39 @@ class Model:
478
579
  if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
479
580
  # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
480
581
  res = "smaug-bpe"
582
+ if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
583
+ # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
584
+ res = "poro-chat"
585
+ if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
586
+ # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
587
+ res = "jina-v2-code"
588
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
589
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
590
+ res = "chatglm-bpe"
591
+ if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
592
+ # ref: https://huggingface.co/LumiOpen/Viking-7B
593
+ res = "viking"
594
+ if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
595
+ # ref: https://huggingface.co/core42/jais-13b
596
+ res = "jais"
597
+ if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
598
+ # ref: https://huggingface.co/WisdomShell/CodeShell-7B
599
+ res = "codeshell"
600
+ if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
601
+ # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
602
+ res = "tekken"
603
+ if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
604
+ # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
605
+ res = "smollm"
481
606
 
482
607
  if res is None:
483
608
  logger.warning("\n")
484
609
  logger.warning("**************************************************************************************")
485
610
  logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
486
611
  logger.warning("** There are 2 possible reasons for this:")
487
- logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
612
+ logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
488
613
  logger.warning("** - the pre-tokenization config has changed upstream")
489
- logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
614
+ logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
490
615
  logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
491
616
  logger.warning("**")
492
617
  logger.warning(f"** chkhsh: {chkhsh}")
@@ -541,7 +666,7 @@ class Model:
541
666
  for i in range(vocab_size):
542
667
  if i not in reverse_vocab:
543
668
  tokens.append(f"[PAD{i}]")
544
- toktypes.append(gguf.TokenType.USER_DEFINED)
669
+ toktypes.append(gguf.TokenType.UNUSED)
545
670
  elif reverse_vocab[i] in added_vocab:
546
671
  tokens.append(reverse_vocab[i])
547
672
  toktypes.append(gguf.TokenType.CONTROL)
@@ -564,15 +689,23 @@ class Model:
564
689
  special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
565
690
  special_vocab.add_to_gguf(self.gguf_writer)
566
691
 
567
- def _set_vocab_sentencepiece(self):
692
+ def _set_vocab_sentencepiece(self, add_to_gguf=True):
693
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
694
+
695
+ self.gguf_writer.add_tokenizer_model("llama")
696
+ self.gguf_writer.add_tokenizer_pre("default")
697
+ self.gguf_writer.add_token_list(tokens)
698
+ self.gguf_writer.add_token_scores(scores)
699
+ self.gguf_writer.add_token_types(toktypes)
700
+
701
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
702
+ special_vocab.add_to_gguf(self.gguf_writer)
703
+
704
+ def _create_vocab_sentencepiece(self):
568
705
  from sentencepiece import SentencePieceProcessor
569
706
 
570
707
  tokenizer_path = self.dir_model / 'tokenizer.model'
571
708
 
572
- tokens: list[bytes] = []
573
- scores: list[float] = []
574
- toktypes: list[int] = []
575
-
576
709
  if not tokenizer_path.is_file():
577
710
  raise FileNotFoundError(f"File not found: {tokenizer_path}")
578
711
 
@@ -583,7 +716,7 @@ class Model:
583
716
 
584
717
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
585
718
  scores: list[float] = [-10000.0] * vocab_size
586
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
719
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
587
720
 
588
721
  for token_id in range(tokenizer.vocab_size()):
589
722
  piece = tokenizer.IdToPiece(token_id)
@@ -610,7 +743,7 @@ class Model:
610
743
  added_tokens_json = json.load(f)
611
744
  for key in added_tokens_json:
612
745
  token_id = added_tokens_json[key]
613
- if (token_id >= vocab_size):
746
+ if token_id >= vocab_size:
614
747
  logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
615
748
  continue
616
749
 
@@ -618,6 +751,26 @@ class Model:
618
751
  scores[token_id] = -1000.0
619
752
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
620
753
 
754
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
755
+ if tokenizer_config_file.is_file():
756
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
757
+ tokenizer_config_json = json.load(f)
758
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
759
+ for token_id, token_data in added_tokens_decoder.items():
760
+ token_id = int(token_id)
761
+ token: str = token_data["content"]
762
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
763
+ if tokens[token_id] != token.encode("utf-8"):
764
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
765
+ if token_data.get("special") or self.does_token_look_special(token):
766
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
767
+ else:
768
+ token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
769
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
770
+
771
+ scores[token_id] = -1000.0
772
+ tokens[token_id] = token.encode("utf-8")
773
+
621
774
  if vocab_size > len(tokens):
622
775
  pad_count = vocab_size - len(tokens)
623
776
  logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
@@ -626,17 +779,10 @@ class Model:
626
779
  scores.append(-1000.0)
627
780
  toktypes.append(SentencePieceTokenTypes.UNUSED)
628
781
 
629
- self.gguf_writer.add_tokenizer_model("llama")
630
- self.gguf_writer.add_tokenizer_pre("default")
631
- self.gguf_writer.add_token_list(tokens)
632
- self.gguf_writer.add_token_scores(scores)
633
- self.gguf_writer.add_token_types(toktypes)
634
-
635
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
636
- special_vocab.add_to_gguf(self.gguf_writer)
782
+ return tokens, scores, toktypes
637
783
 
638
784
  def _set_vocab_llama_hf(self):
639
- vocab = LlamaHfVocab(self.dir_model)
785
+ vocab = gguf.LlamaHfVocab(self.dir_model)
640
786
  tokens = []
641
787
  scores = []
642
788
  toktypes = []
@@ -657,6 +803,51 @@ class Model:
657
803
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
658
804
  special_vocab.add_to_gguf(self.gguf_writer)
659
805
 
806
+ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
807
+ tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
808
+ logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
809
+ vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
810
+
811
+ default_pre = "mpt" if model_name == "gpt-neox" else "default"
812
+
813
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
814
+ assert field # tokenizer model
815
+ self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
816
+
817
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
818
+ self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
819
+
820
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
821
+ assert field # token list
822
+ self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
823
+
824
+ if model_name == "llama-spm":
825
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
826
+ assert field # token scores
827
+ self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
828
+
829
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
830
+ assert field # token types
831
+ self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
832
+
833
+ if model_name != "llama-spm":
834
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
835
+ assert field # token merges
836
+ self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
837
+
838
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
839
+ self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
840
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
841
+ self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
842
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
843
+ self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
844
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
845
+ self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
846
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
847
+ self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
848
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
849
+ self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
850
+
660
851
 
661
852
  @Model.register("GPTNeoXForCausalLM")
662
853
  class GPTNeoXModel(Model):
@@ -665,7 +856,6 @@ class GPTNeoXModel(Model):
665
856
  def set_gguf_parameters(self):
666
857
  block_count = self.hparams["num_hidden_layers"]
667
858
 
668
- self.gguf_writer.add_name(self.dir_model.name)
669
859
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
670
860
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
671
861
  self.gguf_writer.add_block_count(block_count)
@@ -721,7 +911,6 @@ class BloomModel(Model):
721
911
  model_arch = gguf.MODEL_ARCH.BLOOM
722
912
 
723
913
  def set_gguf_parameters(self):
724
- self.gguf_writer.add_name("Bloom")
725
914
  n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
726
915
  n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
727
916
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
@@ -798,7 +987,6 @@ class MPTModel(Model):
798
987
 
799
988
  def set_gguf_parameters(self):
800
989
  block_count = self.hparams["n_layers"]
801
- self.gguf_writer.add_name(self.dir_model.name)
802
990
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
803
991
  self.gguf_writer.add_embedding_length(self.hparams["d_model"])
804
992
  self.gguf_writer.add_block_count(block_count)
@@ -837,7 +1025,6 @@ class OrionModel(Model):
837
1025
  block_count = self.hparams["num_hidden_layers"]
838
1026
  head_count = self.hparams["num_attention_heads"]
839
1027
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
840
- hf_repo = self.hparams.get("_name_or_path", "")
841
1028
 
842
1029
  ctx_length = 0
843
1030
  if "max_sequence_length" in self.hparams:
@@ -850,8 +1037,6 @@ class OrionModel(Model):
850
1037
  raise ValueError("gguf: can not find ctx length parameter.")
851
1038
 
852
1039
  self.gguf_writer.add_file_type(self.ftype)
853
- self.gguf_writer.add_name(self.dir_model.name)
854
- self.gguf_writer.add_source_hf_repo(hf_repo)
855
1040
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
856
1041
  self.gguf_writer.add_context_length(ctx_length)
857
1042
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -875,7 +1060,6 @@ class BaichuanModel(Model):
875
1060
  block_count = self.hparams["num_hidden_layers"]
876
1061
  head_count = self.hparams["num_attention_heads"]
877
1062
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
878
- hf_repo = self.hparams.get("_name_or_path", "")
879
1063
 
880
1064
  ctx_length = 0
881
1065
  if "max_sequence_length" in self.hparams:
@@ -887,8 +1071,6 @@ class BaichuanModel(Model):
887
1071
  else:
888
1072
  raise ValueError("gguf: can not find ctx length parameter.")
889
1073
 
890
- self.gguf_writer.add_name(self.dir_model.name)
891
- self.gguf_writer.add_source_hf_repo(hf_repo)
892
1074
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
893
1075
  self.gguf_writer.add_context_length(ctx_length)
894
1076
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -962,7 +1144,11 @@ class XverseModel(Model):
962
1144
  from transformers import AutoTokenizer
963
1145
  tokenizer = AutoTokenizer.from_pretrained(dir_model)
964
1146
  vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
965
- assert max(tokenizer.vocab.values()) < vocab_size
1147
+ # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
1148
+ # because vocab_size is the count of items, and indexes start at 0.
1149
+ max_vocab_index = max(tokenizer.get_vocab().values())
1150
+ if max_vocab_index >= vocab_size:
1151
+ raise ValueError("Vocabulary size exceeds expected maximum size.")
966
1152
 
967
1153
  reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
968
1154
  added_vocab = tokenizer.get_added_vocab()
@@ -998,7 +1184,6 @@ class XverseModel(Model):
998
1184
  block_count = self.hparams["num_hidden_layers"]
999
1185
  head_count = self.hparams["num_attention_heads"]
1000
1186
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
1001
- hf_repo = self.hparams.get("_name_or_path", "")
1002
1187
 
1003
1188
  ctx_length = 0
1004
1189
  if "max_sequence_length" in self.hparams:
@@ -1010,8 +1195,6 @@ class XverseModel(Model):
1010
1195
  else:
1011
1196
  raise ValueError("gguf: can not find ctx length parameter.")
1012
1197
 
1013
- self.gguf_writer.add_name(self.dir_model.name)
1014
- self.gguf_writer.add_source_hf_repo(hf_repo)
1015
1198
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
1016
1199
  self.gguf_writer.add_context_length(ctx_length)
1017
1200
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1070,7 +1253,6 @@ class FalconModel(Model):
1070
1253
  if n_head_kv is None:
1071
1254
  n_head_kv = self.hparams.get("n_head_kv", 1) # old name
1072
1255
 
1073
- self.gguf_writer.add_name("Falcon")
1074
1256
  self.gguf_writer.add_context_length(2048) # not in config.json
1075
1257
  self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
1076
1258
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1115,7 +1297,6 @@ class StarCoderModel(Model):
1115
1297
  def set_gguf_parameters(self):
1116
1298
  block_count = self.hparams["n_layer"]
1117
1299
 
1118
- self.gguf_writer.add_name("StarCoder")
1119
1300
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
1120
1301
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
1121
1302
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -1135,11 +1316,11 @@ class RefactModel(Model):
1135
1316
 
1136
1317
  # TODO: how to determine special FIM tokens automatically?
1137
1318
  special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
1138
- special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
1319
+ special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
1139
1320
  special_vocab._set_special_token("prefix", 1)
1140
1321
  special_vocab._set_special_token("suffix", 3)
1141
1322
  special_vocab._set_special_token("middle", 2)
1142
- special_vocab._set_special_token("fsep", 4) # is this correct?
1323
+ special_vocab.chat_template = None # do not add it twice
1143
1324
  special_vocab.add_to_gguf(self.gguf_writer)
1144
1325
 
1145
1326
  def set_gguf_parameters(self):
@@ -1151,7 +1332,6 @@ class RefactModel(Model):
1151
1332
 
1152
1333
  block_count = self.hparams["n_layer"]
1153
1334
 
1154
- self.gguf_writer.add_name("Refact")
1155
1335
  # refact uses Alibi. So this is from config.json which might be used by training.
1156
1336
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
1157
1337
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -1199,14 +1379,13 @@ class StableLMModel(Model):
1199
1379
  if (self.dir_model / "tokenizer.json").is_file():
1200
1380
  self._set_vocab_gpt2()
1201
1381
  else:
1202
- # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1382
+ # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
1203
1383
  self._set_vocab_qwen()
1204
1384
 
1205
1385
  def set_gguf_parameters(self):
1206
1386
  hparams = self.hparams
1207
1387
  block_count = hparams["num_hidden_layers"]
1208
1388
 
1209
- self.gguf_writer.add_name(self.dir_model.name)
1210
1389
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
1211
1390
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1212
1391
  self.gguf_writer.add_block_count(block_count)
@@ -1268,8 +1447,8 @@ class StableLMModel(Model):
1268
1447
 
1269
1448
  return [(new_name, data_torch)]
1270
1449
 
1271
- def write_tensors(self):
1272
- super().write_tensors()
1450
+ def prepare_tensors(self):
1451
+ super().prepare_tensors()
1273
1452
 
1274
1453
  if self._q_norms is not None or self._k_norms is not None:
1275
1454
  # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
@@ -1281,85 +1460,6 @@ class StableLMModel(Model):
1281
1460
  if len(norms) > 0:
1282
1461
  raise ValueError(f"Unprocessed norms: {norms}")
1283
1462
 
1284
- def write_tensors(self):
1285
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1286
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1287
- n_head = self.hparams.get("num_attention_heads")
1288
- n_kv_head = self.hparams.get("num_key_value_heads")
1289
- q_norms = dict()
1290
- k_norms = dict()
1291
- for name, data_torch in self.get_tensors():
1292
- # we don't need these
1293
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1294
- continue
1295
-
1296
- old_dtype = data_torch.dtype
1297
-
1298
- # convert any unsupported data types to float32
1299
- if data_torch.dtype not in (torch.float16, torch.float32):
1300
- data_torch = data_torch.to(torch.float32)
1301
-
1302
- data = data_torch.squeeze().numpy()
1303
- n_dims = len(data.shape)
1304
- if name.find("q_layernorm.norms") != -1:
1305
- q_norms[name] = data
1306
- if len(q_norms) >= (block_count * n_head):
1307
- self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
1308
- continue
1309
- if name.find("k_layernorm.norms") != -1:
1310
- k_norms[name] = data
1311
- if len(k_norms) >= (block_count * n_kv_head):
1312
- self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
1313
- continue
1314
-
1315
- # map tensor names
1316
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1317
- if new_name is None:
1318
- raise ValueError(f"Can not map tensor {name!r}")
1319
-
1320
- n_dims = len(data.shape)
1321
- data_dtype = data.dtype
1322
-
1323
- # if f32 desired, convert any float16 to float32
1324
- if self.ftype == 0 and data_dtype == np.float16:
1325
- data = data.astype(np.float32)
1326
-
1327
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1328
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1329
- data = data.astype(np.float32)
1330
-
1331
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1332
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1333
- data = data.astype(np.float16)
1334
-
1335
- logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1336
-
1337
- self.gguf_writer.add_tensor(new_name, data)
1338
-
1339
- def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
1340
- for bid in range(block_count):
1341
- datas = []
1342
- for xid in range(n_head):
1343
- ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
1344
- datas.append(norms[ename])
1345
- del norms[ename]
1346
- data = np.stack(datas, axis=0)
1347
- data_dtype = data.dtype
1348
- merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
1349
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1350
- if new_name is None:
1351
- raise ValueError(f"Can not map tensor {name!r}")
1352
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1353
- data = data.astype(np.float32)
1354
-
1355
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1356
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1357
- data = data.astype(np.float16)
1358
-
1359
- logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1360
-
1361
- self.gguf_writer.add_tensor(new_name, data)
1362
-
1363
1463
 
1364
1464
  @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1365
1465
  class LlamaModel(Model):
@@ -1367,7 +1467,7 @@ class LlamaModel(Model):
1367
1467
 
1368
1468
  def set_vocab(self):
1369
1469
  try:
1370
- self. _set_vocab_sentencepiece()
1470
+ self._set_vocab_sentencepiece()
1371
1471
  except FileNotFoundError:
1372
1472
  try:
1373
1473
  self._set_vocab_llama_hf()
@@ -1391,13 +1491,29 @@ class LlamaModel(Model):
1391
1491
  super().set_gguf_parameters()
1392
1492
  hparams = self.hparams
1393
1493
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1394
- self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
1494
+
1495
+ if "head_dim" in hparams:
1496
+ rope_dim = hparams["head_dim"]
1497
+ else:
1498
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1499
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
1395
1500
 
1396
1501
  if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1397
1502
  if self.hparams["rope_scaling"].get("type") == "linear":
1398
1503
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1399
1504
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1400
1505
 
1506
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1507
+ if tokenizer_config_file.is_file():
1508
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1509
+ tokenizer_config_json = json.load(f)
1510
+ if "add_prefix_space" in tokenizer_config_json:
1511
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1512
+
1513
+ # Apply to granite small models only
1514
+ if self.hparams.get("vocab_size", 32000) == 49152:
1515
+ self.gguf_writer.add_add_bos_token(False)
1516
+
1401
1517
  @staticmethod
1402
1518
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1403
1519
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1412,9 +1528,9 @@ class LlamaModel(Model):
1412
1528
  n_head = self.hparams["num_attention_heads"]
1413
1529
  n_kv_head = self.hparams.get("num_key_value_heads")
1414
1530
 
1415
- if name.endswith("q_proj.weight"):
1531
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
1416
1532
  data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1417
- if name.endswith("k_proj.weight"):
1533
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
1418
1534
  data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1419
1535
 
1420
1536
  # process the experts separately
@@ -1453,8 +1569,35 @@ class LlamaModel(Model):
1453
1569
 
1454
1570
  return [(self.map_tensor_name(name), data_torch)]
1455
1571
 
1456
- def write_tensors(self):
1457
- super().write_tensors()
1572
+ def prepare_tensors(self):
1573
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1574
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
1575
+ base = self.hparams.get("rope_theta", 10000.0)
1576
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1577
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1578
+ factor = rope_scaling.get("factor", 8.0)
1579
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1580
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1581
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1582
+
1583
+ low_freq_wavelen = old_context_len / low_freq_factor
1584
+ high_freq_wavelen = old_context_len / high_freq_factor
1585
+ assert low_freq_wavelen != high_freq_wavelen
1586
+
1587
+ rope_factors = []
1588
+ for freq in freqs:
1589
+ wavelen = 2 * math.pi / freq
1590
+ if wavelen < high_freq_wavelen:
1591
+ rope_factors.append(1)
1592
+ elif wavelen > low_freq_wavelen:
1593
+ rope_factors.append(factor)
1594
+ else:
1595
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1596
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1597
+
1598
+ self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1599
+
1600
+ super().prepare_tensors()
1458
1601
 
1459
1602
  if self._experts is not None:
1460
1603
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -1463,6 +1606,48 @@ class LlamaModel(Model):
1463
1606
  raise ValueError(f"Unprocessed experts: {experts}")
1464
1607
 
1465
1608
 
1609
+ @Model.register("BitnetForCausalLM")
1610
+ class BitnetModel(Model):
1611
+ model_arch = gguf.MODEL_ARCH.BITNET
1612
+
1613
+ def set_vocab(self):
1614
+ self._set_vocab_sentencepiece()
1615
+
1616
+ def set_gguf_parameters(self):
1617
+ super().set_gguf_parameters()
1618
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1619
+ self.gguf_writer.add_rope_scaling_factor(1.0)
1620
+
1621
+ def weight_quant(self, weight):
1622
+ dtype = weight.dtype
1623
+ weight = weight.float()
1624
+ s = 1 / weight.abs().mean().clamp(min=1e-5)
1625
+ weight = (weight * s).round().clamp(-1, 1) / s
1626
+ scale = weight.abs().max().unsqueeze(0)
1627
+ weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
1628
+ weight = torch.sign(weight).type(dtype)
1629
+ return weight.type(dtype), scale.type(torch.float32)
1630
+
1631
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1632
+ new_name = self.map_tensor_name(name)
1633
+
1634
+ if any(self.match_model_tensor_name(new_name, key, bid) for key in [
1635
+ gguf.MODEL_TENSOR.ATTN_Q,
1636
+ gguf.MODEL_TENSOR.ATTN_K,
1637
+ gguf.MODEL_TENSOR.ATTN_V,
1638
+ gguf.MODEL_TENSOR.ATTN_OUT,
1639
+ gguf.MODEL_TENSOR.FFN_UP,
1640
+ gguf.MODEL_TENSOR.FFN_DOWN,
1641
+ gguf.MODEL_TENSOR.FFN_GATE,
1642
+ ]):
1643
+ # transform weight into 1/0/-1 (in fp32)
1644
+ weight_torch, scale_torch = self.weight_quant(data_torch)
1645
+ yield (new_name, weight_torch)
1646
+ yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
1647
+ else:
1648
+ yield (new_name, data_torch)
1649
+
1650
+
1466
1651
  @Model.register("GrokForCausalLM")
1467
1652
  class GrokModel(Model):
1468
1653
  model_arch = gguf.MODEL_ARCH.GROK
@@ -1475,7 +1660,6 @@ class GrokModel(Model):
1475
1660
 
1476
1661
  def set_gguf_parameters(self):
1477
1662
  super().set_gguf_parameters()
1478
- self.gguf_writer.add_name("Grok")
1479
1663
 
1480
1664
  _experts: list[dict[str, Tensor]] | None = None
1481
1665
 
@@ -1524,7 +1708,6 @@ class DbrxModel(Model):
1524
1708
  def set_gguf_parameters(self):
1525
1709
  ffn_config = self.hparams["ffn_config"]
1526
1710
  attn_config = self.hparams["attn_config"]
1527
- self.gguf_writer.add_name(self.hparams["model_type"])
1528
1711
  self.gguf_writer.add_block_count(self.hparams["n_layers"])
1529
1712
 
1530
1713
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
@@ -1537,7 +1720,6 @@ class DbrxModel(Model):
1537
1720
  self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
1538
1721
 
1539
1722
  self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
1540
- self.gguf_writer.add_file_type(self.ftype)
1541
1723
 
1542
1724
  self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
1543
1725
  self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
@@ -1594,7 +1776,6 @@ class MiniCPMModel(Model):
1594
1776
 
1595
1777
  def set_gguf_parameters(self):
1596
1778
  block_count = self.hparams["num_hidden_layers"]
1597
- self.gguf_writer.add_name("MiniCPM")
1598
1779
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1599
1780
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1600
1781
  self.gguf_writer.add_block_count(block_count)
@@ -1610,7 +1791,7 @@ class MiniCPMModel(Model):
1610
1791
 
1611
1792
  def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1612
1793
  if n_kv_head is not None and n_head != n_kv_head:
1613
- n_head //= n_kv_head
1794
+ n_head = n_kv_head
1614
1795
 
1615
1796
  return (
1616
1797
  weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@@ -1664,7 +1845,6 @@ class QwenModel(Model):
1664
1845
  self._set_vocab_qwen()
1665
1846
 
1666
1847
  def set_gguf_parameters(self):
1667
- self.gguf_writer.add_name("Qwen")
1668
1848
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1669
1849
  self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
1670
1850
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1695,6 +1875,12 @@ class Qwen2MoeModel(Model):
1695
1875
  super().set_gguf_parameters()
1696
1876
  if (n_experts := self.hparams.get("num_experts")) is not None:
1697
1877
  self.gguf_writer.add_expert_count(n_experts)
1878
+ if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
1879
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
1880
+ logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
1881
+ if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
1882
+ self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
1883
+ logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
1698
1884
 
1699
1885
  _experts: list[dict[str, Tensor]] | None = None
1700
1886
 
@@ -1734,8 +1920,8 @@ class Qwen2MoeModel(Model):
1734
1920
 
1735
1921
  return [(self.map_tensor_name(name), data_torch)]
1736
1922
 
1737
- def write_tensors(self):
1738
- super().write_tensors()
1923
+ def prepare_tensors(self):
1924
+ super().prepare_tensors()
1739
1925
 
1740
1926
  if self._experts is not None:
1741
1927
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -1749,7 +1935,6 @@ class GPT2Model(Model):
1749
1935
  model_arch = gguf.MODEL_ARCH.GPT2
1750
1936
 
1751
1937
  def set_gguf_parameters(self):
1752
- self.gguf_writer.add_name(self.dir_model.name)
1753
1938
  self.gguf_writer.add_block_count(self.hparams["n_layer"])
1754
1939
  self.gguf_writer.add_context_length(self.hparams["n_ctx"])
1755
1940
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -1792,7 +1977,6 @@ class Phi2Model(Model):
1792
1977
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
1793
1978
  n_head = self.find_hparam(["num_attention_heads", "n_head"])
1794
1979
 
1795
- self.gguf_writer.add_name("Phi2")
1796
1980
  self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
1797
1981
 
1798
1982
  self.gguf_writer.add_embedding_length(n_embd)
@@ -1825,7 +2009,7 @@ class Phi3MiniModel(Model):
1825
2009
 
1826
2010
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
1827
2011
  scores: list[float] = [-10000.0] * vocab_size
1828
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2012
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
1829
2013
 
1830
2014
  for token_id in range(tokenizer.vocab_size()):
1831
2015
 
@@ -1854,7 +2038,7 @@ class Phi3MiniModel(Model):
1854
2038
 
1855
2039
  for key in added_tokens_json:
1856
2040
  token_id = added_tokens_json[key]
1857
- if (token_id >= vocab_size):
2041
+ if token_id >= vocab_size:
1858
2042
  logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
1859
2043
  continue
1860
2044
 
@@ -1870,8 +2054,9 @@ class Phi3MiniModel(Model):
1870
2054
  for token_id, foken_data in added_tokens_decoder.items():
1871
2055
  token_id = int(token_id)
1872
2056
  token = foken_data["content"].encode("utf-8")
1873
- if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1874
- assert tokens[token_id] == token
2057
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2058
+ if tokens[token_id] != token:
2059
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
1875
2060
  tokens[token_id] = token
1876
2061
  scores[token_id] = -1000.0
1877
2062
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1886,8 +2071,9 @@ class Phi3MiniModel(Model):
1886
2071
  for foken_data in added_tokens:
1887
2072
  token_id = int(foken_data["id"])
1888
2073
  token = foken_data["content"].encode("utf-8")
1889
- if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1890
- assert tokens[token_id] == token
2074
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2075
+ if tokens[token_id] != token:
2076
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
1891
2077
  tokens[token_id] = token
1892
2078
  scores[token_id] = -1000.0
1893
2079
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1914,7 +2100,6 @@ class Phi3MiniModel(Model):
1914
2100
  orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
1915
2101
  rope_dims = n_embd // n_head
1916
2102
 
1917
- self.gguf_writer.add_name("Phi3")
1918
2103
  self.gguf_writer.add_context_length(max_pos_embds)
1919
2104
  self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
1920
2105
  self.gguf_writer.add_embedding_length(n_embd)
@@ -1926,10 +2111,11 @@ class Phi3MiniModel(Model):
1926
2111
  self.gguf_writer.add_rope_dimension_count(rope_dims)
1927
2112
  self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
1928
2113
  self.gguf_writer.add_file_type(self.ftype)
2114
+ self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
1929
2115
 
1930
2116
  # write rope scaling for long context (128k) model
1931
2117
  rope_scaling = self.find_hparam(['rope_scaling'], True)
1932
- if (rope_scaling is None):
2118
+ if rope_scaling is None:
1933
2119
  return
1934
2120
 
1935
2121
  scale = max_pos_embds / orig_max_pos_embds
@@ -1938,7 +2124,7 @@ class Phi3MiniModel(Model):
1938
2124
  if len(rope_scaling_type) == 0:
1939
2125
  raise KeyError('Missing the required key rope_scaling.type')
1940
2126
 
1941
- if rope_scaling_type == 'su':
2127
+ if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
1942
2128
  attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
1943
2129
  elif rope_scaling_type == 'yarn':
1944
2130
  attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
@@ -1971,7 +2157,6 @@ class PlamoModel(Model):
1971
2157
  hparams = self.hparams
1972
2158
  block_count = hparams["num_hidden_layers"]
1973
2159
 
1974
- self.gguf_writer.add_name("PLaMo")
1975
2160
  self.gguf_writer.add_context_length(4096) # not in config.json
1976
2161
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1977
2162
  self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
@@ -2016,7 +2201,6 @@ class CodeShellModel(Model):
2016
2201
  def set_gguf_parameters(self):
2017
2202
  block_count = self.hparams["n_layer"]
2018
2203
 
2019
- self.gguf_writer.add_name("CodeShell")
2020
2204
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
2021
2205
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
2022
2206
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -2068,7 +2252,7 @@ class InternLM2Model(Model):
2068
2252
  logger.error(f'Error: Missing {tokenizer_path}')
2069
2253
  sys.exit(1)
2070
2254
 
2071
- sentencepiece_model = model.ModelProto()
2255
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
2072
2256
  sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2073
2257
  add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2074
2258
 
@@ -2096,6 +2280,9 @@ class InternLM2Model(Model):
2096
2280
  toktype = SentencePieceTokenTypes.UNUSED
2097
2281
  elif tokenizer.IsByte(token_id):
2098
2282
  toktype = SentencePieceTokenTypes.BYTE
2283
+ # take care of ununsed raw token
2284
+ if piece.startswith('[UNUSED'):
2285
+ toktype = SentencePieceTokenTypes.UNUSED
2099
2286
 
2100
2287
  tokens.append(text)
2101
2288
  scores.append(score)
@@ -2111,6 +2298,49 @@ class InternLM2Model(Model):
2111
2298
  scores.append(-1000.0)
2112
2299
  toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
2113
2300
 
2301
+ chat_eos_token = '<|im_end|>'
2302
+ chat_eos_token_id = None
2303
+
2304
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2305
+ if tokenizer_config_file.is_file():
2306
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2307
+ tokenizer_config_json = json.load(f)
2308
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
2309
+ for token_id, foken_data in added_tokens_decoder.items():
2310
+ token_id = int(token_id)
2311
+ token = foken_data["content"]
2312
+ if token == chat_eos_token:
2313
+ chat_eos_token_id = token_id
2314
+ token = token.encode("utf-8")
2315
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2316
+ if tokens[token_id] != token:
2317
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
2318
+ tokens[token_id] = token
2319
+ scores[token_id] = -1000.0
2320
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2321
+ if foken_data.get("special"):
2322
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2323
+
2324
+ tokenizer_file = self.dir_model / 'tokenizer.json'
2325
+ if tokenizer_file.is_file():
2326
+ with open(tokenizer_file, "r", encoding="utf-8") as f:
2327
+ tokenizer_json = json.load(f)
2328
+ added_tokens = tokenizer_json.get("added_tokens", [])
2329
+ for foken_data in added_tokens:
2330
+ token_id = int(foken_data["id"])
2331
+ token = foken_data["content"]
2332
+ if token == chat_eos_token:
2333
+ chat_eos_token_id = token_id
2334
+ token = token.encode("utf-8")
2335
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2336
+ if tokens[token_id] != token:
2337
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
2338
+ tokens[token_id] = token
2339
+ scores[token_id] = -1000.0
2340
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2341
+ if foken_data.get("special"):
2342
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2343
+
2114
2344
  self.gguf_writer.add_tokenizer_model("llama")
2115
2345
  self.gguf_writer.add_tokenizer_pre("default")
2116
2346
  self.gguf_writer.add_token_list(tokens)
@@ -2120,37 +2350,17 @@ class InternLM2Model(Model):
2120
2350
 
2121
2351
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2122
2352
  old_eos = special_vocab.special_token_ids["eos"]
2123
- if "chat" in os.path.basename(self.dir_model.absolute()):
2353
+ if chat_eos_token_id is not None:
2124
2354
  # For the chat model, we replace the eos with '<|im_end|>'.
2125
2355
  # TODO: this is a hack, should be fixed
2126
2356
  # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2127
- special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
2128
- logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
2129
- in chat mode so that the conversation can end normally.")
2357
+ special_vocab.special_token_ids["eos"] = chat_eos_token_id
2358
+ logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
2359
+ " in chat mode so that the conversation can end normally.")
2130
2360
 
2131
2361
  special_vocab.add_to_gguf(self.gguf_writer)
2132
2362
 
2133
- def _try_get_sft_eos(self, tokenizer):
2134
- unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
2135
- im_end_list = tokenizer.Encode('<|im_end|>')
2136
- eos_token = None
2137
- assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
2138
- if len(unused_145_list) == 1:
2139
- eos_token = unused_145_list[0]
2140
- if len(im_end_list) == 1:
2141
- eos_token = im_end_list[0]
2142
- assert eos_token
2143
- return eos_token
2144
-
2145
- def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
2146
- if n_head_kv is not None and n_head != n_head_kv:
2147
- n_head = n_head_kv
2148
- return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2149
- .swapaxes(1, 2)
2150
- .reshape(weights.shape))
2151
-
2152
2363
  def set_gguf_parameters(self):
2153
- self.gguf_writer.add_name("InternLM2")
2154
2364
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
2155
2365
  self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
2156
2366
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -2160,30 +2370,30 @@ in chat mode so that the conversation can end normally.")
2160
2370
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2161
2371
  self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
2162
2372
  self.gguf_writer.add_file_type(self.ftype)
2373
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2374
+ if self.hparams["rope_scaling"].get("type") == "linear":
2375
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2376
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2163
2377
 
2164
2378
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2165
2379
  num_heads = self.hparams["num_attention_heads"]
2166
2380
  num_kv_heads = self.hparams["num_key_value_heads"]
2167
- hidden_size = self.hparams["hidden_size"]
2381
+ n_embd = self.hparams["hidden_size"]
2168
2382
  q_per_kv = num_heads // num_kv_heads
2169
- head_dim = hidden_size // num_heads
2383
+ head_dim = n_embd // num_heads
2170
2384
  num_groups = num_heads // q_per_kv
2171
2385
 
2172
- qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
2173
-
2174
- if re.match(qkv_pattern, name):
2175
- bid = re.findall(qkv_pattern, name)[0]
2386
+ if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
2176
2387
  qkv = data_torch
2177
- # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
2178
- qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
2179
- q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
2388
+
2389
+ qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
2390
+ q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
2391
+
2180
2392
  # The model weights of q and k equire additional reshape.
2181
- # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
2182
- q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
2183
- # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
2184
- k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
2185
- # v = rearrange(v, " o g n i -> o (g n i)").T
2186
- v = v.reshape((v.shape[0], -1)).T
2393
+ q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
2394
+ k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
2395
+ v = v.reshape((-1, v.shape[-1]))
2396
+
2187
2397
  return [
2188
2398
  (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
2189
2399
  (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
@@ -2310,13 +2520,55 @@ class GemmaModel(Model):
2310
2520
  special_vocab._set_special_token("middle", 68)
2311
2521
  special_vocab._set_special_token("fsep", 70)
2312
2522
  special_vocab._set_special_token("eot", 107)
2523
+ special_vocab.chat_template = None # do not add it twice
2313
2524
  special_vocab.add_to_gguf(self.gguf_writer)
2314
2525
 
2526
+ self.gguf_writer.add_add_space_prefix(False)
2527
+
2528
+ def set_gguf_parameters(self):
2529
+ hparams = self.hparams
2530
+ block_count = hparams["num_hidden_layers"]
2531
+
2532
+ self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2533
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2534
+ self.gguf_writer.add_block_count(block_count)
2535
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2536
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2537
+ self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
2538
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2539
+ self.gguf_writer.add_key_length(hparams["head_dim"])
2540
+ self.gguf_writer.add_value_length(hparams["head_dim"])
2541
+ self.gguf_writer.add_file_type(self.ftype)
2542
+
2543
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2544
+ del bid # unused
2545
+
2546
+ # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
2547
+ # To prevent errors, skip loading lm_head.weight.
2548
+ if name == "lm_head.weight":
2549
+ logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
2550
+ return []
2551
+
2552
+ # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
2553
+ if name.endswith("norm.weight"):
2554
+ data_torch = data_torch + 1
2555
+
2556
+ return [(self.map_tensor_name(name), data_torch)]
2557
+
2558
+
2559
+ @Model.register("Gemma2ForCausalLM")
2560
+ class Gemma2Model(Model):
2561
+ model_arch = gguf.MODEL_ARCH.GEMMA2
2562
+
2563
+ def set_vocab(self):
2564
+ self._set_vocab_sentencepiece()
2565
+
2566
+ self.gguf_writer.add_add_space_prefix(False)
2567
+
2315
2568
  def set_gguf_parameters(self):
2316
2569
  hparams = self.hparams
2317
2570
  block_count = hparams["num_hidden_layers"]
2318
2571
 
2319
- self.gguf_writer.add_name(self.dir_model.name)
2320
2572
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2321
2573
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2322
2574
  self.gguf_writer.add_block_count(block_count)
@@ -2327,6 +2579,13 @@ class GemmaModel(Model):
2327
2579
  self.gguf_writer.add_key_length(hparams["head_dim"])
2328
2580
  self.gguf_writer.add_value_length(hparams["head_dim"])
2329
2581
  self.gguf_writer.add_file_type(self.ftype)
2582
+ self.gguf_writer.add_attn_logit_softcapping(
2583
+ self.hparams["attn_logit_softcapping"]
2584
+ )
2585
+ self.gguf_writer.add_final_logit_softcapping(
2586
+ self.hparams["final_logit_softcapping"]
2587
+ )
2588
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
2330
2589
 
2331
2590
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2332
2591
  del bid # unused
@@ -2368,39 +2627,7 @@ class MambaModel(Model):
2368
2627
  self._set_vocab_sentencepiece()
2369
2628
  else:
2370
2629
  # Use the GPT-NeoX tokenizer when no tokenizer files are present
2371
- tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
2372
- logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
2373
- neox_reader = gguf.GGUFReader(tokenizer_path, "r")
2374
-
2375
- field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
2376
- self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
2377
-
2378
- field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
2379
- self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
2380
-
2381
- field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
2382
- assert field
2383
- self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
2384
-
2385
- field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
2386
- assert field
2387
- self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
2388
-
2389
- field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
2390
- assert field
2391
- self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
2392
-
2393
- field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
2394
- self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
2395
-
2396
- field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
2397
- self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
2398
-
2399
- field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
2400
- self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
2401
-
2402
- field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
2403
- self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
2630
+ self._set_vocab_builtin("gpt-neox", vocab_size)
2404
2631
 
2405
2632
  def set_gguf_parameters(self):
2406
2633
  d_model = self.find_hparam(["hidden_size", "d_model"])
@@ -2416,7 +2643,6 @@ class MambaModel(Model):
2416
2643
  # Fail early for models which don't have a block expansion factor of 2
2417
2644
  assert d_inner == 2 * d_model
2418
2645
 
2419
- self.gguf_writer.add_name(self.dir_model.name)
2420
2646
  self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
2421
2647
  self.gguf_writer.add_embedding_length(d_model)
2422
2648
  self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@@ -2523,18 +2749,20 @@ class JinaBertV2Model(BertModel):
2523
2749
 
2524
2750
  def get_tensors(self):
2525
2751
  for name, data in super().get_tensors():
2526
- if 'gated_layers' in name:
2752
+ if 'gated_layer' in name:
2527
2753
  d1 = data[:self.intermediate_size, :]
2528
2754
  name1 = name.replace('gated_layers', 'gated_layers_w')
2755
+ name1 = name1.replace('up_gated_layer', 'gated_layers_v')
2529
2756
  d2 = data[self.intermediate_size:, :]
2530
2757
  name2 = name.replace('gated_layers', 'gated_layers_v')
2758
+ name2 = name2.replace('up_gated_layer', 'gated_layers_w')
2531
2759
  yield name1, d1
2532
2760
  yield name2, d2
2533
2761
  continue
2534
2762
 
2535
2763
  yield name, data
2536
2764
 
2537
- def set_vocab(self, *args, **kwargs):
2765
+ def set_vocab(self):
2538
2766
  tokenizer_class = 'BertTokenizer'
2539
2767
  with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
2540
2768
  tokenizer_class = json.load(f)['tokenizer_class']
@@ -2550,19 +2778,94 @@ class JinaBertV2Model(BertModel):
2550
2778
  self.gguf_writer.add_add_eos_token(True)
2551
2779
 
2552
2780
 
2553
- @Model.register("ArcticForCausalLM")
2554
- class ArcticModel(Model):
2555
- model_arch = gguf.MODEL_ARCH.ARCTIC
2781
+ @Model.register("OpenELMForCausalLM")
2782
+ class OpenELMModel(Model):
2783
+ model_arch = gguf.MODEL_ARCH.OPENELM
2784
+
2785
+ @staticmethod
2786
+ def _make_divisible(v: float | int, divisor: int) -> int:
2787
+ # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
2788
+ new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
2789
+ # Make sure that round down does not go down by more than 10%.
2790
+ if new_v < 0.9 * v:
2791
+ new_v += divisor
2792
+ return new_v
2793
+
2794
+ def __init__(self, *args, **kwargs):
2795
+ super().__init__(*args, **kwargs)
2556
2796
 
2797
+ ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
2798
+ ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
2799
+ self._n_embd: int = self.hparams["model_dim"]
2800
+ self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
2801
+ self._num_query_heads: list[int] = self.hparams["num_query_heads"]
2802
+ self._ffn_dims: list[int] = [
2803
+ OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
2804
+ for multiplier in ffn_multipliers
2805
+ ]
2806
+ assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
2807
+ assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
2808
+
2809
+ # Uses the tokenizer from meta-llama/Llama-2-7b-hf
2557
2810
  def set_vocab(self):
2558
- # The reason for using a custom implementation here is that the
2559
- # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
2560
- # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
2561
- from sentencepiece import SentencePieceProcessor
2811
+ try:
2812
+ self._set_vocab_sentencepiece()
2813
+ except FileNotFoundError:
2814
+ self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
2562
2815
 
2563
- tokenizer_path = self.dir_model / 'tokenizer.model'
2816
+ def set_gguf_parameters(self):
2817
+ n_embd = self._n_embd
2818
+ head_dim = self.hparams["head_dim"]
2819
+ rot_pct = 1.0
2820
+ assert self.block_count == len(self._num_kv_heads)
2821
+ assert self.block_count == len(self._num_query_heads)
2822
+ assert self.block_count == len(self._ffn_dims)
2564
2823
 
2565
- if not tokenizer_path.is_file():
2824
+ self.gguf_writer.add_block_count(self.block_count)
2825
+ self.gguf_writer.add_context_length(self.hparams["max_context_length"])
2826
+ self.gguf_writer.add_embedding_length(n_embd)
2827
+ self.gguf_writer.add_feed_forward_length(self._ffn_dims)
2828
+ self.gguf_writer.add_head_count(self._num_query_heads)
2829
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
2830
+ self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
2831
+ # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
2832
+ self.gguf_writer.add_layer_norm_rms_eps(1e-6)
2833
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
2834
+ self.gguf_writer.add_key_length(head_dim)
2835
+ self.gguf_writer.add_value_length(head_dim)
2836
+ self.gguf_writer.add_file_type(self.ftype)
2837
+
2838
+ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
2839
+ if "n_layers" in keys:
2840
+ return self.hparams["num_transformer_layers"]
2841
+
2842
+ return super().find_hparam(keys, optional)
2843
+
2844
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2845
+
2846
+ # split ff
2847
+ if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
2848
+ ff_dim = self._ffn_dims[bid]
2849
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
2850
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
2851
+ return
2852
+
2853
+ yield (self.map_tensor_name(name), data_torch)
2854
+
2855
+
2856
+ @Model.register("ArcticForCausalLM")
2857
+ class ArcticModel(Model):
2858
+ model_arch = gguf.MODEL_ARCH.ARCTIC
2859
+
2860
+ def set_vocab(self):
2861
+ # The reason for using a custom implementation here is that the
2862
+ # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
2863
+ # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
2864
+ from sentencepiece import SentencePieceProcessor
2865
+
2866
+ tokenizer_path = self.dir_model / 'tokenizer.model'
2867
+
2868
+ if not tokenizer_path.is_file():
2566
2869
  logger.error(f'Error: Missing {tokenizer_path}')
2567
2870
  sys.exit(1)
2568
2871
 
@@ -2574,7 +2877,7 @@ class ArcticModel(Model):
2574
2877
 
2575
2878
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
2576
2879
  scores: list[float] = [-10000.0] * vocab_size
2577
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2880
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
2578
2881
 
2579
2882
  for token_id in range(tokenizer.vocab_size()):
2580
2883
 
@@ -2607,7 +2910,7 @@ class ArcticModel(Model):
2607
2910
  added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
2608
2911
  for token_id, token_json in added_tokens_decoder.items():
2609
2912
  token_id = int(token_id)
2610
- if (token_id >= vocab_size):
2913
+ if token_id >= vocab_size:
2611
2914
  logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
2612
2915
  continue
2613
2916
 
@@ -2691,8 +2994,8 @@ class ArcticModel(Model):
2691
2994
 
2692
2995
  return [(self.map_tensor_name(name), data_torch)]
2693
2996
 
2694
- def write_tensors(self):
2695
- super().write_tensors()
2997
+ def prepare_tensors(self):
2998
+ super().prepare_tensors()
2696
2999
 
2697
3000
  if self._experts is not None:
2698
3001
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -2701,6 +3004,499 @@ class ArcticModel(Model):
2701
3004
  raise ValueError(f"Unprocessed experts: {experts}")
2702
3005
 
2703
3006
 
3007
+ @Model.register("DeepseekV2ForCausalLM")
3008
+ class DeepseekV2Model(Model):
3009
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK2
3010
+
3011
+ def set_vocab(self):
3012
+ self._set_vocab_gpt2()
3013
+
3014
+ def set_gguf_parameters(self):
3015
+ super().set_gguf_parameters()
3016
+ hparams = self.hparams
3017
+
3018
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
3019
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3020
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
3021
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
3022
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
3023
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
3024
+ self.gguf_writer.add_value_length(hparams["v_head_dim"])
3025
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
3026
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3027
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3028
+ self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
3029
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
3030
+
3031
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
3032
+ if self.hparams["rope_scaling"].get("type") == "yarn":
3033
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
3034
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
3035
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
3036
+ self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
3037
+
3038
+ _experts: list[dict[str, Tensor]] | None = None
3039
+
3040
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3041
+ # process the experts separately
3042
+ if name.find("mlp.experts") != -1:
3043
+ n_experts = self.hparams["n_routed_experts"]
3044
+ assert bid is not None
3045
+
3046
+ if self._experts is None:
3047
+ self._experts = [{} for _ in range(self.block_count)]
3048
+
3049
+ self._experts[bid][name] = data_torch
3050
+
3051
+ if len(self._experts[bid]) >= n_experts * 3:
3052
+ tensors: list[tuple[str, Tensor]] = []
3053
+
3054
+ # merge the experts into a single 3d tensor
3055
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
3056
+ datas: list[Tensor] = []
3057
+
3058
+ for xid in range(n_experts):
3059
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3060
+ datas.append(self._experts[bid][ename])
3061
+ del self._experts[bid][ename]
3062
+
3063
+ data_torch = torch.stack(datas, dim=0)
3064
+
3065
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3066
+
3067
+ new_name = self.map_tensor_name(merged_name)
3068
+
3069
+ tensors.append((new_name, data_torch))
3070
+ return tensors
3071
+ else:
3072
+ return []
3073
+
3074
+ return [(self.map_tensor_name(name), data_torch)]
3075
+
3076
+ def prepare_tensors(self):
3077
+ super().prepare_tensors()
3078
+
3079
+ if self._experts is not None:
3080
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3081
+ experts = [k for d in self._experts for k in d.keys()]
3082
+ if len(experts) > 0:
3083
+ raise ValueError(f"Unprocessed experts: {experts}")
3084
+
3085
+
3086
+ @Model.register("T5WithLMHeadModel")
3087
+ @Model.register("T5ForConditionalGeneration")
3088
+ @Model.register("MT5ForConditionalGeneration")
3089
+ @Model.register("UMT5ForConditionalGeneration")
3090
+ class T5Model(Model):
3091
+ model_arch = gguf.MODEL_ARCH.T5
3092
+
3093
+ def __init__(self, *args, **kwargs):
3094
+ super().__init__(*args, **kwargs)
3095
+ self.shared_token_embeddings_found = False
3096
+
3097
+ def set_vocab(self):
3098
+ # to avoid TypeError: Descriptors cannot be created directly
3099
+ # exception when importing sentencepiece_model_pb2
3100
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3101
+ from sentencepiece import SentencePieceProcessor
3102
+ from sentencepiece import sentencepiece_model_pb2 as model
3103
+
3104
+ tokenizer_path = self.dir_model / 'tokenizer.model'
3105
+
3106
+ # many older models use spiece.model tokenizer model filename
3107
+ if not tokenizer_path.is_file():
3108
+ tokenizer_path = self.dir_model / 'spiece.model'
3109
+
3110
+ if not tokenizer_path.is_file():
3111
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
3112
+
3113
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3114
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3115
+
3116
+ # some models like Pile-T5 family use BPE tokenizer instead of Unigram
3117
+ if sentencepiece_model.trainer_spec.model_type == 2: # BPE
3118
+ # assure the tokenizer model file name is correct
3119
+ assert tokenizer_path.name == 'tokenizer.model'
3120
+ return self._set_vocab_sentencepiece()
3121
+ else:
3122
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3123
+
3124
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3125
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3126
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3127
+
3128
+ tokenizer = SentencePieceProcessor()
3129
+ tokenizer.LoadFromFile(str(tokenizer_path))
3130
+
3131
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3132
+
3133
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3134
+ scores: list[float] = [-10000.0] * vocab_size
3135
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3136
+
3137
+ for token_id in range(tokenizer.vocab_size()):
3138
+ piece = tokenizer.IdToPiece(token_id)
3139
+ text = piece.encode("utf-8")
3140
+ score = tokenizer.GetScore(token_id)
3141
+
3142
+ toktype = SentencePieceTokenTypes.NORMAL
3143
+ if tokenizer.IsUnknown(token_id):
3144
+ toktype = SentencePieceTokenTypes.UNKNOWN
3145
+ elif tokenizer.IsControl(token_id):
3146
+ toktype = SentencePieceTokenTypes.CONTROL
3147
+ elif tokenizer.IsUnused(token_id):
3148
+ toktype = SentencePieceTokenTypes.UNUSED
3149
+ elif tokenizer.IsByte(token_id):
3150
+ toktype = SentencePieceTokenTypes.BYTE
3151
+
3152
+ tokens[token_id] = text
3153
+ scores[token_id] = score
3154
+ toktypes[token_id] = toktype
3155
+
3156
+ added_tokens_file = self.dir_model / 'added_tokens.json'
3157
+ if added_tokens_file.is_file():
3158
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
3159
+ added_tokens_json = json.load(f)
3160
+ for key in added_tokens_json:
3161
+ token_id = added_tokens_json[key]
3162
+ if token_id >= vocab_size:
3163
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
3164
+ continue
3165
+
3166
+ tokens[token_id] = key.encode("utf-8")
3167
+ scores[token_id] = -1000.0
3168
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
3169
+
3170
+ if vocab_size > len(tokens):
3171
+ pad_count = vocab_size - len(tokens)
3172
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3173
+ for i in range(1, pad_count + 1):
3174
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3175
+ scores.append(-1000.0)
3176
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
3177
+
3178
+ self.gguf_writer.add_tokenizer_model("t5")
3179
+ self.gguf_writer.add_tokenizer_pre("default")
3180
+ self.gguf_writer.add_token_list(tokens)
3181
+ self.gguf_writer.add_token_scores(scores)
3182
+ self.gguf_writer.add_token_types(toktypes)
3183
+ self.gguf_writer.add_add_space_prefix(add_prefix)
3184
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3185
+ if precompiled_charsmap:
3186
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3187
+
3188
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3189
+ special_vocab.add_to_gguf(self.gguf_writer)
3190
+
3191
+ self.gguf_writer.add_add_bos_token(False)
3192
+ self.gguf_writer.add_add_eos_token(True)
3193
+
3194
+ def set_gguf_parameters(self):
3195
+ if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
3196
+ logger.warning("Couldn't find context length in config.json, assuming default value of 512")
3197
+ n_ctx = 512
3198
+ self.gguf_writer.add_context_length(n_ctx)
3199
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
3200
+ self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
3201
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
3202
+ self.gguf_writer.add_head_count(self.hparams["num_heads"])
3203
+ self.gguf_writer.add_key_length(self.hparams["d_kv"])
3204
+ self.gguf_writer.add_value_length(self.hparams["d_kv"])
3205
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3206
+ self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
3207
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
3208
+ self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
3209
+ self.gguf_writer.add_file_type(self.ftype)
3210
+
3211
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3212
+ del bid # unused
3213
+
3214
+ # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
3215
+ # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
3216
+ # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
3217
+ # and decoder and ignore the remaining ones.
3218
+ if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
3219
+ if not self.shared_token_embeddings_found:
3220
+ name = "shared.weight"
3221
+ self.shared_token_embeddings_found = True
3222
+ else:
3223
+ logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
3224
+ return []
3225
+
3226
+ return [(self.map_tensor_name(name), data_torch)]
3227
+
3228
+
3229
+ @Model.register("JAISLMHeadModel")
3230
+ class JaisModel(Model):
3231
+ model_arch = gguf.MODEL_ARCH.JAIS
3232
+
3233
+ def __init__(self, *args, **kwargs):
3234
+ super().__init__(*args, **kwargs)
3235
+
3236
+ # SwigLU activation
3237
+ assert self.hparams["activation_function"] == "swiglu"
3238
+ # ALiBi position embedding
3239
+ assert self.hparams["position_embedding_type"] == "alibi"
3240
+
3241
+ # Embeddings scale
3242
+ self.embeddings_scale = 1.0
3243
+ # note: For some JAIS flavors, output is tied to (same as) wte in original model
3244
+ self.output_is_wte = False
3245
+ if 'mup_embeddings_scale' in self.hparams:
3246
+ self.output_is_wte = True # Hack (?)
3247
+ self.embeddings_scale = self.hparams['mup_embeddings_scale']
3248
+ elif 'embeddings_scale' in self.hparams:
3249
+ self.embeddings_scale = self.hparams['embeddings_scale']
3250
+ else:
3251
+ assert False
3252
+
3253
+ self.width_scale = 1.0
3254
+ if 'mup_output_alpha' in self.hparams:
3255
+ assert 'mup_width_scale' in self.hparams
3256
+ self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
3257
+ elif 'width_scale' in self.hparams:
3258
+ self.width_scale = self.hparams['width_scale']
3259
+ else:
3260
+ assert False
3261
+
3262
+ self.max_alibi_bias = 8.0
3263
+
3264
+ def set_vocab(self):
3265
+ self._set_vocab_gpt2()
3266
+
3267
+ def set_gguf_parameters(self):
3268
+ self.gguf_writer.add_block_count(self.hparams["n_layer"])
3269
+ self.gguf_writer.add_context_length(self.hparams["n_positions"])
3270
+ self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
3271
+ self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
3272
+ self.gguf_writer.add_head_count(self.hparams["n_head"])
3273
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3274
+ self.gguf_writer.add_file_type(self.ftype)
3275
+
3276
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3277
+ del bid # unused
3278
+
3279
+ tensors: list[tuple[str, Tensor]] = []
3280
+
3281
+ # we don't need these
3282
+ if name.endswith((".attn.bias")):
3283
+ return tensors
3284
+
3285
+ if name.endswith(("relative_pe.slopes")):
3286
+ # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
3287
+ # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
3288
+ # but Jais's PyTorch model simply precalculates the slope values and places them
3289
+ # in relative_pes.slopes
3290
+ n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
3291
+ first_val = float(data_torch[0].item())
3292
+ self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
3293
+
3294
+ return tensors
3295
+
3296
+ if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
3297
+ data_torch = data_torch.transpose(1, 0)
3298
+
3299
+ new_name = self.map_tensor_name(name)
3300
+
3301
+ if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3302
+ tensors.append((new_name, data_torch * self.embeddings_scale))
3303
+ if self.output_is_wte:
3304
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
3305
+ elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3306
+ assert not self.output_is_wte
3307
+ tensors.append((new_name, data_torch * self.width_scale))
3308
+ else:
3309
+ tensors.append((new_name, data_torch))
3310
+
3311
+ return tensors
3312
+
3313
+ def prepare_tensors(self):
3314
+ super().prepare_tensors()
3315
+ self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
3316
+
3317
+
3318
+ @Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
3319
+ class ChatGLMModel(Model):
3320
+ model_arch = gguf.MODEL_ARCH.CHATGLM
3321
+
3322
+ def set_vocab_chatglm3(self):
3323
+ dir_model = self.dir_model
3324
+ hparams = self.hparams
3325
+ tokens: list[bytes] = []
3326
+ toktypes: list[int] = []
3327
+ scores: list[float] = []
3328
+
3329
+ from transformers import AutoTokenizer
3330
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3331
+ vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
3332
+ assert max(tokenizer.get_vocab().values()) < vocab_size
3333
+ role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
3334
+ special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
3335
+ for token_id in range(vocab_size):
3336
+ piece = tokenizer._convert_id_to_token(token_id)
3337
+ if token_id == 0:
3338
+ piece = "<unk>"
3339
+ elif token_id == 1:
3340
+ piece = "<bos>"
3341
+ elif token_id == 2:
3342
+ piece = "<eos>"
3343
+
3344
+ text = piece.encode("utf-8")
3345
+ score = 0.0
3346
+ # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
3347
+ # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
3348
+ if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
3349
+ score = tokenizer.tokenizer.sp_model.get_score(token_id)
3350
+
3351
+ if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
3352
+ if piece in special_tokens:
3353
+ toktype = SentencePieceTokenTypes.CONTROL
3354
+ elif len(piece) == 0:
3355
+ text = f"[PAD{token_id}]".encode("utf-8")
3356
+ toktype = SentencePieceTokenTypes.UNUSED
3357
+ else:
3358
+ toktype = SentencePieceTokenTypes.USER_DEFINED
3359
+ tokens.append(text)
3360
+ scores.append(score)
3361
+ toktypes.append(toktype)
3362
+ continue
3363
+
3364
+ toktype = SentencePieceTokenTypes.NORMAL
3365
+ if tokenizer.tokenizer.sp_model.is_unknown(token_id):
3366
+ toktype = SentencePieceTokenTypes.UNKNOWN
3367
+ elif tokenizer.tokenizer.sp_model.is_control(token_id):
3368
+ toktype = SentencePieceTokenTypes.CONTROL
3369
+ elif tokenizer.tokenizer.sp_model.is_unused(token_id):
3370
+ toktype = SentencePieceTokenTypes.UNUSED
3371
+ elif tokenizer.tokenizer.sp_model.is_byte(token_id):
3372
+ toktype = SentencePieceTokenTypes.BYTE
3373
+
3374
+ tokens.append(text)
3375
+ scores.append(score)
3376
+ toktypes.append(toktype)
3377
+
3378
+ self.gguf_writer.add_tokenizer_model("llama")
3379
+ # glm3 needs prefix and suffix formatted as:
3380
+ # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
3381
+ self.gguf_writer.add_tokenizer_pre("chatglm-spm")
3382
+ self.gguf_writer.add_token_list(tokens)
3383
+ self.gguf_writer.add_token_scores(scores)
3384
+ self.gguf_writer.add_token_types(toktypes)
3385
+
3386
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3387
+ special_vocab.add_to_gguf(self.gguf_writer)
3388
+
3389
+ @staticmethod
3390
+ def token_bytes_to_string(b):
3391
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
3392
+ byte_encoder = bytes_to_unicode()
3393
+ return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
3394
+
3395
+ @staticmethod
3396
+ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
3397
+ parts = [bytes([b]) for b in token]
3398
+ while True:
3399
+ min_idx = None
3400
+ min_rank = None
3401
+ for i, pair in enumerate(zip(parts[:-1], parts[1:])):
3402
+ rank = mergeable_ranks.get(pair[0] + pair[1])
3403
+ if rank is not None and (min_rank is None or rank < min_rank):
3404
+ min_idx = i
3405
+ min_rank = rank
3406
+ if min_rank is None or (max_rank is not None and min_rank >= max_rank):
3407
+ break
3408
+ assert min_idx is not None
3409
+ parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
3410
+ return parts
3411
+
3412
+ def set_vocab(self):
3413
+ if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
3414
+ self.set_vocab_chatglm3()
3415
+ return
3416
+
3417
+ dir_model = self.dir_model
3418
+ hparams = self.hparams
3419
+ tokens: list[str] = []
3420
+ toktypes: list[int] = []
3421
+
3422
+ from transformers import AutoTokenizer
3423
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3424
+ vocab_size = hparams["padded_vocab_size"]
3425
+ assert max(tokenizer.get_vocab().values()) < vocab_size
3426
+
3427
+ tokpre = self.get_vocab_base_pre(tokenizer)
3428
+
3429
+ merges = []
3430
+ vocab = {}
3431
+ mergeable_ranks = tokenizer.mergeable_ranks
3432
+ for token, rank in mergeable_ranks.items():
3433
+ vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
3434
+ if len(token) == 1:
3435
+ continue
3436
+ merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
3437
+ assert len(merged) >= 2 and len(merged) <= 7
3438
+ merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
3439
+
3440
+ # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3441
+ added_vocab = tokenizer.get_added_vocab()
3442
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
3443
+
3444
+ for i in range(vocab_size):
3445
+ if i not in reverse_vocab:
3446
+ tokens.append(f"[PAD{i}]")
3447
+ toktypes.append(gguf.TokenType.UNUSED)
3448
+ elif reverse_vocab[i] in added_vocab:
3449
+ tokens.append(reverse_vocab[i])
3450
+ if tokenizer.added_tokens_decoder[i].special:
3451
+ toktypes.append(gguf.TokenType.CONTROL)
3452
+ else:
3453
+ toktypes.append(gguf.TokenType.USER_DEFINED)
3454
+ else:
3455
+ tokens.append(reverse_vocab[i])
3456
+ toktypes.append(gguf.TokenType.NORMAL)
3457
+
3458
+ self.gguf_writer.add_tokenizer_model("gpt2")
3459
+ self.gguf_writer.add_tokenizer_pre(tokpre)
3460
+ self.gguf_writer.add_token_list(tokens)
3461
+ self.gguf_writer.add_token_types(toktypes)
3462
+
3463
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
3464
+ special_vocab.merges = merges
3465
+ # only add special tokens when they were not already loaded from config.json
3466
+ special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
3467
+ special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
3468
+ # this one is usually not in config.json anyway
3469
+ special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
3470
+ special_vocab.add_to_gguf(self.gguf_writer)
3471
+
3472
+ def set_gguf_parameters(self):
3473
+ n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
3474
+ n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
3475
+ n_head_kv = self.hparams.get("multi_query_group_num", n_head)
3476
+ self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
3477
+ self.gguf_writer.add_embedding_length(n_embed)
3478
+ self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
3479
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
3480
+ self.gguf_writer.add_head_count(n_head)
3481
+ self.gguf_writer.add_head_count_kv(n_head_kv)
3482
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
3483
+ self.gguf_writer.add_file_type(self.ftype)
3484
+ self.gguf_writer.add_rope_dimension_count(64)
3485
+ self.gguf_writer.add_add_bos_token(False)
3486
+ rope_freq = 10000
3487
+ if "rope_ratio" in self.hparams:
3488
+ rope_freq = rope_freq * self.hparams["rope_ratio"]
3489
+ self.gguf_writer.add_rope_freq_base(rope_freq)
3490
+
3491
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3492
+ del bid # unused
3493
+
3494
+ if name.endswith(".rotary_pos_emb.inv_freq"):
3495
+ return []
3496
+
3497
+ name = name.removeprefix("transformer.")
3498
+ return [(self.map_tensor_name(name), data_torch)]
3499
+
2704
3500
  ###### CONVERSION LOGIC ######
2705
3501
 
2706
3502
 
@@ -2717,19 +3513,46 @@ class LazyTorchTensor(gguf.LazyBase):
2717
3513
  torch.float32: np.float32,
2718
3514
  }
2719
3515
 
3516
+ # used for safetensors slices
3517
+ # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
3518
+ # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
3519
+ _dtype_str_map: dict[str, torch.dtype] = {
3520
+ "F64": torch.float64,
3521
+ "F32": torch.float32,
3522
+ "BF16": torch.bfloat16,
3523
+ "F16": torch.float16,
3524
+ # "U64": torch.uint64,
3525
+ "I64": torch.int64,
3526
+ # "U32": torch.uint32,
3527
+ "I32": torch.int32,
3528
+ # "U16": torch.uint16,
3529
+ "I16": torch.int16,
3530
+ "U8": torch.uint8,
3531
+ "I8": torch.int8,
3532
+ "BOOL": torch.bool,
3533
+ "F8_E4M3": torch.float8_e4m3fn,
3534
+ "F8_E5M2": torch.float8_e5m2,
3535
+ }
3536
+
2720
3537
  def numpy(self) -> gguf.LazyNumpyTensor:
2721
3538
  dtype = self._dtype_map[self.dtype]
2722
3539
  return gguf.LazyNumpyTensor(
2723
3540
  meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
2724
- lazy=self._lazy,
2725
3541
  args=(self,),
2726
- func=(lambda s: s[0].numpy())
3542
+ func=(lambda s: s.numpy())
2727
3543
  )
2728
3544
 
2729
3545
  @classmethod
2730
- def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
3546
+ def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
2731
3547
  return torch.empty(size=shape, dtype=dtype, device="meta")
2732
3548
 
3549
+ @classmethod
3550
+ def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
3551
+ dtype = cls._dtype_str_map[st_slice.get_dtype()]
3552
+ shape: tuple[int, ...] = tuple(st_slice.get_shape())
3553
+ lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
3554
+ return cast(torch.Tensor, lazy)
3555
+
2733
3556
  @classmethod
2734
3557
  def __torch_function__(cls, func, types, args=(), kwargs=None):
2735
3558
  del types # unused
@@ -2740,7 +3563,7 @@ class LazyTorchTensor(gguf.LazyBase):
2740
3563
  if func is torch.Tensor.numpy:
2741
3564
  return args[0].numpy()
2742
3565
 
2743
- return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
3566
+ return cls._wrap_fn(func)(*args, **kwargs)
2744
3567
 
2745
3568
 
2746
3569
  def parse_args() -> argparse.Namespace:
@@ -2750,10 +3573,6 @@ def parse_args() -> argparse.Namespace:
2750
3573
  "--vocab-only", action="store_true",
2751
3574
  help="extract only the vocab",
2752
3575
  )
2753
- parser.add_argument(
2754
- "--awq-path", type=Path, default=None,
2755
- help="Path to scale awq cache file",
2756
- )
2757
3576
  parser.add_argument(
2758
3577
  "--outfile", type=Path,
2759
3578
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -2786,30 +3605,58 @@ def parse_args() -> argparse.Namespace:
2786
3605
  "--verbose", action="store_true",
2787
3606
  help="increase output verbosity",
2788
3607
  )
3608
+ parser.add_argument(
3609
+ "--split-max-tensors", type=int, default=0,
3610
+ help="max tensors in each split",
3611
+ )
3612
+ parser.add_argument(
3613
+ "--split-max-size", type=str, default="0",
3614
+ help="max size per split N(M|G)",
3615
+ )
3616
+ parser.add_argument(
3617
+ "--dry-run", action="store_true",
3618
+ help="only print out a split plan and exit, without writing any new files",
3619
+ )
3620
+ parser.add_argument(
3621
+ "--no-tensor-first-split", action="store_true",
3622
+ help="do not add tensors to the first split (disabled by default)"
3623
+ )
3624
+ parser.add_argument(
3625
+ "--metadata", type=Path,
3626
+ help="Specify the path for an authorship metadata override file"
3627
+ )
2789
3628
 
2790
3629
  return parser.parse_args()
2791
3630
 
2792
3631
 
3632
+ def split_str_to_n_bytes(split_str: str) -> int:
3633
+ if split_str.endswith("K"):
3634
+ n = int(split_str[:-1]) * 1000
3635
+ elif split_str.endswith("M"):
3636
+ n = int(split_str[:-1]) * 1000 * 1000
3637
+ elif split_str.endswith("G"):
3638
+ n = int(split_str[:-1]) * 1000 * 1000 * 1000
3639
+ elif split_str.isnumeric():
3640
+ n = int(split_str)
3641
+ else:
3642
+ raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
3643
+
3644
+ if n < 0:
3645
+ raise ValueError(f"Invalid split size: {split_str}, must be positive")
3646
+
3647
+ return n
3648
+
3649
+
2793
3650
  def main() -> None:
2794
3651
  args = parse_args()
2795
3652
 
2796
- logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
3653
+ if args.verbose:
3654
+ logging.basicConfig(level=logging.DEBUG)
3655
+ else:
3656
+ logging.basicConfig(level=logging.INFO)
2797
3657
 
2798
3658
  dir_model = args.model
2799
3659
 
2800
- if args.awq_path:
2801
- sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
2802
- from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
2803
- tmp_model_path = args.model / "weighted_model"
2804
- dir_model = tmp_model_path
2805
- if tmp_model_path.is_dir():
2806
- logger.info(f"{tmp_model_path} exists as a weighted model.")
2807
- else:
2808
- tmp_model_path.mkdir(parents=True, exist_ok=True)
2809
- logger.info("Saving new weighted model ...")
2810
- add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
2811
- logger.info(f"Saved weighted model at {tmp_model_path}.")
2812
-
2813
3660
  if not dir_model.is_dir():
2814
3661
  logger.error(f'Error: {args.model} is not a directory')
2815
3662
  sys.exit(1)
@@ -2822,36 +3669,47 @@ def main() -> None:
2822
3669
  "auto": gguf.LlamaFileType.GUESSED,
2823
3670
  }
2824
3671
 
3672
+ is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
3673
+ if args.use_temp_file and is_split:
3674
+ logger.error("Error: Cannot use temp file when splitting")
3675
+ sys.exit(1)
3676
+
2825
3677
  if args.outfile is not None:
2826
3678
  fname_out = args.outfile
2827
3679
  else:
2828
- # output in the same directory as the model by default
2829
- fname_out = dir_model / 'ggml-model-{ftype}.gguf'
3680
+ fname_out = dir_model
2830
3681
 
2831
3682
  logger.info(f"Loading model: {dir_model.name}")
2832
3683
 
2833
3684
  hparams = Model.load_hparams(dir_model)
2834
3685
 
2835
3686
  with torch.inference_mode():
2836
- model_class = Model.from_model_architecture(hparams["architectures"][0])
2837
- model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
3687
+ output_type = ftype_map[args.outtype]
3688
+ model_architecture = hparams["architectures"][0]
2838
3689
 
2839
- logger.info("Set model parameters")
2840
- model_instance.set_gguf_parameters()
2841
-
2842
- logger.info("Set model tokenizer")
2843
- model_instance.set_vocab()
3690
+ try:
3691
+ model_class = Model.from_model_architecture(model_architecture)
3692
+ except NotImplementedError:
3693
+ logger.error(f"Model {model_architecture} is not supported")
3694
+ sys.exit(1)
2844
3695
 
2845
- model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
3696
+ model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
3697
+ is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
3698
+ eager=args.no_lazy,
3699
+ metadata_override=args.metadata, model_name=args.model_name,
3700
+ split_max_tensors=args.split_max_tensors,
3701
+ split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
3702
+ small_first_shard=args.no_tensor_first_split)
2846
3703
 
2847
3704
  if args.vocab_only:
2848
- logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
3705
+ logger.info("Exporting model vocab...")
2849
3706
  model_instance.write_vocab()
3707
+ logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
2850
3708
  else:
2851
- logger.info(f"Exporting model to '{model_instance.fname_out}'")
3709
+ logger.info("Exporting model...")
2852
3710
  model_instance.write()
2853
-
2854
- logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
3711
+ out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
3712
+ logger.info(f"Model successfully exported to {out_path}")
2855
3713
 
2856
3714
 
2857
3715
  if __name__ == '__main__':