bigdl-core-cpp 2.5.0b20240725__py3-none-manylinux2010_x86_64.whl → 2.5.0b20240727__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. bigdl/cpp/convert-hf-to-gguf.py +1106 -320
  2. bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
  3. bigdl/cpp/gguf-py/gguf/constants.py +442 -173
  4. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  5. bigdl/cpp/gguf-py/gguf/gguf_reader.py +29 -8
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +472 -156
  7. bigdl/cpp/gguf-py/gguf/lazy.py +24 -49
  8. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +195 -23
  9. bigdl/cpp/libs/baby-llama +0 -0
  10. bigdl/cpp/libs/batched +0 -0
  11. bigdl/cpp/libs/batched-bench +0 -0
  12. bigdl/cpp/libs/benchmark +0 -0
  13. bigdl/cpp/libs/embedding +0 -0
  14. bigdl/cpp/libs/gguf +0 -0
  15. bigdl/cpp/libs/imatrix +0 -0
  16. bigdl/cpp/libs/llama-bench +0 -0
  17. bigdl/cpp/libs/llava-cli +0 -0
  18. bigdl/cpp/libs/lookahead +0 -0
  19. bigdl/cpp/libs/lookup +0 -0
  20. bigdl/cpp/libs/ls-sycl-device +0 -0
  21. bigdl/cpp/libs/main +0 -0
  22. bigdl/cpp/libs/ollama +0 -0
  23. bigdl/cpp/libs/perplexity +0 -0
  24. bigdl/cpp/libs/quantize +0 -0
  25. bigdl/cpp/libs/quantize-stats +0 -0
  26. bigdl/cpp/libs/save-load-state +0 -0
  27. bigdl/cpp/libs/server +0 -0
  28. bigdl/cpp/libs/speculative +0 -0
  29. bigdl/cpp/libs/tokenize +0 -0
  30. {bigdl_core_cpp-2.5.0b20240725.dist-info → bigdl_core_cpp-2.5.0b20240727.dist-info}/METADATA +1 -1
  31. bigdl_core_cpp-2.5.0b20240727.dist-info/RECORD +43 -0
  32. bigdl_core_cpp-2.5.0b20240725.dist-info/RECORD +0 -43
  33. {bigdl_core_cpp-2.5.0b20240725.data → bigdl_core_cpp-2.5.0b20240727.data}/scripts/init-llama-cpp +0 -0
  34. {bigdl_core_cpp-2.5.0b20240725.data → bigdl_core_cpp-2.5.0b20240727.data}/scripts/init-ollama +0 -0
  35. {bigdl_core_cpp-2.5.0b20240725.dist-info → bigdl_core_cpp-2.5.0b20240727.dist-info}/WHEEL +0 -0
  36. {bigdl_core_cpp-2.5.0b20240725.dist-info → bigdl_core_cpp-2.5.0b20240727.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
2
3
 
3
4
  from __future__ import annotations
4
5
 
@@ -12,7 +13,7 @@ import sys
12
13
  from enum import IntEnum
13
14
  from pathlib import Path
14
15
  from hashlib import sha256
15
- from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
16
+ from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
16
17
 
17
18
  import math
18
19
  import numpy as np
@@ -25,10 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
25
26
  sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
26
27
  import gguf
27
28
 
28
- from convert import LlamaHfVocab
29
-
30
- logger = logging.getLogger("hf-to-gguf")
31
-
32
29
  logger = logging.getLogger("hf-to-gguf")
33
30
 
34
31
 
@@ -50,7 +47,8 @@ class Model:
50
47
  _model_classes: dict[str, type[Model]] = {}
51
48
 
52
49
  dir_model: Path
53
- ftype: int
50
+ ftype: gguf.LlamaFileType
51
+ fname_out: Path
54
52
  is_big_endian: bool
55
53
  endianess: gguf.GGUFEndian
56
54
  use_temp_file: bool
@@ -61,29 +59,41 @@ class Model:
61
59
  block_count: int
62
60
  tensor_map: gguf.TensorNameMap
63
61
  tensor_names: set[str] | None
64
- fname_out: Path
65
62
  gguf_writer: gguf.GGUFWriter
63
+ model_name: str | None
64
+ metadata_override: Path | None
65
+ dir_model_card: Path
66
66
 
67
67
  # subclasses should define this!
68
68
  model_arch: gguf.MODEL_ARCH
69
69
 
70
- def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
70
+ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
71
+ use_temp_file: bool = False, eager: bool = False,
72
+ metadata_override: Path | None = None, model_name: str | None = None,
73
+ split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
71
74
  if type(self) is Model:
72
75
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
76
+
73
77
  self.dir_model = dir_model
74
78
  self.ftype = ftype
79
+ self.fname_out = fname_out
75
80
  self.is_big_endian = is_big_endian
76
81
  self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
77
82
  self.use_temp_file = use_temp_file
78
83
  self.lazy = not eager
79
- self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
84
+ self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
80
85
  self.is_safetensors = len(self.part_names) > 0
81
86
  if not self.is_safetensors:
82
- self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
87
+ self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
83
88
  self.hparams = Model.load_hparams(self.dir_model)
84
- self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
89
+ self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
85
90
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
86
91
  self.tensor_names = None
92
+ self.metadata_override = metadata_override
93
+ self.model_name = model_name
94
+ self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
95
+
96
+ # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
87
97
  if self.ftype == gguf.LlamaFileType.GUESSED:
88
98
  # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
89
99
  _, first_tensor = next(self.get_tensors())
@@ -93,11 +103,10 @@ class Model:
93
103
  else:
94
104
  logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
95
105
  self.ftype = gguf.LlamaFileType.MOSTLY_BF16
96
- ftype_up: str = self.ftype.name.partition("_")[2].upper()
97
- ftype_lw: str = ftype_up.lower()
98
- # allow templating the file name with the output ftype, useful with the "auto" ftype
99
- self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
100
- self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
106
+
107
+ # Configure GGUF Writer
108
+ self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
109
+ split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
101
110
 
102
111
  @classmethod
103
112
  def __init_subclass__(cls):
@@ -147,9 +156,16 @@ class Model:
147
156
  tensor_names_from_parts.update(model_part.keys())
148
157
 
149
158
  for name in model_part.keys():
150
- data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
151
- if self.lazy:
152
- data = LazyTorchTensor.from_eager(data)
159
+ if self.is_safetensors:
160
+ if self.lazy:
161
+ data = model_part.get_slice(name)
162
+ data = LazyTorchTensor.from_safetensors_slice(data)
163
+ else:
164
+ data = model_part.get_tensor(name)
165
+ else:
166
+ data = model_part[name]
167
+ if self.lazy:
168
+ data = LazyTorchTensor.from_eager(data)
153
169
  yield name, data
154
170
 
155
171
  # only verify tensor name presence; it doesn't matter if they are not in the right files
@@ -185,7 +201,6 @@ class Model:
185
201
  return new_name
186
202
 
187
203
  def set_gguf_parameters(self):
188
- self.gguf_writer.add_name(self.dir_model.name)
189
204
  self.gguf_writer.add_block_count(self.block_count)
190
205
 
191
206
  if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@@ -224,6 +239,10 @@ class Model:
224
239
  self.gguf_writer.add_expert_used_count(n_experts_used)
225
240
  logger.info(f"gguf: experts used count = {n_experts_used}")
226
241
 
242
+ if (head_dim := self.hparams.get("head_dim")) is not None:
243
+ self.gguf_writer.add_key_length(head_dim)
244
+ self.gguf_writer.add_value_length(head_dim)
245
+
227
246
  self.gguf_writer.add_file_type(self.ftype)
228
247
  logger.info(f"gguf: file type = {self.ftype}")
229
248
 
@@ -242,7 +261,7 @@ class Model:
242
261
 
243
262
  return False
244
263
 
245
- def write_tensors(self):
264
+ def prepare_tensors(self):
246
265
  max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
247
266
 
248
267
  for name, data_torch in self.get_tensors():
@@ -264,7 +283,7 @@ class Model:
264
283
  break
265
284
 
266
285
  for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
267
- data: np.ndarray = data # type hint
286
+ data: np.ndarray # type hint
268
287
  n_dims = len(data.shape)
269
288
  data_dtype = data.dtype
270
289
  data_qtype: gguf.GGMLQuantizationType | None = None
@@ -325,23 +344,80 @@ class Model:
325
344
 
326
345
  self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
327
346
 
347
+ def set_type(self):
348
+ self.gguf_writer.add_type(gguf.GGUFType.MODEL)
349
+
350
+ def prepare_metadata(self, vocab_only: bool):
351
+
352
+ total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
353
+
354
+ self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
355
+
356
+ # Fallback to model directory name if metadata name is still missing
357
+ if self.metadata.name is None:
358
+ self.metadata.name = self.dir_model.name
359
+
360
+ # Generate parameter weight class (useful for leader boards) if not yet determined
361
+ if self.metadata.size_label is None and total_params > 0:
362
+ self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
363
+
364
+ # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
365
+ output_type: str = self.ftype.name.partition("_")[2]
366
+
367
+ # Filename Output
368
+ if self.fname_out.is_dir():
369
+ # Generate default filename based on model specification and available metadata
370
+ if not vocab_only:
371
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
372
+ else:
373
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
374
+
375
+ # Use the default filename
376
+ self.fname_out = self.fname_out / f"{fname_default}.gguf"
377
+ else:
378
+ # Output path is a custom defined templated filename
379
+ # Note: `not is_dir()` is used because `.is_file()` will not detect
380
+ # file template strings as it doesn't actually exist as a file
381
+
382
+ # Process templated file name with the output ftype, useful with the "auto" ftype
383
+ self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
384
+
385
+ self.set_type()
386
+
387
+ logger.info("Set meta model")
388
+ self.metadata.set_gguf_meta_model(self.gguf_writer)
389
+
390
+ logger.info("Set model parameters")
391
+ self.set_gguf_parameters()
392
+
393
+ logger.info("Set model tokenizer")
394
+ self.set_vocab()
395
+
396
+ logger.info("Set model quantization version")
397
+ self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
398
+
328
399
  def write(self):
329
- self.write_tensors()
330
- self.gguf_writer.write_header_to_file()
400
+ self.prepare_tensors()
401
+ self.prepare_metadata(vocab_only=False)
402
+ self.gguf_writer.write_header_to_file(path=self.fname_out)
331
403
  self.gguf_writer.write_kv_data_to_file()
332
404
  self.gguf_writer.write_tensors_to_file(progress=True)
333
405
  self.gguf_writer.close()
334
406
 
335
407
  def write_vocab(self):
336
- self.gguf_writer.write_header_to_file()
408
+ if len(self.gguf_writer.tensors) != 1:
409
+ raise ValueError('Splitting the vocabulary is not supported')
410
+
411
+ self.prepare_metadata(vocab_only=True)
412
+ self.gguf_writer.write_header_to_file(path=self.fname_out)
337
413
  self.gguf_writer.write_kv_data_to_file()
338
414
  self.gguf_writer.close()
339
415
 
340
416
  @staticmethod
341
- def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
417
+ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
342
418
  part_names: list[str] = []
343
419
  for filename in os.listdir(dir_model):
344
- if filename.endswith(suffix):
420
+ if filename.startswith(prefix) and filename.endswith(suffix):
345
421
  part_names.append(filename)
346
422
 
347
423
  part_names.sort()
@@ -370,6 +446,29 @@ class Model:
370
446
  except KeyError:
371
447
  raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
372
448
 
449
+ def does_token_look_special(self, token: str | bytes) -> bool:
450
+ if isinstance(token, (bytes, bytearray)):
451
+ token_text = token.decode(encoding="utf-8")
452
+ elif isinstance(token, memoryview):
453
+ token_text = token.tobytes().decode(encoding="utf-8")
454
+ else:
455
+ token_text = token
456
+
457
+ # Some models mark some added tokens which ought to be control tokens as not special.
458
+ # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
459
+ seems_special = token_text in (
460
+ "<pad>", # deepseek-coder
461
+ "<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
462
+ )
463
+
464
+ seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
465
+ seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
466
+
467
+ # TODO: should these be marked as UNUSED instead? (maybe not)
468
+ seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
469
+
470
+ return seems_special
471
+
373
472
  # used for GPT-2 BPE and WordPiece vocabs
374
473
  def get_vocab_base(self) -> tuple[list[str], list[int], str]:
375
474
  tokens: list[str] = []
@@ -388,20 +487,22 @@ class Model:
388
487
  for i in range(vocab_size):
389
488
  if i not in reverse_vocab:
390
489
  tokens.append(f"[PAD{i}]")
391
- toktypes.append(gguf.TokenType.USER_DEFINED)
392
- elif reverse_vocab[i] in added_vocab:
393
- tokens.append(reverse_vocab[i])
394
- if tokenizer.added_tokens_decoder[i].special:
395
- toktypes.append(gguf.TokenType.CONTROL)
396
- else:
397
- toktypes.append(gguf.TokenType.USER_DEFINED)
490
+ toktypes.append(gguf.TokenType.UNUSED)
398
491
  else:
399
- tokens.append(reverse_vocab[i])
400
- toktypes.append(gguf.TokenType.NORMAL)
492
+ token: str = reverse_vocab[i]
493
+ if token in added_vocab:
494
+ if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
495
+ toktypes.append(gguf.TokenType.CONTROL)
496
+ else:
497
+ token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
498
+ toktypes.append(gguf.TokenType.USER_DEFINED)
499
+ else:
500
+ toktypes.append(gguf.TokenType.NORMAL)
501
+ tokens.append(token)
401
502
 
402
503
  return tokens, toktypes, tokpre
403
504
 
404
- # NOTE: this function is generated by convert-hf-to-gguf-update.py
505
+ # NOTE: this function is generated by convert_hf_to_gguf_update.py
405
506
  # do not modify it manually!
406
507
  # ref: https://github.com/ggerganov/llama.cpp/pull/6920
407
508
  # Marker: Start get_vocab_base_pre
@@ -421,7 +522,7 @@ class Model:
421
522
 
422
523
  res = None
423
524
 
424
- # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
525
+ # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
425
526
  # or pull the latest version of the model from Huggingface
426
527
  # don't edit the hashes manually!
427
528
  if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
@@ -478,15 +579,39 @@ class Model:
478
579
  if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
479
580
  # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
480
581
  res = "smaug-bpe"
582
+ if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
583
+ # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
584
+ res = "poro-chat"
585
+ if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
586
+ # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
587
+ res = "jina-v2-code"
588
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
589
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
590
+ res = "chatglm-bpe"
591
+ if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
592
+ # ref: https://huggingface.co/LumiOpen/Viking-7B
593
+ res = "viking"
594
+ if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
595
+ # ref: https://huggingface.co/core42/jais-13b
596
+ res = "jais"
597
+ if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
598
+ # ref: https://huggingface.co/WisdomShell/CodeShell-7B
599
+ res = "codeshell"
600
+ if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
601
+ # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
602
+ res = "tekken"
603
+ if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
604
+ # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
605
+ res = "smollm"
481
606
 
482
607
  if res is None:
483
608
  logger.warning("\n")
484
609
  logger.warning("**************************************************************************************")
485
610
  logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
486
611
  logger.warning("** There are 2 possible reasons for this:")
487
- logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
612
+ logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
488
613
  logger.warning("** - the pre-tokenization config has changed upstream")
489
- logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
614
+ logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
490
615
  logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
491
616
  logger.warning("**")
492
617
  logger.warning(f"** chkhsh: {chkhsh}")
@@ -541,7 +666,7 @@ class Model:
541
666
  for i in range(vocab_size):
542
667
  if i not in reverse_vocab:
543
668
  tokens.append(f"[PAD{i}]")
544
- toktypes.append(gguf.TokenType.USER_DEFINED)
669
+ toktypes.append(gguf.TokenType.UNUSED)
545
670
  elif reverse_vocab[i] in added_vocab:
546
671
  tokens.append(reverse_vocab[i])
547
672
  toktypes.append(gguf.TokenType.CONTROL)
@@ -564,15 +689,23 @@ class Model:
564
689
  special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
565
690
  special_vocab.add_to_gguf(self.gguf_writer)
566
691
 
567
- def _set_vocab_sentencepiece(self):
692
+ def _set_vocab_sentencepiece(self, add_to_gguf=True):
693
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
694
+
695
+ self.gguf_writer.add_tokenizer_model("llama")
696
+ self.gguf_writer.add_tokenizer_pre("default")
697
+ self.gguf_writer.add_token_list(tokens)
698
+ self.gguf_writer.add_token_scores(scores)
699
+ self.gguf_writer.add_token_types(toktypes)
700
+
701
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
702
+ special_vocab.add_to_gguf(self.gguf_writer)
703
+
704
+ def _create_vocab_sentencepiece(self):
568
705
  from sentencepiece import SentencePieceProcessor
569
706
 
570
707
  tokenizer_path = self.dir_model / 'tokenizer.model'
571
708
 
572
- tokens: list[bytes] = []
573
- scores: list[float] = []
574
- toktypes: list[int] = []
575
-
576
709
  if not tokenizer_path.is_file():
577
710
  raise FileNotFoundError(f"File not found: {tokenizer_path}")
578
711
 
@@ -583,7 +716,7 @@ class Model:
583
716
 
584
717
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
585
718
  scores: list[float] = [-10000.0] * vocab_size
586
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
719
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
587
720
 
588
721
  for token_id in range(tokenizer.vocab_size()):
589
722
  piece = tokenizer.IdToPiece(token_id)
@@ -610,7 +743,7 @@ class Model:
610
743
  added_tokens_json = json.load(f)
611
744
  for key in added_tokens_json:
612
745
  token_id = added_tokens_json[key]
613
- if (token_id >= vocab_size):
746
+ if token_id >= vocab_size:
614
747
  logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
615
748
  continue
616
749
 
@@ -618,6 +751,26 @@ class Model:
618
751
  scores[token_id] = -1000.0
619
752
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
620
753
 
754
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
755
+ if tokenizer_config_file.is_file():
756
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
757
+ tokenizer_config_json = json.load(f)
758
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
759
+ for token_id, token_data in added_tokens_decoder.items():
760
+ token_id = int(token_id)
761
+ token: str = token_data["content"]
762
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
763
+ if tokens[token_id] != token.encode("utf-8"):
764
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
765
+ if token_data.get("special") or self.does_token_look_special(token):
766
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
767
+ else:
768
+ token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
769
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
770
+
771
+ scores[token_id] = -1000.0
772
+ tokens[token_id] = token.encode("utf-8")
773
+
621
774
  if vocab_size > len(tokens):
622
775
  pad_count = vocab_size - len(tokens)
623
776
  logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
@@ -626,17 +779,10 @@ class Model:
626
779
  scores.append(-1000.0)
627
780
  toktypes.append(SentencePieceTokenTypes.UNUSED)
628
781
 
629
- self.gguf_writer.add_tokenizer_model("llama")
630
- self.gguf_writer.add_tokenizer_pre("default")
631
- self.gguf_writer.add_token_list(tokens)
632
- self.gguf_writer.add_token_scores(scores)
633
- self.gguf_writer.add_token_types(toktypes)
634
-
635
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
636
- special_vocab.add_to_gguf(self.gguf_writer)
782
+ return tokens, scores, toktypes
637
783
 
638
784
  def _set_vocab_llama_hf(self):
639
- vocab = LlamaHfVocab(self.dir_model)
785
+ vocab = gguf.LlamaHfVocab(self.dir_model)
640
786
  tokens = []
641
787
  scores = []
642
788
  toktypes = []
@@ -657,6 +803,51 @@ class Model:
657
803
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
658
804
  special_vocab.add_to_gguf(self.gguf_writer)
659
805
 
806
+ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
807
+ tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
808
+ logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
809
+ vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
810
+
811
+ default_pre = "mpt" if model_name == "gpt-neox" else "default"
812
+
813
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
814
+ assert field # tokenizer model
815
+ self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
816
+
817
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
818
+ self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
819
+
820
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
821
+ assert field # token list
822
+ self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
823
+
824
+ if model_name == "llama-spm":
825
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
826
+ assert field # token scores
827
+ self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
828
+
829
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
830
+ assert field # token types
831
+ self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
832
+
833
+ if model_name != "llama-spm":
834
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
835
+ assert field # token merges
836
+ self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
837
+
838
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
839
+ self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
840
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
841
+ self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
842
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
843
+ self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
844
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
845
+ self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
846
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
847
+ self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
848
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
849
+ self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
850
+
660
851
 
661
852
  @Model.register("GPTNeoXForCausalLM")
662
853
  class GPTNeoXModel(Model):
@@ -665,7 +856,6 @@ class GPTNeoXModel(Model):
665
856
  def set_gguf_parameters(self):
666
857
  block_count = self.hparams["num_hidden_layers"]
667
858
 
668
- self.gguf_writer.add_name(self.dir_model.name)
669
859
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
670
860
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
671
861
  self.gguf_writer.add_block_count(block_count)
@@ -721,7 +911,6 @@ class BloomModel(Model):
721
911
  model_arch = gguf.MODEL_ARCH.BLOOM
722
912
 
723
913
  def set_gguf_parameters(self):
724
- self.gguf_writer.add_name("Bloom")
725
914
  n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
726
915
  n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
727
916
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
@@ -798,7 +987,6 @@ class MPTModel(Model):
798
987
 
799
988
  def set_gguf_parameters(self):
800
989
  block_count = self.hparams["n_layers"]
801
- self.gguf_writer.add_name(self.dir_model.name)
802
990
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
803
991
  self.gguf_writer.add_embedding_length(self.hparams["d_model"])
804
992
  self.gguf_writer.add_block_count(block_count)
@@ -837,7 +1025,6 @@ class OrionModel(Model):
837
1025
  block_count = self.hparams["num_hidden_layers"]
838
1026
  head_count = self.hparams["num_attention_heads"]
839
1027
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
840
- hf_repo = self.hparams.get("_name_or_path", "")
841
1028
 
842
1029
  ctx_length = 0
843
1030
  if "max_sequence_length" in self.hparams:
@@ -850,8 +1037,6 @@ class OrionModel(Model):
850
1037
  raise ValueError("gguf: can not find ctx length parameter.")
851
1038
 
852
1039
  self.gguf_writer.add_file_type(self.ftype)
853
- self.gguf_writer.add_name(self.dir_model.name)
854
- self.gguf_writer.add_source_hf_repo(hf_repo)
855
1040
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
856
1041
  self.gguf_writer.add_context_length(ctx_length)
857
1042
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -875,7 +1060,6 @@ class BaichuanModel(Model):
875
1060
  block_count = self.hparams["num_hidden_layers"]
876
1061
  head_count = self.hparams["num_attention_heads"]
877
1062
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
878
- hf_repo = self.hparams.get("_name_or_path", "")
879
1063
 
880
1064
  ctx_length = 0
881
1065
  if "max_sequence_length" in self.hparams:
@@ -887,8 +1071,6 @@ class BaichuanModel(Model):
887
1071
  else:
888
1072
  raise ValueError("gguf: can not find ctx length parameter.")
889
1073
 
890
- self.gguf_writer.add_name(self.dir_model.name)
891
- self.gguf_writer.add_source_hf_repo(hf_repo)
892
1074
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
893
1075
  self.gguf_writer.add_context_length(ctx_length)
894
1076
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -962,7 +1144,11 @@ class XverseModel(Model):
962
1144
  from transformers import AutoTokenizer
963
1145
  tokenizer = AutoTokenizer.from_pretrained(dir_model)
964
1146
  vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
965
- assert max(tokenizer.vocab.values()) < vocab_size
1147
+ # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
1148
+ # because vocab_size is the count of items, and indexes start at 0.
1149
+ max_vocab_index = max(tokenizer.get_vocab().values())
1150
+ if max_vocab_index >= vocab_size:
1151
+ raise ValueError("Vocabulary size exceeds expected maximum size.")
966
1152
 
967
1153
  reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
968
1154
  added_vocab = tokenizer.get_added_vocab()
@@ -998,7 +1184,6 @@ class XverseModel(Model):
998
1184
  block_count = self.hparams["num_hidden_layers"]
999
1185
  head_count = self.hparams["num_attention_heads"]
1000
1186
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
1001
- hf_repo = self.hparams.get("_name_or_path", "")
1002
1187
 
1003
1188
  ctx_length = 0
1004
1189
  if "max_sequence_length" in self.hparams:
@@ -1010,8 +1195,6 @@ class XverseModel(Model):
1010
1195
  else:
1011
1196
  raise ValueError("gguf: can not find ctx length parameter.")
1012
1197
 
1013
- self.gguf_writer.add_name(self.dir_model.name)
1014
- self.gguf_writer.add_source_hf_repo(hf_repo)
1015
1198
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
1016
1199
  self.gguf_writer.add_context_length(ctx_length)
1017
1200
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1070,7 +1253,6 @@ class FalconModel(Model):
1070
1253
  if n_head_kv is None:
1071
1254
  n_head_kv = self.hparams.get("n_head_kv", 1) # old name
1072
1255
 
1073
- self.gguf_writer.add_name("Falcon")
1074
1256
  self.gguf_writer.add_context_length(2048) # not in config.json
1075
1257
  self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
1076
1258
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1115,7 +1297,6 @@ class StarCoderModel(Model):
1115
1297
  def set_gguf_parameters(self):
1116
1298
  block_count = self.hparams["n_layer"]
1117
1299
 
1118
- self.gguf_writer.add_name("StarCoder")
1119
1300
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
1120
1301
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
1121
1302
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -1135,11 +1316,11 @@ class RefactModel(Model):
1135
1316
 
1136
1317
  # TODO: how to determine special FIM tokens automatically?
1137
1318
  special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
1138
- special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
1319
+ special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
1139
1320
  special_vocab._set_special_token("prefix", 1)
1140
1321
  special_vocab._set_special_token("suffix", 3)
1141
1322
  special_vocab._set_special_token("middle", 2)
1142
- special_vocab._set_special_token("fsep", 4) # is this correct?
1323
+ special_vocab.chat_template = None # do not add it twice
1143
1324
  special_vocab.add_to_gguf(self.gguf_writer)
1144
1325
 
1145
1326
  def set_gguf_parameters(self):
@@ -1151,7 +1332,6 @@ class RefactModel(Model):
1151
1332
 
1152
1333
  block_count = self.hparams["n_layer"]
1153
1334
 
1154
- self.gguf_writer.add_name("Refact")
1155
1335
  # refact uses Alibi. So this is from config.json which might be used by training.
1156
1336
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
1157
1337
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -1199,14 +1379,13 @@ class StableLMModel(Model):
1199
1379
  if (self.dir_model / "tokenizer.json").is_file():
1200
1380
  self._set_vocab_gpt2()
1201
1381
  else:
1202
- # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1382
+ # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
1203
1383
  self._set_vocab_qwen()
1204
1384
 
1205
1385
  def set_gguf_parameters(self):
1206
1386
  hparams = self.hparams
1207
1387
  block_count = hparams["num_hidden_layers"]
1208
1388
 
1209
- self.gguf_writer.add_name(self.dir_model.name)
1210
1389
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
1211
1390
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1212
1391
  self.gguf_writer.add_block_count(block_count)
@@ -1268,8 +1447,8 @@ class StableLMModel(Model):
1268
1447
 
1269
1448
  return [(new_name, data_torch)]
1270
1449
 
1271
- def write_tensors(self):
1272
- super().write_tensors()
1450
+ def prepare_tensors(self):
1451
+ super().prepare_tensors()
1273
1452
 
1274
1453
  if self._q_norms is not None or self._k_norms is not None:
1275
1454
  # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
@@ -1281,85 +1460,6 @@ class StableLMModel(Model):
1281
1460
  if len(norms) > 0:
1282
1461
  raise ValueError(f"Unprocessed norms: {norms}")
1283
1462
 
1284
- def write_tensors(self):
1285
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1286
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1287
- n_head = self.hparams.get("num_attention_heads")
1288
- n_kv_head = self.hparams.get("num_key_value_heads")
1289
- q_norms = dict()
1290
- k_norms = dict()
1291
- for name, data_torch in self.get_tensors():
1292
- # we don't need these
1293
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1294
- continue
1295
-
1296
- old_dtype = data_torch.dtype
1297
-
1298
- # convert any unsupported data types to float32
1299
- if data_torch.dtype not in (torch.float16, torch.float32):
1300
- data_torch = data_torch.to(torch.float32)
1301
-
1302
- data = data_torch.squeeze().numpy()
1303
- n_dims = len(data.shape)
1304
- if name.find("q_layernorm.norms") != -1:
1305
- q_norms[name] = data
1306
- if len(q_norms) >= (block_count * n_head):
1307
- self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
1308
- continue
1309
- if name.find("k_layernorm.norms") != -1:
1310
- k_norms[name] = data
1311
- if len(k_norms) >= (block_count * n_kv_head):
1312
- self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
1313
- continue
1314
-
1315
- # map tensor names
1316
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1317
- if new_name is None:
1318
- raise ValueError(f"Can not map tensor {name!r}")
1319
-
1320
- n_dims = len(data.shape)
1321
- data_dtype = data.dtype
1322
-
1323
- # if f32 desired, convert any float16 to float32
1324
- if self.ftype == 0 and data_dtype == np.float16:
1325
- data = data.astype(np.float32)
1326
-
1327
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1328
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1329
- data = data.astype(np.float32)
1330
-
1331
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1332
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1333
- data = data.astype(np.float16)
1334
-
1335
- logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1336
-
1337
- self.gguf_writer.add_tensor(new_name, data)
1338
-
1339
- def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
1340
- for bid in range(block_count):
1341
- datas = []
1342
- for xid in range(n_head):
1343
- ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
1344
- datas.append(norms[ename])
1345
- del norms[ename]
1346
- data = np.stack(datas, axis=0)
1347
- data_dtype = data.dtype
1348
- merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
1349
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1350
- if new_name is None:
1351
- raise ValueError(f"Can not map tensor {name!r}")
1352
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1353
- data = data.astype(np.float32)
1354
-
1355
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1356
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1357
- data = data.astype(np.float16)
1358
-
1359
- logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1360
-
1361
- self.gguf_writer.add_tensor(new_name, data)
1362
-
1363
1463
 
1364
1464
  @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1365
1465
  class LlamaModel(Model):
@@ -1367,7 +1467,7 @@ class LlamaModel(Model):
1367
1467
 
1368
1468
  def set_vocab(self):
1369
1469
  try:
1370
- self. _set_vocab_sentencepiece()
1470
+ self._set_vocab_sentencepiece()
1371
1471
  except FileNotFoundError:
1372
1472
  try:
1373
1473
  self._set_vocab_llama_hf()
@@ -1391,13 +1491,29 @@ class LlamaModel(Model):
1391
1491
  super().set_gguf_parameters()
1392
1492
  hparams = self.hparams
1393
1493
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1394
- self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
1494
+
1495
+ if "head_dim" in hparams:
1496
+ rope_dim = hparams["head_dim"]
1497
+ else:
1498
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1499
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
1395
1500
 
1396
1501
  if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1397
1502
  if self.hparams["rope_scaling"].get("type") == "linear":
1398
1503
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1399
1504
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1400
1505
 
1506
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1507
+ if tokenizer_config_file.is_file():
1508
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1509
+ tokenizer_config_json = json.load(f)
1510
+ if "add_prefix_space" in tokenizer_config_json:
1511
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1512
+
1513
+ # Apply to granite small models only
1514
+ if self.hparams.get("vocab_size", 32000) == 49152:
1515
+ self.gguf_writer.add_add_bos_token(False)
1516
+
1401
1517
  @staticmethod
1402
1518
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1403
1519
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1412,9 +1528,9 @@ class LlamaModel(Model):
1412
1528
  n_head = self.hparams["num_attention_heads"]
1413
1529
  n_kv_head = self.hparams.get("num_key_value_heads")
1414
1530
 
1415
- if name.endswith("q_proj.weight"):
1531
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
1416
1532
  data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1417
- if name.endswith("k_proj.weight"):
1533
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
1418
1534
  data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1419
1535
 
1420
1536
  # process the experts separately
@@ -1453,8 +1569,8 @@ class LlamaModel(Model):
1453
1569
 
1454
1570
  return [(self.map_tensor_name(name), data_torch)]
1455
1571
 
1456
- def write_tensors(self):
1457
- super().write_tensors()
1572
+ def prepare_tensors(self):
1573
+ super().prepare_tensors()
1458
1574
 
1459
1575
  if self._experts is not None:
1460
1576
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -1463,6 +1579,48 @@ class LlamaModel(Model):
1463
1579
  raise ValueError(f"Unprocessed experts: {experts}")
1464
1580
 
1465
1581
 
1582
+ @Model.register("BitnetForCausalLM")
1583
+ class BitnetModel(Model):
1584
+ model_arch = gguf.MODEL_ARCH.BITNET
1585
+
1586
+ def set_vocab(self):
1587
+ self._set_vocab_sentencepiece()
1588
+
1589
+ def set_gguf_parameters(self):
1590
+ super().set_gguf_parameters()
1591
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1592
+ self.gguf_writer.add_rope_scaling_factor(1.0)
1593
+
1594
+ def weight_quant(self, weight):
1595
+ dtype = weight.dtype
1596
+ weight = weight.float()
1597
+ s = 1 / weight.abs().mean().clamp(min=1e-5)
1598
+ weight = (weight * s).round().clamp(-1, 1) / s
1599
+ scale = weight.abs().max().unsqueeze(0)
1600
+ weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
1601
+ weight = torch.sign(weight).type(dtype)
1602
+ return weight.type(dtype), scale.type(torch.float32)
1603
+
1604
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1605
+ new_name = self.map_tensor_name(name)
1606
+
1607
+ if any(self.match_model_tensor_name(new_name, key, bid) for key in [
1608
+ gguf.MODEL_TENSOR.ATTN_Q,
1609
+ gguf.MODEL_TENSOR.ATTN_K,
1610
+ gguf.MODEL_TENSOR.ATTN_V,
1611
+ gguf.MODEL_TENSOR.ATTN_OUT,
1612
+ gguf.MODEL_TENSOR.FFN_UP,
1613
+ gguf.MODEL_TENSOR.FFN_DOWN,
1614
+ gguf.MODEL_TENSOR.FFN_GATE,
1615
+ ]):
1616
+ # transform weight into 1/0/-1 (in fp32)
1617
+ weight_torch, scale_torch = self.weight_quant(data_torch)
1618
+ yield (new_name, weight_torch)
1619
+ yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
1620
+ else:
1621
+ yield (new_name, data_torch)
1622
+
1623
+
1466
1624
  @Model.register("GrokForCausalLM")
1467
1625
  class GrokModel(Model):
1468
1626
  model_arch = gguf.MODEL_ARCH.GROK
@@ -1475,7 +1633,6 @@ class GrokModel(Model):
1475
1633
 
1476
1634
  def set_gguf_parameters(self):
1477
1635
  super().set_gguf_parameters()
1478
- self.gguf_writer.add_name("Grok")
1479
1636
 
1480
1637
  _experts: list[dict[str, Tensor]] | None = None
1481
1638
 
@@ -1524,7 +1681,6 @@ class DbrxModel(Model):
1524
1681
  def set_gguf_parameters(self):
1525
1682
  ffn_config = self.hparams["ffn_config"]
1526
1683
  attn_config = self.hparams["attn_config"]
1527
- self.gguf_writer.add_name(self.hparams["model_type"])
1528
1684
  self.gguf_writer.add_block_count(self.hparams["n_layers"])
1529
1685
 
1530
1686
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
@@ -1537,7 +1693,6 @@ class DbrxModel(Model):
1537
1693
  self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
1538
1694
 
1539
1695
  self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
1540
- self.gguf_writer.add_file_type(self.ftype)
1541
1696
 
1542
1697
  self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
1543
1698
  self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
@@ -1594,7 +1749,6 @@ class MiniCPMModel(Model):
1594
1749
 
1595
1750
  def set_gguf_parameters(self):
1596
1751
  block_count = self.hparams["num_hidden_layers"]
1597
- self.gguf_writer.add_name("MiniCPM")
1598
1752
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1599
1753
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1600
1754
  self.gguf_writer.add_block_count(block_count)
@@ -1612,9 +1766,11 @@ class MiniCPMModel(Model):
1612
1766
  if n_kv_head is not None and n_head != n_kv_head:
1613
1767
  n_head = n_kv_head
1614
1768
 
1615
- return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1616
- .swapaxes(1, 2)
1617
- .reshape(weights.shape))
1769
+ return (
1770
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1771
+ .swapaxes(1, 2)
1772
+ .reshape(weights.shape)
1773
+ )
1618
1774
 
1619
1775
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1620
1776
  del bid # unused
@@ -1662,7 +1818,6 @@ class QwenModel(Model):
1662
1818
  self._set_vocab_qwen()
1663
1819
 
1664
1820
  def set_gguf_parameters(self):
1665
- self.gguf_writer.add_name("Qwen")
1666
1821
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1667
1822
  self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
1668
1823
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1693,6 +1848,12 @@ class Qwen2MoeModel(Model):
1693
1848
  super().set_gguf_parameters()
1694
1849
  if (n_experts := self.hparams.get("num_experts")) is not None:
1695
1850
  self.gguf_writer.add_expert_count(n_experts)
1851
+ if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
1852
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
1853
+ logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
1854
+ if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
1855
+ self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
1856
+ logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
1696
1857
 
1697
1858
  _experts: list[dict[str, Tensor]] | None = None
1698
1859
 
@@ -1732,8 +1893,8 @@ class Qwen2MoeModel(Model):
1732
1893
 
1733
1894
  return [(self.map_tensor_name(name), data_torch)]
1734
1895
 
1735
- def write_tensors(self):
1736
- super().write_tensors()
1896
+ def prepare_tensors(self):
1897
+ super().prepare_tensors()
1737
1898
 
1738
1899
  if self._experts is not None:
1739
1900
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -1747,7 +1908,6 @@ class GPT2Model(Model):
1747
1908
  model_arch = gguf.MODEL_ARCH.GPT2
1748
1909
 
1749
1910
  def set_gguf_parameters(self):
1750
- self.gguf_writer.add_name(self.dir_model.name)
1751
1911
  self.gguf_writer.add_block_count(self.hparams["n_layer"])
1752
1912
  self.gguf_writer.add_context_length(self.hparams["n_ctx"])
1753
1913
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -1790,7 +1950,6 @@ class Phi2Model(Model):
1790
1950
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
1791
1951
  n_head = self.find_hparam(["num_attention_heads", "n_head"])
1792
1952
 
1793
- self.gguf_writer.add_name("Phi2")
1794
1953
  self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
1795
1954
 
1796
1955
  self.gguf_writer.add_embedding_length(n_embd)
@@ -1823,7 +1982,7 @@ class Phi3MiniModel(Model):
1823
1982
 
1824
1983
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
1825
1984
  scores: list[float] = [-10000.0] * vocab_size
1826
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
1985
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
1827
1986
 
1828
1987
  for token_id in range(tokenizer.vocab_size()):
1829
1988
 
@@ -1852,7 +2011,7 @@ class Phi3MiniModel(Model):
1852
2011
 
1853
2012
  for key in added_tokens_json:
1854
2013
  token_id = added_tokens_json[key]
1855
- if (token_id >= vocab_size):
2014
+ if token_id >= vocab_size:
1856
2015
  logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
1857
2016
  continue
1858
2017
 
@@ -1868,8 +2027,9 @@ class Phi3MiniModel(Model):
1868
2027
  for token_id, foken_data in added_tokens_decoder.items():
1869
2028
  token_id = int(token_id)
1870
2029
  token = foken_data["content"].encode("utf-8")
1871
- if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1872
- assert tokens[token_id] == token
2030
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2031
+ if tokens[token_id] != token:
2032
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
1873
2033
  tokens[token_id] = token
1874
2034
  scores[token_id] = -1000.0
1875
2035
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1884,8 +2044,9 @@ class Phi3MiniModel(Model):
1884
2044
  for foken_data in added_tokens:
1885
2045
  token_id = int(foken_data["id"])
1886
2046
  token = foken_data["content"].encode("utf-8")
1887
- if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1888
- assert tokens[token_id] == token
2047
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2048
+ if tokens[token_id] != token:
2049
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
1889
2050
  tokens[token_id] = token
1890
2051
  scores[token_id] = -1000.0
1891
2052
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1912,7 +2073,6 @@ class Phi3MiniModel(Model):
1912
2073
  orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
1913
2074
  rope_dims = n_embd // n_head
1914
2075
 
1915
- self.gguf_writer.add_name("Phi3")
1916
2076
  self.gguf_writer.add_context_length(max_pos_embds)
1917
2077
  self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
1918
2078
  self.gguf_writer.add_embedding_length(n_embd)
@@ -1924,10 +2084,11 @@ class Phi3MiniModel(Model):
1924
2084
  self.gguf_writer.add_rope_dimension_count(rope_dims)
1925
2085
  self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
1926
2086
  self.gguf_writer.add_file_type(self.ftype)
2087
+ self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
1927
2088
 
1928
2089
  # write rope scaling for long context (128k) model
1929
2090
  rope_scaling = self.find_hparam(['rope_scaling'], True)
1930
- if (rope_scaling is None):
2091
+ if rope_scaling is None:
1931
2092
  return
1932
2093
 
1933
2094
  scale = max_pos_embds / orig_max_pos_embds
@@ -1936,7 +2097,7 @@ class Phi3MiniModel(Model):
1936
2097
  if len(rope_scaling_type) == 0:
1937
2098
  raise KeyError('Missing the required key rope_scaling.type')
1938
2099
 
1939
- if rope_scaling_type == 'su':
2100
+ if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
1940
2101
  attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
1941
2102
  elif rope_scaling_type == 'yarn':
1942
2103
  attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
@@ -1969,7 +2130,6 @@ class PlamoModel(Model):
1969
2130
  hparams = self.hparams
1970
2131
  block_count = hparams["num_hidden_layers"]
1971
2132
 
1972
- self.gguf_writer.add_name("PLaMo")
1973
2133
  self.gguf_writer.add_context_length(4096) # not in config.json
1974
2134
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1975
2135
  self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
@@ -2014,7 +2174,6 @@ class CodeShellModel(Model):
2014
2174
  def set_gguf_parameters(self):
2015
2175
  block_count = self.hparams["n_layer"]
2016
2176
 
2017
- self.gguf_writer.add_name("CodeShell")
2018
2177
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
2019
2178
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
2020
2179
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -2066,7 +2225,7 @@ class InternLM2Model(Model):
2066
2225
  logger.error(f'Error: Missing {tokenizer_path}')
2067
2226
  sys.exit(1)
2068
2227
 
2069
- sentencepiece_model = model.ModelProto()
2228
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
2070
2229
  sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2071
2230
  add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2072
2231
 
@@ -2094,6 +2253,9 @@ class InternLM2Model(Model):
2094
2253
  toktype = SentencePieceTokenTypes.UNUSED
2095
2254
  elif tokenizer.IsByte(token_id):
2096
2255
  toktype = SentencePieceTokenTypes.BYTE
2256
+ # take care of ununsed raw token
2257
+ if piece.startswith('[UNUSED'):
2258
+ toktype = SentencePieceTokenTypes.UNUSED
2097
2259
 
2098
2260
  tokens.append(text)
2099
2261
  scores.append(score)
@@ -2109,6 +2271,49 @@ class InternLM2Model(Model):
2109
2271
  scores.append(-1000.0)
2110
2272
  toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
2111
2273
 
2274
+ chat_eos_token = '<|im_end|>'
2275
+ chat_eos_token_id = None
2276
+
2277
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2278
+ if tokenizer_config_file.is_file():
2279
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2280
+ tokenizer_config_json = json.load(f)
2281
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
2282
+ for token_id, foken_data in added_tokens_decoder.items():
2283
+ token_id = int(token_id)
2284
+ token = foken_data["content"]
2285
+ if token == chat_eos_token:
2286
+ chat_eos_token_id = token_id
2287
+ token = token.encode("utf-8")
2288
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2289
+ if tokens[token_id] != token:
2290
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
2291
+ tokens[token_id] = token
2292
+ scores[token_id] = -1000.0
2293
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2294
+ if foken_data.get("special"):
2295
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2296
+
2297
+ tokenizer_file = self.dir_model / 'tokenizer.json'
2298
+ if tokenizer_file.is_file():
2299
+ with open(tokenizer_file, "r", encoding="utf-8") as f:
2300
+ tokenizer_json = json.load(f)
2301
+ added_tokens = tokenizer_json.get("added_tokens", [])
2302
+ for foken_data in added_tokens:
2303
+ token_id = int(foken_data["id"])
2304
+ token = foken_data["content"]
2305
+ if token == chat_eos_token:
2306
+ chat_eos_token_id = token_id
2307
+ token = token.encode("utf-8")
2308
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2309
+ if tokens[token_id] != token:
2310
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
2311
+ tokens[token_id] = token
2312
+ scores[token_id] = -1000.0
2313
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2314
+ if foken_data.get("special"):
2315
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2316
+
2112
2317
  self.gguf_writer.add_tokenizer_model("llama")
2113
2318
  self.gguf_writer.add_tokenizer_pre("default")
2114
2319
  self.gguf_writer.add_token_list(tokens)
@@ -2118,37 +2323,17 @@ class InternLM2Model(Model):
2118
2323
 
2119
2324
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2120
2325
  old_eos = special_vocab.special_token_ids["eos"]
2121
- if "chat" in os.path.basename(self.dir_model.absolute()):
2326
+ if chat_eos_token_id is not None:
2122
2327
  # For the chat model, we replace the eos with '<|im_end|>'.
2123
2328
  # TODO: this is a hack, should be fixed
2124
2329
  # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2125
- special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
2126
- logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
2127
- in chat mode so that the conversation can end normally.")
2330
+ special_vocab.special_token_ids["eos"] = chat_eos_token_id
2331
+ logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
2332
+ " in chat mode so that the conversation can end normally.")
2128
2333
 
2129
2334
  special_vocab.add_to_gguf(self.gguf_writer)
2130
2335
 
2131
- def _try_get_sft_eos(self, tokenizer):
2132
- unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
2133
- im_end_list = tokenizer.Encode('<|im_end|>')
2134
- eos_token = None
2135
- assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
2136
- if len(unused_145_list) == 1:
2137
- eos_token = unused_145_list[0]
2138
- if len(im_end_list) == 1:
2139
- eos_token = im_end_list[0]
2140
- assert eos_token
2141
- return eos_token
2142
-
2143
- def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
2144
- if n_head_kv is not None and n_head != n_head_kv:
2145
- n_head = n_head_kv
2146
- return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2147
- .swapaxes(1, 2)
2148
- .reshape(weights.shape))
2149
-
2150
2336
  def set_gguf_parameters(self):
2151
- self.gguf_writer.add_name("InternLM2")
2152
2337
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
2153
2338
  self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
2154
2339
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -2158,30 +2343,30 @@ in chat mode so that the conversation can end normally.")
2158
2343
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2159
2344
  self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
2160
2345
  self.gguf_writer.add_file_type(self.ftype)
2346
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2347
+ if self.hparams["rope_scaling"].get("type") == "linear":
2348
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2349
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2161
2350
 
2162
2351
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2163
2352
  num_heads = self.hparams["num_attention_heads"]
2164
2353
  num_kv_heads = self.hparams["num_key_value_heads"]
2165
- hidden_size = self.hparams["hidden_size"]
2354
+ n_embd = self.hparams["hidden_size"]
2166
2355
  q_per_kv = num_heads // num_kv_heads
2167
- head_dim = hidden_size // num_heads
2356
+ head_dim = n_embd // num_heads
2168
2357
  num_groups = num_heads // q_per_kv
2169
2358
 
2170
- qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
2171
-
2172
- if re.match(qkv_pattern, name):
2173
- bid = re.findall(qkv_pattern, name)[0]
2359
+ if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
2174
2360
  qkv = data_torch
2175
- # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
2176
- qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
2177
- q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
2361
+
2362
+ qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
2363
+ q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
2364
+
2178
2365
  # The model weights of q and k equire additional reshape.
2179
- # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
2180
- q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
2181
- # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
2182
- k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
2183
- # v = rearrange(v, " o g n i -> o (g n i)").T
2184
- v = v.reshape((v.shape[0], -1)).T
2366
+ q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
2367
+ k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
2368
+ v = v.reshape((-1, v.shape[-1]))
2369
+
2185
2370
  return [
2186
2371
  (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
2187
2372
  (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
@@ -2308,13 +2493,15 @@ class GemmaModel(Model):
2308
2493
  special_vocab._set_special_token("middle", 68)
2309
2494
  special_vocab._set_special_token("fsep", 70)
2310
2495
  special_vocab._set_special_token("eot", 107)
2496
+ special_vocab.chat_template = None # do not add it twice
2311
2497
  special_vocab.add_to_gguf(self.gguf_writer)
2312
2498
 
2499
+ self.gguf_writer.add_add_space_prefix(False)
2500
+
2313
2501
  def set_gguf_parameters(self):
2314
2502
  hparams = self.hparams
2315
2503
  block_count = hparams["num_hidden_layers"]
2316
2504
 
2317
- self.gguf_writer.add_name(self.dir_model.name)
2318
2505
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2319
2506
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2320
2507
  self.gguf_writer.add_block_count(block_count)
@@ -2347,14 +2534,14 @@ class Gemma2Model(Model):
2347
2534
  model_arch = gguf.MODEL_ARCH.GEMMA2
2348
2535
 
2349
2536
  def set_vocab(self):
2350
- self._set_vocab_llama_hf()
2537
+ self._set_vocab_sentencepiece()
2538
+
2351
2539
  self.gguf_writer.add_add_space_prefix(False)
2352
2540
 
2353
2541
  def set_gguf_parameters(self):
2354
2542
  hparams = self.hparams
2355
2543
  block_count = hparams["num_hidden_layers"]
2356
2544
 
2357
- self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
2358
2545
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2359
2546
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2360
2547
  self.gguf_writer.add_block_count(block_count)
@@ -2374,7 +2561,7 @@ class Gemma2Model(Model):
2374
2561
  self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
2375
2562
 
2376
2563
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2377
- del bid # unusem
2564
+ del bid # unused
2378
2565
 
2379
2566
  # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
2380
2567
  # To prevent errors, skip loading lm_head.weight.
@@ -2413,39 +2600,7 @@ class MambaModel(Model):
2413
2600
  self._set_vocab_sentencepiece()
2414
2601
  else:
2415
2602
  # Use the GPT-NeoX tokenizer when no tokenizer files are present
2416
- tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
2417
- logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
2418
- neox_reader = gguf.GGUFReader(tokenizer_path, "r")
2419
-
2420
- field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
2421
- self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
2422
-
2423
- field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
2424
- self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
2425
-
2426
- field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
2427
- assert field
2428
- self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
2429
-
2430
- field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
2431
- assert field
2432
- self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
2433
-
2434
- field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
2435
- assert field
2436
- self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
2437
-
2438
- field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
2439
- self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
2440
-
2441
- field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
2442
- self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
2443
-
2444
- field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
2445
- self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
2446
-
2447
- field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
2448
- self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
2603
+ self._set_vocab_builtin("gpt-neox", vocab_size)
2449
2604
 
2450
2605
  def set_gguf_parameters(self):
2451
2606
  d_model = self.find_hparam(["hidden_size", "d_model"])
@@ -2461,7 +2616,6 @@ class MambaModel(Model):
2461
2616
  # Fail early for models which don't have a block expansion factor of 2
2462
2617
  assert d_inner == 2 * d_model
2463
2618
 
2464
- self.gguf_writer.add_name(self.dir_model.name)
2465
2619
  self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
2466
2620
  self.gguf_writer.add_embedding_length(d_model)
2467
2621
  self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@@ -2568,18 +2722,20 @@ class JinaBertV2Model(BertModel):
2568
2722
 
2569
2723
  def get_tensors(self):
2570
2724
  for name, data in super().get_tensors():
2571
- if 'gated_layers' in name:
2725
+ if 'gated_layer' in name:
2572
2726
  d1 = data[:self.intermediate_size, :]
2573
2727
  name1 = name.replace('gated_layers', 'gated_layers_w')
2728
+ name1 = name1.replace('up_gated_layer', 'gated_layers_v')
2574
2729
  d2 = data[self.intermediate_size:, :]
2575
2730
  name2 = name.replace('gated_layers', 'gated_layers_v')
2731
+ name2 = name2.replace('up_gated_layer', 'gated_layers_w')
2576
2732
  yield name1, d1
2577
2733
  yield name2, d2
2578
2734
  continue
2579
2735
 
2580
2736
  yield name, data
2581
2737
 
2582
- def set_vocab(self, *args, **kwargs):
2738
+ def set_vocab(self):
2583
2739
  tokenizer_class = 'BertTokenizer'
2584
2740
  with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
2585
2741
  tokenizer_class = json.load(f)['tokenizer_class']
@@ -2595,6 +2751,81 @@ class JinaBertV2Model(BertModel):
2595
2751
  self.gguf_writer.add_add_eos_token(True)
2596
2752
 
2597
2753
 
2754
+ @Model.register("OpenELMForCausalLM")
2755
+ class OpenELMModel(Model):
2756
+ model_arch = gguf.MODEL_ARCH.OPENELM
2757
+
2758
+ @staticmethod
2759
+ def _make_divisible(v: float | int, divisor: int) -> int:
2760
+ # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
2761
+ new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
2762
+ # Make sure that round down does not go down by more than 10%.
2763
+ if new_v < 0.9 * v:
2764
+ new_v += divisor
2765
+ return new_v
2766
+
2767
+ def __init__(self, *args, **kwargs):
2768
+ super().__init__(*args, **kwargs)
2769
+
2770
+ ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
2771
+ ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
2772
+ self._n_embd: int = self.hparams["model_dim"]
2773
+ self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
2774
+ self._num_query_heads: list[int] = self.hparams["num_query_heads"]
2775
+ self._ffn_dims: list[int] = [
2776
+ OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
2777
+ for multiplier in ffn_multipliers
2778
+ ]
2779
+ assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
2780
+ assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
2781
+
2782
+ # Uses the tokenizer from meta-llama/Llama-2-7b-hf
2783
+ def set_vocab(self):
2784
+ try:
2785
+ self._set_vocab_sentencepiece()
2786
+ except FileNotFoundError:
2787
+ self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
2788
+
2789
+ def set_gguf_parameters(self):
2790
+ n_embd = self._n_embd
2791
+ head_dim = self.hparams["head_dim"]
2792
+ rot_pct = 1.0
2793
+ assert self.block_count == len(self._num_kv_heads)
2794
+ assert self.block_count == len(self._num_query_heads)
2795
+ assert self.block_count == len(self._ffn_dims)
2796
+
2797
+ self.gguf_writer.add_block_count(self.block_count)
2798
+ self.gguf_writer.add_context_length(self.hparams["max_context_length"])
2799
+ self.gguf_writer.add_embedding_length(n_embd)
2800
+ self.gguf_writer.add_feed_forward_length(self._ffn_dims)
2801
+ self.gguf_writer.add_head_count(self._num_query_heads)
2802
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
2803
+ self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
2804
+ # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
2805
+ self.gguf_writer.add_layer_norm_rms_eps(1e-6)
2806
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
2807
+ self.gguf_writer.add_key_length(head_dim)
2808
+ self.gguf_writer.add_value_length(head_dim)
2809
+ self.gguf_writer.add_file_type(self.ftype)
2810
+
2811
+ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
2812
+ if "n_layers" in keys:
2813
+ return self.hparams["num_transformer_layers"]
2814
+
2815
+ return super().find_hparam(keys, optional)
2816
+
2817
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2818
+
2819
+ # split ff
2820
+ if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
2821
+ ff_dim = self._ffn_dims[bid]
2822
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
2823
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
2824
+ return
2825
+
2826
+ yield (self.map_tensor_name(name), data_torch)
2827
+
2828
+
2598
2829
  @Model.register("ArcticForCausalLM")
2599
2830
  class ArcticModel(Model):
2600
2831
  model_arch = gguf.MODEL_ARCH.ARCTIC
@@ -2619,7 +2850,7 @@ class ArcticModel(Model):
2619
2850
 
2620
2851
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
2621
2852
  scores: list[float] = [-10000.0] * vocab_size
2622
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2853
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
2623
2854
 
2624
2855
  for token_id in range(tokenizer.vocab_size()):
2625
2856
 
@@ -2652,7 +2883,7 @@ class ArcticModel(Model):
2652
2883
  added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
2653
2884
  for token_id, token_json in added_tokens_decoder.items():
2654
2885
  token_id = int(token_id)
2655
- if (token_id >= vocab_size):
2886
+ if token_id >= vocab_size:
2656
2887
  logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
2657
2888
  continue
2658
2889
 
@@ -2736,8 +2967,8 @@ class ArcticModel(Model):
2736
2967
 
2737
2968
  return [(self.map_tensor_name(name), data_torch)]
2738
2969
 
2739
- def write_tensors(self):
2740
- super().write_tensors()
2970
+ def prepare_tensors(self):
2971
+ super().prepare_tensors()
2741
2972
 
2742
2973
  if self._experts is not None:
2743
2974
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -2746,35 +2977,555 @@ class ArcticModel(Model):
2746
2977
  raise ValueError(f"Unprocessed experts: {experts}")
2747
2978
 
2748
2979
 
2749
- ###### CONVERSION LOGIC ######
2980
+ @Model.register("DeepseekV2ForCausalLM")
2981
+ class DeepseekV2Model(Model):
2982
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK2
2750
2983
 
2984
+ def set_vocab(self):
2985
+ self._set_vocab_gpt2()
2751
2986
 
2752
- # tree of lazy tensors
2753
- class LazyTorchTensor(gguf.LazyBase):
2754
- _tensor_type = torch.Tensor
2755
- # to keep the type-checker happy
2756
- dtype: torch.dtype
2757
- shape: torch.Size
2987
+ def set_gguf_parameters(self):
2988
+ super().set_gguf_parameters()
2989
+ hparams = self.hparams
2758
2990
 
2759
- # only used when converting a torch.Tensor to a np.ndarray
2760
- _dtype_map: dict[torch.dtype, type] = {
2761
- torch.float16: np.float16,
2762
- torch.float32: np.float32,
2763
- }
2991
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
2992
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2993
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
2994
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
2995
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
2996
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
2997
+ self.gguf_writer.add_value_length(hparams["v_head_dim"])
2998
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
2999
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3000
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3001
+ self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
3002
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
3003
+
3004
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
3005
+ if self.hparams["rope_scaling"].get("type") == "yarn":
3006
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
3007
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
3008
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
3009
+ self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
3010
+
3011
+ _experts: list[dict[str, Tensor]] | None = None
3012
+
3013
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3014
+ # process the experts separately
3015
+ if name.find("mlp.experts") != -1:
3016
+ n_experts = self.hparams["n_routed_experts"]
3017
+ assert bid is not None
3018
+
3019
+ if self._experts is None:
3020
+ self._experts = [{} for _ in range(self.block_count)]
3021
+
3022
+ self._experts[bid][name] = data_torch
3023
+
3024
+ if len(self._experts[bid]) >= n_experts * 3:
3025
+ tensors: list[tuple[str, Tensor]] = []
3026
+
3027
+ # merge the experts into a single 3d tensor
3028
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
3029
+ datas: list[Tensor] = []
3030
+
3031
+ for xid in range(n_experts):
3032
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3033
+ datas.append(self._experts[bid][ename])
3034
+ del self._experts[bid][ename]
3035
+
3036
+ data_torch = torch.stack(datas, dim=0)
3037
+
3038
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3039
+
3040
+ new_name = self.map_tensor_name(merged_name)
3041
+
3042
+ tensors.append((new_name, data_torch))
3043
+ return tensors
3044
+ else:
3045
+ return []
3046
+
3047
+ return [(self.map_tensor_name(name), data_torch)]
3048
+
3049
+ def prepare_tensors(self):
3050
+ super().prepare_tensors()
3051
+
3052
+ if self._experts is not None:
3053
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3054
+ experts = [k for d in self._experts for k in d.keys()]
3055
+ if len(experts) > 0:
3056
+ raise ValueError(f"Unprocessed experts: {experts}")
3057
+
3058
+
3059
+ @Model.register("T5WithLMHeadModel")
3060
+ @Model.register("T5ForConditionalGeneration")
3061
+ @Model.register("MT5ForConditionalGeneration")
3062
+ @Model.register("UMT5ForConditionalGeneration")
3063
+ class T5Model(Model):
3064
+ model_arch = gguf.MODEL_ARCH.T5
3065
+
3066
+ def __init__(self, *args, **kwargs):
3067
+ super().__init__(*args, **kwargs)
3068
+ self.shared_token_embeddings_found = False
3069
+
3070
+ def set_vocab(self):
3071
+ # to avoid TypeError: Descriptors cannot be created directly
3072
+ # exception when importing sentencepiece_model_pb2
3073
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3074
+ from sentencepiece import SentencePieceProcessor
3075
+ from sentencepiece import sentencepiece_model_pb2 as model
3076
+
3077
+ tokenizer_path = self.dir_model / 'tokenizer.model'
3078
+
3079
+ # many older models use spiece.model tokenizer model filename
3080
+ if not tokenizer_path.is_file():
3081
+ tokenizer_path = self.dir_model / 'spiece.model'
3082
+
3083
+ if not tokenizer_path.is_file():
3084
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
3085
+
3086
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3087
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3088
+
3089
+ # some models like Pile-T5 family use BPE tokenizer instead of Unigram
3090
+ if sentencepiece_model.trainer_spec.model_type == 2: # BPE
3091
+ # assure the tokenizer model file name is correct
3092
+ assert tokenizer_path.name == 'tokenizer.model'
3093
+ return self._set_vocab_sentencepiece()
3094
+ else:
3095
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3096
+
3097
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3098
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3099
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3100
+
3101
+ tokenizer = SentencePieceProcessor()
3102
+ tokenizer.LoadFromFile(str(tokenizer_path))
3103
+
3104
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3105
+
3106
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3107
+ scores: list[float] = [-10000.0] * vocab_size
3108
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3109
+
3110
+ for token_id in range(tokenizer.vocab_size()):
3111
+ piece = tokenizer.IdToPiece(token_id)
3112
+ text = piece.encode("utf-8")
3113
+ score = tokenizer.GetScore(token_id)
3114
+
3115
+ toktype = SentencePieceTokenTypes.NORMAL
3116
+ if tokenizer.IsUnknown(token_id):
3117
+ toktype = SentencePieceTokenTypes.UNKNOWN
3118
+ elif tokenizer.IsControl(token_id):
3119
+ toktype = SentencePieceTokenTypes.CONTROL
3120
+ elif tokenizer.IsUnused(token_id):
3121
+ toktype = SentencePieceTokenTypes.UNUSED
3122
+ elif tokenizer.IsByte(token_id):
3123
+ toktype = SentencePieceTokenTypes.BYTE
3124
+
3125
+ tokens[token_id] = text
3126
+ scores[token_id] = score
3127
+ toktypes[token_id] = toktype
3128
+
3129
+ added_tokens_file = self.dir_model / 'added_tokens.json'
3130
+ if added_tokens_file.is_file():
3131
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
3132
+ added_tokens_json = json.load(f)
3133
+ for key in added_tokens_json:
3134
+ token_id = added_tokens_json[key]
3135
+ if token_id >= vocab_size:
3136
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
3137
+ continue
3138
+
3139
+ tokens[token_id] = key.encode("utf-8")
3140
+ scores[token_id] = -1000.0
3141
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
3142
+
3143
+ if vocab_size > len(tokens):
3144
+ pad_count = vocab_size - len(tokens)
3145
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3146
+ for i in range(1, pad_count + 1):
3147
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3148
+ scores.append(-1000.0)
3149
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
3150
+
3151
+ self.gguf_writer.add_tokenizer_model("t5")
3152
+ self.gguf_writer.add_tokenizer_pre("default")
3153
+ self.gguf_writer.add_token_list(tokens)
3154
+ self.gguf_writer.add_token_scores(scores)
3155
+ self.gguf_writer.add_token_types(toktypes)
3156
+ self.gguf_writer.add_add_space_prefix(add_prefix)
3157
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3158
+ if precompiled_charsmap:
3159
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3160
+
3161
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3162
+ special_vocab.add_to_gguf(self.gguf_writer)
3163
+
3164
+ self.gguf_writer.add_add_bos_token(False)
3165
+ self.gguf_writer.add_add_eos_token(True)
3166
+
3167
+ def set_gguf_parameters(self):
3168
+ if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
3169
+ logger.warning("Couldn't find context length in config.json, assuming default value of 512")
3170
+ n_ctx = 512
3171
+ self.gguf_writer.add_context_length(n_ctx)
3172
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
3173
+ self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
3174
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
3175
+ self.gguf_writer.add_head_count(self.hparams["num_heads"])
3176
+ self.gguf_writer.add_key_length(self.hparams["d_kv"])
3177
+ self.gguf_writer.add_value_length(self.hparams["d_kv"])
3178
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3179
+ self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
3180
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
3181
+ self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
3182
+ self.gguf_writer.add_file_type(self.ftype)
3183
+
3184
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3185
+ del bid # unused
3186
+
3187
+ # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
3188
+ # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
3189
+ # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
3190
+ # and decoder and ignore the remaining ones.
3191
+ if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
3192
+ if not self.shared_token_embeddings_found:
3193
+ name = "shared.weight"
3194
+ self.shared_token_embeddings_found = True
3195
+ else:
3196
+ logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
3197
+ return []
3198
+
3199
+ return [(self.map_tensor_name(name), data_torch)]
3200
+
3201
+
3202
+ @Model.register("JAISLMHeadModel")
3203
+ class JaisModel(Model):
3204
+ model_arch = gguf.MODEL_ARCH.JAIS
3205
+
3206
+ def __init__(self, *args, **kwargs):
3207
+ super().__init__(*args, **kwargs)
3208
+
3209
+ # SwigLU activation
3210
+ assert self.hparams["activation_function"] == "swiglu"
3211
+ # ALiBi position embedding
3212
+ assert self.hparams["position_embedding_type"] == "alibi"
3213
+
3214
+ # Embeddings scale
3215
+ self.embeddings_scale = 1.0
3216
+ # note: For some JAIS flavors, output is tied to (same as) wte in original model
3217
+ self.output_is_wte = False
3218
+ if 'mup_embeddings_scale' in self.hparams:
3219
+ self.output_is_wte = True # Hack (?)
3220
+ self.embeddings_scale = self.hparams['mup_embeddings_scale']
3221
+ elif 'embeddings_scale' in self.hparams:
3222
+ self.embeddings_scale = self.hparams['embeddings_scale']
3223
+ else:
3224
+ assert False
3225
+
3226
+ self.width_scale = 1.0
3227
+ if 'mup_output_alpha' in self.hparams:
3228
+ assert 'mup_width_scale' in self.hparams
3229
+ self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
3230
+ elif 'width_scale' in self.hparams:
3231
+ self.width_scale = self.hparams['width_scale']
3232
+ else:
3233
+ assert False
3234
+
3235
+ self.max_alibi_bias = 8.0
3236
+
3237
+ def set_vocab(self):
3238
+ self._set_vocab_gpt2()
3239
+
3240
+ def set_gguf_parameters(self):
3241
+ self.gguf_writer.add_block_count(self.hparams["n_layer"])
3242
+ self.gguf_writer.add_context_length(self.hparams["n_positions"])
3243
+ self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
3244
+ self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
3245
+ self.gguf_writer.add_head_count(self.hparams["n_head"])
3246
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3247
+ self.gguf_writer.add_file_type(self.ftype)
3248
+
3249
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3250
+ del bid # unused
3251
+
3252
+ tensors: list[tuple[str, Tensor]] = []
3253
+
3254
+ # we don't need these
3255
+ if name.endswith((".attn.bias")):
3256
+ return tensors
3257
+
3258
+ if name.endswith(("relative_pe.slopes")):
3259
+ # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
3260
+ # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
3261
+ # but Jais's PyTorch model simply precalculates the slope values and places them
3262
+ # in relative_pes.slopes
3263
+ n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
3264
+ first_val = float(data_torch[0].item())
3265
+ self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
3266
+
3267
+ return tensors
3268
+
3269
+ if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
3270
+ data_torch = data_torch.transpose(1, 0)
3271
+
3272
+ new_name = self.map_tensor_name(name)
3273
+
3274
+ if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3275
+ tensors.append((new_name, data_torch * self.embeddings_scale))
3276
+ if self.output_is_wte:
3277
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
3278
+ elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3279
+ assert not self.output_is_wte
3280
+ tensors.append((new_name, data_torch * self.width_scale))
3281
+ else:
3282
+ tensors.append((new_name, data_torch))
3283
+
3284
+ return tensors
3285
+
3286
+ def prepare_tensors(self):
3287
+ super().prepare_tensors()
3288
+ self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
3289
+
3290
+
3291
+ @Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
3292
+ class ChatGLMModel(Model):
3293
+ model_arch = gguf.MODEL_ARCH.CHATGLM
3294
+
3295
+ def set_vocab_chatglm3(self):
3296
+ dir_model = self.dir_model
3297
+ hparams = self.hparams
3298
+ tokens: list[bytes] = []
3299
+ toktypes: list[int] = []
3300
+ scores: list[float] = []
3301
+
3302
+ from transformers import AutoTokenizer
3303
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3304
+ vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
3305
+ assert max(tokenizer.get_vocab().values()) < vocab_size
3306
+ role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
3307
+ special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
3308
+ for token_id in range(vocab_size):
3309
+ piece = tokenizer._convert_id_to_token(token_id)
3310
+ if token_id == 0:
3311
+ piece = "<unk>"
3312
+ elif token_id == 1:
3313
+ piece = "<bos>"
3314
+ elif token_id == 2:
3315
+ piece = "<eos>"
3316
+
3317
+ text = piece.encode("utf-8")
3318
+ score = 0.0
3319
+ # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
3320
+ # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
3321
+ if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
3322
+ score = tokenizer.tokenizer.sp_model.get_score(token_id)
3323
+
3324
+ if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
3325
+ if piece in special_tokens:
3326
+ toktype = SentencePieceTokenTypes.CONTROL
3327
+ elif len(piece) == 0:
3328
+ text = f"[PAD{token_id}]".encode("utf-8")
3329
+ toktype = SentencePieceTokenTypes.UNUSED
3330
+ else:
3331
+ toktype = SentencePieceTokenTypes.USER_DEFINED
3332
+ tokens.append(text)
3333
+ scores.append(score)
3334
+ toktypes.append(toktype)
3335
+ continue
3336
+
3337
+ toktype = SentencePieceTokenTypes.NORMAL
3338
+ if tokenizer.tokenizer.sp_model.is_unknown(token_id):
3339
+ toktype = SentencePieceTokenTypes.UNKNOWN
3340
+ elif tokenizer.tokenizer.sp_model.is_control(token_id):
3341
+ toktype = SentencePieceTokenTypes.CONTROL
3342
+ elif tokenizer.tokenizer.sp_model.is_unused(token_id):
3343
+ toktype = SentencePieceTokenTypes.UNUSED
3344
+ elif tokenizer.tokenizer.sp_model.is_byte(token_id):
3345
+ toktype = SentencePieceTokenTypes.BYTE
3346
+
3347
+ tokens.append(text)
3348
+ scores.append(score)
3349
+ toktypes.append(toktype)
3350
+
3351
+ self.gguf_writer.add_tokenizer_model("llama")
3352
+ # glm3 needs prefix and suffix formatted as:
3353
+ # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
3354
+ self.gguf_writer.add_tokenizer_pre("chatglm-spm")
3355
+ self.gguf_writer.add_token_list(tokens)
3356
+ self.gguf_writer.add_token_scores(scores)
3357
+ self.gguf_writer.add_token_types(toktypes)
3358
+
3359
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3360
+ special_vocab.add_to_gguf(self.gguf_writer)
3361
+
3362
+ @staticmethod
3363
+ def token_bytes_to_string(b):
3364
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
3365
+ byte_encoder = bytes_to_unicode()
3366
+ return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
3367
+
3368
+ @staticmethod
3369
+ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
3370
+ parts = [bytes([b]) for b in token]
3371
+ while True:
3372
+ min_idx = None
3373
+ min_rank = None
3374
+ for i, pair in enumerate(zip(parts[:-1], parts[1:])):
3375
+ rank = mergeable_ranks.get(pair[0] + pair[1])
3376
+ if rank is not None and (min_rank is None or rank < min_rank):
3377
+ min_idx = i
3378
+ min_rank = rank
3379
+ if min_rank is None or (max_rank is not None and min_rank >= max_rank):
3380
+ break
3381
+ assert min_idx is not None
3382
+ parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
3383
+ return parts
3384
+
3385
+ def set_vocab(self):
3386
+ if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
3387
+ self.set_vocab_chatglm3()
3388
+ return
3389
+
3390
+ dir_model = self.dir_model
3391
+ hparams = self.hparams
3392
+ tokens: list[str] = []
3393
+ toktypes: list[int] = []
3394
+
3395
+ from transformers import AutoTokenizer
3396
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3397
+ vocab_size = hparams["padded_vocab_size"]
3398
+ assert max(tokenizer.get_vocab().values()) < vocab_size
3399
+
3400
+ tokpre = self.get_vocab_base_pre(tokenizer)
3401
+
3402
+ merges = []
3403
+ vocab = {}
3404
+ mergeable_ranks = tokenizer.mergeable_ranks
3405
+ for token, rank in mergeable_ranks.items():
3406
+ vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
3407
+ if len(token) == 1:
3408
+ continue
3409
+ merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
3410
+ assert len(merged) >= 2 and len(merged) <= 7
3411
+ merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
3412
+
3413
+ # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3414
+ added_vocab = tokenizer.get_added_vocab()
3415
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
3416
+
3417
+ for i in range(vocab_size):
3418
+ if i not in reverse_vocab:
3419
+ tokens.append(f"[PAD{i}]")
3420
+ toktypes.append(gguf.TokenType.UNUSED)
3421
+ elif reverse_vocab[i] in added_vocab:
3422
+ tokens.append(reverse_vocab[i])
3423
+ if tokenizer.added_tokens_decoder[i].special:
3424
+ toktypes.append(gguf.TokenType.CONTROL)
3425
+ else:
3426
+ toktypes.append(gguf.TokenType.USER_DEFINED)
3427
+ else:
3428
+ tokens.append(reverse_vocab[i])
3429
+ toktypes.append(gguf.TokenType.NORMAL)
3430
+
3431
+ self.gguf_writer.add_tokenizer_model("gpt2")
3432
+ self.gguf_writer.add_tokenizer_pre(tokpre)
3433
+ self.gguf_writer.add_token_list(tokens)
3434
+ self.gguf_writer.add_token_types(toktypes)
3435
+
3436
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
3437
+ special_vocab.merges = merges
3438
+ # only add special tokens when they were not already loaded from config.json
3439
+ special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
3440
+ special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
3441
+ # this one is usually not in config.json anyway
3442
+ special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
3443
+ special_vocab.add_to_gguf(self.gguf_writer)
3444
+
3445
+ def set_gguf_parameters(self):
3446
+ n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
3447
+ n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
3448
+ n_head_kv = self.hparams.get("multi_query_group_num", n_head)
3449
+ self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
3450
+ self.gguf_writer.add_embedding_length(n_embed)
3451
+ self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
3452
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
3453
+ self.gguf_writer.add_head_count(n_head)
3454
+ self.gguf_writer.add_head_count_kv(n_head_kv)
3455
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
3456
+ self.gguf_writer.add_file_type(self.ftype)
3457
+ self.gguf_writer.add_rope_dimension_count(64)
3458
+ self.gguf_writer.add_add_bos_token(False)
3459
+ rope_freq = 10000
3460
+ if "rope_ratio" in self.hparams:
3461
+ rope_freq = rope_freq * self.hparams["rope_ratio"]
3462
+ self.gguf_writer.add_rope_freq_base(rope_freq)
3463
+
3464
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3465
+ del bid # unused
3466
+
3467
+ if name.endswith(".rotary_pos_emb.inv_freq"):
3468
+ return []
3469
+
3470
+ name = name.removeprefix("transformer.")
3471
+ return [(self.map_tensor_name(name), data_torch)]
3472
+
3473
+ ###### CONVERSION LOGIC ######
3474
+
3475
+
3476
+ # tree of lazy tensors
3477
+ class LazyTorchTensor(gguf.LazyBase):
3478
+ _tensor_type = torch.Tensor
3479
+ # to keep the type-checker happy
3480
+ dtype: torch.dtype
3481
+ shape: torch.Size
3482
+
3483
+ # only used when converting a torch.Tensor to a np.ndarray
3484
+ _dtype_map: dict[torch.dtype, type] = {
3485
+ torch.float16: np.float16,
3486
+ torch.float32: np.float32,
3487
+ }
3488
+
3489
+ # used for safetensors slices
3490
+ # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
3491
+ # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
3492
+ _dtype_str_map: dict[str, torch.dtype] = {
3493
+ "F64": torch.float64,
3494
+ "F32": torch.float32,
3495
+ "BF16": torch.bfloat16,
3496
+ "F16": torch.float16,
3497
+ # "U64": torch.uint64,
3498
+ "I64": torch.int64,
3499
+ # "U32": torch.uint32,
3500
+ "I32": torch.int32,
3501
+ # "U16": torch.uint16,
3502
+ "I16": torch.int16,
3503
+ "U8": torch.uint8,
3504
+ "I8": torch.int8,
3505
+ "BOOL": torch.bool,
3506
+ "F8_E4M3": torch.float8_e4m3fn,
3507
+ "F8_E5M2": torch.float8_e5m2,
3508
+ }
2764
3509
 
2765
3510
  def numpy(self) -> gguf.LazyNumpyTensor:
2766
3511
  dtype = self._dtype_map[self.dtype]
2767
3512
  return gguf.LazyNumpyTensor(
2768
3513
  meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
2769
- lazy=self._lazy,
2770
3514
  args=(self,),
2771
- func=(lambda s: s[0].numpy())
3515
+ func=(lambda s: s.numpy())
2772
3516
  )
2773
3517
 
2774
3518
  @classmethod
2775
- def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
3519
+ def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
2776
3520
  return torch.empty(size=shape, dtype=dtype, device="meta")
2777
3521
 
3522
+ @classmethod
3523
+ def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
3524
+ dtype = cls._dtype_str_map[st_slice.get_dtype()]
3525
+ shape: tuple[int, ...] = tuple(st_slice.get_shape())
3526
+ lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
3527
+ return cast(torch.Tensor, lazy)
3528
+
2778
3529
  @classmethod
2779
3530
  def __torch_function__(cls, func, types, args=(), kwargs=None):
2780
3531
  del types # unused
@@ -2785,7 +3536,7 @@ class LazyTorchTensor(gguf.LazyBase):
2785
3536
  if func is torch.Tensor.numpy:
2786
3537
  return args[0].numpy()
2787
3538
 
2788
- return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
3539
+ return cls._wrap_fn(func)(*args, **kwargs)
2789
3540
 
2790
3541
 
2791
3542
  def parse_args() -> argparse.Namespace:
@@ -2795,10 +3546,6 @@ def parse_args() -> argparse.Namespace:
2795
3546
  "--vocab-only", action="store_true",
2796
3547
  help="extract only the vocab",
2797
3548
  )
2798
- parser.add_argument(
2799
- "--awq-path", type=Path, default=None,
2800
- help="Path to scale awq cache file",
2801
- )
2802
3549
  parser.add_argument(
2803
3550
  "--outfile", type=Path,
2804
3551
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -2831,30 +3578,58 @@ def parse_args() -> argparse.Namespace:
2831
3578
  "--verbose", action="store_true",
2832
3579
  help="increase output verbosity",
2833
3580
  )
3581
+ parser.add_argument(
3582
+ "--split-max-tensors", type=int, default=0,
3583
+ help="max tensors in each split",
3584
+ )
3585
+ parser.add_argument(
3586
+ "--split-max-size", type=str, default="0",
3587
+ help="max size per split N(M|G)",
3588
+ )
3589
+ parser.add_argument(
3590
+ "--dry-run", action="store_true",
3591
+ help="only print out a split plan and exit, without writing any new files",
3592
+ )
3593
+ parser.add_argument(
3594
+ "--no-tensor-first-split", action="store_true",
3595
+ help="do not add tensors to the first split (disabled by default)"
3596
+ )
3597
+ parser.add_argument(
3598
+ "--metadata", type=Path,
3599
+ help="Specify the path for an authorship metadata override file"
3600
+ )
2834
3601
 
2835
3602
  return parser.parse_args()
2836
3603
 
2837
3604
 
3605
+ def split_str_to_n_bytes(split_str: str) -> int:
3606
+ if split_str.endswith("K"):
3607
+ n = int(split_str[:-1]) * 1000
3608
+ elif split_str.endswith("M"):
3609
+ n = int(split_str[:-1]) * 1000 * 1000
3610
+ elif split_str.endswith("G"):
3611
+ n = int(split_str[:-1]) * 1000 * 1000 * 1000
3612
+ elif split_str.isnumeric():
3613
+ n = int(split_str)
3614
+ else:
3615
+ raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
3616
+
3617
+ if n < 0:
3618
+ raise ValueError(f"Invalid split size: {split_str}, must be positive")
3619
+
3620
+ return n
3621
+
3622
+
2838
3623
  def main() -> None:
2839
3624
  args = parse_args()
2840
3625
 
2841
- logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
3626
+ if args.verbose:
3627
+ logging.basicConfig(level=logging.DEBUG)
3628
+ else:
3629
+ logging.basicConfig(level=logging.INFO)
2842
3630
 
2843
3631
  dir_model = args.model
2844
3632
 
2845
- if args.awq_path:
2846
- sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
2847
- from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
2848
- tmp_model_path = args.model / "weighted_model"
2849
- dir_model = tmp_model_path
2850
- if tmp_model_path.is_dir():
2851
- logger.info(f"{tmp_model_path} exists as a weighted model.")
2852
- else:
2853
- tmp_model_path.mkdir(parents=True, exist_ok=True)
2854
- logger.info("Saving new weighted model ...")
2855
- add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
2856
- logger.info(f"Saved weighted model at {tmp_model_path}.")
2857
-
2858
3633
  if not dir_model.is_dir():
2859
3634
  logger.error(f'Error: {args.model} is not a directory')
2860
3635
  sys.exit(1)
@@ -2867,37 +3642,48 @@ def main() -> None:
2867
3642
  "auto": gguf.LlamaFileType.GUESSED,
2868
3643
  }
2869
3644
 
3645
+ is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
3646
+ if args.use_temp_file and is_split:
3647
+ logger.error("Error: Cannot use temp file when splitting")
3648
+ sys.exit(1)
3649
+
2870
3650
  if args.outfile is not None:
2871
3651
  fname_out = args.outfile
2872
3652
  else:
2873
- # output in the same directory as the model by default
2874
- fname_out = dir_model / 'ggml-model-{ftype}.gguf'
3653
+ fname_out = dir_model
2875
3654
 
2876
3655
  logger.info(f"Loading model: {dir_model.name}")
2877
3656
 
2878
3657
  hparams = Model.load_hparams(dir_model)
2879
3658
 
2880
3659
  with torch.inference_mode():
2881
- model_class = Model.from_model_architecture(hparams["architectures"][0])
2882
- model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
3660
+ output_type = ftype_map[args.outtype]
3661
+ model_architecture = hparams["architectures"][0]
2883
3662
 
2884
- logger.info("Set model parameters")
2885
- model_instance.set_gguf_parameters()
2886
-
2887
- logger.info("Set model tokenizer")
2888
- model_instance.set_vocab()
3663
+ try:
3664
+ model_class = Model.from_model_architecture(model_architecture)
3665
+ except NotImplementedError:
3666
+ logger.error(f"Model {model_architecture} is not supported")
3667
+ sys.exit(1)
2889
3668
 
2890
- model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
3669
+ model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
3670
+ is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
3671
+ eager=args.no_lazy,
3672
+ metadata_override=args.metadata, model_name=args.model_name,
3673
+ split_max_tensors=args.split_max_tensors,
3674
+ split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
3675
+ small_first_shard=args.no_tensor_first_split)
2891
3676
 
2892
3677
  if args.vocab_only:
2893
- logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
3678
+ logger.info("Exporting model vocab...")
2894
3679
  model_instance.write_vocab()
3680
+ logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
2895
3681
  else:
2896
- logger.info(f"Exporting model to '{model_instance.fname_out}'")
3682
+ logger.info("Exporting model...")
2897
3683
  model_instance.write()
2898
-
2899
- logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
3684
+ out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
3685
+ logger.info(f"Model successfully exported to {out_path}")
2900
3686
 
2901
3687
 
2902
3688
  if __name__ == '__main__':
2903
- main()
3689
+ main()