bigdl-core-cpp 2.1.0b2__py3-none-win_amd64.whl → 2.1.0b20240820.post1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. bigdl/cpp/convert-hf-to-gguf.py +1174 -314
  2. bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
  3. bigdl/cpp/gguf-py/gguf/constants.py +463 -167
  4. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  5. bigdl/cpp/gguf-py/gguf/gguf_reader.py +29 -8
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +475 -156
  7. bigdl/cpp/gguf-py/gguf/lazy.py +24 -49
  8. bigdl/cpp/gguf-py/gguf/metadata.py +503 -0
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +209 -23
  10. bigdl/cpp/gguf-py/gguf/utility.py +69 -0
  11. bigdl/cpp/libs/baby-llama.exe +0 -0
  12. bigdl/cpp/libs/batched-bench.exe +0 -0
  13. bigdl/cpp/libs/batched.exe +0 -0
  14. bigdl/cpp/libs/beam-search.exe +0 -0
  15. bigdl/cpp/libs/benchmark.exe +0 -0
  16. bigdl/cpp/libs/common.lib +0 -0
  17. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  18. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  19. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  20. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  21. bigdl/cpp/libs/embedding.exe +0 -0
  22. bigdl/cpp/libs/export-lora.exe +0 -0
  23. bigdl/cpp/libs/finetune.exe +0 -0
  24. bigdl/cpp/libs/ggml_shared.dll +0 -0
  25. bigdl/cpp/libs/gguf.exe +0 -0
  26. bigdl/cpp/libs/gritlm.exe +0 -0
  27. bigdl/cpp/libs/imatrix.exe +0 -0
  28. bigdl/cpp/libs/infill.exe +0 -0
  29. bigdl/cpp/libs/llama-bench.exe +0 -0
  30. bigdl/cpp/libs/llama.dll +0 -0
  31. bigdl/cpp/libs/llava-cli.exe +0 -0
  32. bigdl/cpp/libs/llava_shared.dll +0 -0
  33. bigdl/cpp/libs/lookahead.exe +0 -0
  34. bigdl/cpp/libs/lookup.exe +0 -0
  35. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  36. bigdl/cpp/libs/main.exe +0 -0
  37. bigdl/cpp/libs/ollama.exe +0 -0
  38. bigdl/cpp/libs/parallel.exe +0 -0
  39. bigdl/cpp/libs/passkey.exe +0 -0
  40. bigdl/cpp/libs/perplexity.exe +0 -0
  41. bigdl/cpp/libs/q8dot.exe +0 -0
  42. bigdl/cpp/libs/quantize-stats.exe +0 -0
  43. bigdl/cpp/libs/quantize.exe +0 -0
  44. bigdl/cpp/libs/save-load-state.exe +0 -0
  45. bigdl/cpp/libs/server.exe +0 -0
  46. bigdl/cpp/libs/simple.exe +0 -0
  47. bigdl/cpp/libs/speculative.exe +0 -0
  48. bigdl/cpp/libs/tokenize.exe +0 -0
  49. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  50. bigdl/cpp/libs/vdot.exe +0 -0
  51. {bigdl_core_cpp-2.1.0b2.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/METADATA +8 -8
  52. bigdl_core_cpp-2.1.0b20240820.post1.dist-info/RECORD +63 -0
  53. {bigdl_core_cpp-2.1.0b2.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/WHEEL +1 -1
  54. bigdl_core_cpp-2.1.0b2.dist-info/RECORD +0 -61
  55. {bigdl_core_cpp-2.1.0b2.data → bigdl_core_cpp-2.1.0b20240820.post1.data}/scripts/init-llama-cpp.bat +0 -0
  56. {bigdl_core_cpp-2.1.0b2.data → bigdl_core_cpp-2.1.0b20240820.post1.data}/scripts/init-llama-cpp.ps1 +0 -0
  57. {bigdl_core_cpp-2.1.0b2.data → bigdl_core_cpp-2.1.0b20240820.post1.data}/scripts/init-ollama.bat +0 -0
  58. {bigdl_core_cpp-2.1.0b2.dist-info → bigdl_core_cpp-2.1.0b20240820.post1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
2
3
 
3
4
  from __future__ import annotations
4
5
 
@@ -12,7 +13,7 @@ import sys
12
13
  from enum import IntEnum
13
14
  from pathlib import Path
14
15
  from hashlib import sha256
15
- from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
16
+ from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
16
17
 
17
18
  import math
18
19
  import numpy as np
@@ -25,10 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
25
26
  sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
26
27
  import gguf
27
28
 
28
- from convert import LlamaHfVocab
29
-
30
- logger = logging.getLogger("hf-to-gguf")
31
-
32
29
  logger = logging.getLogger("hf-to-gguf")
33
30
 
34
31
 
@@ -50,7 +47,8 @@ class Model:
50
47
  _model_classes: dict[str, type[Model]] = {}
51
48
 
52
49
  dir_model: Path
53
- ftype: int
50
+ ftype: gguf.LlamaFileType
51
+ fname_out: Path
54
52
  is_big_endian: bool
55
53
  endianess: gguf.GGUFEndian
56
54
  use_temp_file: bool
@@ -61,29 +59,41 @@ class Model:
61
59
  block_count: int
62
60
  tensor_map: gguf.TensorNameMap
63
61
  tensor_names: set[str] | None
64
- fname_out: Path
65
62
  gguf_writer: gguf.GGUFWriter
63
+ model_name: str | None
64
+ metadata_override: Path | None
65
+ dir_model_card: Path
66
66
 
67
67
  # subclasses should define this!
68
68
  model_arch: gguf.MODEL_ARCH
69
69
 
70
- def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
70
+ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
71
+ use_temp_file: bool = False, eager: bool = False,
72
+ metadata_override: Path | None = None, model_name: str | None = None,
73
+ split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
71
74
  if type(self) is Model:
72
75
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
76
+
73
77
  self.dir_model = dir_model
74
78
  self.ftype = ftype
79
+ self.fname_out = fname_out
75
80
  self.is_big_endian = is_big_endian
76
81
  self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
77
82
  self.use_temp_file = use_temp_file
78
83
  self.lazy = not eager
79
- self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
84
+ self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
80
85
  self.is_safetensors = len(self.part_names) > 0
81
86
  if not self.is_safetensors:
82
- self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
87
+ self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
83
88
  self.hparams = Model.load_hparams(self.dir_model)
84
- self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
89
+ self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
85
90
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
86
91
  self.tensor_names = None
92
+ self.metadata_override = metadata_override
93
+ self.model_name = model_name
94
+ self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
95
+
96
+ # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
87
97
  if self.ftype == gguf.LlamaFileType.GUESSED:
88
98
  # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
89
99
  _, first_tensor = next(self.get_tensors())
@@ -93,11 +103,10 @@ class Model:
93
103
  else:
94
104
  logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
95
105
  self.ftype = gguf.LlamaFileType.MOSTLY_BF16
96
- ftype_up: str = self.ftype.name.partition("_")[2].upper()
97
- ftype_lw: str = ftype_up.lower()
98
- # allow templating the file name with the output ftype, useful with the "auto" ftype
99
- self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
100
- self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
106
+
107
+ # Configure GGUF Writer
108
+ self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
109
+ split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
101
110
 
102
111
  @classmethod
103
112
  def __init_subclass__(cls):
@@ -147,9 +156,16 @@ class Model:
147
156
  tensor_names_from_parts.update(model_part.keys())
148
157
 
149
158
  for name in model_part.keys():
150
- data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
151
- if self.lazy:
152
- data = LazyTorchTensor.from_eager(data)
159
+ if self.is_safetensors:
160
+ if self.lazy:
161
+ data = model_part.get_slice(name)
162
+ data = LazyTorchTensor.from_safetensors_slice(data)
163
+ else:
164
+ data = model_part.get_tensor(name)
165
+ else:
166
+ data = model_part[name]
167
+ if self.lazy:
168
+ data = LazyTorchTensor.from_eager(data)
153
169
  yield name, data
154
170
 
155
171
  # only verify tensor name presence; it doesn't matter if they are not in the right files
@@ -185,7 +201,6 @@ class Model:
185
201
  return new_name
186
202
 
187
203
  def set_gguf_parameters(self):
188
- self.gguf_writer.add_name(self.dir_model.name)
189
204
  self.gguf_writer.add_block_count(self.block_count)
190
205
 
191
206
  if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@@ -224,6 +239,10 @@ class Model:
224
239
  self.gguf_writer.add_expert_used_count(n_experts_used)
225
240
  logger.info(f"gguf: experts used count = {n_experts_used}")
226
241
 
242
+ if (head_dim := self.hparams.get("head_dim")) is not None:
243
+ self.gguf_writer.add_key_length(head_dim)
244
+ self.gguf_writer.add_value_length(head_dim)
245
+
227
246
  self.gguf_writer.add_file_type(self.ftype)
228
247
  logger.info(f"gguf: file type = {self.ftype}")
229
248
 
@@ -242,7 +261,7 @@ class Model:
242
261
 
243
262
  return False
244
263
 
245
- def write_tensors(self):
264
+ def prepare_tensors(self):
246
265
  max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
247
266
 
248
267
  for name, data_torch in self.get_tensors():
@@ -264,7 +283,7 @@ class Model:
264
283
  break
265
284
 
266
285
  for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
267
- data: np.ndarray = data # type hint
286
+ data: np.ndarray # type hint
268
287
  n_dims = len(data.shape)
269
288
  data_dtype = data.dtype
270
289
  data_qtype: gguf.GGMLQuantizationType | None = None
@@ -325,23 +344,80 @@ class Model:
325
344
 
326
345
  self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
327
346
 
347
+ def set_type(self):
348
+ self.gguf_writer.add_type(gguf.GGUFType.MODEL)
349
+
350
+ def prepare_metadata(self, vocab_only: bool):
351
+
352
+ total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
353
+
354
+ self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
355
+
356
+ # Fallback to model directory name if metadata name is still missing
357
+ if self.metadata.name is None:
358
+ self.metadata.name = self.dir_model.name
359
+
360
+ # Generate parameter weight class (useful for leader boards) if not yet determined
361
+ if self.metadata.size_label is None and total_params > 0:
362
+ self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
363
+
364
+ # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
365
+ output_type: str = self.ftype.name.partition("_")[2]
366
+
367
+ # Filename Output
368
+ if self.fname_out.is_dir():
369
+ # Generate default filename based on model specification and available metadata
370
+ if not vocab_only:
371
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
372
+ else:
373
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
374
+
375
+ # Use the default filename
376
+ self.fname_out = self.fname_out / f"{fname_default}.gguf"
377
+ else:
378
+ # Output path is a custom defined templated filename
379
+ # Note: `not is_dir()` is used because `.is_file()` will not detect
380
+ # file template strings as it doesn't actually exist as a file
381
+
382
+ # Process templated file name with the output ftype, useful with the "auto" ftype
383
+ self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
384
+
385
+ self.set_type()
386
+
387
+ logger.info("Set meta model")
388
+ self.metadata.set_gguf_meta_model(self.gguf_writer)
389
+
390
+ logger.info("Set model parameters")
391
+ self.set_gguf_parameters()
392
+
393
+ logger.info("Set model tokenizer")
394
+ self.set_vocab()
395
+
396
+ logger.info("Set model quantization version")
397
+ self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
398
+
328
399
  def write(self):
329
- self.write_tensors()
330
- self.gguf_writer.write_header_to_file()
400
+ self.prepare_tensors()
401
+ self.prepare_metadata(vocab_only=False)
402
+ self.gguf_writer.write_header_to_file(path=self.fname_out)
331
403
  self.gguf_writer.write_kv_data_to_file()
332
404
  self.gguf_writer.write_tensors_to_file(progress=True)
333
405
  self.gguf_writer.close()
334
406
 
335
407
  def write_vocab(self):
336
- self.gguf_writer.write_header_to_file()
408
+ if len(self.gguf_writer.tensors) != 1:
409
+ raise ValueError('Splitting the vocabulary is not supported')
410
+
411
+ self.prepare_metadata(vocab_only=True)
412
+ self.gguf_writer.write_header_to_file(path=self.fname_out)
337
413
  self.gguf_writer.write_kv_data_to_file()
338
414
  self.gguf_writer.close()
339
415
 
340
416
  @staticmethod
341
- def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
417
+ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
342
418
  part_names: list[str] = []
343
419
  for filename in os.listdir(dir_model):
344
- if filename.endswith(suffix):
420
+ if filename.startswith(prefix) and filename.endswith(suffix):
345
421
  part_names.append(filename)
346
422
 
347
423
  part_names.sort()
@@ -370,6 +446,29 @@ class Model:
370
446
  except KeyError:
371
447
  raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
372
448
 
449
+ def does_token_look_special(self, token: str | bytes) -> bool:
450
+ if isinstance(token, (bytes, bytearray)):
451
+ token_text = token.decode(encoding="utf-8")
452
+ elif isinstance(token, memoryview):
453
+ token_text = token.tobytes().decode(encoding="utf-8")
454
+ else:
455
+ token_text = token
456
+
457
+ # Some models mark some added tokens which ought to be control tokens as not special.
458
+ # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
459
+ seems_special = token_text in (
460
+ "<pad>", # deepseek-coder
461
+ "<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
462
+ )
463
+
464
+ seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
465
+ seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
466
+
467
+ # TODO: should these be marked as UNUSED instead? (maybe not)
468
+ seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
469
+
470
+ return seems_special
471
+
373
472
  # used for GPT-2 BPE and WordPiece vocabs
374
473
  def get_vocab_base(self) -> tuple[list[str], list[int], str]:
375
474
  tokens: list[str] = []
@@ -388,20 +487,22 @@ class Model:
388
487
  for i in range(vocab_size):
389
488
  if i not in reverse_vocab:
390
489
  tokens.append(f"[PAD{i}]")
391
- toktypes.append(gguf.TokenType.USER_DEFINED)
392
- elif reverse_vocab[i] in added_vocab:
393
- tokens.append(reverse_vocab[i])
394
- if tokenizer.added_tokens_decoder[i].special:
395
- toktypes.append(gguf.TokenType.CONTROL)
396
- else:
397
- toktypes.append(gguf.TokenType.USER_DEFINED)
490
+ toktypes.append(gguf.TokenType.UNUSED)
398
491
  else:
399
- tokens.append(reverse_vocab[i])
400
- toktypes.append(gguf.TokenType.NORMAL)
492
+ token: str = reverse_vocab[i]
493
+ if token in added_vocab:
494
+ if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
495
+ toktypes.append(gguf.TokenType.CONTROL)
496
+ else:
497
+ token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
498
+ toktypes.append(gguf.TokenType.USER_DEFINED)
499
+ else:
500
+ toktypes.append(gguf.TokenType.NORMAL)
501
+ tokens.append(token)
401
502
 
402
503
  return tokens, toktypes, tokpre
403
504
 
404
- # NOTE: this function is generated by convert-hf-to-gguf-update.py
505
+ # NOTE: this function is generated by convert_hf_to_gguf_update.py
405
506
  # do not modify it manually!
406
507
  # ref: https://github.com/ggerganov/llama.cpp/pull/6920
407
508
  # Marker: Start get_vocab_base_pre
@@ -421,7 +522,7 @@ class Model:
421
522
 
422
523
  res = None
423
524
 
424
- # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
525
+ # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
425
526
  # or pull the latest version of the model from Huggingface
426
527
  # don't edit the hashes manually!
427
528
  if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
@@ -478,15 +579,39 @@ class Model:
478
579
  if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
479
580
  # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
480
581
  res = "smaug-bpe"
582
+ if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
583
+ # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
584
+ res = "poro-chat"
585
+ if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
586
+ # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
587
+ res = "jina-v2-code"
588
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
589
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
590
+ res = "chatglm-bpe"
591
+ if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
592
+ # ref: https://huggingface.co/LumiOpen/Viking-7B
593
+ res = "viking"
594
+ if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
595
+ # ref: https://huggingface.co/core42/jais-13b
596
+ res = "jais"
597
+ if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
598
+ # ref: https://huggingface.co/WisdomShell/CodeShell-7B
599
+ res = "codeshell"
600
+ if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
601
+ # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
602
+ res = "tekken"
603
+ if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
604
+ # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
605
+ res = "smollm"
481
606
 
482
607
  if res is None:
483
608
  logger.warning("\n")
484
609
  logger.warning("**************************************************************************************")
485
610
  logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
486
611
  logger.warning("** There are 2 possible reasons for this:")
487
- logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
612
+ logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
488
613
  logger.warning("** - the pre-tokenization config has changed upstream")
489
- logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
614
+ logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
490
615
  logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
491
616
  logger.warning("**")
492
617
  logger.warning(f"** chkhsh: {chkhsh}")
@@ -541,7 +666,7 @@ class Model:
541
666
  for i in range(vocab_size):
542
667
  if i not in reverse_vocab:
543
668
  tokens.append(f"[PAD{i}]")
544
- toktypes.append(gguf.TokenType.USER_DEFINED)
669
+ toktypes.append(gguf.TokenType.UNUSED)
545
670
  elif reverse_vocab[i] in added_vocab:
546
671
  tokens.append(reverse_vocab[i])
547
672
  toktypes.append(gguf.TokenType.CONTROL)
@@ -564,15 +689,23 @@ class Model:
564
689
  special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
565
690
  special_vocab.add_to_gguf(self.gguf_writer)
566
691
 
567
- def _set_vocab_sentencepiece(self):
692
+ def _set_vocab_sentencepiece(self, add_to_gguf=True):
693
+ tokens, scores, toktypes = self._create_vocab_sentencepiece()
694
+
695
+ self.gguf_writer.add_tokenizer_model("llama")
696
+ self.gguf_writer.add_tokenizer_pre("default")
697
+ self.gguf_writer.add_token_list(tokens)
698
+ self.gguf_writer.add_token_scores(scores)
699
+ self.gguf_writer.add_token_types(toktypes)
700
+
701
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
702
+ special_vocab.add_to_gguf(self.gguf_writer)
703
+
704
+ def _create_vocab_sentencepiece(self):
568
705
  from sentencepiece import SentencePieceProcessor
569
706
 
570
707
  tokenizer_path = self.dir_model / 'tokenizer.model'
571
708
 
572
- tokens: list[bytes] = []
573
- scores: list[float] = []
574
- toktypes: list[int] = []
575
-
576
709
  if not tokenizer_path.is_file():
577
710
  raise FileNotFoundError(f"File not found: {tokenizer_path}")
578
711
 
@@ -583,7 +716,7 @@ class Model:
583
716
 
584
717
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
585
718
  scores: list[float] = [-10000.0] * vocab_size
586
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
719
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
587
720
 
588
721
  for token_id in range(tokenizer.vocab_size()):
589
722
  piece = tokenizer.IdToPiece(token_id)
@@ -610,7 +743,7 @@ class Model:
610
743
  added_tokens_json = json.load(f)
611
744
  for key in added_tokens_json:
612
745
  token_id = added_tokens_json[key]
613
- if (token_id >= vocab_size):
746
+ if token_id >= vocab_size:
614
747
  logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
615
748
  continue
616
749
 
@@ -618,6 +751,26 @@ class Model:
618
751
  scores[token_id] = -1000.0
619
752
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
620
753
 
754
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
755
+ if tokenizer_config_file.is_file():
756
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
757
+ tokenizer_config_json = json.load(f)
758
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
759
+ for token_id, token_data in added_tokens_decoder.items():
760
+ token_id = int(token_id)
761
+ token: str = token_data["content"]
762
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
763
+ if tokens[token_id] != token.encode("utf-8"):
764
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
765
+ if token_data.get("special") or self.does_token_look_special(token):
766
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
767
+ else:
768
+ token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
769
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
770
+
771
+ scores[token_id] = -1000.0
772
+ tokens[token_id] = token.encode("utf-8")
773
+
621
774
  if vocab_size > len(tokens):
622
775
  pad_count = vocab_size - len(tokens)
623
776
  logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
@@ -626,17 +779,10 @@ class Model:
626
779
  scores.append(-1000.0)
627
780
  toktypes.append(SentencePieceTokenTypes.UNUSED)
628
781
 
629
- self.gguf_writer.add_tokenizer_model("llama")
630
- self.gguf_writer.add_tokenizer_pre("default")
631
- self.gguf_writer.add_token_list(tokens)
632
- self.gguf_writer.add_token_scores(scores)
633
- self.gguf_writer.add_token_types(toktypes)
634
-
635
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
636
- special_vocab.add_to_gguf(self.gguf_writer)
782
+ return tokens, scores, toktypes
637
783
 
638
784
  def _set_vocab_llama_hf(self):
639
- vocab = LlamaHfVocab(self.dir_model)
785
+ vocab = gguf.LlamaHfVocab(self.dir_model)
640
786
  tokens = []
641
787
  scores = []
642
788
  toktypes = []
@@ -657,6 +803,51 @@ class Model:
657
803
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
658
804
  special_vocab.add_to_gguf(self.gguf_writer)
659
805
 
806
+ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
807
+ tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
808
+ logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
809
+ vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
810
+
811
+ default_pre = "mpt" if model_name == "gpt-neox" else "default"
812
+
813
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
814
+ assert field # tokenizer model
815
+ self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
816
+
817
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
818
+ self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
819
+
820
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
821
+ assert field # token list
822
+ self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
823
+
824
+ if model_name == "llama-spm":
825
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
826
+ assert field # token scores
827
+ self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
828
+
829
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
830
+ assert field # token types
831
+ self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
832
+
833
+ if model_name != "llama-spm":
834
+ field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
835
+ assert field # token merges
836
+ self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
837
+
838
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
839
+ self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
840
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
841
+ self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
842
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
843
+ self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
844
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
845
+ self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
846
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
847
+ self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
848
+ if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
849
+ self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
850
+
660
851
 
661
852
  @Model.register("GPTNeoXForCausalLM")
662
853
  class GPTNeoXModel(Model):
@@ -665,7 +856,6 @@ class GPTNeoXModel(Model):
665
856
  def set_gguf_parameters(self):
666
857
  block_count = self.hparams["num_hidden_layers"]
667
858
 
668
- self.gguf_writer.add_name(self.dir_model.name)
669
859
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
670
860
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
671
861
  self.gguf_writer.add_block_count(block_count)
@@ -721,7 +911,6 @@ class BloomModel(Model):
721
911
  model_arch = gguf.MODEL_ARCH.BLOOM
722
912
 
723
913
  def set_gguf_parameters(self):
724
- self.gguf_writer.add_name("Bloom")
725
914
  n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
726
915
  n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
727
916
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
@@ -798,7 +987,6 @@ class MPTModel(Model):
798
987
 
799
988
  def set_gguf_parameters(self):
800
989
  block_count = self.hparams["n_layers"]
801
- self.gguf_writer.add_name(self.dir_model.name)
802
990
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
803
991
  self.gguf_writer.add_embedding_length(self.hparams["d_model"])
804
992
  self.gguf_writer.add_block_count(block_count)
@@ -837,7 +1025,6 @@ class OrionModel(Model):
837
1025
  block_count = self.hparams["num_hidden_layers"]
838
1026
  head_count = self.hparams["num_attention_heads"]
839
1027
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
840
- hf_repo = self.hparams.get("_name_or_path", "")
841
1028
 
842
1029
  ctx_length = 0
843
1030
  if "max_sequence_length" in self.hparams:
@@ -850,8 +1037,6 @@ class OrionModel(Model):
850
1037
  raise ValueError("gguf: can not find ctx length parameter.")
851
1038
 
852
1039
  self.gguf_writer.add_file_type(self.ftype)
853
- self.gguf_writer.add_name(self.dir_model.name)
854
- self.gguf_writer.add_source_hf_repo(hf_repo)
855
1040
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
856
1041
  self.gguf_writer.add_context_length(ctx_length)
857
1042
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -875,7 +1060,6 @@ class BaichuanModel(Model):
875
1060
  block_count = self.hparams["num_hidden_layers"]
876
1061
  head_count = self.hparams["num_attention_heads"]
877
1062
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
878
- hf_repo = self.hparams.get("_name_or_path", "")
879
1063
 
880
1064
  ctx_length = 0
881
1065
  if "max_sequence_length" in self.hparams:
@@ -887,8 +1071,6 @@ class BaichuanModel(Model):
887
1071
  else:
888
1072
  raise ValueError("gguf: can not find ctx length parameter.")
889
1073
 
890
- self.gguf_writer.add_name(self.dir_model.name)
891
- self.gguf_writer.add_source_hf_repo(hf_repo)
892
1074
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
893
1075
  self.gguf_writer.add_context_length(ctx_length)
894
1076
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -962,7 +1144,11 @@ class XverseModel(Model):
962
1144
  from transformers import AutoTokenizer
963
1145
  tokenizer = AutoTokenizer.from_pretrained(dir_model)
964
1146
  vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
965
- assert max(tokenizer.vocab.values()) < vocab_size
1147
+ # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
1148
+ # because vocab_size is the count of items, and indexes start at 0.
1149
+ max_vocab_index = max(tokenizer.get_vocab().values())
1150
+ if max_vocab_index >= vocab_size:
1151
+ raise ValueError("Vocabulary size exceeds expected maximum size.")
966
1152
 
967
1153
  reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
968
1154
  added_vocab = tokenizer.get_added_vocab()
@@ -998,7 +1184,6 @@ class XverseModel(Model):
998
1184
  block_count = self.hparams["num_hidden_layers"]
999
1185
  head_count = self.hparams["num_attention_heads"]
1000
1186
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
1001
- hf_repo = self.hparams.get("_name_or_path", "")
1002
1187
 
1003
1188
  ctx_length = 0
1004
1189
  if "max_sequence_length" in self.hparams:
@@ -1010,8 +1195,6 @@ class XverseModel(Model):
1010
1195
  else:
1011
1196
  raise ValueError("gguf: can not find ctx length parameter.")
1012
1197
 
1013
- self.gguf_writer.add_name(self.dir_model.name)
1014
- self.gguf_writer.add_source_hf_repo(hf_repo)
1015
1198
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
1016
1199
  self.gguf_writer.add_context_length(ctx_length)
1017
1200
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1070,7 +1253,6 @@ class FalconModel(Model):
1070
1253
  if n_head_kv is None:
1071
1254
  n_head_kv = self.hparams.get("n_head_kv", 1) # old name
1072
1255
 
1073
- self.gguf_writer.add_name("Falcon")
1074
1256
  self.gguf_writer.add_context_length(2048) # not in config.json
1075
1257
  self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
1076
1258
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1115,7 +1297,6 @@ class StarCoderModel(Model):
1115
1297
  def set_gguf_parameters(self):
1116
1298
  block_count = self.hparams["n_layer"]
1117
1299
 
1118
- self.gguf_writer.add_name("StarCoder")
1119
1300
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
1120
1301
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
1121
1302
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -1135,11 +1316,11 @@ class RefactModel(Model):
1135
1316
 
1136
1317
  # TODO: how to determine special FIM tokens automatically?
1137
1318
  special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
1138
- special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
1319
+ special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
1139
1320
  special_vocab._set_special_token("prefix", 1)
1140
1321
  special_vocab._set_special_token("suffix", 3)
1141
1322
  special_vocab._set_special_token("middle", 2)
1142
- special_vocab._set_special_token("fsep", 4) # is this correct?
1323
+ special_vocab.chat_template = None # do not add it twice
1143
1324
  special_vocab.add_to_gguf(self.gguf_writer)
1144
1325
 
1145
1326
  def set_gguf_parameters(self):
@@ -1151,7 +1332,6 @@ class RefactModel(Model):
1151
1332
 
1152
1333
  block_count = self.hparams["n_layer"]
1153
1334
 
1154
- self.gguf_writer.add_name("Refact")
1155
1335
  # refact uses Alibi. So this is from config.json which might be used by training.
1156
1336
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
1157
1337
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -1199,14 +1379,13 @@ class StableLMModel(Model):
1199
1379
  if (self.dir_model / "tokenizer.json").is_file():
1200
1380
  self._set_vocab_gpt2()
1201
1381
  else:
1202
- # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1382
+ # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
1203
1383
  self._set_vocab_qwen()
1204
1384
 
1205
1385
  def set_gguf_parameters(self):
1206
1386
  hparams = self.hparams
1207
1387
  block_count = hparams["num_hidden_layers"]
1208
1388
 
1209
- self.gguf_writer.add_name(self.dir_model.name)
1210
1389
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
1211
1390
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1212
1391
  self.gguf_writer.add_block_count(block_count)
@@ -1268,8 +1447,8 @@ class StableLMModel(Model):
1268
1447
 
1269
1448
  return [(new_name, data_torch)]
1270
1449
 
1271
- def write_tensors(self):
1272
- super().write_tensors()
1450
+ def prepare_tensors(self):
1451
+ super().prepare_tensors()
1273
1452
 
1274
1453
  if self._q_norms is not None or self._k_norms is not None:
1275
1454
  # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
@@ -1281,85 +1460,6 @@ class StableLMModel(Model):
1281
1460
  if len(norms) > 0:
1282
1461
  raise ValueError(f"Unprocessed norms: {norms}")
1283
1462
 
1284
- def write_tensors(self):
1285
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1286
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1287
- n_head = self.hparams.get("num_attention_heads")
1288
- n_kv_head = self.hparams.get("num_key_value_heads")
1289
- q_norms = dict()
1290
- k_norms = dict()
1291
- for name, data_torch in self.get_tensors():
1292
- # we don't need these
1293
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1294
- continue
1295
-
1296
- old_dtype = data_torch.dtype
1297
-
1298
- # convert any unsupported data types to float32
1299
- if data_torch.dtype not in (torch.float16, torch.float32):
1300
- data_torch = data_torch.to(torch.float32)
1301
-
1302
- data = data_torch.squeeze().numpy()
1303
- n_dims = len(data.shape)
1304
- if name.find("q_layernorm.norms") != -1:
1305
- q_norms[name] = data
1306
- if len(q_norms) >= (block_count * n_head):
1307
- self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
1308
- continue
1309
- if name.find("k_layernorm.norms") != -1:
1310
- k_norms[name] = data
1311
- if len(k_norms) >= (block_count * n_kv_head):
1312
- self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
1313
- continue
1314
-
1315
- # map tensor names
1316
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1317
- if new_name is None:
1318
- raise ValueError(f"Can not map tensor {name!r}")
1319
-
1320
- n_dims = len(data.shape)
1321
- data_dtype = data.dtype
1322
-
1323
- # if f32 desired, convert any float16 to float32
1324
- if self.ftype == 0 and data_dtype == np.float16:
1325
- data = data.astype(np.float32)
1326
-
1327
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1328
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1329
- data = data.astype(np.float32)
1330
-
1331
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1332
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1333
- data = data.astype(np.float16)
1334
-
1335
- logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1336
-
1337
- self.gguf_writer.add_tensor(new_name, data)
1338
-
1339
- def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
1340
- for bid in range(block_count):
1341
- datas = []
1342
- for xid in range(n_head):
1343
- ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
1344
- datas.append(norms[ename])
1345
- del norms[ename]
1346
- data = np.stack(datas, axis=0)
1347
- data_dtype = data.dtype
1348
- merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
1349
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1350
- if new_name is None:
1351
- raise ValueError(f"Can not map tensor {name!r}")
1352
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
1353
- data = data.astype(np.float32)
1354
-
1355
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1356
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
1357
- data = data.astype(np.float16)
1358
-
1359
- logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1360
-
1361
- self.gguf_writer.add_tensor(new_name, data)
1362
-
1363
1463
 
1364
1464
  @Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1365
1465
  class LlamaModel(Model):
@@ -1367,7 +1467,7 @@ class LlamaModel(Model):
1367
1467
 
1368
1468
  def set_vocab(self):
1369
1469
  try:
1370
- self. _set_vocab_sentencepiece()
1470
+ self._set_vocab_sentencepiece()
1371
1471
  except FileNotFoundError:
1372
1472
  try:
1373
1473
  self._set_vocab_llama_hf()
@@ -1391,13 +1491,29 @@ class LlamaModel(Model):
1391
1491
  super().set_gguf_parameters()
1392
1492
  hparams = self.hparams
1393
1493
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1394
- self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
1494
+
1495
+ if "head_dim" in hparams:
1496
+ rope_dim = hparams["head_dim"]
1497
+ else:
1498
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1499
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
1395
1500
 
1396
1501
  if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1397
1502
  if self.hparams["rope_scaling"].get("type") == "linear":
1398
1503
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1399
1504
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1400
1505
 
1506
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1507
+ if tokenizer_config_file.is_file():
1508
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1509
+ tokenizer_config_json = json.load(f)
1510
+ if "add_prefix_space" in tokenizer_config_json:
1511
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1512
+
1513
+ # Apply to granite small models only
1514
+ if self.hparams.get("vocab_size", 32000) == 49152:
1515
+ self.gguf_writer.add_add_bos_token(False)
1516
+
1401
1517
  @staticmethod
1402
1518
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1403
1519
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1412,9 +1528,9 @@ class LlamaModel(Model):
1412
1528
  n_head = self.hparams["num_attention_heads"]
1413
1529
  n_kv_head = self.hparams.get("num_key_value_heads")
1414
1530
 
1415
- if name.endswith("q_proj.weight"):
1531
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
1416
1532
  data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1417
- if name.endswith("k_proj.weight"):
1533
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
1418
1534
  data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1419
1535
 
1420
1536
  # process the experts separately
@@ -1453,8 +1569,35 @@ class LlamaModel(Model):
1453
1569
 
1454
1570
  return [(self.map_tensor_name(name), data_torch)]
1455
1571
 
1456
- def write_tensors(self):
1457
- super().write_tensors()
1572
+ def prepare_tensors(self):
1573
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1574
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
1575
+ base = self.hparams.get("rope_theta", 10000.0)
1576
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1577
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1578
+ factor = rope_scaling.get("factor", 8.0)
1579
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1580
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1581
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1582
+
1583
+ low_freq_wavelen = old_context_len / low_freq_factor
1584
+ high_freq_wavelen = old_context_len / high_freq_factor
1585
+ assert low_freq_wavelen != high_freq_wavelen
1586
+
1587
+ rope_factors = []
1588
+ for freq in freqs:
1589
+ wavelen = 2 * math.pi / freq
1590
+ if wavelen < high_freq_wavelen:
1591
+ rope_factors.append(1)
1592
+ elif wavelen > low_freq_wavelen:
1593
+ rope_factors.append(factor)
1594
+ else:
1595
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1596
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1597
+
1598
+ self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1599
+
1600
+ super().prepare_tensors()
1458
1601
 
1459
1602
  if self._experts is not None:
1460
1603
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -1463,6 +1606,48 @@ class LlamaModel(Model):
1463
1606
  raise ValueError(f"Unprocessed experts: {experts}")
1464
1607
 
1465
1608
 
1609
+ @Model.register("BitnetForCausalLM")
1610
+ class BitnetModel(Model):
1611
+ model_arch = gguf.MODEL_ARCH.BITNET
1612
+
1613
+ def set_vocab(self):
1614
+ self._set_vocab_sentencepiece()
1615
+
1616
+ def set_gguf_parameters(self):
1617
+ super().set_gguf_parameters()
1618
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1619
+ self.gguf_writer.add_rope_scaling_factor(1.0)
1620
+
1621
+ def weight_quant(self, weight):
1622
+ dtype = weight.dtype
1623
+ weight = weight.float()
1624
+ s = 1 / weight.abs().mean().clamp(min=1e-5)
1625
+ weight = (weight * s).round().clamp(-1, 1) / s
1626
+ scale = weight.abs().max().unsqueeze(0)
1627
+ weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
1628
+ weight = torch.sign(weight).type(dtype)
1629
+ return weight.type(dtype), scale.type(torch.float32)
1630
+
1631
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1632
+ new_name = self.map_tensor_name(name)
1633
+
1634
+ if any(self.match_model_tensor_name(new_name, key, bid) for key in [
1635
+ gguf.MODEL_TENSOR.ATTN_Q,
1636
+ gguf.MODEL_TENSOR.ATTN_K,
1637
+ gguf.MODEL_TENSOR.ATTN_V,
1638
+ gguf.MODEL_TENSOR.ATTN_OUT,
1639
+ gguf.MODEL_TENSOR.FFN_UP,
1640
+ gguf.MODEL_TENSOR.FFN_DOWN,
1641
+ gguf.MODEL_TENSOR.FFN_GATE,
1642
+ ]):
1643
+ # transform weight into 1/0/-1 (in fp32)
1644
+ weight_torch, scale_torch = self.weight_quant(data_torch)
1645
+ yield (new_name, weight_torch)
1646
+ yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
1647
+ else:
1648
+ yield (new_name, data_torch)
1649
+
1650
+
1466
1651
  @Model.register("GrokForCausalLM")
1467
1652
  class GrokModel(Model):
1468
1653
  model_arch = gguf.MODEL_ARCH.GROK
@@ -1475,7 +1660,6 @@ class GrokModel(Model):
1475
1660
 
1476
1661
  def set_gguf_parameters(self):
1477
1662
  super().set_gguf_parameters()
1478
- self.gguf_writer.add_name("Grok")
1479
1663
 
1480
1664
  _experts: list[dict[str, Tensor]] | None = None
1481
1665
 
@@ -1524,7 +1708,6 @@ class DbrxModel(Model):
1524
1708
  def set_gguf_parameters(self):
1525
1709
  ffn_config = self.hparams["ffn_config"]
1526
1710
  attn_config = self.hparams["attn_config"]
1527
- self.gguf_writer.add_name(self.hparams["model_type"])
1528
1711
  self.gguf_writer.add_block_count(self.hparams["n_layers"])
1529
1712
 
1530
1713
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
@@ -1537,7 +1720,6 @@ class DbrxModel(Model):
1537
1720
  self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
1538
1721
 
1539
1722
  self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
1540
- self.gguf_writer.add_file_type(self.ftype)
1541
1723
 
1542
1724
  self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
1543
1725
  self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
@@ -1594,7 +1776,6 @@ class MiniCPMModel(Model):
1594
1776
 
1595
1777
  def set_gguf_parameters(self):
1596
1778
  block_count = self.hparams["num_hidden_layers"]
1597
- self.gguf_writer.add_name("MiniCPM")
1598
1779
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1599
1780
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1600
1781
  self.gguf_writer.add_block_count(block_count)
@@ -1612,9 +1793,11 @@ class MiniCPMModel(Model):
1612
1793
  if n_kv_head is not None and n_head != n_kv_head:
1613
1794
  n_head = n_kv_head
1614
1795
 
1615
- return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1616
- .swapaxes(1, 2)
1617
- .reshape(weights.shape))
1796
+ return (
1797
+ weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1798
+ .swapaxes(1, 2)
1799
+ .reshape(weights.shape)
1800
+ )
1618
1801
 
1619
1802
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1620
1803
  del bid # unused
@@ -1662,7 +1845,6 @@ class QwenModel(Model):
1662
1845
  self._set_vocab_qwen()
1663
1846
 
1664
1847
  def set_gguf_parameters(self):
1665
- self.gguf_writer.add_name("Qwen")
1666
1848
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1667
1849
  self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
1668
1850
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -1693,6 +1875,12 @@ class Qwen2MoeModel(Model):
1693
1875
  super().set_gguf_parameters()
1694
1876
  if (n_experts := self.hparams.get("num_experts")) is not None:
1695
1877
  self.gguf_writer.add_expert_count(n_experts)
1878
+ if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
1879
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
1880
+ logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
1881
+ if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
1882
+ self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
1883
+ logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
1696
1884
 
1697
1885
  _experts: list[dict[str, Tensor]] | None = None
1698
1886
 
@@ -1732,8 +1920,8 @@ class Qwen2MoeModel(Model):
1732
1920
 
1733
1921
  return [(self.map_tensor_name(name), data_torch)]
1734
1922
 
1735
- def write_tensors(self):
1736
- super().write_tensors()
1923
+ def prepare_tensors(self):
1924
+ super().prepare_tensors()
1737
1925
 
1738
1926
  if self._experts is not None:
1739
1927
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -1747,7 +1935,6 @@ class GPT2Model(Model):
1747
1935
  model_arch = gguf.MODEL_ARCH.GPT2
1748
1936
 
1749
1937
  def set_gguf_parameters(self):
1750
- self.gguf_writer.add_name(self.dir_model.name)
1751
1938
  self.gguf_writer.add_block_count(self.hparams["n_layer"])
1752
1939
  self.gguf_writer.add_context_length(self.hparams["n_ctx"])
1753
1940
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -1790,7 +1977,6 @@ class Phi2Model(Model):
1790
1977
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
1791
1978
  n_head = self.find_hparam(["num_attention_heads", "n_head"])
1792
1979
 
1793
- self.gguf_writer.add_name("Phi2")
1794
1980
  self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
1795
1981
 
1796
1982
  self.gguf_writer.add_embedding_length(n_embd)
@@ -1823,7 +2009,7 @@ class Phi3MiniModel(Model):
1823
2009
 
1824
2010
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
1825
2011
  scores: list[float] = [-10000.0] * vocab_size
1826
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2012
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
1827
2013
 
1828
2014
  for token_id in range(tokenizer.vocab_size()):
1829
2015
 
@@ -1852,7 +2038,7 @@ class Phi3MiniModel(Model):
1852
2038
 
1853
2039
  for key in added_tokens_json:
1854
2040
  token_id = added_tokens_json[key]
1855
- if (token_id >= vocab_size):
2041
+ if token_id >= vocab_size:
1856
2042
  logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
1857
2043
  continue
1858
2044
 
@@ -1868,8 +2054,9 @@ class Phi3MiniModel(Model):
1868
2054
  for token_id, foken_data in added_tokens_decoder.items():
1869
2055
  token_id = int(token_id)
1870
2056
  token = foken_data["content"].encode("utf-8")
1871
- if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1872
- assert tokens[token_id] == token
2057
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2058
+ if tokens[token_id] != token:
2059
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
1873
2060
  tokens[token_id] = token
1874
2061
  scores[token_id] = -1000.0
1875
2062
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1884,8 +2071,9 @@ class Phi3MiniModel(Model):
1884
2071
  for foken_data in added_tokens:
1885
2072
  token_id = int(foken_data["id"])
1886
2073
  token = foken_data["content"].encode("utf-8")
1887
- if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1888
- assert tokens[token_id] == token
2074
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2075
+ if tokens[token_id] != token:
2076
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
1889
2077
  tokens[token_id] = token
1890
2078
  scores[token_id] = -1000.0
1891
2079
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1912,7 +2100,6 @@ class Phi3MiniModel(Model):
1912
2100
  orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
1913
2101
  rope_dims = n_embd // n_head
1914
2102
 
1915
- self.gguf_writer.add_name("Phi3")
1916
2103
  self.gguf_writer.add_context_length(max_pos_embds)
1917
2104
  self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
1918
2105
  self.gguf_writer.add_embedding_length(n_embd)
@@ -1924,10 +2111,11 @@ class Phi3MiniModel(Model):
1924
2111
  self.gguf_writer.add_rope_dimension_count(rope_dims)
1925
2112
  self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
1926
2113
  self.gguf_writer.add_file_type(self.ftype)
2114
+ self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
1927
2115
 
1928
2116
  # write rope scaling for long context (128k) model
1929
2117
  rope_scaling = self.find_hparam(['rope_scaling'], True)
1930
- if (rope_scaling is None):
2118
+ if rope_scaling is None:
1931
2119
  return
1932
2120
 
1933
2121
  scale = max_pos_embds / orig_max_pos_embds
@@ -1936,7 +2124,7 @@ class Phi3MiniModel(Model):
1936
2124
  if len(rope_scaling_type) == 0:
1937
2125
  raise KeyError('Missing the required key rope_scaling.type')
1938
2126
 
1939
- if rope_scaling_type == 'su':
2127
+ if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
1940
2128
  attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
1941
2129
  elif rope_scaling_type == 'yarn':
1942
2130
  attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
@@ -1969,7 +2157,6 @@ class PlamoModel(Model):
1969
2157
  hparams = self.hparams
1970
2158
  block_count = hparams["num_hidden_layers"]
1971
2159
 
1972
- self.gguf_writer.add_name("PLaMo")
1973
2160
  self.gguf_writer.add_context_length(4096) # not in config.json
1974
2161
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1975
2162
  self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
@@ -2014,7 +2201,6 @@ class CodeShellModel(Model):
2014
2201
  def set_gguf_parameters(self):
2015
2202
  block_count = self.hparams["n_layer"]
2016
2203
 
2017
- self.gguf_writer.add_name("CodeShell")
2018
2204
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
2019
2205
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
2020
2206
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -2066,7 +2252,7 @@ class InternLM2Model(Model):
2066
2252
  logger.error(f'Error: Missing {tokenizer_path}')
2067
2253
  sys.exit(1)
2068
2254
 
2069
- sentencepiece_model = model.ModelProto()
2255
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
2070
2256
  sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2071
2257
  add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2072
2258
 
@@ -2094,6 +2280,9 @@ class InternLM2Model(Model):
2094
2280
  toktype = SentencePieceTokenTypes.UNUSED
2095
2281
  elif tokenizer.IsByte(token_id):
2096
2282
  toktype = SentencePieceTokenTypes.BYTE
2283
+ # take care of ununsed raw token
2284
+ if piece.startswith('[UNUSED'):
2285
+ toktype = SentencePieceTokenTypes.UNUSED
2097
2286
 
2098
2287
  tokens.append(text)
2099
2288
  scores.append(score)
@@ -2109,6 +2298,49 @@ class InternLM2Model(Model):
2109
2298
  scores.append(-1000.0)
2110
2299
  toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
2111
2300
 
2301
+ chat_eos_token = '<|im_end|>'
2302
+ chat_eos_token_id = None
2303
+
2304
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2305
+ if tokenizer_config_file.is_file():
2306
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2307
+ tokenizer_config_json = json.load(f)
2308
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
2309
+ for token_id, foken_data in added_tokens_decoder.items():
2310
+ token_id = int(token_id)
2311
+ token = foken_data["content"]
2312
+ if token == chat_eos_token:
2313
+ chat_eos_token_id = token_id
2314
+ token = token.encode("utf-8")
2315
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2316
+ if tokens[token_id] != token:
2317
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
2318
+ tokens[token_id] = token
2319
+ scores[token_id] = -1000.0
2320
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2321
+ if foken_data.get("special"):
2322
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2323
+
2324
+ tokenizer_file = self.dir_model / 'tokenizer.json'
2325
+ if tokenizer_file.is_file():
2326
+ with open(tokenizer_file, "r", encoding="utf-8") as f:
2327
+ tokenizer_json = json.load(f)
2328
+ added_tokens = tokenizer_json.get("added_tokens", [])
2329
+ for foken_data in added_tokens:
2330
+ token_id = int(foken_data["id"])
2331
+ token = foken_data["content"]
2332
+ if token == chat_eos_token:
2333
+ chat_eos_token_id = token_id
2334
+ token = token.encode("utf-8")
2335
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2336
+ if tokens[token_id] != token:
2337
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
2338
+ tokens[token_id] = token
2339
+ scores[token_id] = -1000.0
2340
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2341
+ if foken_data.get("special"):
2342
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2343
+
2112
2344
  self.gguf_writer.add_tokenizer_model("llama")
2113
2345
  self.gguf_writer.add_tokenizer_pre("default")
2114
2346
  self.gguf_writer.add_token_list(tokens)
@@ -2118,37 +2350,17 @@ class InternLM2Model(Model):
2118
2350
 
2119
2351
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2120
2352
  old_eos = special_vocab.special_token_ids["eos"]
2121
- if "chat" in os.path.basename(self.dir_model.absolute()):
2353
+ if chat_eos_token_id is not None:
2122
2354
  # For the chat model, we replace the eos with '<|im_end|>'.
2123
2355
  # TODO: this is a hack, should be fixed
2124
2356
  # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2125
- special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
2126
- logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
2127
- in chat mode so that the conversation can end normally.")
2357
+ special_vocab.special_token_ids["eos"] = chat_eos_token_id
2358
+ logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
2359
+ " in chat mode so that the conversation can end normally.")
2128
2360
 
2129
2361
  special_vocab.add_to_gguf(self.gguf_writer)
2130
2362
 
2131
- def _try_get_sft_eos(self, tokenizer):
2132
- unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
2133
- im_end_list = tokenizer.Encode('<|im_end|>')
2134
- eos_token = None
2135
- assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
2136
- if len(unused_145_list) == 1:
2137
- eos_token = unused_145_list[0]
2138
- if len(im_end_list) == 1:
2139
- eos_token = im_end_list[0]
2140
- assert eos_token
2141
- return eos_token
2142
-
2143
- def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
2144
- if n_head_kv is not None and n_head != n_head_kv:
2145
- n_head = n_head_kv
2146
- return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2147
- .swapaxes(1, 2)
2148
- .reshape(weights.shape))
2149
-
2150
2363
  def set_gguf_parameters(self):
2151
- self.gguf_writer.add_name("InternLM2")
2152
2364
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
2153
2365
  self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
2154
2366
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
@@ -2158,30 +2370,30 @@ in chat mode so that the conversation can end normally.")
2158
2370
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2159
2371
  self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
2160
2372
  self.gguf_writer.add_file_type(self.ftype)
2373
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2374
+ if self.hparams["rope_scaling"].get("type") == "linear":
2375
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
2376
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2161
2377
 
2162
2378
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2163
2379
  num_heads = self.hparams["num_attention_heads"]
2164
2380
  num_kv_heads = self.hparams["num_key_value_heads"]
2165
- hidden_size = self.hparams["hidden_size"]
2381
+ n_embd = self.hparams["hidden_size"]
2166
2382
  q_per_kv = num_heads // num_kv_heads
2167
- head_dim = hidden_size // num_heads
2383
+ head_dim = n_embd // num_heads
2168
2384
  num_groups = num_heads // q_per_kv
2169
2385
 
2170
- qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
2171
-
2172
- if re.match(qkv_pattern, name):
2173
- bid = re.findall(qkv_pattern, name)[0]
2386
+ if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
2174
2387
  qkv = data_torch
2175
- # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
2176
- qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
2177
- q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
2388
+
2389
+ qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
2390
+ q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
2391
+
2178
2392
  # The model weights of q and k equire additional reshape.
2179
- # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
2180
- q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
2181
- # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
2182
- k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
2183
- # v = rearrange(v, " o g n i -> o (g n i)").T
2184
- v = v.reshape((v.shape[0], -1)).T
2393
+ q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
2394
+ k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
2395
+ v = v.reshape((-1, v.shape[-1]))
2396
+
2185
2397
  return [
2186
2398
  (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
2187
2399
  (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
@@ -2308,13 +2520,55 @@ class GemmaModel(Model):
2308
2520
  special_vocab._set_special_token("middle", 68)
2309
2521
  special_vocab._set_special_token("fsep", 70)
2310
2522
  special_vocab._set_special_token("eot", 107)
2523
+ special_vocab.chat_template = None # do not add it twice
2311
2524
  special_vocab.add_to_gguf(self.gguf_writer)
2312
2525
 
2526
+ self.gguf_writer.add_add_space_prefix(False)
2527
+
2528
+ def set_gguf_parameters(self):
2529
+ hparams = self.hparams
2530
+ block_count = hparams["num_hidden_layers"]
2531
+
2532
+ self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2533
+ self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2534
+ self.gguf_writer.add_block_count(block_count)
2535
+ self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2536
+ self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2537
+ self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
2538
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
2539
+ self.gguf_writer.add_key_length(hparams["head_dim"])
2540
+ self.gguf_writer.add_value_length(hparams["head_dim"])
2541
+ self.gguf_writer.add_file_type(self.ftype)
2542
+
2543
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2544
+ del bid # unused
2545
+
2546
+ # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
2547
+ # To prevent errors, skip loading lm_head.weight.
2548
+ if name == "lm_head.weight":
2549
+ logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
2550
+ return []
2551
+
2552
+ # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
2553
+ if name.endswith("norm.weight"):
2554
+ data_torch = data_torch + 1
2555
+
2556
+ return [(self.map_tensor_name(name), data_torch)]
2557
+
2558
+
2559
+ @Model.register("Gemma2ForCausalLM")
2560
+ class Gemma2Model(Model):
2561
+ model_arch = gguf.MODEL_ARCH.GEMMA2
2562
+
2563
+ def set_vocab(self):
2564
+ self._set_vocab_sentencepiece()
2565
+
2566
+ self.gguf_writer.add_add_space_prefix(False)
2567
+
2313
2568
  def set_gguf_parameters(self):
2314
2569
  hparams = self.hparams
2315
2570
  block_count = hparams["num_hidden_layers"]
2316
2571
 
2317
- self.gguf_writer.add_name(self.dir_model.name)
2318
2572
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2319
2573
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2320
2574
  self.gguf_writer.add_block_count(block_count)
@@ -2325,6 +2579,13 @@ class GemmaModel(Model):
2325
2579
  self.gguf_writer.add_key_length(hparams["head_dim"])
2326
2580
  self.gguf_writer.add_value_length(hparams["head_dim"])
2327
2581
  self.gguf_writer.add_file_type(self.ftype)
2582
+ self.gguf_writer.add_attn_logit_softcapping(
2583
+ self.hparams["attn_logit_softcapping"]
2584
+ )
2585
+ self.gguf_writer.add_final_logit_softcapping(
2586
+ self.hparams["final_logit_softcapping"]
2587
+ )
2588
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
2328
2589
 
2329
2590
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2330
2591
  del bid # unused
@@ -2366,39 +2627,7 @@ class MambaModel(Model):
2366
2627
  self._set_vocab_sentencepiece()
2367
2628
  else:
2368
2629
  # Use the GPT-NeoX tokenizer when no tokenizer files are present
2369
- tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
2370
- logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
2371
- neox_reader = gguf.GGUFReader(tokenizer_path, "r")
2372
-
2373
- field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
2374
- self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
2375
-
2376
- field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
2377
- self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
2378
-
2379
- field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
2380
- assert field
2381
- self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
2382
-
2383
- field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
2384
- assert field
2385
- self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
2386
-
2387
- field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
2388
- assert field
2389
- self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
2390
-
2391
- field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
2392
- self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
2393
-
2394
- field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
2395
- self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
2396
-
2397
- field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
2398
- self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
2399
-
2400
- field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
2401
- self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)
2630
+ self._set_vocab_builtin("gpt-neox", vocab_size)
2402
2631
 
2403
2632
  def set_gguf_parameters(self):
2404
2633
  d_model = self.find_hparam(["hidden_size", "d_model"])
@@ -2414,7 +2643,6 @@ class MambaModel(Model):
2414
2643
  # Fail early for models which don't have a block expansion factor of 2
2415
2644
  assert d_inner == 2 * d_model
2416
2645
 
2417
- self.gguf_writer.add_name(self.dir_model.name)
2418
2646
  self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
2419
2647
  self.gguf_writer.add_embedding_length(d_model)
2420
2648
  self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@@ -2521,18 +2749,20 @@ class JinaBertV2Model(BertModel):
2521
2749
 
2522
2750
  def get_tensors(self):
2523
2751
  for name, data in super().get_tensors():
2524
- if 'gated_layers' in name:
2752
+ if 'gated_layer' in name:
2525
2753
  d1 = data[:self.intermediate_size, :]
2526
2754
  name1 = name.replace('gated_layers', 'gated_layers_w')
2755
+ name1 = name1.replace('up_gated_layer', 'gated_layers_v')
2527
2756
  d2 = data[self.intermediate_size:, :]
2528
2757
  name2 = name.replace('gated_layers', 'gated_layers_v')
2758
+ name2 = name2.replace('up_gated_layer', 'gated_layers_w')
2529
2759
  yield name1, d1
2530
2760
  yield name2, d2
2531
2761
  continue
2532
2762
 
2533
2763
  yield name, data
2534
2764
 
2535
- def set_vocab(self, *args, **kwargs):
2765
+ def set_vocab(self):
2536
2766
  tokenizer_class = 'BertTokenizer'
2537
2767
  with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
2538
2768
  tokenizer_class = json.load(f)['tokenizer_class']
@@ -2548,17 +2778,92 @@ class JinaBertV2Model(BertModel):
2548
2778
  self.gguf_writer.add_add_eos_token(True)
2549
2779
 
2550
2780
 
2551
- @Model.register("ArcticForCausalLM")
2552
- class ArcticModel(Model):
2553
- model_arch = gguf.MODEL_ARCH.ARCTIC
2554
-
2555
- def set_vocab(self):
2556
- # The reason for using a custom implementation here is that the
2557
- # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
2558
- # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
2559
- from sentencepiece import SentencePieceProcessor
2781
+ @Model.register("OpenELMForCausalLM")
2782
+ class OpenELMModel(Model):
2783
+ model_arch = gguf.MODEL_ARCH.OPENELM
2560
2784
 
2561
- tokenizer_path = self.dir_model / 'tokenizer.model'
2785
+ @staticmethod
2786
+ def _make_divisible(v: float | int, divisor: int) -> int:
2787
+ # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
2788
+ new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
2789
+ # Make sure that round down does not go down by more than 10%.
2790
+ if new_v < 0.9 * v:
2791
+ new_v += divisor
2792
+ return new_v
2793
+
2794
+ def __init__(self, *args, **kwargs):
2795
+ super().__init__(*args, **kwargs)
2796
+
2797
+ ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
2798
+ ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
2799
+ self._n_embd: int = self.hparams["model_dim"]
2800
+ self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
2801
+ self._num_query_heads: list[int] = self.hparams["num_query_heads"]
2802
+ self._ffn_dims: list[int] = [
2803
+ OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
2804
+ for multiplier in ffn_multipliers
2805
+ ]
2806
+ assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
2807
+ assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
2808
+
2809
+ # Uses the tokenizer from meta-llama/Llama-2-7b-hf
2810
+ def set_vocab(self):
2811
+ try:
2812
+ self._set_vocab_sentencepiece()
2813
+ except FileNotFoundError:
2814
+ self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
2815
+
2816
+ def set_gguf_parameters(self):
2817
+ n_embd = self._n_embd
2818
+ head_dim = self.hparams["head_dim"]
2819
+ rot_pct = 1.0
2820
+ assert self.block_count == len(self._num_kv_heads)
2821
+ assert self.block_count == len(self._num_query_heads)
2822
+ assert self.block_count == len(self._ffn_dims)
2823
+
2824
+ self.gguf_writer.add_block_count(self.block_count)
2825
+ self.gguf_writer.add_context_length(self.hparams["max_context_length"])
2826
+ self.gguf_writer.add_embedding_length(n_embd)
2827
+ self.gguf_writer.add_feed_forward_length(self._ffn_dims)
2828
+ self.gguf_writer.add_head_count(self._num_query_heads)
2829
+ self.gguf_writer.add_head_count_kv(self._num_kv_heads)
2830
+ self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
2831
+ # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
2832
+ self.gguf_writer.add_layer_norm_rms_eps(1e-6)
2833
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
2834
+ self.gguf_writer.add_key_length(head_dim)
2835
+ self.gguf_writer.add_value_length(head_dim)
2836
+ self.gguf_writer.add_file_type(self.ftype)
2837
+
2838
+ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
2839
+ if "n_layers" in keys:
2840
+ return self.hparams["num_transformer_layers"]
2841
+
2842
+ return super().find_hparam(keys, optional)
2843
+
2844
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2845
+
2846
+ # split ff
2847
+ if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
2848
+ ff_dim = self._ffn_dims[bid]
2849
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
2850
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
2851
+ return
2852
+
2853
+ yield (self.map_tensor_name(name), data_torch)
2854
+
2855
+
2856
+ @Model.register("ArcticForCausalLM")
2857
+ class ArcticModel(Model):
2858
+ model_arch = gguf.MODEL_ARCH.ARCTIC
2859
+
2860
+ def set_vocab(self):
2861
+ # The reason for using a custom implementation here is that the
2862
+ # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
2863
+ # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
2864
+ from sentencepiece import SentencePieceProcessor
2865
+
2866
+ tokenizer_path = self.dir_model / 'tokenizer.model'
2562
2867
 
2563
2868
  if not tokenizer_path.is_file():
2564
2869
  logger.error(f'Error: Missing {tokenizer_path}')
@@ -2572,7 +2877,7 @@ class ArcticModel(Model):
2572
2877
 
2573
2878
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
2574
2879
  scores: list[float] = [-10000.0] * vocab_size
2575
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2880
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
2576
2881
 
2577
2882
  for token_id in range(tokenizer.vocab_size()):
2578
2883
 
@@ -2605,7 +2910,7 @@ class ArcticModel(Model):
2605
2910
  added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
2606
2911
  for token_id, token_json in added_tokens_decoder.items():
2607
2912
  token_id = int(token_id)
2608
- if (token_id >= vocab_size):
2913
+ if token_id >= vocab_size:
2609
2914
  logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
2610
2915
  continue
2611
2916
 
@@ -2689,8 +2994,8 @@ class ArcticModel(Model):
2689
2994
 
2690
2995
  return [(self.map_tensor_name(name), data_torch)]
2691
2996
 
2692
- def write_tensors(self):
2693
- super().write_tensors()
2997
+ def prepare_tensors(self):
2998
+ super().prepare_tensors()
2694
2999
 
2695
3000
  if self._experts is not None:
2696
3001
  # flatten `list[dict[str, Tensor]]` into `list[str]`
@@ -2699,6 +3004,499 @@ class ArcticModel(Model):
2699
3004
  raise ValueError(f"Unprocessed experts: {experts}")
2700
3005
 
2701
3006
 
3007
+ @Model.register("DeepseekV2ForCausalLM")
3008
+ class DeepseekV2Model(Model):
3009
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK2
3010
+
3011
+ def set_vocab(self):
3012
+ self._set_vocab_gpt2()
3013
+
3014
+ def set_gguf_parameters(self):
3015
+ super().set_gguf_parameters()
3016
+ hparams = self.hparams
3017
+
3018
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
3019
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3020
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
3021
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
3022
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
3023
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
3024
+ self.gguf_writer.add_value_length(hparams["v_head_dim"])
3025
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
3026
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3027
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3028
+ self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
3029
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
3030
+
3031
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
3032
+ if self.hparams["rope_scaling"].get("type") == "yarn":
3033
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
3034
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
3035
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
3036
+ self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
3037
+
3038
+ _experts: list[dict[str, Tensor]] | None = None
3039
+
3040
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3041
+ # process the experts separately
3042
+ if name.find("mlp.experts") != -1:
3043
+ n_experts = self.hparams["n_routed_experts"]
3044
+ assert bid is not None
3045
+
3046
+ if self._experts is None:
3047
+ self._experts = [{} for _ in range(self.block_count)]
3048
+
3049
+ self._experts[bid][name] = data_torch
3050
+
3051
+ if len(self._experts[bid]) >= n_experts * 3:
3052
+ tensors: list[tuple[str, Tensor]] = []
3053
+
3054
+ # merge the experts into a single 3d tensor
3055
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
3056
+ datas: list[Tensor] = []
3057
+
3058
+ for xid in range(n_experts):
3059
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3060
+ datas.append(self._experts[bid][ename])
3061
+ del self._experts[bid][ename]
3062
+
3063
+ data_torch = torch.stack(datas, dim=0)
3064
+
3065
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3066
+
3067
+ new_name = self.map_tensor_name(merged_name)
3068
+
3069
+ tensors.append((new_name, data_torch))
3070
+ return tensors
3071
+ else:
3072
+ return []
3073
+
3074
+ return [(self.map_tensor_name(name), data_torch)]
3075
+
3076
+ def prepare_tensors(self):
3077
+ super().prepare_tensors()
3078
+
3079
+ if self._experts is not None:
3080
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3081
+ experts = [k for d in self._experts for k in d.keys()]
3082
+ if len(experts) > 0:
3083
+ raise ValueError(f"Unprocessed experts: {experts}")
3084
+
3085
+
3086
+ @Model.register("T5WithLMHeadModel")
3087
+ @Model.register("T5ForConditionalGeneration")
3088
+ @Model.register("MT5ForConditionalGeneration")
3089
+ @Model.register("UMT5ForConditionalGeneration")
3090
+ class T5Model(Model):
3091
+ model_arch = gguf.MODEL_ARCH.T5
3092
+
3093
+ def __init__(self, *args, **kwargs):
3094
+ super().__init__(*args, **kwargs)
3095
+ self.shared_token_embeddings_found = False
3096
+
3097
+ def set_vocab(self):
3098
+ # to avoid TypeError: Descriptors cannot be created directly
3099
+ # exception when importing sentencepiece_model_pb2
3100
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3101
+ from sentencepiece import SentencePieceProcessor
3102
+ from sentencepiece import sentencepiece_model_pb2 as model
3103
+
3104
+ tokenizer_path = self.dir_model / 'tokenizer.model'
3105
+
3106
+ # many older models use spiece.model tokenizer model filename
3107
+ if not tokenizer_path.is_file():
3108
+ tokenizer_path = self.dir_model / 'spiece.model'
3109
+
3110
+ if not tokenizer_path.is_file():
3111
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
3112
+
3113
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3114
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3115
+
3116
+ # some models like Pile-T5 family use BPE tokenizer instead of Unigram
3117
+ if sentencepiece_model.trainer_spec.model_type == 2: # BPE
3118
+ # assure the tokenizer model file name is correct
3119
+ assert tokenizer_path.name == 'tokenizer.model'
3120
+ return self._set_vocab_sentencepiece()
3121
+ else:
3122
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3123
+
3124
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3125
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3126
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3127
+
3128
+ tokenizer = SentencePieceProcessor()
3129
+ tokenizer.LoadFromFile(str(tokenizer_path))
3130
+
3131
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3132
+
3133
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3134
+ scores: list[float] = [-10000.0] * vocab_size
3135
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3136
+
3137
+ for token_id in range(tokenizer.vocab_size()):
3138
+ piece = tokenizer.IdToPiece(token_id)
3139
+ text = piece.encode("utf-8")
3140
+ score = tokenizer.GetScore(token_id)
3141
+
3142
+ toktype = SentencePieceTokenTypes.NORMAL
3143
+ if tokenizer.IsUnknown(token_id):
3144
+ toktype = SentencePieceTokenTypes.UNKNOWN
3145
+ elif tokenizer.IsControl(token_id):
3146
+ toktype = SentencePieceTokenTypes.CONTROL
3147
+ elif tokenizer.IsUnused(token_id):
3148
+ toktype = SentencePieceTokenTypes.UNUSED
3149
+ elif tokenizer.IsByte(token_id):
3150
+ toktype = SentencePieceTokenTypes.BYTE
3151
+
3152
+ tokens[token_id] = text
3153
+ scores[token_id] = score
3154
+ toktypes[token_id] = toktype
3155
+
3156
+ added_tokens_file = self.dir_model / 'added_tokens.json'
3157
+ if added_tokens_file.is_file():
3158
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
3159
+ added_tokens_json = json.load(f)
3160
+ for key in added_tokens_json:
3161
+ token_id = added_tokens_json[key]
3162
+ if token_id >= vocab_size:
3163
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
3164
+ continue
3165
+
3166
+ tokens[token_id] = key.encode("utf-8")
3167
+ scores[token_id] = -1000.0
3168
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
3169
+
3170
+ if vocab_size > len(tokens):
3171
+ pad_count = vocab_size - len(tokens)
3172
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3173
+ for i in range(1, pad_count + 1):
3174
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3175
+ scores.append(-1000.0)
3176
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
3177
+
3178
+ self.gguf_writer.add_tokenizer_model("t5")
3179
+ self.gguf_writer.add_tokenizer_pre("default")
3180
+ self.gguf_writer.add_token_list(tokens)
3181
+ self.gguf_writer.add_token_scores(scores)
3182
+ self.gguf_writer.add_token_types(toktypes)
3183
+ self.gguf_writer.add_add_space_prefix(add_prefix)
3184
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3185
+ if precompiled_charsmap:
3186
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3187
+
3188
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3189
+ special_vocab.add_to_gguf(self.gguf_writer)
3190
+
3191
+ self.gguf_writer.add_add_bos_token(False)
3192
+ self.gguf_writer.add_add_eos_token(True)
3193
+
3194
+ def set_gguf_parameters(self):
3195
+ if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
3196
+ logger.warning("Couldn't find context length in config.json, assuming default value of 512")
3197
+ n_ctx = 512
3198
+ self.gguf_writer.add_context_length(n_ctx)
3199
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
3200
+ self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
3201
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
3202
+ self.gguf_writer.add_head_count(self.hparams["num_heads"])
3203
+ self.gguf_writer.add_key_length(self.hparams["d_kv"])
3204
+ self.gguf_writer.add_value_length(self.hparams["d_kv"])
3205
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3206
+ self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
3207
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
3208
+ self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
3209
+ self.gguf_writer.add_file_type(self.ftype)
3210
+
3211
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3212
+ del bid # unused
3213
+
3214
+ # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
3215
+ # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
3216
+ # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
3217
+ # and decoder and ignore the remaining ones.
3218
+ if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
3219
+ if not self.shared_token_embeddings_found:
3220
+ name = "shared.weight"
3221
+ self.shared_token_embeddings_found = True
3222
+ else:
3223
+ logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
3224
+ return []
3225
+
3226
+ return [(self.map_tensor_name(name), data_torch)]
3227
+
3228
+
3229
+ @Model.register("JAISLMHeadModel")
3230
+ class JaisModel(Model):
3231
+ model_arch = gguf.MODEL_ARCH.JAIS
3232
+
3233
+ def __init__(self, *args, **kwargs):
3234
+ super().__init__(*args, **kwargs)
3235
+
3236
+ # SwigLU activation
3237
+ assert self.hparams["activation_function"] == "swiglu"
3238
+ # ALiBi position embedding
3239
+ assert self.hparams["position_embedding_type"] == "alibi"
3240
+
3241
+ # Embeddings scale
3242
+ self.embeddings_scale = 1.0
3243
+ # note: For some JAIS flavors, output is tied to (same as) wte in original model
3244
+ self.output_is_wte = False
3245
+ if 'mup_embeddings_scale' in self.hparams:
3246
+ self.output_is_wte = True # Hack (?)
3247
+ self.embeddings_scale = self.hparams['mup_embeddings_scale']
3248
+ elif 'embeddings_scale' in self.hparams:
3249
+ self.embeddings_scale = self.hparams['embeddings_scale']
3250
+ else:
3251
+ assert False
3252
+
3253
+ self.width_scale = 1.0
3254
+ if 'mup_output_alpha' in self.hparams:
3255
+ assert 'mup_width_scale' in self.hparams
3256
+ self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
3257
+ elif 'width_scale' in self.hparams:
3258
+ self.width_scale = self.hparams['width_scale']
3259
+ else:
3260
+ assert False
3261
+
3262
+ self.max_alibi_bias = 8.0
3263
+
3264
+ def set_vocab(self):
3265
+ self._set_vocab_gpt2()
3266
+
3267
+ def set_gguf_parameters(self):
3268
+ self.gguf_writer.add_block_count(self.hparams["n_layer"])
3269
+ self.gguf_writer.add_context_length(self.hparams["n_positions"])
3270
+ self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
3271
+ self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
3272
+ self.gguf_writer.add_head_count(self.hparams["n_head"])
3273
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3274
+ self.gguf_writer.add_file_type(self.ftype)
3275
+
3276
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3277
+ del bid # unused
3278
+
3279
+ tensors: list[tuple[str, Tensor]] = []
3280
+
3281
+ # we don't need these
3282
+ if name.endswith((".attn.bias")):
3283
+ return tensors
3284
+
3285
+ if name.endswith(("relative_pe.slopes")):
3286
+ # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
3287
+ # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
3288
+ # but Jais's PyTorch model simply precalculates the slope values and places them
3289
+ # in relative_pes.slopes
3290
+ n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
3291
+ first_val = float(data_torch[0].item())
3292
+ self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
3293
+
3294
+ return tensors
3295
+
3296
+ if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
3297
+ data_torch = data_torch.transpose(1, 0)
3298
+
3299
+ new_name = self.map_tensor_name(name)
3300
+
3301
+ if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3302
+ tensors.append((new_name, data_torch * self.embeddings_scale))
3303
+ if self.output_is_wte:
3304
+ tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
3305
+ elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
3306
+ assert not self.output_is_wte
3307
+ tensors.append((new_name, data_torch * self.width_scale))
3308
+ else:
3309
+ tensors.append((new_name, data_torch))
3310
+
3311
+ return tensors
3312
+
3313
+ def prepare_tensors(self):
3314
+ super().prepare_tensors()
3315
+ self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
3316
+
3317
+
3318
+ @Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
3319
+ class ChatGLMModel(Model):
3320
+ model_arch = gguf.MODEL_ARCH.CHATGLM
3321
+
3322
+ def set_vocab_chatglm3(self):
3323
+ dir_model = self.dir_model
3324
+ hparams = self.hparams
3325
+ tokens: list[bytes] = []
3326
+ toktypes: list[int] = []
3327
+ scores: list[float] = []
3328
+
3329
+ from transformers import AutoTokenizer
3330
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3331
+ vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
3332
+ assert max(tokenizer.get_vocab().values()) < vocab_size
3333
+ role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
3334
+ special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
3335
+ for token_id in range(vocab_size):
3336
+ piece = tokenizer._convert_id_to_token(token_id)
3337
+ if token_id == 0:
3338
+ piece = "<unk>"
3339
+ elif token_id == 1:
3340
+ piece = "<bos>"
3341
+ elif token_id == 2:
3342
+ piece = "<eos>"
3343
+
3344
+ text = piece.encode("utf-8")
3345
+ score = 0.0
3346
+ # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
3347
+ # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
3348
+ if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
3349
+ score = tokenizer.tokenizer.sp_model.get_score(token_id)
3350
+
3351
+ if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
3352
+ if piece in special_tokens:
3353
+ toktype = SentencePieceTokenTypes.CONTROL
3354
+ elif len(piece) == 0:
3355
+ text = f"[PAD{token_id}]".encode("utf-8")
3356
+ toktype = SentencePieceTokenTypes.UNUSED
3357
+ else:
3358
+ toktype = SentencePieceTokenTypes.USER_DEFINED
3359
+ tokens.append(text)
3360
+ scores.append(score)
3361
+ toktypes.append(toktype)
3362
+ continue
3363
+
3364
+ toktype = SentencePieceTokenTypes.NORMAL
3365
+ if tokenizer.tokenizer.sp_model.is_unknown(token_id):
3366
+ toktype = SentencePieceTokenTypes.UNKNOWN
3367
+ elif tokenizer.tokenizer.sp_model.is_control(token_id):
3368
+ toktype = SentencePieceTokenTypes.CONTROL
3369
+ elif tokenizer.tokenizer.sp_model.is_unused(token_id):
3370
+ toktype = SentencePieceTokenTypes.UNUSED
3371
+ elif tokenizer.tokenizer.sp_model.is_byte(token_id):
3372
+ toktype = SentencePieceTokenTypes.BYTE
3373
+
3374
+ tokens.append(text)
3375
+ scores.append(score)
3376
+ toktypes.append(toktype)
3377
+
3378
+ self.gguf_writer.add_tokenizer_model("llama")
3379
+ # glm3 needs prefix and suffix formatted as:
3380
+ # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
3381
+ self.gguf_writer.add_tokenizer_pre("chatglm-spm")
3382
+ self.gguf_writer.add_token_list(tokens)
3383
+ self.gguf_writer.add_token_scores(scores)
3384
+ self.gguf_writer.add_token_types(toktypes)
3385
+
3386
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3387
+ special_vocab.add_to_gguf(self.gguf_writer)
3388
+
3389
+ @staticmethod
3390
+ def token_bytes_to_string(b):
3391
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
3392
+ byte_encoder = bytes_to_unicode()
3393
+ return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
3394
+
3395
+ @staticmethod
3396
+ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
3397
+ parts = [bytes([b]) for b in token]
3398
+ while True:
3399
+ min_idx = None
3400
+ min_rank = None
3401
+ for i, pair in enumerate(zip(parts[:-1], parts[1:])):
3402
+ rank = mergeable_ranks.get(pair[0] + pair[1])
3403
+ if rank is not None and (min_rank is None or rank < min_rank):
3404
+ min_idx = i
3405
+ min_rank = rank
3406
+ if min_rank is None or (max_rank is not None and min_rank >= max_rank):
3407
+ break
3408
+ assert min_idx is not None
3409
+ parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
3410
+ return parts
3411
+
3412
+ def set_vocab(self):
3413
+ if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
3414
+ self.set_vocab_chatglm3()
3415
+ return
3416
+
3417
+ dir_model = self.dir_model
3418
+ hparams = self.hparams
3419
+ tokens: list[str] = []
3420
+ toktypes: list[int] = []
3421
+
3422
+ from transformers import AutoTokenizer
3423
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3424
+ vocab_size = hparams["padded_vocab_size"]
3425
+ assert max(tokenizer.get_vocab().values()) < vocab_size
3426
+
3427
+ tokpre = self.get_vocab_base_pre(tokenizer)
3428
+
3429
+ merges = []
3430
+ vocab = {}
3431
+ mergeable_ranks = tokenizer.mergeable_ranks
3432
+ for token, rank in mergeable_ranks.items():
3433
+ vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
3434
+ if len(token) == 1:
3435
+ continue
3436
+ merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
3437
+ assert len(merged) >= 2 and len(merged) <= 7
3438
+ merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
3439
+
3440
+ # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3441
+ added_vocab = tokenizer.get_added_vocab()
3442
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
3443
+
3444
+ for i in range(vocab_size):
3445
+ if i not in reverse_vocab:
3446
+ tokens.append(f"[PAD{i}]")
3447
+ toktypes.append(gguf.TokenType.UNUSED)
3448
+ elif reverse_vocab[i] in added_vocab:
3449
+ tokens.append(reverse_vocab[i])
3450
+ if tokenizer.added_tokens_decoder[i].special:
3451
+ toktypes.append(gguf.TokenType.CONTROL)
3452
+ else:
3453
+ toktypes.append(gguf.TokenType.USER_DEFINED)
3454
+ else:
3455
+ tokens.append(reverse_vocab[i])
3456
+ toktypes.append(gguf.TokenType.NORMAL)
3457
+
3458
+ self.gguf_writer.add_tokenizer_model("gpt2")
3459
+ self.gguf_writer.add_tokenizer_pre(tokpre)
3460
+ self.gguf_writer.add_token_list(tokens)
3461
+ self.gguf_writer.add_token_types(toktypes)
3462
+
3463
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
3464
+ special_vocab.merges = merges
3465
+ # only add special tokens when they were not already loaded from config.json
3466
+ special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
3467
+ special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
3468
+ # this one is usually not in config.json anyway
3469
+ special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
3470
+ special_vocab.add_to_gguf(self.gguf_writer)
3471
+
3472
+ def set_gguf_parameters(self):
3473
+ n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
3474
+ n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
3475
+ n_head_kv = self.hparams.get("multi_query_group_num", n_head)
3476
+ self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
3477
+ self.gguf_writer.add_embedding_length(n_embed)
3478
+ self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
3479
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
3480
+ self.gguf_writer.add_head_count(n_head)
3481
+ self.gguf_writer.add_head_count_kv(n_head_kv)
3482
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
3483
+ self.gguf_writer.add_file_type(self.ftype)
3484
+ self.gguf_writer.add_rope_dimension_count(64)
3485
+ self.gguf_writer.add_add_bos_token(False)
3486
+ rope_freq = 10000
3487
+ if "rope_ratio" in self.hparams:
3488
+ rope_freq = rope_freq * self.hparams["rope_ratio"]
3489
+ self.gguf_writer.add_rope_freq_base(rope_freq)
3490
+
3491
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3492
+ del bid # unused
3493
+
3494
+ if name.endswith(".rotary_pos_emb.inv_freq"):
3495
+ return []
3496
+
3497
+ name = name.removeprefix("transformer.")
3498
+ return [(self.map_tensor_name(name), data_torch)]
3499
+
2702
3500
  ###### CONVERSION LOGIC ######
2703
3501
 
2704
3502
 
@@ -2715,19 +3513,46 @@ class LazyTorchTensor(gguf.LazyBase):
2715
3513
  torch.float32: np.float32,
2716
3514
  }
2717
3515
 
3516
+ # used for safetensors slices
3517
+ # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
3518
+ # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
3519
+ _dtype_str_map: dict[str, torch.dtype] = {
3520
+ "F64": torch.float64,
3521
+ "F32": torch.float32,
3522
+ "BF16": torch.bfloat16,
3523
+ "F16": torch.float16,
3524
+ # "U64": torch.uint64,
3525
+ "I64": torch.int64,
3526
+ # "U32": torch.uint32,
3527
+ "I32": torch.int32,
3528
+ # "U16": torch.uint16,
3529
+ "I16": torch.int16,
3530
+ "U8": torch.uint8,
3531
+ "I8": torch.int8,
3532
+ "BOOL": torch.bool,
3533
+ "F8_E4M3": torch.float8_e4m3fn,
3534
+ "F8_E5M2": torch.float8_e5m2,
3535
+ }
3536
+
2718
3537
  def numpy(self) -> gguf.LazyNumpyTensor:
2719
3538
  dtype = self._dtype_map[self.dtype]
2720
3539
  return gguf.LazyNumpyTensor(
2721
3540
  meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
2722
- lazy=self._lazy,
2723
3541
  args=(self,),
2724
- func=(lambda s: s[0].numpy())
3542
+ func=(lambda s: s.numpy())
2725
3543
  )
2726
3544
 
2727
3545
  @classmethod
2728
- def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
3546
+ def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
2729
3547
  return torch.empty(size=shape, dtype=dtype, device="meta")
2730
3548
 
3549
+ @classmethod
3550
+ def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
3551
+ dtype = cls._dtype_str_map[st_slice.get_dtype()]
3552
+ shape: tuple[int, ...] = tuple(st_slice.get_shape())
3553
+ lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
3554
+ return cast(torch.Tensor, lazy)
3555
+
2731
3556
  @classmethod
2732
3557
  def __torch_function__(cls, func, types, args=(), kwargs=None):
2733
3558
  del types # unused
@@ -2738,7 +3563,7 @@ class LazyTorchTensor(gguf.LazyBase):
2738
3563
  if func is torch.Tensor.numpy:
2739
3564
  return args[0].numpy()
2740
3565
 
2741
- return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
3566
+ return cls._wrap_fn(func)(*args, **kwargs)
2742
3567
 
2743
3568
 
2744
3569
  def parse_args() -> argparse.Namespace:
@@ -2748,10 +3573,6 @@ def parse_args() -> argparse.Namespace:
2748
3573
  "--vocab-only", action="store_true",
2749
3574
  help="extract only the vocab",
2750
3575
  )
2751
- parser.add_argument(
2752
- "--awq-path", type=Path, default=None,
2753
- help="Path to scale awq cache file",
2754
- )
2755
3576
  parser.add_argument(
2756
3577
  "--outfile", type=Path,
2757
3578
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -2784,30 +3605,58 @@ def parse_args() -> argparse.Namespace:
2784
3605
  "--verbose", action="store_true",
2785
3606
  help="increase output verbosity",
2786
3607
  )
3608
+ parser.add_argument(
3609
+ "--split-max-tensors", type=int, default=0,
3610
+ help="max tensors in each split",
3611
+ )
3612
+ parser.add_argument(
3613
+ "--split-max-size", type=str, default="0",
3614
+ help="max size per split N(M|G)",
3615
+ )
3616
+ parser.add_argument(
3617
+ "--dry-run", action="store_true",
3618
+ help="only print out a split plan and exit, without writing any new files",
3619
+ )
3620
+ parser.add_argument(
3621
+ "--no-tensor-first-split", action="store_true",
3622
+ help="do not add tensors to the first split (disabled by default)"
3623
+ )
3624
+ parser.add_argument(
3625
+ "--metadata", type=Path,
3626
+ help="Specify the path for an authorship metadata override file"
3627
+ )
2787
3628
 
2788
3629
  return parser.parse_args()
2789
3630
 
2790
3631
 
3632
+ def split_str_to_n_bytes(split_str: str) -> int:
3633
+ if split_str.endswith("K"):
3634
+ n = int(split_str[:-1]) * 1000
3635
+ elif split_str.endswith("M"):
3636
+ n = int(split_str[:-1]) * 1000 * 1000
3637
+ elif split_str.endswith("G"):
3638
+ n = int(split_str[:-1]) * 1000 * 1000 * 1000
3639
+ elif split_str.isnumeric():
3640
+ n = int(split_str)
3641
+ else:
3642
+ raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
3643
+
3644
+ if n < 0:
3645
+ raise ValueError(f"Invalid split size: {split_str}, must be positive")
3646
+
3647
+ return n
3648
+
3649
+
2791
3650
  def main() -> None:
2792
3651
  args = parse_args()
2793
3652
 
2794
- logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
3653
+ if args.verbose:
3654
+ logging.basicConfig(level=logging.DEBUG)
3655
+ else:
3656
+ logging.basicConfig(level=logging.INFO)
2795
3657
 
2796
3658
  dir_model = args.model
2797
3659
 
2798
- if args.awq_path:
2799
- sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
2800
- from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
2801
- tmp_model_path = args.model / "weighted_model"
2802
- dir_model = tmp_model_path
2803
- if tmp_model_path.is_dir():
2804
- logger.info(f"{tmp_model_path} exists as a weighted model.")
2805
- else:
2806
- tmp_model_path.mkdir(parents=True, exist_ok=True)
2807
- logger.info("Saving new weighted model ...")
2808
- add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
2809
- logger.info(f"Saved weighted model at {tmp_model_path}.")
2810
-
2811
3660
  if not dir_model.is_dir():
2812
3661
  logger.error(f'Error: {args.model} is not a directory')
2813
3662
  sys.exit(1)
@@ -2820,36 +3669,47 @@ def main() -> None:
2820
3669
  "auto": gguf.LlamaFileType.GUESSED,
2821
3670
  }
2822
3671
 
3672
+ is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
3673
+ if args.use_temp_file and is_split:
3674
+ logger.error("Error: Cannot use temp file when splitting")
3675
+ sys.exit(1)
3676
+
2823
3677
  if args.outfile is not None:
2824
3678
  fname_out = args.outfile
2825
3679
  else:
2826
- # output in the same directory as the model by default
2827
- fname_out = dir_model / 'ggml-model-{ftype}.gguf'
3680
+ fname_out = dir_model
2828
3681
 
2829
3682
  logger.info(f"Loading model: {dir_model.name}")
2830
3683
 
2831
3684
  hparams = Model.load_hparams(dir_model)
2832
3685
 
2833
3686
  with torch.inference_mode():
2834
- model_class = Model.from_model_architecture(hparams["architectures"][0])
2835
- model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
3687
+ output_type = ftype_map[args.outtype]
3688
+ model_architecture = hparams["architectures"][0]
2836
3689
 
2837
- logger.info("Set model parameters")
2838
- model_instance.set_gguf_parameters()
2839
-
2840
- logger.info("Set model tokenizer")
2841
- model_instance.set_vocab()
3690
+ try:
3691
+ model_class = Model.from_model_architecture(model_architecture)
3692
+ except NotImplementedError:
3693
+ logger.error(f"Model {model_architecture} is not supported")
3694
+ sys.exit(1)
2842
3695
 
2843
- model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
3696
+ model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
3697
+ is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
3698
+ eager=args.no_lazy,
3699
+ metadata_override=args.metadata, model_name=args.model_name,
3700
+ split_max_tensors=args.split_max_tensors,
3701
+ split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
3702
+ small_first_shard=args.no_tensor_first_split)
2844
3703
 
2845
3704
  if args.vocab_only:
2846
- logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
3705
+ logger.info("Exporting model vocab...")
2847
3706
  model_instance.write_vocab()
3707
+ logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
2848
3708
  else:
2849
- logger.info(f"Exporting model to '{model_instance.fname_out}'")
3709
+ logger.info("Exporting model...")
2850
3710
  model_instance.write()
2851
-
2852
- logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
3711
+ out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
3712
+ logger.info(f"Model successfully exported to {out_path}")
2853
3713
 
2854
3714
 
2855
3715
  if __name__ == '__main__':