bigdl-core-cpp 2.5.0b20240620__py3-none-manylinux2010_x86_64.whl → 2.5.0b20240621__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. bigdl/cpp/convert-hf-to-gguf.py +0 -1340
  2. bigdl/cpp/libs/baby-llama +0 -0
  3. bigdl/cpp/libs/batched +0 -0
  4. bigdl/cpp/libs/batched-bench +0 -0
  5. bigdl/cpp/libs/beam-search +0 -0
  6. bigdl/cpp/libs/benchmark +0 -0
  7. bigdl/cpp/libs/convert-llama2c-to-ggml +0 -0
  8. bigdl/cpp/libs/embedding +0 -0
  9. bigdl/cpp/libs/export-lora +0 -0
  10. bigdl/cpp/libs/finetune +0 -0
  11. bigdl/cpp/libs/gguf +0 -0
  12. bigdl/cpp/libs/gritlm +0 -0
  13. bigdl/cpp/libs/imatrix +0 -0
  14. bigdl/cpp/libs/infill +0 -0
  15. bigdl/cpp/libs/llama-bench +0 -0
  16. bigdl/cpp/libs/llava-cli +0 -0
  17. bigdl/cpp/libs/lookahead +0 -0
  18. bigdl/cpp/libs/lookup +0 -0
  19. bigdl/cpp/libs/ls-sycl-device +0 -0
  20. bigdl/cpp/libs/main +0 -0
  21. bigdl/cpp/libs/ollama +0 -0
  22. bigdl/cpp/libs/parallel +0 -0
  23. bigdl/cpp/libs/passkey +0 -0
  24. bigdl/cpp/libs/perplexity +0 -0
  25. bigdl/cpp/libs/q8dot +0 -0
  26. bigdl/cpp/libs/quantize +0 -0
  27. bigdl/cpp/libs/quantize-stats +0 -0
  28. bigdl/cpp/libs/save-load-state +0 -0
  29. bigdl/cpp/libs/server +0 -0
  30. bigdl/cpp/libs/simple +0 -0
  31. bigdl/cpp/libs/speculative +0 -0
  32. bigdl/cpp/libs/tokenize +0 -0
  33. bigdl/cpp/libs/train-text-from-scratch +0 -0
  34. bigdl/cpp/libs/vdot +0 -0
  35. {bigdl_core_cpp-2.5.0b20240620.dist-info → bigdl_core_cpp-2.5.0b20240621.dist-info}/METADATA +1 -1
  36. bigdl_core_cpp-2.5.0b20240621.dist-info/RECORD +55 -0
  37. bigdl_core_cpp-2.5.0b20240620.dist-info/RECORD +0 -55
  38. {bigdl_core_cpp-2.5.0b20240620.data → bigdl_core_cpp-2.5.0b20240621.data}/scripts/init-llama-cpp +0 -0
  39. {bigdl_core_cpp-2.5.0b20240620.data → bigdl_core_cpp-2.5.0b20240621.data}/scripts/init-ollama +0 -0
  40. {bigdl_core_cpp-2.5.0b20240620.dist-info → bigdl_core_cpp-2.5.0b20240621.dist-info}/WHEEL +0 -0
  41. {bigdl_core_cpp-2.5.0b20240620.dist-info → bigdl_core_cpp-2.5.0b20240621.dist-info}/top_level.txt +0 -0
@@ -49,9 +49,6 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
49
49
  class Model:
50
50
  _model_classes: dict[str, type[Model]] = {}
51
51
 
52
- <<<<<<< HEAD
53
- def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
54
- =======
55
52
  dir_model: Path
56
53
  ftype: int
57
54
  is_big_endian: bool
@@ -73,26 +70,17 @@ class Model:
73
70
  def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
74
71
  if type(self) is Model:
75
72
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
76
- >>>>>>> uupstream/master
77
73
  self.dir_model = dir_model
78
74
  self.ftype = ftype
79
75
  self.is_big_endian = is_big_endian
80
76
  self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
81
77
  self.use_temp_file = use_temp_file
82
- <<<<<<< HEAD
83
- self.is_safetensors = self._is_model_safetensors()
84
- self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
85
- self.part_names = self._get_part_names()
86
- self.hparams = Model.load_hparams(self.dir_model)
87
- self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
88
- =======
89
78
  self.lazy = not eager
90
79
  self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
91
80
  self.is_safetensors = len(self.part_names) > 0
92
81
  if not self.is_safetensors:
93
82
  self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
94
83
  self.hparams = Model.load_hparams(self.dir_model)
95
- >>>>>>> uupstream/master
96
84
  self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
97
85
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
98
86
  self.tensor_names = None
@@ -238,8 +226,6 @@ class Model:
238
226
 
239
227
  self.gguf_writer.add_file_type(self.ftype)
240
228
  logger.info(f"gguf: file type = {self.ftype}")
241
- <<<<<<< HEAD
242
- =======
243
229
 
244
230
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
245
231
  del bid # unused
@@ -255,7 +241,6 @@ class Model:
255
241
  del name, new_name, bid, n_dims # unused
256
242
 
257
243
  return False
258
- >>>>>>> uupstream/master
259
244
 
260
245
  def write_tensors(self):
261
246
  max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
@@ -278,18 +263,11 @@ class Model:
278
263
  bid = int(part)
279
264
  break
280
265
 
281
- <<<<<<< HEAD
282
- # map tensor names
283
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
284
- if new_name is None:
285
- raise ValueError(f"Can not map tensor {name!r}")
286
- =======
287
266
  for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
288
267
  data: np.ndarray = data # type hint
289
268
  n_dims = len(data.shape)
290
269
  data_dtype = data.dtype
291
270
  data_qtype: gguf.GGMLQuantizationType | None = None
292
- >>>>>>> uupstream/master
293
271
 
294
272
  # when both are True, f32 should win
295
273
  extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
@@ -303,18 +281,12 @@ class Model:
303
281
  new_name.endswith("_norm.weight"),
304
282
  ))
305
283
 
306
- <<<<<<< HEAD
307
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
308
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
309
- data = data.astype(np.float32)
310
- =======
311
284
  # Some tensor types are always in float32
312
285
  extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
313
286
  gguf.MODEL_TENSOR.FFN_GATE_INP,
314
287
  gguf.MODEL_TENSOR.POS_EMBD,
315
288
  gguf.MODEL_TENSOR.TOKEN_TYPES,
316
289
  ))
317
- >>>>>>> uupstream/master
318
290
 
319
291
  # if f16 desired, convert any float32 2-dim weight tensors to float16
320
292
  extra_f16 = any(cond for cond in (
@@ -322,15 +294,11 @@ class Model:
322
294
  (name.endswith(".weight") and n_dims >= 2),
323
295
  ))
324
296
 
325
- <<<<<<< HEAD
326
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
327
- =======
328
297
  if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
329
298
  if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
330
299
  data = gguf.quantize_bf16(data)
331
300
  assert data.dtype == np.int16
332
301
  data_qtype = gguf.GGMLQuantizationType.BF16
333
- >>>>>>> uupstream/master
334
302
 
335
303
  elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
336
304
  data = gguf.quantize_q8_0(data)
@@ -402,22 +370,6 @@ class Model:
402
370
  except KeyError:
403
371
  raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
404
372
 
405
- <<<<<<< HEAD
406
- def _is_model_safetensors(self) -> bool:
407
- return Model.count_model_parts(self.dir_model, ".safetensors") > 0
408
-
409
- def _get_part_names(self):
410
- if self.is_safetensors:
411
- if self.num_parts == 1: # there's only one .safetensors file
412
- return ("model.safetensors",)
413
- return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
414
-
415
- if self.num_parts == 1: # there's only one .bin file
416
- return ("pytorch_model.bin",)
417
- return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
418
-
419
- =======
420
- >>>>>>> uupstream/master
421
373
  # used for GPT-2 BPE and WordPiece vocabs
422
374
  def get_vocab_base(self) -> tuple[list[str], list[int], str]:
423
375
  tokens: list[str] = []
@@ -452,10 +404,7 @@ class Model:
452
404
  # NOTE: this function is generated by convert-hf-to-gguf-update.py
453
405
  # do not modify it manually!
454
406
  # ref: https://github.com/ggerganov/llama.cpp/pull/6920
455
- <<<<<<< HEAD
456
- =======
457
407
  # Marker: Start get_vocab_base_pre
458
- >>>>>>> uupstream/master
459
408
  def get_vocab_base_pre(self, tokenizer) -> str:
460
409
  # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
461
410
  # is specific for the BPE pre-tokenizer used by the model
@@ -499,23 +448,15 @@ class Model:
499
448
  if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
500
449
  # ref: https://huggingface.co/openai-community/gpt2
501
450
  res = "gpt-2"
502
- <<<<<<< HEAD
503
- =======
504
451
  if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
505
452
  # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
506
453
  res = "stablelm2"
507
- >>>>>>> uupstream/master
508
454
  if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
509
455
  # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
510
456
  res = "refact"
511
457
  if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
512
458
  # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
513
459
  res = "command-r"
514
- <<<<<<< HEAD
515
- if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
516
- # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
517
- res = "olmo"
518
- =======
519
460
  if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
520
461
  # ref: https://huggingface.co/Qwen/Qwen1.5-7B
521
462
  res = "qwen2"
@@ -537,7 +478,6 @@ class Model:
537
478
  if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
538
479
  # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
539
480
  res = "smaug-bpe"
540
- >>>>>>> uupstream/master
541
481
 
542
482
  if res is None:
543
483
  logger.warning("\n")
@@ -558,10 +498,7 @@ class Model:
558
498
  logger.debug(f"chkhsh: {chkhsh}")
559
499
 
560
500
  return res
561
- <<<<<<< HEAD
562
- =======
563
501
  # Marker: End get_vocab_base_pre
564
- >>>>>>> uupstream/master
565
502
 
566
503
  def _set_vocab_gpt2(self) -> None:
567
504
  tokens, toktypes, tokpre = self.get_vocab_base()
@@ -677,28 +614,17 @@ class Model:
677
614
  logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
678
615
  continue
679
616
 
680
- <<<<<<< HEAD
681
- =======
682
617
  tokens[token_id] = key.encode("utf-8")
683
618
  scores[token_id] = -1000.0
684
619
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
685
620
 
686
- >>>>>>> uupstream/master
687
621
  if vocab_size > len(tokens):
688
622
  pad_count = vocab_size - len(tokens)
689
623
  logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
690
624
  for i in range(1, pad_count + 1):
691
- <<<<<<< HEAD
692
- tokens.append(f"[PAD{i}]")
693
- scores.append(-1000.0)
694
- toktypes.append(SentencePieceTokenTypes.UNUSED)
695
-
696
- assert len(tokens) == vocab_size
697
- =======
698
625
  tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
699
626
  scores.append(-1000.0)
700
627
  toktypes.append(SentencePieceTokenTypes.UNUSED)
701
- >>>>>>> uupstream/master
702
628
 
703
629
  self.gguf_writer.add_tokenizer_model("llama")
704
630
  self.gguf_writer.add_tokenizer_pre("default")
@@ -848,67 +774,11 @@ class BloomModel(Model):
848
774
  if name == "word_embeddings.weight":
849
775
  assert self.tensor_names is not None
850
776
 
851
- <<<<<<< HEAD
852
- if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
853
- # Map bloom-style qkv_linear to gpt-style qkv_linear
854
- # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
855
- # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
856
- qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
857
- data = np.concatenate(
858
- (
859
- qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
860
- qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
861
- qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
862
- ),
863
- axis=0,
864
- )
865
- logger.info("re-format attention.linear_qkv.weight")
866
- elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
867
- qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
868
- data = np.concatenate(
869
- (
870
- qkv_bias[:, 0, :].reshape((n_embed,)),
871
- qkv_bias[:, 1, :].reshape((n_embed,)),
872
- qkv_bias[:, 2, :].reshape((n_embed,)),
873
- ),
874
- axis=0,
875
- )
876
- logger.info("re-format attention.linear_qkv.bias")
877
-
878
- # map tensor names
879
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
880
- if new_name is None:
881
- raise ValueError(f"Can not map tensor {name!r}")
882
-
883
- n_dims = len(data.shape)
884
- data_dtype = data.dtype
885
-
886
- # if f32 desired, convert any float16 to float32
887
- if self.ftype == 0 and data_dtype == np.float16:
888
- data = data.astype(np.float32)
889
-
890
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
891
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
892
- data = data.astype(np.float32)
893
-
894
- # if f16 desired, convert any float32 2-dim weight tensors to float16
895
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
896
- data = data.astype(np.float16)
897
-
898
- logger.info(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
899
-
900
- self.gguf_writer.add_tensor(new_name, data)
901
-
902
- if not has_lm_head and name == "word_embeddings.weight":
903
- self.gguf_writer.add_tensor("output.weight", data)
904
- logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
905
- =======
906
777
  # TODO: tie them at runtime, don't duplicate in the model file
907
778
  if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
908
779
  tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
909
780
 
910
781
  return tensors
911
- >>>>>>> uupstream/master
912
782
 
913
783
 
914
784
  @Model.register("MPTForCausalLM")
@@ -953,44 +823,7 @@ class MPTModel(Model):
953
823
  else:
954
824
  new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias"))
955
825
 
956
- <<<<<<< HEAD
957
- # convert any unsupported data types to float32
958
- if data_torch.dtype not in (torch.float16, torch.float32):
959
- data_torch = data_torch.to(torch.float32)
960
-
961
- data = data_torch.squeeze().numpy()
962
-
963
- # map tensor names
964
- if "scales" in name:
965
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
966
- if new_name is not None:
967
- new_name = new_name.replace("scales", "act.scales")
968
- else:
969
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
970
- if new_name is None:
971
- raise ValueError(f"Can not map tensor {name!r}")
972
-
973
- n_dims = len(data.shape)
974
- data_dtype = data.dtype
975
-
976
- # if f32 desired, convert any float16 to float32
977
- if self.ftype == 0 and data_dtype == np.float16:
978
- data = data.astype(np.float32)
979
-
980
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
981
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
982
- data = data.astype(np.float32)
983
-
984
- # if f16 desired, convert any float32 2-dim weight tensors to float16
985
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
986
- data = data.astype(np.float16)
987
-
988
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
989
-
990
- self.gguf_writer.add_tensor(new_name, data)
991
- =======
992
826
  return [(new_name, data_torch)]
993
- >>>>>>> uupstream/master
994
827
 
995
828
 
996
829
  @Model.register("OrionForCausalLM")
@@ -1030,51 +863,6 @@ class OrionModel(Model):
1030
863
  # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
1031
864
  self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
1032
865
 
1033
- <<<<<<< HEAD
1034
- def write_tensors(self):
1035
- # Collect tensors from generator object
1036
- model_kv = dict(self.get_tensors())
1037
- block_count = self.hparams["num_hidden_layers"]
1038
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1039
-
1040
- for name, data_torch in model_kv.items():
1041
- # we don't need these
1042
- if name.endswith(".rotary_emb.inv_freq"):
1043
- continue
1044
-
1045
- old_dtype = data_torch.dtype
1046
-
1047
- # convert any unsupported data types to float32
1048
- if data_torch.dtype not in (torch.float16, torch.float32):
1049
- data_torch = data_torch.to(torch.float32)
1050
-
1051
- data = data_torch.squeeze().numpy()
1052
-
1053
- # map tensor names
1054
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1055
- if new_name is None:
1056
- raise ValueError(f"Can not map tensor {name!r}")
1057
-
1058
- n_dims = len(data.shape)
1059
- data_dtype = data.dtype
1060
-
1061
- # if f32 desired, convert any float16 to float32
1062
- if self.ftype == 0 and data_dtype == np.float16:
1063
- data = data.astype(np.float32)
1064
-
1065
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1066
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1067
- data = data.astype(np.float32)
1068
-
1069
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1070
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1071
- data = data.astype(np.float16)
1072
-
1073
- logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1074
- self.gguf_writer.add_tensor(new_name, data)
1075
-
1076
- =======
1077
- >>>>>>> uupstream/master
1078
866
 
1079
867
  @Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
1080
868
  class BaichuanModel(Model):
@@ -1121,20 +909,7 @@ class BaichuanModel(Model):
1121
909
  head_count = self.hparams["num_attention_heads"]
1122
910
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
1123
911
 
1124
- <<<<<<< HEAD
1125
- for i in range(block_count):
1126
- if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
1127
- logger.info(f"Unpacking and permuting layer {i}")
1128
- model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
1129
- self._reverse_hf_permute_part(w, 0, head_count, head_count)
1130
- model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
1131
- self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
1132
- model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
1133
- self._reverse_hf_part(w, 2)
1134
- del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
1135
- =======
1136
912
  tensors: list[tuple[str, Tensor]] = []
1137
- >>>>>>> uupstream/master
1138
913
 
1139
914
  if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
1140
915
  logger.info(f"Unpacking and permuting layer {bid}")
@@ -1149,40 +924,7 @@ class BaichuanModel(Model):
1149
924
  else:
1150
925
  tensors = [(self.map_tensor_name(name), data_torch)]
1151
926
 
1152
- <<<<<<< HEAD
1153
- old_dtype = data_torch.dtype
1154
-
1155
- # convert any unsupported data types to float32
1156
- if data_torch.dtype not in (torch.float16, torch.float32):
1157
- data_torch = data_torch.to(torch.float32)
1158
-
1159
- data = data_torch.squeeze().numpy()
1160
-
1161
- # map tensor names
1162
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1163
- if new_name is None:
1164
- raise ValueError(f"Can not map tensor {name!r}")
1165
-
1166
- n_dims = len(data.shape)
1167
- data_dtype = data.dtype
1168
-
1169
- # if f32 desired, convert any float16 to float32
1170
- if self.ftype == 0 and data_dtype == np.float16:
1171
- data = data.astype(np.float32)
1172
-
1173
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1174
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1175
- data = data.astype(np.float32)
1176
-
1177
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1178
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1179
- data = data.astype(np.float16)
1180
-
1181
- logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1182
- self.gguf_writer.add_tensor(new_name, data)
1183
- =======
1184
927
  return tensors
1185
- >>>>>>> uupstream/master
1186
928
 
1187
929
  def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1188
930
  if n_kv_head is not None and n_head != n_kv_head:
@@ -1298,46 +1040,7 @@ class XverseModel(Model):
1298
1040
  if name.endswith("k_proj.weight"):
1299
1041
  data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
1300
1042
 
1301
- <<<<<<< HEAD
1302
- old_dtype = data_torch.dtype
1303
-
1304
- # convert any unsupported data types to float32
1305
- if data_torch.dtype not in (torch.float16, torch.float32):
1306
- data_torch = data_torch.to(torch.float32)
1307
-
1308
- # HF models permute some of the tensors, so we need to undo that
1309
- if name.endswith(("q_proj.weight")):
1310
- data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
1311
- if name.endswith(("k_proj.weight")):
1312
- data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
1313
-
1314
- data = data_torch.squeeze().numpy()
1315
-
1316
- # map tensor names
1317
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1318
- if new_name is None:
1319
- raise ValueError(f"Can not map tensor {name!r}")
1320
-
1321
- n_dims = len(data.shape)
1322
- data_dtype = data.dtype
1323
-
1324
- # if f32 desired, convert any float16 to float32
1325
- if self.ftype == 0 and data_dtype == np.float16:
1326
- data = data.astype(np.float32)
1327
-
1328
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1329
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1330
- data = data.astype(np.float32)
1331
-
1332
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1333
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1334
- data = data.astype(np.float16)
1335
-
1336
- logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1337
- self.gguf_writer.add_tensor(new_name, data)
1338
- =======
1339
1043
  return [(self.map_tensor_name(name), data_torch)]
1340
- >>>>>>> uupstream/master
1341
1044
 
1342
1045
  def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1343
1046
  if n_kv_head is not None and n_head != n_kv_head:
@@ -1402,59 +1105,7 @@ class FalconModel(Model):
1402
1105
  v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
1403
1106
  data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
1404
1107
 
1405
- <<<<<<< HEAD
1406
- for name, data_torch in self.get_tensors():
1407
- old_dtype = data_torch.dtype
1408
-
1409
- # convert any unsupported data types to float32
1410
- if data_torch.dtype not in (torch.float16, torch.float32):
1411
- data_torch = data_torch.to(torch.float32)
1412
-
1413
- # QKV tensor transform
1414
- # The original query_key_value tensor contains n_head_kv "kv groups",
1415
- # each consisting of n_head/n_head_kv query weights followed by one key
1416
- # and one value weight (shared by all query heads in the kv group).
1417
- # This layout makes it a big pain to work with in GGML.
1418
- # So we rearrange them here,, so that we have n_head query weights
1419
- # followed by n_head_kv key weights followed by n_head_kv value weights,
1420
- # in contiguous fashion.
1421
- # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
1422
-
1423
- if "query_key_value" in name:
1424
- qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
1425
- q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
1426
- k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
1427
- v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
1428
- data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
1429
-
1430
- data = data_torch.squeeze().numpy()
1431
-
1432
- # map tensor names
1433
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1434
- if new_name is None:
1435
- raise ValueError(f"Can not map tensor {name!r}")
1436
-
1437
- n_dims = len(data.shape)
1438
- data_dtype = data.dtype
1439
-
1440
- # if f32 desired, convert any float16 to float32
1441
- if self.ftype == 0 and data_dtype == np.float16:
1442
- data = data.astype(np.float32)
1443
-
1444
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1445
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1446
- data = data.astype(np.float32)
1447
-
1448
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1449
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1450
- data = data.astype(np.float16)
1451
-
1452
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1453
-
1454
- self.gguf_writer.add_tensor(new_name, data)
1455
- =======
1456
1108
  return [(self.map_tensor_name(name), data_torch)]
1457
- >>>>>>> uupstream/master
1458
1109
 
1459
1110
 
1460
1111
  @Model.register("GPTBigCodeForCausalLM")
@@ -1537,89 +1188,7 @@ class RefactModel(Model):
1537
1188
  if len(tensors) == 0:
1538
1189
  tensors.append((self.map_tensor_name(name), data_torch))
1539
1190
 
1540
- <<<<<<< HEAD
1541
- # convert any unsupported data types to float32
1542
- if data_torch.dtype not in (torch.float16, torch.float32):
1543
- data_torch = data_torch.to(torch.float32)
1544
-
1545
- data = data_torch.squeeze().numpy()
1546
-
1547
- # map tensor names
1548
- new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
1549
- if new_name is None:
1550
- raise ValueError(f"Can not map tensor {name!r}")
1551
-
1552
- n_dims = len(data.shape)
1553
- data_dtype = data.dtype
1554
-
1555
- # if f32 desired, convert any float16 to float32
1556
- if self.ftype == 0 and data_dtype == np.float16:
1557
- data = data.astype(np.float32)
1558
-
1559
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1560
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1561
- data = data.astype(np.float32)
1562
-
1563
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1564
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1565
- data = data.astype(np.float16)
1566
-
1567
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1568
-
1569
- self.gguf_writer.add_tensor(new_name, data)
1570
-
1571
-
1572
- @Model.register("PersimmonForCausalLM")
1573
- class PersimmonModel(Model):
1574
- model_arch = gguf.MODEL_ARCH.PERSIMMON
1575
-
1576
- def set_gguf_parameters(self):
1577
- block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
1578
- head_count = self.hparams["num_attention_heads"]
1579
- head_count_kv = head_count
1580
- hidden_size = self.hparams["hidden_size"]
1581
-
1582
- self.gguf_writer.add_name('persimmon-8b-chat')
1583
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1584
- self.gguf_writer.add_embedding_length(hidden_size)
1585
- self.gguf_writer.add_block_count(block_count)
1586
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1587
-
1588
- # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
1589
- # than the head size?
1590
- # ref: https://github.com/ggerganov/llama.cpp/pull/4889
1591
- # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
1592
- self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
1593
-
1594
- self.gguf_writer.add_head_count(head_count)
1595
- self.gguf_writer.add_head_count_kv(head_count_kv)
1596
- self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
1597
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
1598
-
1599
- def set_vocab(self):
1600
- self._set_vocab_sentencepiece()
1601
- # self.gguf_writer.add_bos_token_id(71013)
1602
- # self.gguf_writer.add_eos_token_id(71013)
1603
-
1604
- def write_tensors(self):
1605
- block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
1606
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1607
-
1608
- for name, data_torch in self.get_tensors():
1609
- if name.endswith(".self_attention.rotary_emb.inv_freq"):
1610
- continue
1611
- old_dtype = data_torch.dtype
1612
- # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
1613
- data = data_torch.to(torch.float32).squeeze().numpy()
1614
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1615
- if new_name is None:
1616
- raise ValueError(f"Can not map tensor {name!r}")
1617
- n_dims = len(data.shape)
1618
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1619
- self.gguf_writer.add_tensor(new_name, data)
1620
- =======
1621
1191
  return tensors
1622
- >>>>>>> uupstream/master
1623
1192
 
1624
1193
 
1625
1194
  @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
@@ -1829,20 +1398,6 @@ class LlamaModel(Model):
1829
1398
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1830
1399
  self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1831
1400
 
1832
- <<<<<<< HEAD
1833
- # Same as super class, but permuting q_proj, k_proj
1834
- def write_tensors(self):
1835
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1836
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1837
- n_head = self.hparams.get("num_attention_heads")
1838
- n_kv_head = self.hparams.get("num_key_value_heads")
1839
- n_experts = self.hparams.get("num_local_experts")
1840
- experts = dict()
1841
- for name, data_torch in self.get_tensors():
1842
- # we don't need these
1843
- if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
1844
- continue
1845
- =======
1846
1401
  @staticmethod
1847
1402
  def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1848
1403
  if n_head_kv is not None and n_head != n_head_kv:
@@ -1856,7 +1411,6 @@ class LlamaModel(Model):
1856
1411
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1857
1412
  n_head = self.hparams["num_attention_heads"]
1858
1413
  n_kv_head = self.hparams.get("num_key_value_heads")
1859
- >>>>>>> uupstream/master
1860
1414
 
1861
1415
  if name.endswith("q_proj.weight"):
1862
1416
  data_torch = LlamaModel.permute(data_torch, n_head, n_head)
@@ -1892,58 +1446,21 @@ class LlamaModel(Model):
1892
1446
 
1893
1447
  new_name = self.map_tensor_name(merged_name)
1894
1448
 
1895
- <<<<<<< HEAD
1896
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1897
- if new_name is None:
1898
- raise ValueError(f"Can not map tensor {name!r}")
1899
-
1900
- logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1901
- =======
1902
1449
  tensors.append((new_name, data_torch))
1903
1450
  return tensors
1904
1451
  else:
1905
1452
  return []
1906
1453
 
1907
1454
  return [(self.map_tensor_name(name), data_torch)]
1908
- >>>>>>> uupstream/master
1909
1455
 
1910
1456
  def write_tensors(self):
1911
1457
  super().write_tensors()
1912
1458
 
1913
- <<<<<<< HEAD
1914
- # map tensor names
1915
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1916
- if new_name is None:
1917
- raise ValueError(f"Can not map tensor {name!r}")
1918
-
1919
- n_dims = len(data.shape)
1920
- data_dtype = data.dtype
1921
-
1922
- # if f32 desired, convert any float16 to float32
1923
- if self.ftype == 0 and data_dtype == np.float16:
1924
- data = data.astype(np.float32)
1925
-
1926
- # 1d tensors need to be converted to float32
1927
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1928
- data = data.astype(np.float32)
1929
-
1930
- # if f16 desired, convert any float32 2-dim weight tensors to float16
1931
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1932
- data = data.astype(np.float16)
1933
-
1934
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1935
-
1936
- self.gguf_writer.add_tensor(new_name, data)
1937
-
1938
- if len(experts) > 0:
1939
- raise ValueError(f"Unprocessed experts: {experts.keys()}")
1940
- =======
1941
1459
  if self._experts is not None:
1942
1460
  # flatten `list[dict[str, Tensor]]` into `list[str]`
1943
1461
  experts = [k for d in self._experts for k in d.keys()]
1944
1462
  if len(experts) > 0:
1945
1463
  raise ValueError(f"Unprocessed experts: {experts}")
1946
- >>>>>>> uupstream/master
1947
1464
 
1948
1465
 
1949
1466
  @Model.register("GrokForCausalLM")
@@ -1990,30 +1507,15 @@ class GrokModel(Model):
1990
1507
 
1991
1508
  merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
1992
1509
 
1993
- <<<<<<< HEAD
1994
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
1995
- if new_name is None:
1996
- raise ValueError(f"Can not map tensor {name!r}")
1997
-
1998
- logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
1999
- =======
2000
1510
  new_name = self.map_tensor_name(merged_name)
2001
1511
 
2002
1512
  tensors.append((new_name, data_torch))
2003
1513
  return tensors
2004
1514
  else:
2005
1515
  return []
2006
- >>>>>>> uupstream/master
2007
1516
 
2008
1517
  return [(self.map_tensor_name(name), data_torch)]
2009
1518
 
2010
- <<<<<<< HEAD
2011
- # map tensor names
2012
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2013
- if new_name is None:
2014
- raise ValueError(f"Can not map tensor {name!r}")
2015
- =======
2016
- >>>>>>> uupstream/master
2017
1519
 
2018
1520
  @Model.register("DbrxForCausalLM")
2019
1521
  class DbrxModel(Model):
@@ -2032,201 +1534,7 @@ class DbrxModel(Model):
2032
1534
  self.gguf_writer.add_head_count(self.hparams["n_heads"])
2033
1535
  self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
2034
1536
 
2035
- <<<<<<< HEAD
2036
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2037
-
2038
- self.gguf_writer.add_tensor(new_name, data)
2039
-
2040
-
2041
- @Model.register("DbrxForCausalLM")
2042
- class DbrxModel(Model):
2043
- model_arch = gguf.MODEL_ARCH.DBRX
2044
-
2045
- def set_gguf_parameters(self):
2046
- ffn_config = self.hparams["ffn_config"]
2047
- attn_config = self.hparams["attn_config"]
2048
- self.gguf_writer.add_name(self.hparams["model_type"])
2049
- self.gguf_writer.add_block_count(self.hparams["n_layers"])
2050
-
2051
- self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
2052
- self.gguf_writer.add_embedding_length(self.hparams["d_model"])
2053
- self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
2054
-
2055
- self.gguf_writer.add_head_count(self.hparams["n_heads"])
2056
- self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
2057
-
2058
- self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
2059
-
2060
- self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
2061
- self.gguf_writer.add_file_type(self.ftype)
2062
-
2063
- self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
2064
- self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
2065
-
2066
- self.gguf_writer.add_layer_norm_eps(1e-5)
2067
-
2068
- self.gguf_writer.add_file_type(self.ftype)
2069
- logger.info(f"gguf: file type = {self.ftype}")
2070
-
2071
- def write_tensors(self):
2072
- block_count = self.hparams.get("n_layers")
2073
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2074
- for name, data_torch in self.get_tensors():
2075
- n_expert = self.hparams["ffn_config"]["moe_num_experts"]
2076
- n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
2077
- n_embd = self.hparams["d_model"]
2078
-
2079
- # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
2080
- # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
2081
- # But llama.cpp moe graph works differently
2082
- # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
2083
- # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
2084
- exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
2085
- "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
2086
- "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
2087
- experts = False
2088
- for exp_tensor_name in exp_tensor_names.keys():
2089
- if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
2090
- experts = True
2091
- data_torch = data_torch.view(n_expert, n_ff, n_embd)
2092
- if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
2093
- data_torch = data_torch.permute(*permute_tensor)
2094
- break
2095
-
2096
- old_dtype = data_torch.dtype
2097
-
2098
- # convert any unsupported data types to float32
2099
- if data_torch.dtype not in (torch.float16, torch.float32):
2100
- data_torch = data_torch.to(torch.float32)
2101
-
2102
- data = data_torch.squeeze().numpy()
2103
-
2104
- # map tensor names
2105
- # In MoE models the ffn tensors are typically most of the model weights,
2106
- # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
2107
- # Every other model has the weight names ending in .weight,
2108
- # let's assume that is the convention which is not the case for dbrx:
2109
- # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
2110
- new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
2111
- if new_name is None:
2112
- raise ValueError(f"Can not map tensor {name!r}")
2113
-
2114
- n_dims = len(data.shape)
2115
- data_dtype = data.dtype
2116
-
2117
- # Most of the codebase that takes in 1D tensors only handles F32 tensors
2118
- # and most of the outputs tensors are F32.
2119
- if data_dtype != np.float32 and n_dims == 1:
2120
- raise ValueError(f"Can not map tensor {name!r}: all 1D tensors must be F32")
2121
-
2122
- # if f32 desired, convert any float16 to float32
2123
- if self.ftype == 0 and data_dtype == np.float16:
2124
- data = data.astype(np.float32)
2125
-
2126
- # if f16 desired, convert any float32 2-dim weight tensors to float16
2127
- if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
2128
- data = data.astype(np.float16)
2129
-
2130
- logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
2131
-
2132
- self.gguf_writer.add_tensor(new_name, data)
2133
-
2134
-
2135
- @Model.register("DbrxForCausalLM")
2136
- class DbrxModel(Model):
2137
- model_arch = gguf.MODEL_ARCH.DBRX
2138
-
2139
- def set_gguf_parameters(self):
2140
- ffn_config = self.hparams["ffn_config"]
2141
- attn_config = self.hparams["attn_config"]
2142
- self.gguf_writer.add_name(self.hparams["model_type"])
2143
- self.gguf_writer.add_block_count(self.hparams["n_layers"])
2144
-
2145
- self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
2146
- self.gguf_writer.add_embedding_length(self.hparams["d_model"])
2147
- self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
2148
-
2149
- self.gguf_writer.add_head_count(self.hparams["n_heads"])
2150
- self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
2151
-
2152
- self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
2153
-
2154
- self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
2155
- self.gguf_writer.add_file_type(self.ftype)
2156
-
2157
- self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
2158
- self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
2159
-
2160
- self.gguf_writer.add_layer_norm_eps(1e-5)
2161
-
2162
- self.gguf_writer.add_file_type(self.ftype)
2163
- print(f"gguf: file type = {self.ftype}")
2164
-
2165
- def write_tensors(self):
2166
- block_count = self.hparams.get("n_layers")
2167
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2168
- for name, data_torch in self.get_tensors():
2169
- n_expert = self.hparams["ffn_config"]["moe_num_experts"]
2170
- n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
2171
- n_embd = self.hparams["d_model"]
2172
-
2173
- # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
2174
- # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
2175
- # But llama.cpp moe graph works differently
2176
- # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
2177
- # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
2178
- exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
2179
- "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
2180
- "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
2181
- experts = False
2182
- for exp_tensor_name in exp_tensor_names.keys():
2183
- if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
2184
- experts = True
2185
- data_torch = data_torch.view(n_expert, n_ff, n_embd)
2186
- if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
2187
- data_torch = data_torch.permute(*permute_tensor)
2188
- break
2189
-
2190
- old_dtype = data_torch.dtype
2191
-
2192
- # convert any unsupported data types to float32
2193
- if data_torch.dtype not in (torch.float16, torch.float32):
2194
- data_torch = data_torch.to(torch.float32)
2195
-
2196
- data = data_torch.squeeze().numpy()
2197
-
2198
- # map tensor names
2199
- # In MoE models the ffn tensors are typically most of the model weights,
2200
- # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
2201
- # Every other model has the weight names ending in .weight,
2202
- # let's assume that is the convention which is not the case for dbrx:
2203
- # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
2204
- new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
2205
- if new_name is None:
2206
- print(f"Can not map tensor {name!r}")
2207
- sys.exit()
2208
-
2209
- n_dims = len(data.shape)
2210
- data_dtype = data.dtype
2211
-
2212
- # Most of the codebase that takes in 1D tensors only handles F32 tensors
2213
- # and most of the outputs tensors are F32.
2214
- if data_dtype != np.float32 and n_dims == 1:
2215
- print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
2216
- sys.exit()
2217
-
2218
- # if f32 desired, convert any float16 to float32
2219
- if self.ftype == 0 and data_dtype == np.float16:
2220
- data = data.astype(np.float32)
2221
-
2222
- # if f16 desired, convert any float32 2-dim weight tensors to float16
2223
- if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
2224
- data = data.astype(np.float16)
2225
-
2226
- print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
2227
- =======
2228
1537
  self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
2229
- >>>>>>> uupstream/master
2230
1538
 
2231
1539
  self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
2232
1540
  self.gguf_writer.add_file_type(self.ftype)
@@ -2322,45 +1630,7 @@ class MiniCPMModel(Model):
2322
1630
  if name.endswith(("k_proj.weight")):
2323
1631
  data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
2324
1632
 
2325
- <<<<<<< HEAD
2326
- # convert any unsupported data types to float32
2327
- if data_torch.dtype not in (torch.float16, torch.float32):
2328
- data_torch = data_torch.to(torch.float32)
2329
-
2330
- # HF models permute some of the tensors, so we need to undo that
2331
- if name.endswith(("q_proj.weight")):
2332
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
2333
- if name.endswith(("k_proj.weight")):
2334
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
2335
-
2336
- data = data_torch.squeeze().numpy()
2337
-
2338
- # map tensor names
2339
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2340
- if new_name is None:
2341
- raise ValueError(f"Can not map tensor {name!r}")
2342
-
2343
- n_dims = len(data.shape)
2344
- data_dtype = data.dtype
2345
-
2346
- # if f32 desired, convert any float16 to float32
2347
- if self.ftype == 0 and data_dtype == np.float16:
2348
- data = data.astype(np.float32)
2349
-
2350
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2351
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2352
- data = data.astype(np.float32)
2353
-
2354
- # if f16 desired, convert any float32 2-dim weight tensors to float16
2355
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2356
- data = data.astype(np.float16)
2357
-
2358
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2359
-
2360
- self.gguf_writer.add_tensor(new_name, data)
2361
- =======
2362
1633
  return [(self.map_tensor_name(name), data_torch)]
2363
- >>>>>>> uupstream/master
2364
1634
 
2365
1635
 
2366
1636
  @Model.register("QWenLMHeadModel")
@@ -2403,50 +1673,7 @@ class QwenModel(Model):
2403
1673
  self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
2404
1674
  self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
2405
1675
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
2406
- <<<<<<< HEAD
2407
-
2408
- def write_tensors(self):
2409
- block_count = self.hparams["num_hidden_layers"]
2410
- model_kv = dict(self.get_tensors())
2411
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2412
- for name, data_torch in model_kv.items():
2413
- # we don't need these
2414
- if name.endswith(".rotary_emb.inv_freq"):
2415
- continue
2416
-
2417
- old_dtype = data_torch.dtype
2418
-
2419
- # convert any unsupported data types to float32
2420
- if data_torch.dtype not in (torch.float16, torch.float32):
2421
- data_torch = data_torch.to(torch.float32)
2422
-
2423
- data = data_torch.squeeze().numpy()
2424
-
2425
- # map tensor names
2426
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2427
- if new_name is None:
2428
- raise ValueError(f"Can not map tensor {name!r}")
2429
-
2430
- n_dims = len(data.shape)
2431
- data_dtype = data.dtype
2432
-
2433
- # if f32 desired, convert any float16 to float32
2434
- if self.ftype == 0 and data_dtype == np.float16:
2435
- data = data.astype(np.float32)
2436
-
2437
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2438
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2439
- data = data.astype(np.float32)
2440
-
2441
- # if f16 desired, convert any float32 2-dim weight tensors to float16
2442
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2443
- data = data.astype(np.float16)
2444
-
2445
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2446
- self.gguf_writer.add_tensor(new_name, data)
2447
- =======
2448
1676
  self.gguf_writer.add_file_type(self.ftype)
2449
- >>>>>>> uupstream/master
2450
1677
 
2451
1678
 
2452
1679
  @Model.register("Qwen2ForCausalLM")
@@ -2469,193 +1696,6 @@ class Qwen2MoeModel(Model):
2469
1696
  if (n_experts := self.hparams.get("num_experts")) is not None:
2470
1697
  self.gguf_writer.add_expert_count(n_experts)
2471
1698
 
2472
- <<<<<<< HEAD
2473
- def write_tensors(self):
2474
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2475
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2476
- n_experts = self.hparams.get("num_experts")
2477
- experts = dict()
2478
- for name, data_torch in self.get_tensors():
2479
- # we don't need these
2480
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
2481
- continue
2482
-
2483
- old_dtype = data_torch.dtype
2484
-
2485
- # convert any unsupported data types to float32
2486
- if data_torch.dtype not in (torch.float16, torch.float32):
2487
- data_torch = data_torch.to(torch.float32)
2488
-
2489
- data = data_torch.squeeze().numpy()
2490
-
2491
- # process the experts separately
2492
- if name.find("experts") != -1:
2493
- experts[name] = data
2494
- if len(experts) >= n_experts * 3:
2495
- # merge the experts into a single 3d tensor
2496
- for bid in range(block_count):
2497
- for w_name in ["down_proj", "gate_proj", "up_proj"]:
2498
- full = True
2499
- for xid in range(n_experts):
2500
- ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2501
- if ename not in experts:
2502
- full = False
2503
- break
2504
- if not full:
2505
- continue
2506
-
2507
- datas = []
2508
- for xid in range(n_experts):
2509
- ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2510
- datas.append(experts[ename])
2511
- del experts[ename]
2512
-
2513
- data = np.stack(datas, axis=0)
2514
- data_dtype = data.dtype
2515
-
2516
- if self.ftype == 0 and data_dtype == np.float16:
2517
- data = data.astype(np.float32)
2518
-
2519
- if self.ftype == 1 and data_dtype == np.float32:
2520
- data = data.astype(np.float16)
2521
-
2522
- merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2523
-
2524
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
2525
- if new_name is None:
2526
- raise ValueError(f"Can not map tensor {name!r}")
2527
-
2528
- logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
2529
-
2530
- self.gguf_writer.add_tensor(new_name, data)
2531
- continue
2532
-
2533
- # map tensor names
2534
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2535
- if new_name is None:
2536
- raise ValueError(f"Can not map tensor {name!r}")
2537
-
2538
- n_dims = len(data.shape)
2539
- data_dtype = data.dtype
2540
-
2541
- # if f32 desired, convert any float16 to float32
2542
- if self.ftype == 0 and data_dtype == np.float16:
2543
- data = data.astype(np.float32)
2544
-
2545
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2546
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
2547
- data = data.astype(np.float32)
2548
-
2549
- # if f16 desired, convert any float32 2-dim weight tensors to float16
2550
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2551
- data = data.astype(np.float16)
2552
-
2553
- logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
2554
-
2555
- self.gguf_writer.add_tensor(new_name, data)
2556
-
2557
- if len(experts) > 0:
2558
- raise ValueError(f"Unprocessed experts: {experts.keys()}")
2559
-
2560
-
2561
- @Model.register("Qwen2MoeForCausalLM")
2562
- class Qwen2MoeModel(Model):
2563
- model_arch = gguf.MODEL_ARCH.QWEN2MOE
2564
-
2565
- def set_gguf_parameters(self):
2566
- super().set_gguf_parameters()
2567
- if (n_experts := self.hparams.get("num_experts")) is not None:
2568
- self.gguf_writer.add_expert_count(n_experts)
2569
-
2570
- def write_tensors(self):
2571
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
2572
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
2573
- n_experts = self.hparams.get("num_experts")
2574
- experts = dict()
2575
- for name, data_torch in self.get_tensors():
2576
- # we don't need these
2577
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
2578
- continue
2579
-
2580
- old_dtype = data_torch.dtype
2581
-
2582
- # convert any unsupported data types to float32
2583
- if data_torch.dtype not in (torch.float16, torch.float32):
2584
- data_torch = data_torch.to(torch.float32)
2585
-
2586
- data = data_torch.squeeze().numpy()
2587
-
2588
- # process the experts separately
2589
- if name.find("experts") != -1:
2590
- experts[name] = data
2591
- if len(experts) >= n_experts * 3:
2592
- # merge the experts into a single 3d tensor
2593
- for bid in range(block_count):
2594
- for w_name in ["down_proj", "gate_proj", "up_proj"]:
2595
- full = True
2596
- for xid in range(n_experts):
2597
- ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2598
- if ename not in experts:
2599
- full = False
2600
- break
2601
- if not full:
2602
- continue
2603
-
2604
- datas = []
2605
- for xid in range(n_experts):
2606
- ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2607
- datas.append(experts[ename])
2608
- del experts[ename]
2609
-
2610
- data = np.stack(datas, axis=0)
2611
- data_dtype = data.dtype
2612
-
2613
- if self.ftype == 0 and data_dtype == np.float16:
2614
- data = data.astype(np.float32)
2615
-
2616
- if self.ftype == 1 and data_dtype == np.float32:
2617
- data = data.astype(np.float16)
2618
-
2619
- merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2620
-
2621
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
2622
- if new_name is None:
2623
- print(f"Can not map tensor {name!r}")
2624
- sys.exit()
2625
-
2626
- print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
2627
-
2628
- self.gguf_writer.add_tensor(new_name, data)
2629
- continue
2630
-
2631
- # map tensor names
2632
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2633
- if new_name is None:
2634
- print(f"Can not map tensor {name!r}")
2635
- sys.exit()
2636
-
2637
- n_dims = len(data.shape)
2638
- data_dtype = data.dtype
2639
-
2640
- # if f32 desired, convert any float16 to float32
2641
- if self.ftype == 0 and data_dtype == np.float16:
2642
- data = data.astype(np.float32)
2643
-
2644
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2645
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
2646
- data = data.astype(np.float32)
2647
-
2648
- # if f16 desired, convert any float32 2-dim weight tensors to float16
2649
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2650
- data = data.astype(np.float16)
2651
-
2652
- print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
2653
-
2654
- self.gguf_writer.add_tensor(new_name, data)
2655
-
2656
- if len(experts) > 0:
2657
- raise ValueError(f"Unprocessed experts: {experts.keys()}")
2658
- =======
2659
1699
  _experts: list[dict[str, Tensor]] | None = None
2660
1700
 
2661
1701
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -2702,7 +1742,6 @@ class Qwen2MoeModel(Model):
2702
1742
  experts = [k for d in self._experts for k in d.keys()]
2703
1743
  if len(experts) > 0:
2704
1744
  raise ValueError(f"Unprocessed experts: {experts}")
2705
- >>>>>>> uupstream/master
2706
1745
 
2707
1746
 
2708
1747
  @Model.register("GPT2LMHeadModel")
@@ -2735,42 +1774,11 @@ class GPT2Model(Model):
2735
1774
 
2736
1775
  tensors.append((new_name, data_torch))
2737
1776
 
2738
- <<<<<<< HEAD
2739
- # map tensor names
2740
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2741
- if new_name is None:
2742
- raise ValueError(f"Can not map tensor {name!r}")
2743
-
2744
- n_dims = len(data.shape)
2745
- data_dtype = data.dtype
2746
-
2747
- # if f32 desired, convert any float16 to float32
2748
- if self.ftype == 0 and data_dtype == np.float16:
2749
- data = data.astype(np.float32)
2750
-
2751
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2752
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2753
- data = data.astype(np.float32)
2754
-
2755
- # if f16 desired, convert any float32 2-dim weight tensors to float16
2756
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
2757
- data = data.astype(np.float16)
2758
-
2759
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2760
-
2761
- self.gguf_writer.add_tensor(new_name, data)
2762
-
2763
- # note: GPT2 output is tied to (same as) wte in original model
2764
- if new_name == "token_embd.weight":
2765
- logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2766
- self.gguf_writer.add_tensor("output.weight", data)
2767
- =======
2768
1777
  # note: GPT2 output is tied to (same as) wte in original model
2769
1778
  if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2770
1779
  tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2771
1780
 
2772
1781
  return tensors
2773
- >>>>>>> uupstream/master
2774
1782
 
2775
1783
 
2776
1784
  @Model.register("PhiForCausalLM")
@@ -2810,12 +1818,8 @@ class Phi3MiniModel(Model):
2810
1818
  if not tokenizer_path.is_file():
2811
1819
  raise ValueError(f'Error: Missing {tokenizer_path}')
2812
1820
 
2813
- <<<<<<< HEAD
2814
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
2815
- =======
2816
1821
  tokenizer = SentencePieceProcessor()
2817
1822
  tokenizer.LoadFromFile(str(tokenizer_path))
2818
- >>>>>>> uupstream/master
2819
1823
 
2820
1824
  vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2821
1825
 
@@ -2825,20 +1829,6 @@ class Phi3MiniModel(Model):
2825
1829
 
2826
1830
  for token_id in range(tokenizer.vocab_size()):
2827
1831
 
2828
- <<<<<<< HEAD
2829
- piece = tokenizer.id_to_piece(token_id)
2830
- text = piece.encode("utf-8")
2831
- score = tokenizer.get_score(token_id)
2832
-
2833
- toktype = SentencePieceTokenTypes.NORMAL
2834
- if tokenizer.is_unknown(token_id):
2835
- toktype = SentencePieceTokenTypes.UNKNOWN
2836
- elif tokenizer.is_control(token_id):
2837
- toktype = SentencePieceTokenTypes.CONTROL
2838
- elif tokenizer.is_unused(token_id):
2839
- toktype = SentencePieceTokenTypes.UNUSED
2840
- elif tokenizer.is_byte(token_id):
2841
- =======
2842
1832
  piece = tokenizer.IdToPiece(token_id)
2843
1833
  text = piece.encode("utf-8")
2844
1834
  score = tokenizer.GetScore(token_id)
@@ -2851,7 +1841,6 @@ class Phi3MiniModel(Model):
2851
1841
  elif tokenizer.IsUnused(token_id):
2852
1842
  toktype = SentencePieceTokenTypes.UNUSED
2853
1843
  elif tokenizer.IsByte(token_id):
2854
- >>>>>>> uupstream/master
2855
1844
  toktype = SentencePieceTokenTypes.BYTE
2856
1845
 
2857
1846
  tokens[token_id] = text
@@ -2873,8 +1862,6 @@ class Phi3MiniModel(Model):
2873
1862
  scores[token_id] = -1000.0
2874
1863
  toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
2875
1864
 
2876
- <<<<<<< HEAD
2877
- =======
2878
1865
  tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2879
1866
  if tokenizer_config_file.is_file():
2880
1867
  with open(tokenizer_config_file, "r", encoding="utf-8") as f:
@@ -2907,7 +1894,6 @@ class Phi3MiniModel(Model):
2907
1894
  if foken_data.get("special"):
2908
1895
  toktypes[token_id] = SentencePieceTokenTypes.CONTROL
2909
1896
 
2910
- >>>>>>> uupstream/master
2911
1897
  self.gguf_writer.add_tokenizer_model("llama")
2912
1898
  self.gguf_writer.add_tokenizer_pre("default")
2913
1899
  self.gguf_writer.add_token_list(tokens)
@@ -2920,25 +1906,6 @@ class Phi3MiniModel(Model):
2920
1906
  def set_gguf_parameters(self):
2921
1907
  block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
2922
1908
 
2923
- <<<<<<< HEAD
2924
- rot_pct = 1.0
2925
- n_embd = self.find_hparam(["hidden_size", "n_embd"])
2926
- n_head = self.find_hparam(["num_attention_heads", "n_head"])
2927
- rms_eps = self.find_hparam(["rms_norm_eps"])
2928
-
2929
- self.gguf_writer.add_name("Phi3")
2930
- self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
2931
-
2932
- self.gguf_writer.add_embedding_length(n_embd)
2933
- self.gguf_writer.add_feed_forward_length(8192)
2934
- self.gguf_writer.add_block_count(block_count)
2935
- self.gguf_writer.add_head_count(n_head)
2936
- self.gguf_writer.add_head_count_kv(n_head)
2937
- self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
2938
- self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
2939
- self.gguf_writer.add_file_type(self.ftype)
2940
-
2941
- =======
2942
1909
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
2943
1910
  n_head = self.find_hparam(["num_attention_heads", "n_head"])
2944
1911
  n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
@@ -2992,7 +1959,6 @@ class Phi3MiniModel(Model):
2992
1959
  self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
2993
1960
  self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
2994
1961
 
2995
- >>>>>>> uupstream/master
2996
1962
 
2997
1963
  @Model.register("PlamoForCausalLM")
2998
1964
  class PlamoModel(Model):
@@ -3034,45 +2000,6 @@ class PlamoModel(Model):
3034
2000
 
3035
2001
  new_name = self.map_tensor_name(name)
3036
2002
 
3037
- <<<<<<< HEAD
3038
- # map tensor names
3039
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
3040
- if new_name is None:
3041
- raise ValueError(f"Can not map tensor {name!r}")
3042
-
3043
- # shuffle for broadcasting of gqa in ggml_mul_mat
3044
- if new_name.endswith("attn_q.weight"):
3045
- data_torch = self.shuffle_attn_q_weight(data_torch)
3046
- elif new_name.endswith("attn_output.weight"):
3047
- data_torch = self.shuffle_attn_output_weight(data_torch)
3048
-
3049
- old_dtype = data_torch.dtype
3050
-
3051
- # convert any unsupported data types to float32
3052
- if data_torch.dtype not in (torch.float16, torch.float32):
3053
- data_torch = data_torch.to(torch.float32)
3054
-
3055
- data = data_torch.squeeze().numpy()
3056
-
3057
- n_dims = len(data.shape)
3058
- data_dtype = data.dtype
3059
-
3060
- # if f32 desired, convert any float16 to float32
3061
- if self.ftype == 0 and data_dtype == np.float16:
3062
- data = data.astype(np.float32)
3063
-
3064
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
3065
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
3066
- data = data.astype(np.float32)
3067
-
3068
- # if f16 desired, convert any float32 2-dim weight tensors to float16
3069
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
3070
- data = data.astype(np.float16)
3071
-
3072
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3073
-
3074
- self.gguf_writer.add_tensor(new_name, data)
3075
- =======
3076
2003
  # shuffle for broadcasting of gqa in ggml_mul_mat
3077
2004
  if new_name.endswith("attn_q.weight"):
3078
2005
  data_torch = self.shuffle_attn_q_weight(data_torch)
@@ -3080,7 +2007,6 @@ class PlamoModel(Model):
3080
2007
  data_torch = self.shuffle_attn_output_weight(data_torch)
3081
2008
 
3082
2009
  return [(new_name, data_torch)]
3083
- >>>>>>> uupstream/master
3084
2010
 
3085
2011
 
3086
2012
  @Model.register("CodeShellForCausalLM")
@@ -3113,41 +2039,11 @@ class CodeShellModel(Model):
3113
2039
  if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
3114
2040
  assert self.tensor_names is not None
3115
2041
 
3116
- <<<<<<< HEAD
3117
- # map tensor names
3118
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
3119
- if new_name is None:
3120
- raise ValueError(f"Can not map tensor {name!r}")
3121
-
3122
- n_dims = len(data.shape)
3123
- data_dtype = data.dtype
3124
-
3125
- # if f32 desired, convert any float16 to float32
3126
- if self.ftype == 0 and data_dtype == np.float16:
3127
- data = data.astype(np.float32)
3128
-
3129
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
3130
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
3131
- data = data.astype(np.float32)
3132
-
3133
- # if f16 desired, convert any float32 2-dim weight tensors to float16
3134
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
3135
- data = data.astype(np.float16)
3136
-
3137
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3138
-
3139
- self.gguf_writer.add_tensor(new_name, data)
3140
-
3141
- if not has_lm_head and name == "transformer.wte.weight":
3142
- self.gguf_writer.add_tensor("output.weight", data)
3143
- logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
3144
- =======
3145
2042
  if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
3146
2043
  # copy tok_embd.weight to output.weight
3147
2044
  tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
3148
2045
 
3149
2046
  return tensors
3150
- >>>>>>> uupstream/master
3151
2047
 
3152
2048
 
3153
2049
  @Model.register("InternLM2ForCausalLM")
@@ -3188,13 +2084,8 @@ class InternLM2Model(Model):
3188
2084
  if text == b"\x00":
3189
2085
  # (TODO): fixme
3190
2086
  # Hack here and replace the \x00 characters.
3191
- <<<<<<< HEAD
3192
- logger.debug(f"InternLM2 convert token '{text}' to '🐉'!")
3193
- text = "🐉"
3194
- =======
3195
2087
  logger.warning(f"InternLM2 convert token '{text}' to '🐉'!")
3196
2088
  text = "🐉".encode("utf-8")
3197
- >>>>>>> uupstream/master
3198
2089
 
3199
2090
  toktype = SentencePieceTokenTypes.NORMAL
3200
2091
  if tokenizer.IsUnknown(token_id):
@@ -3270,51 +2161,10 @@ in chat mode so that the conversation can end normally.")
3270
2161
  self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
3271
2162
  self.gguf_writer.add_file_type(self.ftype)
3272
2163
 
3273
- <<<<<<< HEAD
3274
- def post_write_tensors(self, tensor_map, name, data_torch):
3275
- old_dtype = data_torch.dtype
3276
-
3277
- # convert any unsupported data types to float32
3278
- if data_torch.dtype not in (torch.float16, torch.float32):
3279
- data_torch = data_torch.to(torch.float32)
3280
-
3281
- data = data_torch.squeeze().numpy()
3282
-
3283
- # map tensor names
3284
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
3285
- if new_name is None:
3286
- raise ValueError(f"Can not map tensor {name!r}")
3287
-
3288
- n_dims = len(data.shape)
3289
- data_dtype = data.dtype
3290
-
3291
- # if f32 desired, convert any float16 to float32
3292
- if self.ftype == 0 and data_dtype == np.float16:
3293
- data = data.astype(np.float32)
3294
-
3295
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
3296
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
3297
- data = data.astype(np.float32)
3298
-
3299
- # if f16 desired, convert any float32 2-dim weight tensors to float16
3300
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
3301
- data = data.astype(np.float16)
3302
-
3303
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3304
- self.gguf_writer.add_tensor(new_name, data)
3305
-
3306
- def write_tensors(self):
3307
- from einops import rearrange
3308
-
3309
- num_heads = self.hparams.get("num_attention_heads")
3310
- num_kv_heads = self.hparams.get("num_key_value_heads")
3311
- hidden_size = self.hparams.get("hidden_size")
3312
- =======
3313
2164
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3314
2165
  num_heads = self.hparams["num_attention_heads"]
3315
2166
  num_kv_heads = self.hparams["num_key_value_heads"]
3316
2167
  hidden_size = self.hparams["hidden_size"]
3317
- >>>>>>> uupstream/master
3318
2168
  q_per_kv = num_heads // num_kv_heads
3319
2169
  head_dim = hidden_size // num_heads
3320
2170
  num_groups = num_heads // q_per_kv
@@ -3408,43 +2258,11 @@ class BertModel(Model):
3408
2258
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3409
2259
  del bid # unused
3410
2260
 
3411
- <<<<<<< HEAD
3412
- # map tensor names
3413
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
3414
- if new_name is None:
3415
- raise ValueError(f"Can not map tensor {name!r}")
3416
-
3417
- # convert any unsupported data types to float32
3418
- if data_torch.dtype not in (torch.float16, torch.float32):
3419
- data_torch = data_torch.to(torch.float32)
3420
-
3421
- data = data_torch.squeeze().numpy()
3422
- n_dims = len(data.shape)
3423
- new_dtype: type[np.floating[Any]]
3424
-
3425
- if (
3426
- self.ftype == 1 and name.endswith(".weight") and n_dims == 2
3427
- and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
3428
- ):
3429
- # if f16 desired, convert any float32 2-dim weight tensors to float16
3430
- new_dtype = np.float16
3431
- else:
3432
- # if f32 desired, convert any float16 to float32
3433
- new_dtype = np.float32
3434
-
3435
- logger.info(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
3436
-
3437
- if data.dtype != new_dtype:
3438
- data = data.astype(new_dtype)
3439
-
3440
- self.gguf_writer.add_tensor(new_name, data)
3441
- =======
3442
2261
  # we are only using BERT for embeddings so we don't need the pooling layer
3443
2262
  if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
3444
2263
  return [] # we don't need these
3445
2264
 
3446
2265
  return [(self.map_tensor_name(name), data_torch)]
3447
- >>>>>>> uupstream/master
3448
2266
 
3449
2267
 
3450
2268
  @Model.register("NomicBertModel")
@@ -3513,53 +2331,17 @@ class GemmaModel(Model):
3513
2331
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3514
2332
  del bid # unused
3515
2333
 
3516
- <<<<<<< HEAD
3517
- for name, data_torch in self.get_tensors():
3518
- # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
3519
- # To prevent errors, skip loading lm_head.weight.
3520
- if name == "lm_head.weight":
3521
- logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
3522
- continue
3523
-
3524
- old_dtype = data_torch.dtype
3525
- =======
3526
2334
  # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
3527
2335
  # To prevent errors, skip loading lm_head.weight.
3528
2336
  if name == "lm_head.weight":
3529
2337
  logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
3530
2338
  return []
3531
- >>>>>>> uupstream/master
3532
2339
 
3533
2340
  # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
3534
2341
  if name.endswith("norm.weight"):
3535
2342
  data_torch = data_torch + 1
3536
2343
 
3537
- <<<<<<< HEAD
3538
- # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
3539
- if name.endswith("norm.weight"):
3540
- data_torch = data_torch + 1
3541
- data = data_torch.squeeze().numpy()
3542
-
3543
- # map tensor names
3544
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
3545
- if new_name is None:
3546
- raise ValueError(f"Can not map tensor {name!r}")
3547
-
3548
- n_dims = len(data.shape)
3549
- data_dtype = data.dtype
3550
-
3551
- data = data.astype(np.float32)
3552
-
3553
- # if f16 desired, convert any float32 2-dim weight tensors to float16
3554
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
3555
- data = data.astype(np.float16)
3556
-
3557
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3558
-
3559
- self.gguf_writer.add_tensor(new_name, data)
3560
- =======
3561
2344
  return [(self.map_tensor_name(name), data_torch)]
3562
- >>>>>>> uupstream/master
3563
2345
 
3564
2346
 
3565
2347
  @Model.register("Starcoder2ForCausalLM")
@@ -3591,17 +2373,10 @@ class MambaModel(Model):
3591
2373
  neox_reader = gguf.GGUFReader(tokenizer_path, "r")
3592
2374
 
3593
2375
  field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
3594
- <<<<<<< HEAD
3595
- self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
3596
-
3597
- field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
3598
- self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
3599
- =======
3600
2376
  self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
3601
2377
 
3602
2378
  field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
3603
2379
  self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
3604
- >>>>>>> uupstream/master
3605
2380
 
3606
2381
  field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
3607
2382
  assert field
@@ -3616,17 +2391,10 @@ class MambaModel(Model):
3616
2391
  self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
3617
2392
 
3618
2393
  field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
3619
- <<<<<<< HEAD
3620
- self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
3621
-
3622
- field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
3623
- self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
3624
- =======
3625
2394
  self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
3626
2395
 
3627
2396
  field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
3628
2397
  self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
3629
- >>>>>>> uupstream/master
3630
2398
 
3631
2399
  field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
3632
2400
  self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
@@ -3671,24 +2439,6 @@ class MambaModel(Model):
3671
2439
 
3672
2440
  new_name = self.map_tensor_name(name)
3673
2441
 
3674
- <<<<<<< HEAD
3675
- # map tensor names
3676
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
3677
- if new_name is None:
3678
- raise ValueError(f"Can not map tensor {name!r}")
3679
-
3680
- if name.endswith(".A_log"):
3681
- logger.debug("A_log --> A ==> " + new_name)
3682
- data_torch = -torch.exp(data_torch)
3683
-
3684
- # assuming token_embd.weight is seen before output.weight
3685
- if tok_embd is not None and new_name == output_name:
3686
- if torch.equal(tok_embd, data_torch):
3687
- logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
3688
- continue
3689
- if new_name == tok_embd_name:
3690
- tok_embd = data_torch
3691
- =======
3692
2442
  if name.endswith(".A_log"):
3693
2443
  logger.debug("A_log --> A ==> " + new_name)
3694
2444
  data_torch = -torch.exp(data_torch)
@@ -3702,32 +2452,10 @@ class MambaModel(Model):
3702
2452
  self._tok_embd = data_torch
3703
2453
 
3704
2454
  return [(new_name, data_torch)]
3705
- >>>>>>> uupstream/master
3706
2455
 
3707
2456
  def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
3708
2457
  del n_dims # unused
3709
2458
 
3710
- <<<<<<< HEAD
3711
- n_dims = len(data.shape)
3712
- data_dtype = data.dtype
3713
-
3714
- # if f32 desired, convert any float16 to float32
3715
- if self.ftype == 0 and data_dtype == np.float16:
3716
- data = data.astype(np.float32)
3717
-
3718
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
3719
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
3720
- data = data.astype(np.float32)
3721
-
3722
- # if f16 desired, convert big float32 2-dim weight tensors to float16
3723
- new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
3724
- if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
3725
- data = data.astype(np.float16)
3726
-
3727
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3728
-
3729
- self.gguf_writer.add_tensor(new_name, data)
3730
- =======
3731
2459
  return bid is not None and new_name in (
3732
2460
  self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
3733
2461
  gguf.MODEL_TENSOR.SSM_CONV1D,
@@ -3737,7 +2465,6 @@ class MambaModel(Model):
3737
2465
  gguf.MODEL_TENSOR.SSM_D,
3738
2466
  ]
3739
2467
  )
3740
- >>>>>>> uupstream/master
3741
2468
 
3742
2469
 
3743
2470
  @Model.register("CohereForCausalLM")
@@ -3772,52 +2499,6 @@ class OlmoModel(Model):
3772
2499
 
3773
2500
  # Same as super class, but permuting q_proj, k_proj
3774
2501
  # Copied from: LlamaModel
3775
- <<<<<<< HEAD
3776
- def write_tensors(self):
3777
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
3778
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
3779
- n_head = self.hparams.get("num_attention_heads")
3780
- n_kv_head = self.hparams.get("num_key_value_heads")
3781
- for name, data_torch in self.get_tensors():
3782
- old_dtype = data_torch.dtype
3783
-
3784
- # convert any unsupported data types to float32
3785
- if data_torch.dtype not in (torch.float16, torch.float32):
3786
- data_torch = data_torch.to(torch.float32)
3787
-
3788
- data = data_torch.numpy()
3789
-
3790
- if name.endswith("q_proj.weight"):
3791
- data = permute(data, n_head, n_head)
3792
- if name.endswith("k_proj.weight"):
3793
- data = permute(data, n_head, n_kv_head)
3794
-
3795
- data = data.squeeze()
3796
-
3797
- # map tensor names
3798
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
3799
- if new_name is None:
3800
- raise ValueError(f"Can not map tensor {name!r}")
3801
-
3802
- n_dims = len(data.shape)
3803
- data_dtype = data.dtype
3804
-
3805
- # if f32 desired, convert any float16 to float32
3806
- if self.ftype == 0 and data_dtype == np.float16:
3807
- data = data.astype(np.float32)
3808
-
3809
- # 1d tensors need to be converted to float32
3810
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
3811
- data = data.astype(np.float32)
3812
-
3813
- # if f16 desired, convert any float32 2-dim weight tensors to float16
3814
- if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
3815
- data = data.astype(np.float16)
3816
-
3817
- logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
3818
-
3819
- self.gguf_writer.add_tensor(new_name, data)
3820
- =======
3821
2502
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3822
2503
  del bid # unused
3823
2504
 
@@ -4018,7 +2699,6 @@ class ArcticModel(Model):
4018
2699
  experts = [k for d in self._experts for k in d.keys()]
4019
2700
  if len(experts) > 0:
4020
2701
  raise ValueError(f"Unprocessed experts: {experts}")
4021
- >>>>>>> uupstream/master
4022
2702
 
4023
2703
 
4024
2704
  ###### CONVERSION LOGIC ######
@@ -4090,11 +2770,6 @@ def parse_args() -> argparse.Namespace:
4090
2770
  "model", type=Path,
4091
2771
  help="directory containing model file",
4092
2772
  )
4093
- <<<<<<< HEAD
4094
- parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
4095
- parser.add_argument("--model-name", type=str, default=None, help="name of the model")
4096
- parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
4097
- =======
4098
2773
  parser.add_argument(
4099
2774
  "--use-temp-file", action="store_true",
4100
2775
  help="use the tempfile library while processing (helpful when running out of memory, process killed)",
@@ -4111,7 +2786,6 @@ def parse_args() -> argparse.Namespace:
4111
2786
  "--verbose", action="store_true",
4112
2787
  help="increase output verbosity",
4113
2788
  )
4114
- >>>>>>> uupstream/master
4115
2789
 
4116
2790
  return parser.parse_args()
4117
2791
 
@@ -4160,11 +2834,7 @@ def main() -> None:
4160
2834
 
4161
2835
  with torch.inference_mode():
4162
2836
  model_class = Model.from_model_architecture(hparams["architectures"][0])
4163
- <<<<<<< HEAD
4164
- model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
4165
- =======
4166
2837
  model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
4167
- >>>>>>> uupstream/master
4168
2838
 
4169
2839
  logger.info("Set model parameters")
4170
2840
  model_instance.set_gguf_parameters()
@@ -4175,15 +2845,6 @@ def main() -> None:
4175
2845
  model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
4176
2846
 
4177
2847
  if args.vocab_only:
4178
- <<<<<<< HEAD
4179
- logger.info(f"Exporting model vocab to '{fname_out}'")
4180
- model_instance.write_vocab()
4181
- else:
4182
- logger.info(f"Exporting model to '{fname_out}'")
4183
- model_instance.write()
4184
-
4185
- logger.info(f"Model successfully exported to '{fname_out}'")
4186
- =======
4187
2848
  logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
4188
2849
  model_instance.write_vocab()
4189
2850
  else:
@@ -4191,7 +2852,6 @@ def main() -> None:
4191
2852
  model_instance.write()
4192
2853
 
4193
2854
  logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
4194
- >>>>>>> uupstream/master
4195
2855
 
4196
2856
 
4197
2857
  if __name__ == '__main__':