bigdl-core-cpp 2.5.0b20240620__py3-none-win_amd64.whl → 2.5.0b20240622__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert-hf-to-gguf.py +0 -1340
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/ggml_shared.dll +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- {bigdl_core_cpp-2.5.0b20240620.dist-info → bigdl_core_cpp-2.5.0b20240622.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.5.0b20240622.dist-info/RECORD +61 -0
- bigdl_core_cpp-2.5.0b20240620.dist-info/RECORD +0 -61
- {bigdl_core_cpp-2.5.0b20240620.data → bigdl_core_cpp-2.5.0b20240622.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240620.data → bigdl_core_cpp-2.5.0b20240622.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240620.data → bigdl_core_cpp-2.5.0b20240622.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240620.dist-info → bigdl_core_cpp-2.5.0b20240622.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240620.dist-info → bigdl_core_cpp-2.5.0b20240622.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert-hf-to-gguf.py
CHANGED
@@ -49,9 +49,6 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
|
|
49
49
|
class Model:
|
50
50
|
_model_classes: dict[str, type[Model]] = {}
|
51
51
|
|
52
|
-
<<<<<<< HEAD
|
53
|
-
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
|
54
|
-
=======
|
55
52
|
dir_model: Path
|
56
53
|
ftype: int
|
57
54
|
is_big_endian: bool
|
@@ -73,26 +70,17 @@ class Model:
|
|
73
70
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
|
74
71
|
if type(self) is Model:
|
75
72
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
76
|
-
>>>>>>> uupstream/master
|
77
73
|
self.dir_model = dir_model
|
78
74
|
self.ftype = ftype
|
79
75
|
self.is_big_endian = is_big_endian
|
80
76
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
81
77
|
self.use_temp_file = use_temp_file
|
82
|
-
<<<<<<< HEAD
|
83
|
-
self.is_safetensors = self._is_model_safetensors()
|
84
|
-
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
85
|
-
self.part_names = self._get_part_names()
|
86
|
-
self.hparams = Model.load_hparams(self.dir_model)
|
87
|
-
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
88
|
-
=======
|
89
78
|
self.lazy = not eager
|
90
79
|
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
91
80
|
self.is_safetensors = len(self.part_names) > 0
|
92
81
|
if not self.is_safetensors:
|
93
82
|
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
94
83
|
self.hparams = Model.load_hparams(self.dir_model)
|
95
|
-
>>>>>>> uupstream/master
|
96
84
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
97
85
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
98
86
|
self.tensor_names = None
|
@@ -238,8 +226,6 @@ class Model:
|
|
238
226
|
|
239
227
|
self.gguf_writer.add_file_type(self.ftype)
|
240
228
|
logger.info(f"gguf: file type = {self.ftype}")
|
241
|
-
<<<<<<< HEAD
|
242
|
-
=======
|
243
229
|
|
244
230
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
245
231
|
del bid # unused
|
@@ -255,7 +241,6 @@ class Model:
|
|
255
241
|
del name, new_name, bid, n_dims # unused
|
256
242
|
|
257
243
|
return False
|
258
|
-
>>>>>>> uupstream/master
|
259
244
|
|
260
245
|
def write_tensors(self):
|
261
246
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
@@ -278,18 +263,11 @@ class Model:
|
|
278
263
|
bid = int(part)
|
279
264
|
break
|
280
265
|
|
281
|
-
<<<<<<< HEAD
|
282
|
-
# map tensor names
|
283
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
284
|
-
if new_name is None:
|
285
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
286
|
-
=======
|
287
266
|
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
288
267
|
data: np.ndarray = data # type hint
|
289
268
|
n_dims = len(data.shape)
|
290
269
|
data_dtype = data.dtype
|
291
270
|
data_qtype: gguf.GGMLQuantizationType | None = None
|
292
|
-
>>>>>>> uupstream/master
|
293
271
|
|
294
272
|
# when both are True, f32 should win
|
295
273
|
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
@@ -303,18 +281,12 @@ class Model:
|
|
303
281
|
new_name.endswith("_norm.weight"),
|
304
282
|
))
|
305
283
|
|
306
|
-
<<<<<<< HEAD
|
307
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
308
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
309
|
-
data = data.astype(np.float32)
|
310
|
-
=======
|
311
284
|
# Some tensor types are always in float32
|
312
285
|
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
|
313
286
|
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
314
287
|
gguf.MODEL_TENSOR.POS_EMBD,
|
315
288
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
316
289
|
))
|
317
|
-
>>>>>>> uupstream/master
|
318
290
|
|
319
291
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
320
292
|
extra_f16 = any(cond for cond in (
|
@@ -322,15 +294,11 @@ class Model:
|
|
322
294
|
(name.endswith(".weight") and n_dims >= 2),
|
323
295
|
))
|
324
296
|
|
325
|
-
<<<<<<< HEAD
|
326
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
327
|
-
=======
|
328
297
|
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
329
298
|
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
330
299
|
data = gguf.quantize_bf16(data)
|
331
300
|
assert data.dtype == np.int16
|
332
301
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
333
|
-
>>>>>>> uupstream/master
|
334
302
|
|
335
303
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
336
304
|
data = gguf.quantize_q8_0(data)
|
@@ -402,22 +370,6 @@ class Model:
|
|
402
370
|
except KeyError:
|
403
371
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
404
372
|
|
405
|
-
<<<<<<< HEAD
|
406
|
-
def _is_model_safetensors(self) -> bool:
|
407
|
-
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
|
408
|
-
|
409
|
-
def _get_part_names(self):
|
410
|
-
if self.is_safetensors:
|
411
|
-
if self.num_parts == 1: # there's only one .safetensors file
|
412
|
-
return ("model.safetensors",)
|
413
|
-
return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
|
414
|
-
|
415
|
-
if self.num_parts == 1: # there's only one .bin file
|
416
|
-
return ("pytorch_model.bin",)
|
417
|
-
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
418
|
-
|
419
|
-
=======
|
420
|
-
>>>>>>> uupstream/master
|
421
373
|
# used for GPT-2 BPE and WordPiece vocabs
|
422
374
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
423
375
|
tokens: list[str] = []
|
@@ -452,10 +404,7 @@ class Model:
|
|
452
404
|
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
453
405
|
# do not modify it manually!
|
454
406
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
455
|
-
<<<<<<< HEAD
|
456
|
-
=======
|
457
407
|
# Marker: Start get_vocab_base_pre
|
458
|
-
>>>>>>> uupstream/master
|
459
408
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
460
409
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
461
410
|
# is specific for the BPE pre-tokenizer used by the model
|
@@ -499,23 +448,15 @@ class Model:
|
|
499
448
|
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
|
500
449
|
# ref: https://huggingface.co/openai-community/gpt2
|
501
450
|
res = "gpt-2"
|
502
|
-
<<<<<<< HEAD
|
503
|
-
=======
|
504
451
|
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
|
505
452
|
# ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
|
506
453
|
res = "stablelm2"
|
507
|
-
>>>>>>> uupstream/master
|
508
454
|
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
|
509
455
|
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
|
510
456
|
res = "refact"
|
511
457
|
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
|
512
458
|
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
|
513
459
|
res = "command-r"
|
514
|
-
<<<<<<< HEAD
|
515
|
-
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
516
|
-
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
517
|
-
res = "olmo"
|
518
|
-
=======
|
519
460
|
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
|
520
461
|
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
|
521
462
|
res = "qwen2"
|
@@ -537,7 +478,6 @@ class Model:
|
|
537
478
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
538
479
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
539
480
|
res = "smaug-bpe"
|
540
|
-
>>>>>>> uupstream/master
|
541
481
|
|
542
482
|
if res is None:
|
543
483
|
logger.warning("\n")
|
@@ -558,10 +498,7 @@ class Model:
|
|
558
498
|
logger.debug(f"chkhsh: {chkhsh}")
|
559
499
|
|
560
500
|
return res
|
561
|
-
<<<<<<< HEAD
|
562
|
-
=======
|
563
501
|
# Marker: End get_vocab_base_pre
|
564
|
-
>>>>>>> uupstream/master
|
565
502
|
|
566
503
|
def _set_vocab_gpt2(self) -> None:
|
567
504
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
@@ -677,28 +614,17 @@ class Model:
|
|
677
614
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
678
615
|
continue
|
679
616
|
|
680
|
-
<<<<<<< HEAD
|
681
|
-
=======
|
682
617
|
tokens[token_id] = key.encode("utf-8")
|
683
618
|
scores[token_id] = -1000.0
|
684
619
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
685
620
|
|
686
|
-
>>>>>>> uupstream/master
|
687
621
|
if vocab_size > len(tokens):
|
688
622
|
pad_count = vocab_size - len(tokens)
|
689
623
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
690
624
|
for i in range(1, pad_count + 1):
|
691
|
-
<<<<<<< HEAD
|
692
|
-
tokens.append(f"[PAD{i}]")
|
693
|
-
scores.append(-1000.0)
|
694
|
-
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
695
|
-
|
696
|
-
assert len(tokens) == vocab_size
|
697
|
-
=======
|
698
625
|
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
699
626
|
scores.append(-1000.0)
|
700
627
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
701
|
-
>>>>>>> uupstream/master
|
702
628
|
|
703
629
|
self.gguf_writer.add_tokenizer_model("llama")
|
704
630
|
self.gguf_writer.add_tokenizer_pre("default")
|
@@ -848,67 +774,11 @@ class BloomModel(Model):
|
|
848
774
|
if name == "word_embeddings.weight":
|
849
775
|
assert self.tensor_names is not None
|
850
776
|
|
851
|
-
<<<<<<< HEAD
|
852
|
-
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
853
|
-
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
854
|
-
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
855
|
-
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
856
|
-
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
|
857
|
-
data = np.concatenate(
|
858
|
-
(
|
859
|
-
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
860
|
-
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
861
|
-
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
|
862
|
-
),
|
863
|
-
axis=0,
|
864
|
-
)
|
865
|
-
logger.info("re-format attention.linear_qkv.weight")
|
866
|
-
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
867
|
-
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
|
868
|
-
data = np.concatenate(
|
869
|
-
(
|
870
|
-
qkv_bias[:, 0, :].reshape((n_embed,)),
|
871
|
-
qkv_bias[:, 1, :].reshape((n_embed,)),
|
872
|
-
qkv_bias[:, 2, :].reshape((n_embed,)),
|
873
|
-
),
|
874
|
-
axis=0,
|
875
|
-
)
|
876
|
-
logger.info("re-format attention.linear_qkv.bias")
|
877
|
-
|
878
|
-
# map tensor names
|
879
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
880
|
-
if new_name is None:
|
881
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
882
|
-
|
883
|
-
n_dims = len(data.shape)
|
884
|
-
data_dtype = data.dtype
|
885
|
-
|
886
|
-
# if f32 desired, convert any float16 to float32
|
887
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
888
|
-
data = data.astype(np.float32)
|
889
|
-
|
890
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
891
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
892
|
-
data = data.astype(np.float32)
|
893
|
-
|
894
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
895
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
896
|
-
data = data.astype(np.float16)
|
897
|
-
|
898
|
-
logger.info(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
899
|
-
|
900
|
-
self.gguf_writer.add_tensor(new_name, data)
|
901
|
-
|
902
|
-
if not has_lm_head and name == "word_embeddings.weight":
|
903
|
-
self.gguf_writer.add_tensor("output.weight", data)
|
904
|
-
logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
905
|
-
=======
|
906
777
|
# TODO: tie them at runtime, don't duplicate in the model file
|
907
778
|
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
|
908
779
|
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
909
780
|
|
910
781
|
return tensors
|
911
|
-
>>>>>>> uupstream/master
|
912
782
|
|
913
783
|
|
914
784
|
@Model.register("MPTForCausalLM")
|
@@ -953,44 +823,7 @@ class MPTModel(Model):
|
|
953
823
|
else:
|
954
824
|
new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias"))
|
955
825
|
|
956
|
-
<<<<<<< HEAD
|
957
|
-
# convert any unsupported data types to float32
|
958
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
959
|
-
data_torch = data_torch.to(torch.float32)
|
960
|
-
|
961
|
-
data = data_torch.squeeze().numpy()
|
962
|
-
|
963
|
-
# map tensor names
|
964
|
-
if "scales" in name:
|
965
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
|
966
|
-
if new_name is not None:
|
967
|
-
new_name = new_name.replace("scales", "act.scales")
|
968
|
-
else:
|
969
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
970
|
-
if new_name is None:
|
971
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
972
|
-
|
973
|
-
n_dims = len(data.shape)
|
974
|
-
data_dtype = data.dtype
|
975
|
-
|
976
|
-
# if f32 desired, convert any float16 to float32
|
977
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
978
|
-
data = data.astype(np.float32)
|
979
|
-
|
980
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
981
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
982
|
-
data = data.astype(np.float32)
|
983
|
-
|
984
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
985
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
986
|
-
data = data.astype(np.float16)
|
987
|
-
|
988
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
989
|
-
|
990
|
-
self.gguf_writer.add_tensor(new_name, data)
|
991
|
-
=======
|
992
826
|
return [(new_name, data_torch)]
|
993
|
-
>>>>>>> uupstream/master
|
994
827
|
|
995
828
|
|
996
829
|
@Model.register("OrionForCausalLM")
|
@@ -1030,51 +863,6 @@ class OrionModel(Model):
|
|
1030
863
|
# ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
|
1031
864
|
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
1032
865
|
|
1033
|
-
<<<<<<< HEAD
|
1034
|
-
def write_tensors(self):
|
1035
|
-
# Collect tensors from generator object
|
1036
|
-
model_kv = dict(self.get_tensors())
|
1037
|
-
block_count = self.hparams["num_hidden_layers"]
|
1038
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
1039
|
-
|
1040
|
-
for name, data_torch in model_kv.items():
|
1041
|
-
# we don't need these
|
1042
|
-
if name.endswith(".rotary_emb.inv_freq"):
|
1043
|
-
continue
|
1044
|
-
|
1045
|
-
old_dtype = data_torch.dtype
|
1046
|
-
|
1047
|
-
# convert any unsupported data types to float32
|
1048
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1049
|
-
data_torch = data_torch.to(torch.float32)
|
1050
|
-
|
1051
|
-
data = data_torch.squeeze().numpy()
|
1052
|
-
|
1053
|
-
# map tensor names
|
1054
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1055
|
-
if new_name is None:
|
1056
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1057
|
-
|
1058
|
-
n_dims = len(data.shape)
|
1059
|
-
data_dtype = data.dtype
|
1060
|
-
|
1061
|
-
# if f32 desired, convert any float16 to float32
|
1062
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1063
|
-
data = data.astype(np.float32)
|
1064
|
-
|
1065
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1066
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
1067
|
-
data = data.astype(np.float32)
|
1068
|
-
|
1069
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1070
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
1071
|
-
data = data.astype(np.float16)
|
1072
|
-
|
1073
|
-
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1074
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1075
|
-
|
1076
|
-
=======
|
1077
|
-
>>>>>>> uupstream/master
|
1078
866
|
|
1079
867
|
@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
|
1080
868
|
class BaichuanModel(Model):
|
@@ -1121,20 +909,7 @@ class BaichuanModel(Model):
|
|
1121
909
|
head_count = self.hparams["num_attention_heads"]
|
1122
910
|
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
1123
911
|
|
1124
|
-
<<<<<<< HEAD
|
1125
|
-
for i in range(block_count):
|
1126
|
-
if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
|
1127
|
-
logger.info(f"Unpacking and permuting layer {i}")
|
1128
|
-
model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
|
1129
|
-
self._reverse_hf_permute_part(w, 0, head_count, head_count)
|
1130
|
-
model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
|
1131
|
-
self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
|
1132
|
-
model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
|
1133
|
-
self._reverse_hf_part(w, 2)
|
1134
|
-
del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
|
1135
|
-
=======
|
1136
912
|
tensors: list[tuple[str, Tensor]] = []
|
1137
|
-
>>>>>>> uupstream/master
|
1138
913
|
|
1139
914
|
if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
|
1140
915
|
logger.info(f"Unpacking and permuting layer {bid}")
|
@@ -1149,40 +924,7 @@ class BaichuanModel(Model):
|
|
1149
924
|
else:
|
1150
925
|
tensors = [(self.map_tensor_name(name), data_torch)]
|
1151
926
|
|
1152
|
-
<<<<<<< HEAD
|
1153
|
-
old_dtype = data_torch.dtype
|
1154
|
-
|
1155
|
-
# convert any unsupported data types to float32
|
1156
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1157
|
-
data_torch = data_torch.to(torch.float32)
|
1158
|
-
|
1159
|
-
data = data_torch.squeeze().numpy()
|
1160
|
-
|
1161
|
-
# map tensor names
|
1162
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1163
|
-
if new_name is None:
|
1164
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1165
|
-
|
1166
|
-
n_dims = len(data.shape)
|
1167
|
-
data_dtype = data.dtype
|
1168
|
-
|
1169
|
-
# if f32 desired, convert any float16 to float32
|
1170
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1171
|
-
data = data.astype(np.float32)
|
1172
|
-
|
1173
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1174
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
1175
|
-
data = data.astype(np.float32)
|
1176
|
-
|
1177
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1178
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
1179
|
-
data = data.astype(np.float16)
|
1180
|
-
|
1181
|
-
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1182
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1183
|
-
=======
|
1184
927
|
return tensors
|
1185
|
-
>>>>>>> uupstream/master
|
1186
928
|
|
1187
929
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
1188
930
|
if n_kv_head is not None and n_head != n_kv_head:
|
@@ -1298,46 +1040,7 @@ class XverseModel(Model):
|
|
1298
1040
|
if name.endswith("k_proj.weight"):
|
1299
1041
|
data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
|
1300
1042
|
|
1301
|
-
<<<<<<< HEAD
|
1302
|
-
old_dtype = data_torch.dtype
|
1303
|
-
|
1304
|
-
# convert any unsupported data types to float32
|
1305
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1306
|
-
data_torch = data_torch.to(torch.float32)
|
1307
|
-
|
1308
|
-
# HF models permute some of the tensors, so we need to undo that
|
1309
|
-
if name.endswith(("q_proj.weight")):
|
1310
|
-
data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
|
1311
|
-
if name.endswith(("k_proj.weight")):
|
1312
|
-
data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
|
1313
|
-
|
1314
|
-
data = data_torch.squeeze().numpy()
|
1315
|
-
|
1316
|
-
# map tensor names
|
1317
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1318
|
-
if new_name is None:
|
1319
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1320
|
-
|
1321
|
-
n_dims = len(data.shape)
|
1322
|
-
data_dtype = data.dtype
|
1323
|
-
|
1324
|
-
# if f32 desired, convert any float16 to float32
|
1325
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1326
|
-
data = data.astype(np.float32)
|
1327
|
-
|
1328
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1329
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
1330
|
-
data = data.astype(np.float32)
|
1331
|
-
|
1332
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1333
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
1334
|
-
data = data.astype(np.float16)
|
1335
|
-
|
1336
|
-
logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1337
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1338
|
-
=======
|
1339
1043
|
return [(self.map_tensor_name(name), data_torch)]
|
1340
|
-
>>>>>>> uupstream/master
|
1341
1044
|
|
1342
1045
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
1343
1046
|
if n_kv_head is not None and n_head != n_kv_head:
|
@@ -1402,59 +1105,7 @@ class FalconModel(Model):
|
|
1402
1105
|
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
1403
1106
|
data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
|
1404
1107
|
|
1405
|
-
<<<<<<< HEAD
|
1406
|
-
for name, data_torch in self.get_tensors():
|
1407
|
-
old_dtype = data_torch.dtype
|
1408
|
-
|
1409
|
-
# convert any unsupported data types to float32
|
1410
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1411
|
-
data_torch = data_torch.to(torch.float32)
|
1412
|
-
|
1413
|
-
# QKV tensor transform
|
1414
|
-
# The original query_key_value tensor contains n_head_kv "kv groups",
|
1415
|
-
# each consisting of n_head/n_head_kv query weights followed by one key
|
1416
|
-
# and one value weight (shared by all query heads in the kv group).
|
1417
|
-
# This layout makes it a big pain to work with in GGML.
|
1418
|
-
# So we rearrange them here,, so that we have n_head query weights
|
1419
|
-
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
1420
|
-
# in contiguous fashion.
|
1421
|
-
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
1422
|
-
|
1423
|
-
if "query_key_value" in name:
|
1424
|
-
qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
1425
|
-
q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
|
1426
|
-
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
1427
|
-
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
1428
|
-
data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
|
1429
|
-
|
1430
|
-
data = data_torch.squeeze().numpy()
|
1431
|
-
|
1432
|
-
# map tensor names
|
1433
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1434
|
-
if new_name is None:
|
1435
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1436
|
-
|
1437
|
-
n_dims = len(data.shape)
|
1438
|
-
data_dtype = data.dtype
|
1439
|
-
|
1440
|
-
# if f32 desired, convert any float16 to float32
|
1441
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1442
|
-
data = data.astype(np.float32)
|
1443
|
-
|
1444
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1445
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
1446
|
-
data = data.astype(np.float32)
|
1447
|
-
|
1448
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1449
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
1450
|
-
data = data.astype(np.float16)
|
1451
|
-
|
1452
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1453
|
-
|
1454
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1455
|
-
=======
|
1456
1108
|
return [(self.map_tensor_name(name), data_torch)]
|
1457
|
-
>>>>>>> uupstream/master
|
1458
1109
|
|
1459
1110
|
|
1460
1111
|
@Model.register("GPTBigCodeForCausalLM")
|
@@ -1537,89 +1188,7 @@ class RefactModel(Model):
|
|
1537
1188
|
if len(tensors) == 0:
|
1538
1189
|
tensors.append((self.map_tensor_name(name), data_torch))
|
1539
1190
|
|
1540
|
-
<<<<<<< HEAD
|
1541
|
-
# convert any unsupported data types to float32
|
1542
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
1543
|
-
data_torch = data_torch.to(torch.float32)
|
1544
|
-
|
1545
|
-
data = data_torch.squeeze().numpy()
|
1546
|
-
|
1547
|
-
# map tensor names
|
1548
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
|
1549
|
-
if new_name is None:
|
1550
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1551
|
-
|
1552
|
-
n_dims = len(data.shape)
|
1553
|
-
data_dtype = data.dtype
|
1554
|
-
|
1555
|
-
# if f32 desired, convert any float16 to float32
|
1556
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1557
|
-
data = data.astype(np.float32)
|
1558
|
-
|
1559
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
1560
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
1561
|
-
data = data.astype(np.float32)
|
1562
|
-
|
1563
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1564
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
1565
|
-
data = data.astype(np.float16)
|
1566
|
-
|
1567
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1568
|
-
|
1569
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1570
|
-
|
1571
|
-
|
1572
|
-
@Model.register("PersimmonForCausalLM")
|
1573
|
-
class PersimmonModel(Model):
|
1574
|
-
model_arch = gguf.MODEL_ARCH.PERSIMMON
|
1575
|
-
|
1576
|
-
def set_gguf_parameters(self):
|
1577
|
-
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
|
1578
|
-
head_count = self.hparams["num_attention_heads"]
|
1579
|
-
head_count_kv = head_count
|
1580
|
-
hidden_size = self.hparams["hidden_size"]
|
1581
|
-
|
1582
|
-
self.gguf_writer.add_name('persimmon-8b-chat')
|
1583
|
-
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
1584
|
-
self.gguf_writer.add_embedding_length(hidden_size)
|
1585
|
-
self.gguf_writer.add_block_count(block_count)
|
1586
|
-
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
1587
|
-
|
1588
|
-
# NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
|
1589
|
-
# than the head size?
|
1590
|
-
# ref: https://github.com/ggerganov/llama.cpp/pull/4889
|
1591
|
-
# self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
1592
|
-
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
|
1593
|
-
|
1594
|
-
self.gguf_writer.add_head_count(head_count)
|
1595
|
-
self.gguf_writer.add_head_count_kv(head_count_kv)
|
1596
|
-
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
1597
|
-
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
1598
|
-
|
1599
|
-
def set_vocab(self):
|
1600
|
-
self._set_vocab_sentencepiece()
|
1601
|
-
# self.gguf_writer.add_bos_token_id(71013)
|
1602
|
-
# self.gguf_writer.add_eos_token_id(71013)
|
1603
|
-
|
1604
|
-
def write_tensors(self):
|
1605
|
-
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
|
1606
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
1607
|
-
|
1608
|
-
for name, data_torch in self.get_tensors():
|
1609
|
-
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
1610
|
-
continue
|
1611
|
-
old_dtype = data_torch.dtype
|
1612
|
-
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
1613
|
-
data = data_torch.to(torch.float32).squeeze().numpy()
|
1614
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1615
|
-
if new_name is None:
|
1616
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1617
|
-
n_dims = len(data.shape)
|
1618
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1619
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1620
|
-
=======
|
1621
1191
|
return tensors
|
1622
|
-
>>>>>>> uupstream/master
|
1623
1192
|
|
1624
1193
|
|
1625
1194
|
@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
|
@@ -1829,20 +1398,6 @@ class LlamaModel(Model):
|
|
1829
1398
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
1830
1399
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
1831
1400
|
|
1832
|
-
<<<<<<< HEAD
|
1833
|
-
# Same as super class, but permuting q_proj, k_proj
|
1834
|
-
def write_tensors(self):
|
1835
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
1836
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
1837
|
-
n_head = self.hparams.get("num_attention_heads")
|
1838
|
-
n_kv_head = self.hparams.get("num_key_value_heads")
|
1839
|
-
n_experts = self.hparams.get("num_local_experts")
|
1840
|
-
experts = dict()
|
1841
|
-
for name, data_torch in self.get_tensors():
|
1842
|
-
# we don't need these
|
1843
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
1844
|
-
continue
|
1845
|
-
=======
|
1846
1401
|
@staticmethod
|
1847
1402
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
1848
1403
|
if n_head_kv is not None and n_head != n_head_kv:
|
@@ -1856,7 +1411,6 @@ class LlamaModel(Model):
|
|
1856
1411
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
1857
1412
|
n_head = self.hparams["num_attention_heads"]
|
1858
1413
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
1859
|
-
>>>>>>> uupstream/master
|
1860
1414
|
|
1861
1415
|
if name.endswith("q_proj.weight"):
|
1862
1416
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
@@ -1892,58 +1446,21 @@ class LlamaModel(Model):
|
|
1892
1446
|
|
1893
1447
|
new_name = self.map_tensor_name(merged_name)
|
1894
1448
|
|
1895
|
-
<<<<<<< HEAD
|
1896
|
-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
1897
|
-
if new_name is None:
|
1898
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1899
|
-
|
1900
|
-
logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
1901
|
-
=======
|
1902
1449
|
tensors.append((new_name, data_torch))
|
1903
1450
|
return tensors
|
1904
1451
|
else:
|
1905
1452
|
return []
|
1906
1453
|
|
1907
1454
|
return [(self.map_tensor_name(name), data_torch)]
|
1908
|
-
>>>>>>> uupstream/master
|
1909
1455
|
|
1910
1456
|
def write_tensors(self):
|
1911
1457
|
super().write_tensors()
|
1912
1458
|
|
1913
|
-
<<<<<<< HEAD
|
1914
|
-
# map tensor names
|
1915
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
1916
|
-
if new_name is None:
|
1917
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1918
|
-
|
1919
|
-
n_dims = len(data.shape)
|
1920
|
-
data_dtype = data.dtype
|
1921
|
-
|
1922
|
-
# if f32 desired, convert any float16 to float32
|
1923
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
1924
|
-
data = data.astype(np.float32)
|
1925
|
-
|
1926
|
-
# 1d tensors need to be converted to float32
|
1927
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
1928
|
-
data = data.astype(np.float32)
|
1929
|
-
|
1930
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
1931
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
1932
|
-
data = data.astype(np.float16)
|
1933
|
-
|
1934
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
1935
|
-
|
1936
|
-
self.gguf_writer.add_tensor(new_name, data)
|
1937
|
-
|
1938
|
-
if len(experts) > 0:
|
1939
|
-
raise ValueError(f"Unprocessed experts: {experts.keys()}")
|
1940
|
-
=======
|
1941
1459
|
if self._experts is not None:
|
1942
1460
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
1943
1461
|
experts = [k for d in self._experts for k in d.keys()]
|
1944
1462
|
if len(experts) > 0:
|
1945
1463
|
raise ValueError(f"Unprocessed experts: {experts}")
|
1946
|
-
>>>>>>> uupstream/master
|
1947
1464
|
|
1948
1465
|
|
1949
1466
|
@Model.register("GrokForCausalLM")
|
@@ -1990,30 +1507,15 @@ class GrokModel(Model):
|
|
1990
1507
|
|
1991
1508
|
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
|
1992
1509
|
|
1993
|
-
<<<<<<< HEAD
|
1994
|
-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
1995
|
-
if new_name is None:
|
1996
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
1997
|
-
|
1998
|
-
logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
1999
|
-
=======
|
2000
1510
|
new_name = self.map_tensor_name(merged_name)
|
2001
1511
|
|
2002
1512
|
tensors.append((new_name, data_torch))
|
2003
1513
|
return tensors
|
2004
1514
|
else:
|
2005
1515
|
return []
|
2006
|
-
>>>>>>> uupstream/master
|
2007
1516
|
|
2008
1517
|
return [(self.map_tensor_name(name), data_torch)]
|
2009
1518
|
|
2010
|
-
<<<<<<< HEAD
|
2011
|
-
# map tensor names
|
2012
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2013
|
-
if new_name is None:
|
2014
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
2015
|
-
=======
|
2016
|
-
>>>>>>> uupstream/master
|
2017
1519
|
|
2018
1520
|
@Model.register("DbrxForCausalLM")
|
2019
1521
|
class DbrxModel(Model):
|
@@ -2032,201 +1534,7 @@ class DbrxModel(Model):
|
|
2032
1534
|
self.gguf_writer.add_head_count(self.hparams["n_heads"])
|
2033
1535
|
self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
|
2034
1536
|
|
2035
|
-
<<<<<<< HEAD
|
2036
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
2037
|
-
|
2038
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2039
|
-
|
2040
|
-
|
2041
|
-
@Model.register("DbrxForCausalLM")
|
2042
|
-
class DbrxModel(Model):
|
2043
|
-
model_arch = gguf.MODEL_ARCH.DBRX
|
2044
|
-
|
2045
|
-
def set_gguf_parameters(self):
|
2046
|
-
ffn_config = self.hparams["ffn_config"]
|
2047
|
-
attn_config = self.hparams["attn_config"]
|
2048
|
-
self.gguf_writer.add_name(self.hparams["model_type"])
|
2049
|
-
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
2050
|
-
|
2051
|
-
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
2052
|
-
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
2053
|
-
self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
|
2054
|
-
|
2055
|
-
self.gguf_writer.add_head_count(self.hparams["n_heads"])
|
2056
|
-
self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
|
2057
|
-
|
2058
|
-
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
2059
|
-
|
2060
|
-
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
2061
|
-
self.gguf_writer.add_file_type(self.ftype)
|
2062
|
-
|
2063
|
-
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
2064
|
-
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
2065
|
-
|
2066
|
-
self.gguf_writer.add_layer_norm_eps(1e-5)
|
2067
|
-
|
2068
|
-
self.gguf_writer.add_file_type(self.ftype)
|
2069
|
-
logger.info(f"gguf: file type = {self.ftype}")
|
2070
|
-
|
2071
|
-
def write_tensors(self):
|
2072
|
-
block_count = self.hparams.get("n_layers")
|
2073
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
2074
|
-
for name, data_torch in self.get_tensors():
|
2075
|
-
n_expert = self.hparams["ffn_config"]["moe_num_experts"]
|
2076
|
-
n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
|
2077
|
-
n_embd = self.hparams["d_model"]
|
2078
|
-
|
2079
|
-
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
|
2080
|
-
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights
|
2081
|
-
# But llama.cpp moe graph works differently
|
2082
|
-
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
2083
|
-
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
2084
|
-
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
2085
|
-
"ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
|
2086
|
-
"ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
2087
|
-
experts = False
|
2088
|
-
for exp_tensor_name in exp_tensor_names.keys():
|
2089
|
-
if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
|
2090
|
-
experts = True
|
2091
|
-
data_torch = data_torch.view(n_expert, n_ff, n_embd)
|
2092
|
-
if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
|
2093
|
-
data_torch = data_torch.permute(*permute_tensor)
|
2094
|
-
break
|
2095
|
-
|
2096
|
-
old_dtype = data_torch.dtype
|
2097
|
-
|
2098
|
-
# convert any unsupported data types to float32
|
2099
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2100
|
-
data_torch = data_torch.to(torch.float32)
|
2101
|
-
|
2102
|
-
data = data_torch.squeeze().numpy()
|
2103
|
-
|
2104
|
-
# map tensor names
|
2105
|
-
# In MoE models the ffn tensors are typically most of the model weights,
|
2106
|
-
# and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
|
2107
|
-
# Every other model has the weight names ending in .weight,
|
2108
|
-
# let's assume that is the convention which is not the case for dbrx:
|
2109
|
-
# https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
|
2110
|
-
new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
|
2111
|
-
if new_name is None:
|
2112
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
2113
|
-
|
2114
|
-
n_dims = len(data.shape)
|
2115
|
-
data_dtype = data.dtype
|
2116
|
-
|
2117
|
-
# Most of the codebase that takes in 1D tensors only handles F32 tensors
|
2118
|
-
# and most of the outputs tensors are F32.
|
2119
|
-
if data_dtype != np.float32 and n_dims == 1:
|
2120
|
-
raise ValueError(f"Can not map tensor {name!r}: all 1D tensors must be F32")
|
2121
|
-
|
2122
|
-
# if f32 desired, convert any float16 to float32
|
2123
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2124
|
-
data = data.astype(np.float32)
|
2125
|
-
|
2126
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
2127
|
-
if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
|
2128
|
-
data = data.astype(np.float16)
|
2129
|
-
|
2130
|
-
logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
2131
|
-
|
2132
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2133
|
-
|
2134
|
-
|
2135
|
-
@Model.register("DbrxForCausalLM")
|
2136
|
-
class DbrxModel(Model):
|
2137
|
-
model_arch = gguf.MODEL_ARCH.DBRX
|
2138
|
-
|
2139
|
-
def set_gguf_parameters(self):
|
2140
|
-
ffn_config = self.hparams["ffn_config"]
|
2141
|
-
attn_config = self.hparams["attn_config"]
|
2142
|
-
self.gguf_writer.add_name(self.hparams["model_type"])
|
2143
|
-
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
2144
|
-
|
2145
|
-
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
2146
|
-
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
2147
|
-
self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
|
2148
|
-
|
2149
|
-
self.gguf_writer.add_head_count(self.hparams["n_heads"])
|
2150
|
-
self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
|
2151
|
-
|
2152
|
-
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
2153
|
-
|
2154
|
-
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
2155
|
-
self.gguf_writer.add_file_type(self.ftype)
|
2156
|
-
|
2157
|
-
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
2158
|
-
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
2159
|
-
|
2160
|
-
self.gguf_writer.add_layer_norm_eps(1e-5)
|
2161
|
-
|
2162
|
-
self.gguf_writer.add_file_type(self.ftype)
|
2163
|
-
print(f"gguf: file type = {self.ftype}")
|
2164
|
-
|
2165
|
-
def write_tensors(self):
|
2166
|
-
block_count = self.hparams.get("n_layers")
|
2167
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
2168
|
-
for name, data_torch in self.get_tensors():
|
2169
|
-
n_expert = self.hparams["ffn_config"]["moe_num_experts"]
|
2170
|
-
n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
|
2171
|
-
n_embd = self.hparams["d_model"]
|
2172
|
-
|
2173
|
-
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
|
2174
|
-
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights
|
2175
|
-
# But llama.cpp moe graph works differently
|
2176
|
-
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
2177
|
-
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
2178
|
-
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
2179
|
-
"ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
|
2180
|
-
"ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
2181
|
-
experts = False
|
2182
|
-
for exp_tensor_name in exp_tensor_names.keys():
|
2183
|
-
if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
|
2184
|
-
experts = True
|
2185
|
-
data_torch = data_torch.view(n_expert, n_ff, n_embd)
|
2186
|
-
if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
|
2187
|
-
data_torch = data_torch.permute(*permute_tensor)
|
2188
|
-
break
|
2189
|
-
|
2190
|
-
old_dtype = data_torch.dtype
|
2191
|
-
|
2192
|
-
# convert any unsupported data types to float32
|
2193
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2194
|
-
data_torch = data_torch.to(torch.float32)
|
2195
|
-
|
2196
|
-
data = data_torch.squeeze().numpy()
|
2197
|
-
|
2198
|
-
# map tensor names
|
2199
|
-
# In MoE models the ffn tensors are typically most of the model weights,
|
2200
|
-
# and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
|
2201
|
-
# Every other model has the weight names ending in .weight,
|
2202
|
-
# let's assume that is the convention which is not the case for dbrx:
|
2203
|
-
# https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
|
2204
|
-
new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
|
2205
|
-
if new_name is None:
|
2206
|
-
print(f"Can not map tensor {name!r}")
|
2207
|
-
sys.exit()
|
2208
|
-
|
2209
|
-
n_dims = len(data.shape)
|
2210
|
-
data_dtype = data.dtype
|
2211
|
-
|
2212
|
-
# Most of the codebase that takes in 1D tensors only handles F32 tensors
|
2213
|
-
# and most of the outputs tensors are F32.
|
2214
|
-
if data_dtype != np.float32 and n_dims == 1:
|
2215
|
-
print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
|
2216
|
-
sys.exit()
|
2217
|
-
|
2218
|
-
# if f32 desired, convert any float16 to float32
|
2219
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2220
|
-
data = data.astype(np.float32)
|
2221
|
-
|
2222
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
2223
|
-
if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
|
2224
|
-
data = data.astype(np.float16)
|
2225
|
-
|
2226
|
-
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
2227
|
-
=======
|
2228
1537
|
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
2229
|
-
>>>>>>> uupstream/master
|
2230
1538
|
|
2231
1539
|
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
2232
1540
|
self.gguf_writer.add_file_type(self.ftype)
|
@@ -2322,45 +1630,7 @@ class MiniCPMModel(Model):
|
|
2322
1630
|
if name.endswith(("k_proj.weight")):
|
2323
1631
|
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
|
2324
1632
|
|
2325
|
-
<<<<<<< HEAD
|
2326
|
-
# convert any unsupported data types to float32
|
2327
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2328
|
-
data_torch = data_torch.to(torch.float32)
|
2329
|
-
|
2330
|
-
# HF models permute some of the tensors, so we need to undo that
|
2331
|
-
if name.endswith(("q_proj.weight")):
|
2332
|
-
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
|
2333
|
-
if name.endswith(("k_proj.weight")):
|
2334
|
-
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
|
2335
|
-
|
2336
|
-
data = data_torch.squeeze().numpy()
|
2337
|
-
|
2338
|
-
# map tensor names
|
2339
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2340
|
-
if new_name is None:
|
2341
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
2342
|
-
|
2343
|
-
n_dims = len(data.shape)
|
2344
|
-
data_dtype = data.dtype
|
2345
|
-
|
2346
|
-
# if f32 desired, convert any float16 to float32
|
2347
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2348
|
-
data = data.astype(np.float32)
|
2349
|
-
|
2350
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
2351
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
2352
|
-
data = data.astype(np.float32)
|
2353
|
-
|
2354
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
2355
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
2356
|
-
data = data.astype(np.float16)
|
2357
|
-
|
2358
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
2359
|
-
|
2360
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2361
|
-
=======
|
2362
1633
|
return [(self.map_tensor_name(name), data_torch)]
|
2363
|
-
>>>>>>> uupstream/master
|
2364
1634
|
|
2365
1635
|
|
2366
1636
|
@Model.register("QWenLMHeadModel")
|
@@ -2403,50 +1673,7 @@ class QwenModel(Model):
|
|
2403
1673
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
2404
1674
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
2405
1675
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
2406
|
-
<<<<<<< HEAD
|
2407
|
-
|
2408
|
-
def write_tensors(self):
|
2409
|
-
block_count = self.hparams["num_hidden_layers"]
|
2410
|
-
model_kv = dict(self.get_tensors())
|
2411
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
2412
|
-
for name, data_torch in model_kv.items():
|
2413
|
-
# we don't need these
|
2414
|
-
if name.endswith(".rotary_emb.inv_freq"):
|
2415
|
-
continue
|
2416
|
-
|
2417
|
-
old_dtype = data_torch.dtype
|
2418
|
-
|
2419
|
-
# convert any unsupported data types to float32
|
2420
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2421
|
-
data_torch = data_torch.to(torch.float32)
|
2422
|
-
|
2423
|
-
data = data_torch.squeeze().numpy()
|
2424
|
-
|
2425
|
-
# map tensor names
|
2426
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2427
|
-
if new_name is None:
|
2428
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
2429
|
-
|
2430
|
-
n_dims = len(data.shape)
|
2431
|
-
data_dtype = data.dtype
|
2432
|
-
|
2433
|
-
# if f32 desired, convert any float16 to float32
|
2434
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2435
|
-
data = data.astype(np.float32)
|
2436
|
-
|
2437
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
2438
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
2439
|
-
data = data.astype(np.float32)
|
2440
|
-
|
2441
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
2442
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
2443
|
-
data = data.astype(np.float16)
|
2444
|
-
|
2445
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
2446
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2447
|
-
=======
|
2448
1676
|
self.gguf_writer.add_file_type(self.ftype)
|
2449
|
-
>>>>>>> uupstream/master
|
2450
1677
|
|
2451
1678
|
|
2452
1679
|
@Model.register("Qwen2ForCausalLM")
|
@@ -2469,193 +1696,6 @@ class Qwen2MoeModel(Model):
|
|
2469
1696
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
2470
1697
|
self.gguf_writer.add_expert_count(n_experts)
|
2471
1698
|
|
2472
|
-
<<<<<<< HEAD
|
2473
|
-
def write_tensors(self):
|
2474
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
2475
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
2476
|
-
n_experts = self.hparams.get("num_experts")
|
2477
|
-
experts = dict()
|
2478
|
-
for name, data_torch in self.get_tensors():
|
2479
|
-
# we don't need these
|
2480
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
2481
|
-
continue
|
2482
|
-
|
2483
|
-
old_dtype = data_torch.dtype
|
2484
|
-
|
2485
|
-
# convert any unsupported data types to float32
|
2486
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2487
|
-
data_torch = data_torch.to(torch.float32)
|
2488
|
-
|
2489
|
-
data = data_torch.squeeze().numpy()
|
2490
|
-
|
2491
|
-
# process the experts separately
|
2492
|
-
if name.find("experts") != -1:
|
2493
|
-
experts[name] = data
|
2494
|
-
if len(experts) >= n_experts * 3:
|
2495
|
-
# merge the experts into a single 3d tensor
|
2496
|
-
for bid in range(block_count):
|
2497
|
-
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
2498
|
-
full = True
|
2499
|
-
for xid in range(n_experts):
|
2500
|
-
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
2501
|
-
if ename not in experts:
|
2502
|
-
full = False
|
2503
|
-
break
|
2504
|
-
if not full:
|
2505
|
-
continue
|
2506
|
-
|
2507
|
-
datas = []
|
2508
|
-
for xid in range(n_experts):
|
2509
|
-
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
2510
|
-
datas.append(experts[ename])
|
2511
|
-
del experts[ename]
|
2512
|
-
|
2513
|
-
data = np.stack(datas, axis=0)
|
2514
|
-
data_dtype = data.dtype
|
2515
|
-
|
2516
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2517
|
-
data = data.astype(np.float32)
|
2518
|
-
|
2519
|
-
if self.ftype == 1 and data_dtype == np.float32:
|
2520
|
-
data = data.astype(np.float16)
|
2521
|
-
|
2522
|
-
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
2523
|
-
|
2524
|
-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
2525
|
-
if new_name is None:
|
2526
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
2527
|
-
|
2528
|
-
logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
2529
|
-
|
2530
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2531
|
-
continue
|
2532
|
-
|
2533
|
-
# map tensor names
|
2534
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2535
|
-
if new_name is None:
|
2536
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
2537
|
-
|
2538
|
-
n_dims = len(data.shape)
|
2539
|
-
data_dtype = data.dtype
|
2540
|
-
|
2541
|
-
# if f32 desired, convert any float16 to float32
|
2542
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2543
|
-
data = data.astype(np.float32)
|
2544
|
-
|
2545
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
2546
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
2547
|
-
data = data.astype(np.float32)
|
2548
|
-
|
2549
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
2550
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
2551
|
-
data = data.astype(np.float16)
|
2552
|
-
|
2553
|
-
logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
2554
|
-
|
2555
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2556
|
-
|
2557
|
-
if len(experts) > 0:
|
2558
|
-
raise ValueError(f"Unprocessed experts: {experts.keys()}")
|
2559
|
-
|
2560
|
-
|
2561
|
-
@Model.register("Qwen2MoeForCausalLM")
|
2562
|
-
class Qwen2MoeModel(Model):
|
2563
|
-
model_arch = gguf.MODEL_ARCH.QWEN2MOE
|
2564
|
-
|
2565
|
-
def set_gguf_parameters(self):
|
2566
|
-
super().set_gguf_parameters()
|
2567
|
-
if (n_experts := self.hparams.get("num_experts")) is not None:
|
2568
|
-
self.gguf_writer.add_expert_count(n_experts)
|
2569
|
-
|
2570
|
-
def write_tensors(self):
|
2571
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
2572
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
2573
|
-
n_experts = self.hparams.get("num_experts")
|
2574
|
-
experts = dict()
|
2575
|
-
for name, data_torch in self.get_tensors():
|
2576
|
-
# we don't need these
|
2577
|
-
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
|
2578
|
-
continue
|
2579
|
-
|
2580
|
-
old_dtype = data_torch.dtype
|
2581
|
-
|
2582
|
-
# convert any unsupported data types to float32
|
2583
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
2584
|
-
data_torch = data_torch.to(torch.float32)
|
2585
|
-
|
2586
|
-
data = data_torch.squeeze().numpy()
|
2587
|
-
|
2588
|
-
# process the experts separately
|
2589
|
-
if name.find("experts") != -1:
|
2590
|
-
experts[name] = data
|
2591
|
-
if len(experts) >= n_experts * 3:
|
2592
|
-
# merge the experts into a single 3d tensor
|
2593
|
-
for bid in range(block_count):
|
2594
|
-
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
2595
|
-
full = True
|
2596
|
-
for xid in range(n_experts):
|
2597
|
-
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
2598
|
-
if ename not in experts:
|
2599
|
-
full = False
|
2600
|
-
break
|
2601
|
-
if not full:
|
2602
|
-
continue
|
2603
|
-
|
2604
|
-
datas = []
|
2605
|
-
for xid in range(n_experts):
|
2606
|
-
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
2607
|
-
datas.append(experts[ename])
|
2608
|
-
del experts[ename]
|
2609
|
-
|
2610
|
-
data = np.stack(datas, axis=0)
|
2611
|
-
data_dtype = data.dtype
|
2612
|
-
|
2613
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2614
|
-
data = data.astype(np.float32)
|
2615
|
-
|
2616
|
-
if self.ftype == 1 and data_dtype == np.float32:
|
2617
|
-
data = data.astype(np.float16)
|
2618
|
-
|
2619
|
-
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
2620
|
-
|
2621
|
-
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
|
2622
|
-
if new_name is None:
|
2623
|
-
print(f"Can not map tensor {name!r}")
|
2624
|
-
sys.exit()
|
2625
|
-
|
2626
|
-
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
|
2627
|
-
|
2628
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2629
|
-
continue
|
2630
|
-
|
2631
|
-
# map tensor names
|
2632
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2633
|
-
if new_name is None:
|
2634
|
-
print(f"Can not map tensor {name!r}")
|
2635
|
-
sys.exit()
|
2636
|
-
|
2637
|
-
n_dims = len(data.shape)
|
2638
|
-
data_dtype = data.dtype
|
2639
|
-
|
2640
|
-
# if f32 desired, convert any float16 to float32
|
2641
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2642
|
-
data = data.astype(np.float32)
|
2643
|
-
|
2644
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
2645
|
-
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
2646
|
-
data = data.astype(np.float32)
|
2647
|
-
|
2648
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
2649
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
2650
|
-
data = data.astype(np.float16)
|
2651
|
-
|
2652
|
-
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
2653
|
-
|
2654
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2655
|
-
|
2656
|
-
if len(experts) > 0:
|
2657
|
-
raise ValueError(f"Unprocessed experts: {experts.keys()}")
|
2658
|
-
=======
|
2659
1699
|
_experts: list[dict[str, Tensor]] | None = None
|
2660
1700
|
|
2661
1701
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
@@ -2702,7 +1742,6 @@ class Qwen2MoeModel(Model):
|
|
2702
1742
|
experts = [k for d in self._experts for k in d.keys()]
|
2703
1743
|
if len(experts) > 0:
|
2704
1744
|
raise ValueError(f"Unprocessed experts: {experts}")
|
2705
|
-
>>>>>>> uupstream/master
|
2706
1745
|
|
2707
1746
|
|
2708
1747
|
@Model.register("GPT2LMHeadModel")
|
@@ -2735,42 +1774,11 @@ class GPT2Model(Model):
|
|
2735
1774
|
|
2736
1775
|
tensors.append((new_name, data_torch))
|
2737
1776
|
|
2738
|
-
<<<<<<< HEAD
|
2739
|
-
# map tensor names
|
2740
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
2741
|
-
if new_name is None:
|
2742
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
2743
|
-
|
2744
|
-
n_dims = len(data.shape)
|
2745
|
-
data_dtype = data.dtype
|
2746
|
-
|
2747
|
-
# if f32 desired, convert any float16 to float32
|
2748
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
2749
|
-
data = data.astype(np.float32)
|
2750
|
-
|
2751
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
2752
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
2753
|
-
data = data.astype(np.float32)
|
2754
|
-
|
2755
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
2756
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
2757
|
-
data = data.astype(np.float16)
|
2758
|
-
|
2759
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
2760
|
-
|
2761
|
-
self.gguf_writer.add_tensor(new_name, data)
|
2762
|
-
|
2763
|
-
# note: GPT2 output is tied to (same as) wte in original model
|
2764
|
-
if new_name == "token_embd.weight":
|
2765
|
-
logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
2766
|
-
self.gguf_writer.add_tensor("output.weight", data)
|
2767
|
-
=======
|
2768
1777
|
# note: GPT2 output is tied to (same as) wte in original model
|
2769
1778
|
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
2770
1779
|
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
2771
1780
|
|
2772
1781
|
return tensors
|
2773
|
-
>>>>>>> uupstream/master
|
2774
1782
|
|
2775
1783
|
|
2776
1784
|
@Model.register("PhiForCausalLM")
|
@@ -2810,12 +1818,8 @@ class Phi3MiniModel(Model):
|
|
2810
1818
|
if not tokenizer_path.is_file():
|
2811
1819
|
raise ValueError(f'Error: Missing {tokenizer_path}')
|
2812
1820
|
|
2813
|
-
<<<<<<< HEAD
|
2814
|
-
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
2815
|
-
=======
|
2816
1821
|
tokenizer = SentencePieceProcessor()
|
2817
1822
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
2818
|
-
>>>>>>> uupstream/master
|
2819
1823
|
|
2820
1824
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
2821
1825
|
|
@@ -2825,20 +1829,6 @@ class Phi3MiniModel(Model):
|
|
2825
1829
|
|
2826
1830
|
for token_id in range(tokenizer.vocab_size()):
|
2827
1831
|
|
2828
|
-
<<<<<<< HEAD
|
2829
|
-
piece = tokenizer.id_to_piece(token_id)
|
2830
|
-
text = piece.encode("utf-8")
|
2831
|
-
score = tokenizer.get_score(token_id)
|
2832
|
-
|
2833
|
-
toktype = SentencePieceTokenTypes.NORMAL
|
2834
|
-
if tokenizer.is_unknown(token_id):
|
2835
|
-
toktype = SentencePieceTokenTypes.UNKNOWN
|
2836
|
-
elif tokenizer.is_control(token_id):
|
2837
|
-
toktype = SentencePieceTokenTypes.CONTROL
|
2838
|
-
elif tokenizer.is_unused(token_id):
|
2839
|
-
toktype = SentencePieceTokenTypes.UNUSED
|
2840
|
-
elif tokenizer.is_byte(token_id):
|
2841
|
-
=======
|
2842
1832
|
piece = tokenizer.IdToPiece(token_id)
|
2843
1833
|
text = piece.encode("utf-8")
|
2844
1834
|
score = tokenizer.GetScore(token_id)
|
@@ -2851,7 +1841,6 @@ class Phi3MiniModel(Model):
|
|
2851
1841
|
elif tokenizer.IsUnused(token_id):
|
2852
1842
|
toktype = SentencePieceTokenTypes.UNUSED
|
2853
1843
|
elif tokenizer.IsByte(token_id):
|
2854
|
-
>>>>>>> uupstream/master
|
2855
1844
|
toktype = SentencePieceTokenTypes.BYTE
|
2856
1845
|
|
2857
1846
|
tokens[token_id] = text
|
@@ -2873,8 +1862,6 @@ class Phi3MiniModel(Model):
|
|
2873
1862
|
scores[token_id] = -1000.0
|
2874
1863
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
2875
1864
|
|
2876
|
-
<<<<<<< HEAD
|
2877
|
-
=======
|
2878
1865
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
2879
1866
|
if tokenizer_config_file.is_file():
|
2880
1867
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
@@ -2907,7 +1894,6 @@ class Phi3MiniModel(Model):
|
|
2907
1894
|
if foken_data.get("special"):
|
2908
1895
|
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
2909
1896
|
|
2910
|
-
>>>>>>> uupstream/master
|
2911
1897
|
self.gguf_writer.add_tokenizer_model("llama")
|
2912
1898
|
self.gguf_writer.add_tokenizer_pre("default")
|
2913
1899
|
self.gguf_writer.add_token_list(tokens)
|
@@ -2920,25 +1906,6 @@ class Phi3MiniModel(Model):
|
|
2920
1906
|
def set_gguf_parameters(self):
|
2921
1907
|
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
2922
1908
|
|
2923
|
-
<<<<<<< HEAD
|
2924
|
-
rot_pct = 1.0
|
2925
|
-
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
2926
|
-
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
2927
|
-
rms_eps = self.find_hparam(["rms_norm_eps"])
|
2928
|
-
|
2929
|
-
self.gguf_writer.add_name("Phi3")
|
2930
|
-
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
2931
|
-
|
2932
|
-
self.gguf_writer.add_embedding_length(n_embd)
|
2933
|
-
self.gguf_writer.add_feed_forward_length(8192)
|
2934
|
-
self.gguf_writer.add_block_count(block_count)
|
2935
|
-
self.gguf_writer.add_head_count(n_head)
|
2936
|
-
self.gguf_writer.add_head_count_kv(n_head)
|
2937
|
-
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
2938
|
-
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
2939
|
-
self.gguf_writer.add_file_type(self.ftype)
|
2940
|
-
|
2941
|
-
=======
|
2942
1909
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
2943
1910
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
2944
1911
|
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
@@ -2992,7 +1959,6 @@ class Phi3MiniModel(Model):
|
|
2992
1959
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
2993
1960
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
2994
1961
|
|
2995
|
-
>>>>>>> uupstream/master
|
2996
1962
|
|
2997
1963
|
@Model.register("PlamoForCausalLM")
|
2998
1964
|
class PlamoModel(Model):
|
@@ -3034,45 +2000,6 @@ class PlamoModel(Model):
|
|
3034
2000
|
|
3035
2001
|
new_name = self.map_tensor_name(name)
|
3036
2002
|
|
3037
|
-
<<<<<<< HEAD
|
3038
|
-
# map tensor names
|
3039
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
3040
|
-
if new_name is None:
|
3041
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
3042
|
-
|
3043
|
-
# shuffle for broadcasting of gqa in ggml_mul_mat
|
3044
|
-
if new_name.endswith("attn_q.weight"):
|
3045
|
-
data_torch = self.shuffle_attn_q_weight(data_torch)
|
3046
|
-
elif new_name.endswith("attn_output.weight"):
|
3047
|
-
data_torch = self.shuffle_attn_output_weight(data_torch)
|
3048
|
-
|
3049
|
-
old_dtype = data_torch.dtype
|
3050
|
-
|
3051
|
-
# convert any unsupported data types to float32
|
3052
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
3053
|
-
data_torch = data_torch.to(torch.float32)
|
3054
|
-
|
3055
|
-
data = data_torch.squeeze().numpy()
|
3056
|
-
|
3057
|
-
n_dims = len(data.shape)
|
3058
|
-
data_dtype = data.dtype
|
3059
|
-
|
3060
|
-
# if f32 desired, convert any float16 to float32
|
3061
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
3062
|
-
data = data.astype(np.float32)
|
3063
|
-
|
3064
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
3065
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
3066
|
-
data = data.astype(np.float32)
|
3067
|
-
|
3068
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
3069
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
3070
|
-
data = data.astype(np.float16)
|
3071
|
-
|
3072
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
3073
|
-
|
3074
|
-
self.gguf_writer.add_tensor(new_name, data)
|
3075
|
-
=======
|
3076
2003
|
# shuffle for broadcasting of gqa in ggml_mul_mat
|
3077
2004
|
if new_name.endswith("attn_q.weight"):
|
3078
2005
|
data_torch = self.shuffle_attn_q_weight(data_torch)
|
@@ -3080,7 +2007,6 @@ class PlamoModel(Model):
|
|
3080
2007
|
data_torch = self.shuffle_attn_output_weight(data_torch)
|
3081
2008
|
|
3082
2009
|
return [(new_name, data_torch)]
|
3083
|
-
>>>>>>> uupstream/master
|
3084
2010
|
|
3085
2011
|
|
3086
2012
|
@Model.register("CodeShellForCausalLM")
|
@@ -3113,41 +2039,11 @@ class CodeShellModel(Model):
|
|
3113
2039
|
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
3114
2040
|
assert self.tensor_names is not None
|
3115
2041
|
|
3116
|
-
<<<<<<< HEAD
|
3117
|
-
# map tensor names
|
3118
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
3119
|
-
if new_name is None:
|
3120
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
3121
|
-
|
3122
|
-
n_dims = len(data.shape)
|
3123
|
-
data_dtype = data.dtype
|
3124
|
-
|
3125
|
-
# if f32 desired, convert any float16 to float32
|
3126
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
3127
|
-
data = data.astype(np.float32)
|
3128
|
-
|
3129
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
3130
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
3131
|
-
data = data.astype(np.float32)
|
3132
|
-
|
3133
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
3134
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
3135
|
-
data = data.astype(np.float16)
|
3136
|
-
|
3137
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
3138
|
-
|
3139
|
-
self.gguf_writer.add_tensor(new_name, data)
|
3140
|
-
|
3141
|
-
if not has_lm_head and name == "transformer.wte.weight":
|
3142
|
-
self.gguf_writer.add_tensor("output.weight", data)
|
3143
|
-
logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
3144
|
-
=======
|
3145
2042
|
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
|
3146
2043
|
# copy tok_embd.weight to output.weight
|
3147
2044
|
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
3148
2045
|
|
3149
2046
|
return tensors
|
3150
|
-
>>>>>>> uupstream/master
|
3151
2047
|
|
3152
2048
|
|
3153
2049
|
@Model.register("InternLM2ForCausalLM")
|
@@ -3188,13 +2084,8 @@ class InternLM2Model(Model):
|
|
3188
2084
|
if text == b"\x00":
|
3189
2085
|
# (TODO): fixme
|
3190
2086
|
# Hack here and replace the \x00 characters.
|
3191
|
-
<<<<<<< HEAD
|
3192
|
-
logger.debug(f"InternLM2 convert token '{text}' to '🐉'!")
|
3193
|
-
text = "🐉"
|
3194
|
-
=======
|
3195
2087
|
logger.warning(f"InternLM2 convert token '{text}' to '🐉'!")
|
3196
2088
|
text = "🐉".encode("utf-8")
|
3197
|
-
>>>>>>> uupstream/master
|
3198
2089
|
|
3199
2090
|
toktype = SentencePieceTokenTypes.NORMAL
|
3200
2091
|
if tokenizer.IsUnknown(token_id):
|
@@ -3270,51 +2161,10 @@ in chat mode so that the conversation can end normally.")
|
|
3270
2161
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
3271
2162
|
self.gguf_writer.add_file_type(self.ftype)
|
3272
2163
|
|
3273
|
-
<<<<<<< HEAD
|
3274
|
-
def post_write_tensors(self, tensor_map, name, data_torch):
|
3275
|
-
old_dtype = data_torch.dtype
|
3276
|
-
|
3277
|
-
# convert any unsupported data types to float32
|
3278
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
3279
|
-
data_torch = data_torch.to(torch.float32)
|
3280
|
-
|
3281
|
-
data = data_torch.squeeze().numpy()
|
3282
|
-
|
3283
|
-
# map tensor names
|
3284
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
3285
|
-
if new_name is None:
|
3286
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
3287
|
-
|
3288
|
-
n_dims = len(data.shape)
|
3289
|
-
data_dtype = data.dtype
|
3290
|
-
|
3291
|
-
# if f32 desired, convert any float16 to float32
|
3292
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
3293
|
-
data = data.astype(np.float32)
|
3294
|
-
|
3295
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
3296
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
3297
|
-
data = data.astype(np.float32)
|
3298
|
-
|
3299
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
3300
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
3301
|
-
data = data.astype(np.float16)
|
3302
|
-
|
3303
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
3304
|
-
self.gguf_writer.add_tensor(new_name, data)
|
3305
|
-
|
3306
|
-
def write_tensors(self):
|
3307
|
-
from einops import rearrange
|
3308
|
-
|
3309
|
-
num_heads = self.hparams.get("num_attention_heads")
|
3310
|
-
num_kv_heads = self.hparams.get("num_key_value_heads")
|
3311
|
-
hidden_size = self.hparams.get("hidden_size")
|
3312
|
-
=======
|
3313
2164
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3314
2165
|
num_heads = self.hparams["num_attention_heads"]
|
3315
2166
|
num_kv_heads = self.hparams["num_key_value_heads"]
|
3316
2167
|
hidden_size = self.hparams["hidden_size"]
|
3317
|
-
>>>>>>> uupstream/master
|
3318
2168
|
q_per_kv = num_heads // num_kv_heads
|
3319
2169
|
head_dim = hidden_size // num_heads
|
3320
2170
|
num_groups = num_heads // q_per_kv
|
@@ -3408,43 +2258,11 @@ class BertModel(Model):
|
|
3408
2258
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3409
2259
|
del bid # unused
|
3410
2260
|
|
3411
|
-
<<<<<<< HEAD
|
3412
|
-
# map tensor names
|
3413
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
3414
|
-
if new_name is None:
|
3415
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
3416
|
-
|
3417
|
-
# convert any unsupported data types to float32
|
3418
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
3419
|
-
data_torch = data_torch.to(torch.float32)
|
3420
|
-
|
3421
|
-
data = data_torch.squeeze().numpy()
|
3422
|
-
n_dims = len(data.shape)
|
3423
|
-
new_dtype: type[np.floating[Any]]
|
3424
|
-
|
3425
|
-
if (
|
3426
|
-
self.ftype == 1 and name.endswith(".weight") and n_dims == 2
|
3427
|
-
and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
|
3428
|
-
):
|
3429
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
3430
|
-
new_dtype = np.float16
|
3431
|
-
else:
|
3432
|
-
# if f32 desired, convert any float16 to float32
|
3433
|
-
new_dtype = np.float32
|
3434
|
-
|
3435
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
|
3436
|
-
|
3437
|
-
if data.dtype != new_dtype:
|
3438
|
-
data = data.astype(new_dtype)
|
3439
|
-
|
3440
|
-
self.gguf_writer.add_tensor(new_name, data)
|
3441
|
-
=======
|
3442
2261
|
# we are only using BERT for embeddings so we don't need the pooling layer
|
3443
2262
|
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
3444
2263
|
return [] # we don't need these
|
3445
2264
|
|
3446
2265
|
return [(self.map_tensor_name(name), data_torch)]
|
3447
|
-
>>>>>>> uupstream/master
|
3448
2266
|
|
3449
2267
|
|
3450
2268
|
@Model.register("NomicBertModel")
|
@@ -3513,53 +2331,17 @@ class GemmaModel(Model):
|
|
3513
2331
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3514
2332
|
del bid # unused
|
3515
2333
|
|
3516
|
-
<<<<<<< HEAD
|
3517
|
-
for name, data_torch in self.get_tensors():
|
3518
|
-
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
3519
|
-
# To prevent errors, skip loading lm_head.weight.
|
3520
|
-
if name == "lm_head.weight":
|
3521
|
-
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
3522
|
-
continue
|
3523
|
-
|
3524
|
-
old_dtype = data_torch.dtype
|
3525
|
-
=======
|
3526
2334
|
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
3527
2335
|
# To prevent errors, skip loading lm_head.weight.
|
3528
2336
|
if name == "lm_head.weight":
|
3529
2337
|
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
3530
2338
|
return []
|
3531
|
-
>>>>>>> uupstream/master
|
3532
2339
|
|
3533
2340
|
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
3534
2341
|
if name.endswith("norm.weight"):
|
3535
2342
|
data_torch = data_torch + 1
|
3536
2343
|
|
3537
|
-
<<<<<<< HEAD
|
3538
|
-
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
3539
|
-
if name.endswith("norm.weight"):
|
3540
|
-
data_torch = data_torch + 1
|
3541
|
-
data = data_torch.squeeze().numpy()
|
3542
|
-
|
3543
|
-
# map tensor names
|
3544
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
3545
|
-
if new_name is None:
|
3546
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
3547
|
-
|
3548
|
-
n_dims = len(data.shape)
|
3549
|
-
data_dtype = data.dtype
|
3550
|
-
|
3551
|
-
data = data.astype(np.float32)
|
3552
|
-
|
3553
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
3554
|
-
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
3555
|
-
data = data.astype(np.float16)
|
3556
|
-
|
3557
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
3558
|
-
|
3559
|
-
self.gguf_writer.add_tensor(new_name, data)
|
3560
|
-
=======
|
3561
2344
|
return [(self.map_tensor_name(name), data_torch)]
|
3562
|
-
>>>>>>> uupstream/master
|
3563
2345
|
|
3564
2346
|
|
3565
2347
|
@Model.register("Starcoder2ForCausalLM")
|
@@ -3591,17 +2373,10 @@ class MambaModel(Model):
|
|
3591
2373
|
neox_reader = gguf.GGUFReader(tokenizer_path, "r")
|
3592
2374
|
|
3593
2375
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
3594
|
-
<<<<<<< HEAD
|
3595
|
-
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
|
3596
|
-
|
3597
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
3598
|
-
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
|
3599
|
-
=======
|
3600
2376
|
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
|
3601
2377
|
|
3602
2378
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
|
3603
2379
|
self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
|
3604
|
-
>>>>>>> uupstream/master
|
3605
2380
|
|
3606
2381
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
3607
2382
|
assert field
|
@@ -3616,17 +2391,10 @@ class MambaModel(Model):
|
|
3616
2391
|
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
3617
2392
|
|
3618
2393
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
|
3619
|
-
<<<<<<< HEAD
|
3620
|
-
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
|
3621
|
-
|
3622
|
-
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
3623
|
-
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
|
3624
|
-
=======
|
3625
2394
|
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
|
3626
2395
|
|
3627
2396
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
3628
2397
|
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
|
3629
|
-
>>>>>>> uupstream/master
|
3630
2398
|
|
3631
2399
|
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
|
3632
2400
|
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
|
@@ -3671,24 +2439,6 @@ class MambaModel(Model):
|
|
3671
2439
|
|
3672
2440
|
new_name = self.map_tensor_name(name)
|
3673
2441
|
|
3674
|
-
<<<<<<< HEAD
|
3675
|
-
# map tensor names
|
3676
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
3677
|
-
if new_name is None:
|
3678
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
3679
|
-
|
3680
|
-
if name.endswith(".A_log"):
|
3681
|
-
logger.debug("A_log --> A ==> " + new_name)
|
3682
|
-
data_torch = -torch.exp(data_torch)
|
3683
|
-
|
3684
|
-
# assuming token_embd.weight is seen before output.weight
|
3685
|
-
if tok_embd is not None and new_name == output_name:
|
3686
|
-
if torch.equal(tok_embd, data_torch):
|
3687
|
-
logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
|
3688
|
-
continue
|
3689
|
-
if new_name == tok_embd_name:
|
3690
|
-
tok_embd = data_torch
|
3691
|
-
=======
|
3692
2442
|
if name.endswith(".A_log"):
|
3693
2443
|
logger.debug("A_log --> A ==> " + new_name)
|
3694
2444
|
data_torch = -torch.exp(data_torch)
|
@@ -3702,32 +2452,10 @@ class MambaModel(Model):
|
|
3702
2452
|
self._tok_embd = data_torch
|
3703
2453
|
|
3704
2454
|
return [(new_name, data_torch)]
|
3705
|
-
>>>>>>> uupstream/master
|
3706
2455
|
|
3707
2456
|
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
3708
2457
|
del n_dims # unused
|
3709
2458
|
|
3710
|
-
<<<<<<< HEAD
|
3711
|
-
n_dims = len(data.shape)
|
3712
|
-
data_dtype = data.dtype
|
3713
|
-
|
3714
|
-
# if f32 desired, convert any float16 to float32
|
3715
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
3716
|
-
data = data.astype(np.float32)
|
3717
|
-
|
3718
|
-
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
3719
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
3720
|
-
data = data.astype(np.float32)
|
3721
|
-
|
3722
|
-
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
3723
|
-
new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
|
3724
|
-
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
3725
|
-
data = data.astype(np.float16)
|
3726
|
-
|
3727
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
3728
|
-
|
3729
|
-
self.gguf_writer.add_tensor(new_name, data)
|
3730
|
-
=======
|
3731
2459
|
return bid is not None and new_name in (
|
3732
2460
|
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
3733
2461
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
@@ -3737,7 +2465,6 @@ class MambaModel(Model):
|
|
3737
2465
|
gguf.MODEL_TENSOR.SSM_D,
|
3738
2466
|
]
|
3739
2467
|
)
|
3740
|
-
>>>>>>> uupstream/master
|
3741
2468
|
|
3742
2469
|
|
3743
2470
|
@Model.register("CohereForCausalLM")
|
@@ -3772,52 +2499,6 @@ class OlmoModel(Model):
|
|
3772
2499
|
|
3773
2500
|
# Same as super class, but permuting q_proj, k_proj
|
3774
2501
|
# Copied from: LlamaModel
|
3775
|
-
<<<<<<< HEAD
|
3776
|
-
def write_tensors(self):
|
3777
|
-
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
3778
|
-
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
3779
|
-
n_head = self.hparams.get("num_attention_heads")
|
3780
|
-
n_kv_head = self.hparams.get("num_key_value_heads")
|
3781
|
-
for name, data_torch in self.get_tensors():
|
3782
|
-
old_dtype = data_torch.dtype
|
3783
|
-
|
3784
|
-
# convert any unsupported data types to float32
|
3785
|
-
if data_torch.dtype not in (torch.float16, torch.float32):
|
3786
|
-
data_torch = data_torch.to(torch.float32)
|
3787
|
-
|
3788
|
-
data = data_torch.numpy()
|
3789
|
-
|
3790
|
-
if name.endswith("q_proj.weight"):
|
3791
|
-
data = permute(data, n_head, n_head)
|
3792
|
-
if name.endswith("k_proj.weight"):
|
3793
|
-
data = permute(data, n_head, n_kv_head)
|
3794
|
-
|
3795
|
-
data = data.squeeze()
|
3796
|
-
|
3797
|
-
# map tensor names
|
3798
|
-
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
3799
|
-
if new_name is None:
|
3800
|
-
raise ValueError(f"Can not map tensor {name!r}")
|
3801
|
-
|
3802
|
-
n_dims = len(data.shape)
|
3803
|
-
data_dtype = data.dtype
|
3804
|
-
|
3805
|
-
# if f32 desired, convert any float16 to float32
|
3806
|
-
if self.ftype == 0 and data_dtype == np.float16:
|
3807
|
-
data = data.astype(np.float32)
|
3808
|
-
|
3809
|
-
# 1d tensors need to be converted to float32
|
3810
|
-
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
3811
|
-
data = data.astype(np.float32)
|
3812
|
-
|
3813
|
-
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
3814
|
-
if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
|
3815
|
-
data = data.astype(np.float16)
|
3816
|
-
|
3817
|
-
logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
3818
|
-
|
3819
|
-
self.gguf_writer.add_tensor(new_name, data)
|
3820
|
-
=======
|
3821
2502
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3822
2503
|
del bid # unused
|
3823
2504
|
|
@@ -4018,7 +2699,6 @@ class ArcticModel(Model):
|
|
4018
2699
|
experts = [k for d in self._experts for k in d.keys()]
|
4019
2700
|
if len(experts) > 0:
|
4020
2701
|
raise ValueError(f"Unprocessed experts: {experts}")
|
4021
|
-
>>>>>>> uupstream/master
|
4022
2702
|
|
4023
2703
|
|
4024
2704
|
###### CONVERSION LOGIC ######
|
@@ -4090,11 +2770,6 @@ def parse_args() -> argparse.Namespace:
|
|
4090
2770
|
"model", type=Path,
|
4091
2771
|
help="directory containing model file",
|
4092
2772
|
)
|
4093
|
-
<<<<<<< HEAD
|
4094
|
-
parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
|
4095
|
-
parser.add_argument("--model-name", type=str, default=None, help="name of the model")
|
4096
|
-
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
4097
|
-
=======
|
4098
2773
|
parser.add_argument(
|
4099
2774
|
"--use-temp-file", action="store_true",
|
4100
2775
|
help="use the tempfile library while processing (helpful when running out of memory, process killed)",
|
@@ -4111,7 +2786,6 @@ def parse_args() -> argparse.Namespace:
|
|
4111
2786
|
"--verbose", action="store_true",
|
4112
2787
|
help="increase output verbosity",
|
4113
2788
|
)
|
4114
|
-
>>>>>>> uupstream/master
|
4115
2789
|
|
4116
2790
|
return parser.parse_args()
|
4117
2791
|
|
@@ -4160,11 +2834,7 @@ def main() -> None:
|
|
4160
2834
|
|
4161
2835
|
with torch.inference_mode():
|
4162
2836
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
4163
|
-
<<<<<<< HEAD
|
4164
|
-
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
|
4165
|
-
=======
|
4166
2837
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
|
4167
|
-
>>>>>>> uupstream/master
|
4168
2838
|
|
4169
2839
|
logger.info("Set model parameters")
|
4170
2840
|
model_instance.set_gguf_parameters()
|
@@ -4175,15 +2845,6 @@ def main() -> None:
|
|
4175
2845
|
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
4176
2846
|
|
4177
2847
|
if args.vocab_only:
|
4178
|
-
<<<<<<< HEAD
|
4179
|
-
logger.info(f"Exporting model vocab to '{fname_out}'")
|
4180
|
-
model_instance.write_vocab()
|
4181
|
-
else:
|
4182
|
-
logger.info(f"Exporting model to '{fname_out}'")
|
4183
|
-
model_instance.write()
|
4184
|
-
|
4185
|
-
logger.info(f"Model successfully exported to '{fname_out}'")
|
4186
|
-
=======
|
4187
2848
|
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
4188
2849
|
model_instance.write_vocab()
|
4189
2850
|
else:
|
@@ -4191,7 +2852,6 @@ def main() -> None:
|
|
4191
2852
|
model_instance.write()
|
4192
2853
|
|
4193
2854
|
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
4194
|
-
>>>>>>> uupstream/master
|
4195
2855
|
|
4196
2856
|
|
4197
2857
|
if __name__ == '__main__':
|