bigdl-core-cpp 2.5.0b20240826__py3-none-win_amd64.whl → 2.5.0b20240827__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +413 -67
- bigdl/cpp/convert_hf_to_gguf_update.py +354 -0
- bigdl/cpp/convert_llama_ggml_to_gguf.py +454 -0
- bigdl/cpp/convert_lora_to_gguf.py +393 -0
- bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
- bigdl/cpp/gguf-py/gguf/constants.py +71 -2
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +16 -1
- bigdl/cpp/gguf-py/gguf/lazy.py +4 -1
- bigdl/cpp/gguf-py/gguf/metadata.py +70 -63
- bigdl/cpp/gguf-py/gguf/quants.py +1129 -64
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +23 -15
- bigdl/cpp/gguf-py/gguf/utility.py +1 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +301 -1
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/{gguf.exe → dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll} +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/{ggml_shared.dll → ggml.dll} +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/{ls-sycl-device.exe → llama-ls-sycl-device.exe} +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- {bigdl_core_cpp-2.5.0b20240826.data → bigdl_core_cpp-2.5.0b20240827.data}/scripts/init-llama-cpp.bat +7 -2
- {bigdl_core_cpp-2.5.0b20240826.data → bigdl_core_cpp-2.5.0b20240827.data}/scripts/init-ollama.bat +6 -0
- {bigdl_core_cpp-2.5.0b20240826.dist-info → bigdl_core_cpp-2.5.0b20240827.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.5.0b20240827.dist-info/RECORD +54 -0
- bigdl/cpp/convert.py +0 -1714
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- bigdl_core_cpp-2.5.0b20240826.dist-info/RECORD +0 -63
- {bigdl_core_cpp-2.5.0b20240826.data → bigdl_core_cpp-2.5.0b20240827.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240826.dist-info → bigdl_core_cpp-2.5.0b20240827.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240826.dist-info → bigdl_core_cpp-2.5.0b20240827.dist-info}/top_level.txt +0 -0
@@ -251,12 +251,7 @@ class Model:
|
|
251
251
|
|
252
252
|
return [(self.map_tensor_name(name), data_torch)]
|
253
253
|
|
254
|
-
def
|
255
|
-
del name, new_name, bid, n_dims # unused
|
256
|
-
|
257
|
-
return False
|
258
|
-
|
259
|
-
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
254
|
+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
260
255
|
del name, new_name, bid, n_dims # unused
|
261
256
|
|
262
257
|
return False
|
@@ -285,54 +280,47 @@ class Model:
|
|
285
280
|
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
286
281
|
data: np.ndarray # type hint
|
287
282
|
n_dims = len(data.shape)
|
288
|
-
|
289
|
-
data_qtype: gguf.GGMLQuantizationType | None = None
|
290
|
-
|
291
|
-
# when both are True, f32 should win
|
292
|
-
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
293
|
-
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
|
283
|
+
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
294
284
|
|
295
285
|
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
296
|
-
|
297
|
-
|
298
|
-
extra_f32,
|
299
|
-
n_dims == 1,
|
300
|
-
new_name.endswith("_norm.weight"),
|
301
|
-
))
|
286
|
+
if n_dims <= 1 or new_name.endswith("_norm.weight"):
|
287
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
302
288
|
|
289
|
+
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
303
290
|
# Some tensor types are always in float32
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
318
|
-
data = gguf.quantize_bf16(data)
|
319
|
-
assert data.dtype == np.int16
|
320
|
-
data_qtype = gguf.GGMLQuantizationType.BF16
|
321
|
-
|
322
|
-
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
323
|
-
data = gguf.quantize_q8_0(data)
|
324
|
-
assert data.dtype == np.uint8
|
325
|
-
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
291
|
+
if data_qtype is False and (
|
292
|
+
any(
|
293
|
+
self.match_model_tensor_name(new_name, key, bid)
|
294
|
+
for key in (
|
295
|
+
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
296
|
+
gguf.MODEL_TENSOR.POS_EMBD,
|
297
|
+
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
298
|
+
gguf.MODEL_TENSOR.SSM_CONV1D,
|
299
|
+
)
|
300
|
+
)
|
301
|
+
or not name.endswith(".weight")
|
302
|
+
):
|
303
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
326
304
|
|
327
|
-
|
328
|
-
|
329
|
-
|
305
|
+
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
306
|
+
if isinstance(data_qtype, bool):
|
307
|
+
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
308
|
+
data_qtype = gguf.GGMLQuantizationType.F32
|
309
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
330
310
|
data_qtype = gguf.GGMLQuantizationType.F16
|
311
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
312
|
+
data_qtype = gguf.GGMLQuantizationType.BF16
|
313
|
+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
314
|
+
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
315
|
+
else:
|
316
|
+
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
331
317
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
318
|
+
try:
|
319
|
+
data = gguf.quants.quantize(data, data_qtype)
|
320
|
+
except gguf.QuantError as e:
|
321
|
+
logger.warning("%s, %s", e, "falling back to F16")
|
322
|
+
data_qtype = gguf.GGMLQuantizationType.F16
|
323
|
+
data = gguf.quants.quantize(data, data_qtype)
|
336
324
|
|
337
325
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
338
326
|
|
@@ -603,6 +591,15 @@ class Model:
|
|
603
591
|
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
604
592
|
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
605
593
|
res = "smollm"
|
594
|
+
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
|
595
|
+
# ref: https://huggingface.co/bigscience/bloom
|
596
|
+
res = "bloom"
|
597
|
+
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
|
598
|
+
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
|
599
|
+
res = "gpt3-finnish"
|
600
|
+
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
601
|
+
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
602
|
+
res = "exaone"
|
606
603
|
|
607
604
|
if res is None:
|
608
605
|
logger.warning("\n")
|
@@ -906,7 +903,7 @@ class GPTNeoXModel(Model):
|
|
906
903
|
return tensors
|
907
904
|
|
908
905
|
|
909
|
-
@Model.register("BloomForCausalLM")
|
906
|
+
@Model.register("BloomForCausalLM", "BloomModel")
|
910
907
|
class BloomModel(Model):
|
911
908
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
912
909
|
|
@@ -1575,6 +1572,7 @@ class LlamaModel(Model):
|
|
1575
1572
|
base = self.hparams.get("rope_theta", 10000.0)
|
1576
1573
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
1577
1574
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
1575
|
+
|
1578
1576
|
factor = rope_scaling.get("factor", 8.0)
|
1579
1577
|
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
1580
1578
|
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
@@ -1764,7 +1762,7 @@ class DbrxModel(Model):
|
|
1764
1762
|
|
1765
1763
|
return [(new_name, data_torch)]
|
1766
1764
|
|
1767
|
-
def
|
1765
|
+
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
1768
1766
|
del name, new_name, bid # unused
|
1769
1767
|
|
1770
1768
|
return n_dims > 1
|
@@ -1791,7 +1789,7 @@ class MiniCPMModel(Model):
|
|
1791
1789
|
|
1792
1790
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
1793
1791
|
if n_kv_head is not None and n_head != n_kv_head:
|
1794
|
-
|
1792
|
+
n_head = n_kv_head
|
1795
1793
|
|
1796
1794
|
return (
|
1797
1795
|
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
@@ -2505,6 +2503,112 @@ class NomicBertModel(BertModel):
|
|
2505
2503
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
2506
2504
|
|
2507
2505
|
|
2506
|
+
@Model.register("XLMRobertaModel")
|
2507
|
+
class XLMRobertaModel(BertModel):
|
2508
|
+
model_arch = gguf.MODEL_ARCH.BERT
|
2509
|
+
|
2510
|
+
def __init__(self, *args, **kwargs):
|
2511
|
+
super().__init__(*args, **kwargs)
|
2512
|
+
|
2513
|
+
# we need the pad_token_id to know how to chop down position_embd matrix
|
2514
|
+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
2515
|
+
self._position_offset = 1 + pad_token_id
|
2516
|
+
if "max_position_embeddings" in self.hparams:
|
2517
|
+
self.hparams["max_position_embeddings"] -= self._position_offset
|
2518
|
+
else:
|
2519
|
+
self._position_offset = None
|
2520
|
+
|
2521
|
+
def set_vocab(self):
|
2522
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
2523
|
+
# exception when importing sentencepiece_model_pb2
|
2524
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
2525
|
+
from sentencepiece import SentencePieceProcessor
|
2526
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
2527
|
+
|
2528
|
+
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
|
2529
|
+
if not tokenizer_path.is_file():
|
2530
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
2531
|
+
|
2532
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
2533
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
2534
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
2535
|
+
|
2536
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
2537
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
2538
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
2539
|
+
|
2540
|
+
tokenizer = SentencePieceProcessor()
|
2541
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
2542
|
+
|
2543
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
2544
|
+
|
2545
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
2546
|
+
scores: list[float] = [-10000.0] * vocab_size
|
2547
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
2548
|
+
|
2549
|
+
for token_id in range(tokenizer.vocab_size()):
|
2550
|
+
piece = tokenizer.IdToPiece(token_id)
|
2551
|
+
text = piece.encode("utf-8")
|
2552
|
+
score = tokenizer.GetScore(token_id)
|
2553
|
+
|
2554
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
2555
|
+
if tokenizer.IsUnknown(token_id):
|
2556
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
2557
|
+
elif tokenizer.IsControl(token_id):
|
2558
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
2559
|
+
elif tokenizer.IsUnused(token_id):
|
2560
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
2561
|
+
elif tokenizer.IsByte(token_id):
|
2562
|
+
toktype = SentencePieceTokenTypes.BYTE
|
2563
|
+
|
2564
|
+
tokens[token_id] = text
|
2565
|
+
scores[token_id] = score
|
2566
|
+
toktypes[token_id] = toktype
|
2567
|
+
|
2568
|
+
if vocab_size > len(tokens):
|
2569
|
+
pad_count = vocab_size - len(tokens)
|
2570
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
2571
|
+
for i in range(1, pad_count + 1):
|
2572
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
2573
|
+
scores.append(-1000.0)
|
2574
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
2575
|
+
|
2576
|
+
# realign tokens (see HF tokenizer code)
|
2577
|
+
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
|
2578
|
+
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
|
2579
|
+
toktypes = [
|
2580
|
+
SentencePieceTokenTypes.CONTROL,
|
2581
|
+
SentencePieceTokenTypes.CONTROL,
|
2582
|
+
SentencePieceTokenTypes.CONTROL,
|
2583
|
+
SentencePieceTokenTypes.UNKNOWN,
|
2584
|
+
] + toktypes[3:-1]
|
2585
|
+
|
2586
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
2587
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
2588
|
+
self.gguf_writer.add_token_list(tokens)
|
2589
|
+
self.gguf_writer.add_token_scores(scores)
|
2590
|
+
self.gguf_writer.add_token_types(toktypes)
|
2591
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
2592
|
+
self.gguf_writer.add_token_type_count(1)
|
2593
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
2594
|
+
if precompiled_charsmap:
|
2595
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
2596
|
+
|
2597
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
2598
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
2599
|
+
|
2600
|
+
self.gguf_writer.add_add_bos_token(True)
|
2601
|
+
self.gguf_writer.add_add_eos_token(True)
|
2602
|
+
|
2603
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
2604
|
+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
2605
|
+
if name == "embeddings.position_embeddings.weight":
|
2606
|
+
if self._position_offset is not None:
|
2607
|
+
data_torch = data_torch[self._position_offset:,:]
|
2608
|
+
|
2609
|
+
return super().modify_tensors(data_torch, name, bid)
|
2610
|
+
|
2611
|
+
|
2508
2612
|
@Model.register("GemmaForCausalLM")
|
2509
2613
|
class GemmaModel(Model):
|
2510
2614
|
model_arch = gguf.MODEL_ARCH.GEMMA
|
@@ -2608,7 +2712,7 @@ class StarCoder2Model(Model):
|
|
2608
2712
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
2609
2713
|
|
2610
2714
|
|
2611
|
-
@Model.register("MambaForCausalLM", "MambaLMHeadModel")
|
2715
|
+
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
2612
2716
|
class MambaModel(Model):
|
2613
2717
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
2614
2718
|
|
@@ -2639,7 +2743,10 @@ class MambaModel(Model):
|
|
2639
2743
|
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
2640
2744
|
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
|
2641
2745
|
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
2642
|
-
|
2746
|
+
use_dt_b_c_norm = False
|
2747
|
+
# For falconmamba we do apply RMS norm on B / DT and C layers
|
2748
|
+
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
|
2749
|
+
use_dt_b_c_norm = True
|
2643
2750
|
# Fail early for models which don't have a block expansion factor of 2
|
2644
2751
|
assert d_inner == 2 * d_model
|
2645
2752
|
|
@@ -2647,12 +2754,13 @@ class MambaModel(Model):
|
|
2647
2754
|
self.gguf_writer.add_embedding_length(d_model)
|
2648
2755
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
2649
2756
|
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
2650
|
-
self.gguf_writer.add_block_count(self.
|
2757
|
+
self.gguf_writer.add_block_count(self.block_count)
|
2651
2758
|
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
2652
2759
|
self.gguf_writer.add_ssm_inner_size(d_inner)
|
2653
2760
|
self.gguf_writer.add_ssm_state_size(d_state)
|
2654
2761
|
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
2655
2762
|
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
2763
|
+
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
|
2656
2764
|
self.gguf_writer.add_file_type(self.ftype)
|
2657
2765
|
|
2658
2766
|
_tok_embd = None
|
@@ -2679,19 +2787,6 @@ class MambaModel(Model):
|
|
2679
2787
|
|
2680
2788
|
return [(new_name, data_torch)]
|
2681
2789
|
|
2682
|
-
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
2683
|
-
del n_dims # unused
|
2684
|
-
|
2685
|
-
return bid is not None and new_name in (
|
2686
|
-
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
2687
|
-
gguf.MODEL_TENSOR.SSM_CONV1D,
|
2688
|
-
gguf.MODEL_TENSOR.SSM_X,
|
2689
|
-
gguf.MODEL_TENSOR.SSM_DT,
|
2690
|
-
gguf.MODEL_TENSOR.SSM_A,
|
2691
|
-
gguf.MODEL_TENSOR.SSM_D,
|
2692
|
-
]
|
2693
|
-
)
|
2694
|
-
|
2695
2790
|
|
2696
2791
|
@Model.register("CohereForCausalLM")
|
2697
2792
|
class CommandR2Model(Model):
|
@@ -3226,6 +3321,145 @@ class T5Model(Model):
|
|
3226
3321
|
return [(self.map_tensor_name(name), data_torch)]
|
3227
3322
|
|
3228
3323
|
|
3324
|
+
@Model.register("T5EncoderModel")
|
3325
|
+
class T5EncoderModel(Model):
|
3326
|
+
model_arch = gguf.MODEL_ARCH.T5ENCODER
|
3327
|
+
|
3328
|
+
def __init__(self, *args, **kwargs):
|
3329
|
+
super().__init__(*args, **kwargs)
|
3330
|
+
self.shared_token_embeddings_found = False
|
3331
|
+
|
3332
|
+
def set_vocab(self):
|
3333
|
+
# to avoid TypeError: Descriptors cannot be created directly
|
3334
|
+
# exception when importing sentencepiece_model_pb2
|
3335
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
3336
|
+
from sentencepiece import SentencePieceProcessor
|
3337
|
+
from sentencepiece import sentencepiece_model_pb2 as model
|
3338
|
+
|
3339
|
+
tokenizer_path = self.dir_model / 'tokenizer.model'
|
3340
|
+
|
3341
|
+
# many older models use spiece.model tokenizer model filename
|
3342
|
+
if not tokenizer_path.is_file():
|
3343
|
+
tokenizer_path = self.dir_model / 'spiece.model'
|
3344
|
+
|
3345
|
+
if not tokenizer_path.is_file():
|
3346
|
+
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
3347
|
+
|
3348
|
+
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
3349
|
+
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
3350
|
+
|
3351
|
+
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
3352
|
+
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
3353
|
+
# assure the tokenizer model file name is correct
|
3354
|
+
assert tokenizer_path.name == 'tokenizer.model'
|
3355
|
+
return self._set_vocab_sentencepiece()
|
3356
|
+
else:
|
3357
|
+
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
3358
|
+
|
3359
|
+
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
3360
|
+
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
3361
|
+
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
3362
|
+
|
3363
|
+
tokenizer = SentencePieceProcessor()
|
3364
|
+
tokenizer.LoadFromFile(str(tokenizer_path))
|
3365
|
+
|
3366
|
+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
3367
|
+
|
3368
|
+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
3369
|
+
scores: list[float] = [-10000.0] * vocab_size
|
3370
|
+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
3371
|
+
|
3372
|
+
for token_id in range(tokenizer.vocab_size()):
|
3373
|
+
piece = tokenizer.IdToPiece(token_id)
|
3374
|
+
text = piece.encode("utf-8")
|
3375
|
+
score = tokenizer.GetScore(token_id)
|
3376
|
+
|
3377
|
+
toktype = SentencePieceTokenTypes.NORMAL
|
3378
|
+
if tokenizer.IsUnknown(token_id):
|
3379
|
+
toktype = SentencePieceTokenTypes.UNKNOWN
|
3380
|
+
elif tokenizer.IsControl(token_id):
|
3381
|
+
toktype = SentencePieceTokenTypes.CONTROL
|
3382
|
+
elif tokenizer.IsUnused(token_id):
|
3383
|
+
toktype = SentencePieceTokenTypes.UNUSED
|
3384
|
+
elif tokenizer.IsByte(token_id):
|
3385
|
+
toktype = SentencePieceTokenTypes.BYTE
|
3386
|
+
|
3387
|
+
tokens[token_id] = text
|
3388
|
+
scores[token_id] = score
|
3389
|
+
toktypes[token_id] = toktype
|
3390
|
+
|
3391
|
+
added_tokens_file = self.dir_model / 'added_tokens.json'
|
3392
|
+
if added_tokens_file.is_file():
|
3393
|
+
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
3394
|
+
added_tokens_json = json.load(f)
|
3395
|
+
for key in added_tokens_json:
|
3396
|
+
token_id = added_tokens_json[key]
|
3397
|
+
if token_id >= vocab_size:
|
3398
|
+
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
3399
|
+
continue
|
3400
|
+
|
3401
|
+
tokens[token_id] = key.encode("utf-8")
|
3402
|
+
scores[token_id] = -1000.0
|
3403
|
+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
3404
|
+
|
3405
|
+
if vocab_size > len(tokens):
|
3406
|
+
pad_count = vocab_size - len(tokens)
|
3407
|
+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
3408
|
+
for i in range(1, pad_count + 1):
|
3409
|
+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
3410
|
+
scores.append(-1000.0)
|
3411
|
+
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
3412
|
+
|
3413
|
+
self.gguf_writer.add_tokenizer_model("t5")
|
3414
|
+
self.gguf_writer.add_tokenizer_pre("default")
|
3415
|
+
self.gguf_writer.add_token_list(tokens)
|
3416
|
+
self.gguf_writer.add_token_scores(scores)
|
3417
|
+
self.gguf_writer.add_token_types(toktypes)
|
3418
|
+
self.gguf_writer.add_add_space_prefix(add_prefix)
|
3419
|
+
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
3420
|
+
if precompiled_charsmap:
|
3421
|
+
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
3422
|
+
|
3423
|
+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
3424
|
+
special_vocab.add_to_gguf(self.gguf_writer)
|
3425
|
+
|
3426
|
+
self.gguf_writer.add_add_bos_token(False)
|
3427
|
+
self.gguf_writer.add_add_eos_token(True)
|
3428
|
+
|
3429
|
+
def set_gguf_parameters(self):
|
3430
|
+
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
3431
|
+
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
3432
|
+
n_ctx = 512
|
3433
|
+
self.gguf_writer.add_context_length(n_ctx)
|
3434
|
+
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
3435
|
+
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
3436
|
+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
3437
|
+
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
3438
|
+
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
3439
|
+
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
3440
|
+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
3441
|
+
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
3442
|
+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
3443
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3444
|
+
|
3445
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3446
|
+
del bid # unused
|
3447
|
+
|
3448
|
+
# T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
|
3449
|
+
# "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
|
3450
|
+
# in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
|
3451
|
+
# and decoder and ignore the remaining ones.
|
3452
|
+
if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
|
3453
|
+
if not self.shared_token_embeddings_found:
|
3454
|
+
name = "shared.weight"
|
3455
|
+
self.shared_token_embeddings_found = True
|
3456
|
+
else:
|
3457
|
+
logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
|
3458
|
+
return []
|
3459
|
+
|
3460
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3461
|
+
|
3462
|
+
|
3229
3463
|
@Model.register("JAISLMHeadModel")
|
3230
3464
|
class JaisModel(Model):
|
3231
3465
|
model_arch = gguf.MODEL_ARCH.JAIS
|
@@ -3497,8 +3731,120 @@ class ChatGLMModel(Model):
|
|
3497
3731
|
name = name.removeprefix("transformer.")
|
3498
3732
|
return [(self.map_tensor_name(name), data_torch)]
|
3499
3733
|
|
3500
|
-
###### CONVERSION LOGIC ######
|
3501
3734
|
|
3735
|
+
@Model.register("NemotronForCausalLM")
|
3736
|
+
class NemotronModel(Model):
|
3737
|
+
model_arch = gguf.MODEL_ARCH.NEMOTRON
|
3738
|
+
|
3739
|
+
def set_vocab(self):
|
3740
|
+
self._set_vocab_sentencepiece()
|
3741
|
+
self.gguf_writer.add_pad_token_id(0)
|
3742
|
+
self.gguf_writer.add_unk_token_id(1)
|
3743
|
+
|
3744
|
+
def set_gguf_parameters(self):
|
3745
|
+
super().set_gguf_parameters()
|
3746
|
+
hparams = self.hparams
|
3747
|
+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
3748
|
+
|
3749
|
+
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
|
3750
|
+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
3751
|
+
|
3752
|
+
# * Partial RoPE
|
3753
|
+
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
|
3754
|
+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
3755
|
+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
3756
|
+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
3757
|
+
|
3758
|
+
# * RopeScaling for Nemotron
|
3759
|
+
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
|
3760
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
3761
|
+
else:
|
3762
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
3763
|
+
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
|
3764
|
+
|
3765
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
3766
|
+
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
|
3767
|
+
# model.layers.{l}.input_layernorm.weight
|
3768
|
+
# model.layers.{l}.post_attention_layernorm.weight
|
3769
|
+
# model.norm.weight
|
3770
|
+
if name.endswith("norm.weight"):
|
3771
|
+
data_torch = data_torch + 1
|
3772
|
+
|
3773
|
+
return [(self.map_tensor_name(name), data_torch)]
|
3774
|
+
|
3775
|
+
|
3776
|
+
@Model.register("ExaoneForCausalLM")
|
3777
|
+
class ExaoneModel(Model):
|
3778
|
+
model_arch = gguf.MODEL_ARCH.EXAONE
|
3779
|
+
|
3780
|
+
def set_gguf_parameters(self):
|
3781
|
+
hparams = self.hparams
|
3782
|
+
|
3783
|
+
assert (hparams["activation_function"] == "silu")
|
3784
|
+
|
3785
|
+
max_position_embeddings = hparams["max_position_embeddings"]
|
3786
|
+
embed_dim = hparams["hidden_size"]
|
3787
|
+
num_heads = hparams["num_attention_heads"]
|
3788
|
+
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
|
3789
|
+
layer_norm_eps = hparams["layer_norm_epsilon"]
|
3790
|
+
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
|
3791
|
+
num_layers = hparams["num_layers"]
|
3792
|
+
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
|
3793
|
+
# attention_dropout_rate = hparams["attention_dropout"]
|
3794
|
+
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
|
3795
|
+
# embed_dropout_rate = hparams["embed_dropout"]
|
3796
|
+
self.gguf_writer.add_embedding_length(embed_dim)
|
3797
|
+
self.gguf_writer.add_head_count(num_heads)
|
3798
|
+
self.gguf_writer.add_head_count_kv(num_kv_heads)
|
3799
|
+
self.gguf_writer.add_context_length(max_position_embeddings)
|
3800
|
+
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
|
3801
|
+
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
3802
|
+
self.gguf_writer.add_block_count(num_layers)
|
3803
|
+
self.gguf_writer.add_file_type(self.ftype)
|
3804
|
+
|
3805
|
+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
3806
|
+
self.gguf_writer.add_rope_freq_base(rope_theta)
|
3807
|
+
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
3808
|
+
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
3809
|
+
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
3810
|
+
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
|
3811
|
+
if hparams["rope_scaling"].get("type") == "linear":
|
3812
|
+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
3813
|
+
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
3814
|
+
|
3815
|
+
def prepare_tensors(self):
|
3816
|
+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
3817
|
+
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
3818
|
+
base = self.hparams.get("rope_theta", 10000.0)
|
3819
|
+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
3820
|
+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
3821
|
+
|
3822
|
+
factor = rope_scaling.get("factor", 8.0)
|
3823
|
+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
3824
|
+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
3825
|
+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
3826
|
+
|
3827
|
+
low_freq_wavelen = old_context_len / low_freq_factor
|
3828
|
+
high_freq_wavelen = old_context_len / high_freq_factor
|
3829
|
+
assert low_freq_wavelen != high_freq_wavelen
|
3830
|
+
|
3831
|
+
rope_factors = []
|
3832
|
+
for freq in freqs:
|
3833
|
+
wavelen = 2 * math.pi / freq
|
3834
|
+
if wavelen < high_freq_wavelen:
|
3835
|
+
rope_factors.append(1)
|
3836
|
+
elif wavelen > low_freq_wavelen:
|
3837
|
+
rope_factors.append(factor)
|
3838
|
+
else:
|
3839
|
+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
3840
|
+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
3841
|
+
|
3842
|
+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
3843
|
+
|
3844
|
+
super().prepare_tensors()
|
3845
|
+
|
3846
|
+
|
3847
|
+
###### CONVERSION LOGIC ######
|
3502
3848
|
|
3503
3849
|
# tree of lazy tensors
|
3504
3850
|
class LazyTorchTensor(gguf.LazyBase):
|