bigdl-core-cpp 2.5.0b20240826__py3-none-win_amd64.whl → 2.5.0b20240827__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +413 -67
  2. bigdl/cpp/convert_hf_to_gguf_update.py +354 -0
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +454 -0
  4. bigdl/cpp/convert_lora_to_gguf.py +393 -0
  5. bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/constants.py +71 -2
  7. bigdl/cpp/gguf-py/gguf/gguf_writer.py +16 -1
  8. bigdl/cpp/gguf-py/gguf/lazy.py +4 -1
  9. bigdl/cpp/gguf-py/gguf/metadata.py +70 -63
  10. bigdl/cpp/gguf-py/gguf/quants.py +1129 -64
  11. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +23 -15
  12. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  13. bigdl/cpp/gguf-py/gguf/vocab.py +301 -1
  14. bigdl/cpp/libs/common.lib +0 -0
  15. bigdl/cpp/libs/{gguf.exe → dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll} +0 -0
  16. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
  17. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
  18. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
  19. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
  20. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
  21. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
  22. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
  23. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
  24. bigdl/cpp/libs/{ggml_shared.dll → ggml.dll} +0 -0
  25. bigdl/cpp/libs/llama-batched.exe +0 -0
  26. bigdl/cpp/libs/llama-bench.exe +0 -0
  27. bigdl/cpp/libs/llama-cli.exe +0 -0
  28. bigdl/cpp/libs/llama-embedding.exe +0 -0
  29. bigdl/cpp/libs/llama-gguf.exe +0 -0
  30. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  31. bigdl/cpp/libs/llama-lookup.exe +0 -0
  32. bigdl/cpp/libs/{ls-sycl-device.exe → llama-ls-sycl-device.exe} +0 -0
  33. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  34. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  35. bigdl/cpp/libs/llama-quantize.exe +0 -0
  36. bigdl/cpp/libs/llama-server.exe +0 -0
  37. bigdl/cpp/libs/llama-simple.exe +0 -0
  38. bigdl/cpp/libs/llama-speculative.exe +0 -0
  39. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  40. bigdl/cpp/libs/llama.dll +0 -0
  41. bigdl/cpp/libs/llava_shared.dll +0 -0
  42. bigdl/cpp/libs/ollama.exe +0 -0
  43. {bigdl_core_cpp-2.5.0b20240826.data → bigdl_core_cpp-2.5.0b20240827.data}/scripts/init-llama-cpp.bat +7 -2
  44. {bigdl_core_cpp-2.5.0b20240826.data → bigdl_core_cpp-2.5.0b20240827.data}/scripts/init-ollama.bat +6 -0
  45. {bigdl_core_cpp-2.5.0b20240826.dist-info → bigdl_core_cpp-2.5.0b20240827.dist-info}/METADATA +1 -1
  46. bigdl_core_cpp-2.5.0b20240827.dist-info/RECORD +54 -0
  47. bigdl/cpp/convert.py +0 -1714
  48. bigdl/cpp/libs/baby-llama.exe +0 -0
  49. bigdl/cpp/libs/batched-bench.exe +0 -0
  50. bigdl/cpp/libs/batched.exe +0 -0
  51. bigdl/cpp/libs/beam-search.exe +0 -0
  52. bigdl/cpp/libs/benchmark.exe +0 -0
  53. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  54. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  55. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  56. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  57. bigdl/cpp/libs/embedding.exe +0 -0
  58. bigdl/cpp/libs/export-lora.exe +0 -0
  59. bigdl/cpp/libs/finetune.exe +0 -0
  60. bigdl/cpp/libs/gritlm.exe +0 -0
  61. bigdl/cpp/libs/imatrix.exe +0 -0
  62. bigdl/cpp/libs/infill.exe +0 -0
  63. bigdl/cpp/libs/llava-cli.exe +0 -0
  64. bigdl/cpp/libs/lookahead.exe +0 -0
  65. bigdl/cpp/libs/lookup.exe +0 -0
  66. bigdl/cpp/libs/main.exe +0 -0
  67. bigdl/cpp/libs/parallel.exe +0 -0
  68. bigdl/cpp/libs/passkey.exe +0 -0
  69. bigdl/cpp/libs/perplexity.exe +0 -0
  70. bigdl/cpp/libs/q8dot.exe +0 -0
  71. bigdl/cpp/libs/quantize-stats.exe +0 -0
  72. bigdl/cpp/libs/quantize.exe +0 -0
  73. bigdl/cpp/libs/save-load-state.exe +0 -0
  74. bigdl/cpp/libs/server.exe +0 -0
  75. bigdl/cpp/libs/simple.exe +0 -0
  76. bigdl/cpp/libs/speculative.exe +0 -0
  77. bigdl/cpp/libs/tokenize.exe +0 -0
  78. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  79. bigdl/cpp/libs/vdot.exe +0 -0
  80. bigdl_core_cpp-2.5.0b20240826.dist-info/RECORD +0 -63
  81. {bigdl_core_cpp-2.5.0b20240826.data → bigdl_core_cpp-2.5.0b20240827.data}/scripts/init-llama-cpp.ps1 +0 -0
  82. {bigdl_core_cpp-2.5.0b20240826.dist-info → bigdl_core_cpp-2.5.0b20240827.dist-info}/WHEEL +0 -0
  83. {bigdl_core_cpp-2.5.0b20240826.dist-info → bigdl_core_cpp-2.5.0b20240827.dist-info}/top_level.txt +0 -0
@@ -251,12 +251,7 @@ class Model:
251
251
 
252
252
  return [(self.map_tensor_name(name), data_torch)]
253
253
 
254
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
255
- del name, new_name, bid, n_dims # unused
256
-
257
- return False
258
-
259
- def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
254
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
260
255
  del name, new_name, bid, n_dims # unused
261
256
 
262
257
  return False
@@ -285,54 +280,47 @@ class Model:
285
280
  for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
286
281
  data: np.ndarray # type hint
287
282
  n_dims = len(data.shape)
288
- data_dtype = data.dtype
289
- data_qtype: gguf.GGMLQuantizationType | None = None
290
-
291
- # when both are True, f32 should win
292
- extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
293
- extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
283
+ data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
294
284
 
295
285
  # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
296
- # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
297
- extra_f32 = any(cond for cond in (
298
- extra_f32,
299
- n_dims == 1,
300
- new_name.endswith("_norm.weight"),
301
- ))
286
+ if n_dims <= 1 or new_name.endswith("_norm.weight"):
287
+ data_qtype = gguf.GGMLQuantizationType.F32
302
288
 
289
+ # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
303
290
  # Some tensor types are always in float32
304
- extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
305
- gguf.MODEL_TENSOR.FFN_GATE_INP,
306
- gguf.MODEL_TENSOR.POS_EMBD,
307
- gguf.MODEL_TENSOR.TOKEN_TYPES,
308
- ))
309
-
310
- # if f16 desired, convert any float32 2-dim weight tensors to float16
311
- extra_f16 = any(cond for cond in (
312
- extra_f16,
313
- (name.endswith(".weight") and n_dims >= 2),
314
- ))
315
-
316
- if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
317
- if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
318
- data = gguf.quantize_bf16(data)
319
- assert data.dtype == np.int16
320
- data_qtype = gguf.GGMLQuantizationType.BF16
321
-
322
- elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
323
- data = gguf.quantize_q8_0(data)
324
- assert data.dtype == np.uint8
325
- data_qtype = gguf.GGMLQuantizationType.Q8_0
291
+ if data_qtype is False and (
292
+ any(
293
+ self.match_model_tensor_name(new_name, key, bid)
294
+ for key in (
295
+ gguf.MODEL_TENSOR.FFN_GATE_INP,
296
+ gguf.MODEL_TENSOR.POS_EMBD,
297
+ gguf.MODEL_TENSOR.TOKEN_TYPES,
298
+ gguf.MODEL_TENSOR.SSM_CONV1D,
299
+ )
300
+ )
301
+ or not name.endswith(".weight")
302
+ ):
303
+ data_qtype = gguf.GGMLQuantizationType.F32
326
304
 
327
- else: # default to float16 for quantized tensors
328
- if data_dtype != np.float16:
329
- data = data.astype(np.float16)
305
+ # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
306
+ if isinstance(data_qtype, bool):
307
+ if self.ftype == gguf.LlamaFileType.ALL_F32:
308
+ data_qtype = gguf.GGMLQuantizationType.F32
309
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
330
310
  data_qtype = gguf.GGMLQuantizationType.F16
311
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
312
+ data_qtype = gguf.GGMLQuantizationType.BF16
313
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
314
+ data_qtype = gguf.GGMLQuantizationType.Q8_0
315
+ else:
316
+ raise ValueError(f"Unknown file type: {self.ftype.name}")
331
317
 
332
- if data_qtype is None: # by default, convert to float32
333
- if data_dtype != np.float32:
334
- data = data.astype(np.float32)
335
- data_qtype = gguf.GGMLQuantizationType.F32
318
+ try:
319
+ data = gguf.quants.quantize(data, data_qtype)
320
+ except gguf.QuantError as e:
321
+ logger.warning("%s, %s", e, "falling back to F16")
322
+ data_qtype = gguf.GGMLQuantizationType.F16
323
+ data = gguf.quants.quantize(data, data_qtype)
336
324
 
337
325
  shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
338
326
 
@@ -603,6 +591,15 @@ class Model:
603
591
  if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
604
592
  # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
605
593
  res = "smollm"
594
+ if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
595
+ # ref: https://huggingface.co/bigscience/bloom
596
+ res = "bloom"
597
+ if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
598
+ # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
599
+ res = "gpt3-finnish"
600
+ if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
601
+ # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
602
+ res = "exaone"
606
603
 
607
604
  if res is None:
608
605
  logger.warning("\n")
@@ -906,7 +903,7 @@ class GPTNeoXModel(Model):
906
903
  return tensors
907
904
 
908
905
 
909
- @Model.register("BloomForCausalLM")
906
+ @Model.register("BloomForCausalLM", "BloomModel")
910
907
  class BloomModel(Model):
911
908
  model_arch = gguf.MODEL_ARCH.BLOOM
912
909
 
@@ -1575,6 +1572,7 @@ class LlamaModel(Model):
1575
1572
  base = self.hparams.get("rope_theta", 10000.0)
1576
1573
  dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1577
1574
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1575
+
1578
1576
  factor = rope_scaling.get("factor", 8.0)
1579
1577
  low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1580
1578
  high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
@@ -1764,7 +1762,7 @@ class DbrxModel(Model):
1764
1762
 
1765
1763
  return [(new_name, data_torch)]
1766
1764
 
1767
- def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
1765
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
1768
1766
  del name, new_name, bid # unused
1769
1767
 
1770
1768
  return n_dims > 1
@@ -1791,7 +1789,7 @@ class MiniCPMModel(Model):
1791
1789
 
1792
1790
  def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1793
1791
  if n_kv_head is not None and n_head != n_kv_head:
1794
- n_head = n_kv_head
1792
+ n_head = n_kv_head
1795
1793
 
1796
1794
  return (
1797
1795
  weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@@ -2505,6 +2503,112 @@ class NomicBertModel(BertModel):
2505
2503
  self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
2506
2504
 
2507
2505
 
2506
+ @Model.register("XLMRobertaModel")
2507
+ class XLMRobertaModel(BertModel):
2508
+ model_arch = gguf.MODEL_ARCH.BERT
2509
+
2510
+ def __init__(self, *args, **kwargs):
2511
+ super().__init__(*args, **kwargs)
2512
+
2513
+ # we need the pad_token_id to know how to chop down position_embd matrix
2514
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
2515
+ self._position_offset = 1 + pad_token_id
2516
+ if "max_position_embeddings" in self.hparams:
2517
+ self.hparams["max_position_embeddings"] -= self._position_offset
2518
+ else:
2519
+ self._position_offset = None
2520
+
2521
+ def set_vocab(self):
2522
+ # to avoid TypeError: Descriptors cannot be created directly
2523
+ # exception when importing sentencepiece_model_pb2
2524
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
2525
+ from sentencepiece import SentencePieceProcessor
2526
+ from sentencepiece import sentencepiece_model_pb2 as model
2527
+
2528
+ tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
2529
+ if not tokenizer_path.is_file():
2530
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
2531
+
2532
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
2533
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
2534
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
2535
+
2536
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
2537
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
2538
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
2539
+
2540
+ tokenizer = SentencePieceProcessor()
2541
+ tokenizer.LoadFromFile(str(tokenizer_path))
2542
+
2543
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
2544
+
2545
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
2546
+ scores: list[float] = [-10000.0] * vocab_size
2547
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
2548
+
2549
+ for token_id in range(tokenizer.vocab_size()):
2550
+ piece = tokenizer.IdToPiece(token_id)
2551
+ text = piece.encode("utf-8")
2552
+ score = tokenizer.GetScore(token_id)
2553
+
2554
+ toktype = SentencePieceTokenTypes.NORMAL
2555
+ if tokenizer.IsUnknown(token_id):
2556
+ toktype = SentencePieceTokenTypes.UNKNOWN
2557
+ elif tokenizer.IsControl(token_id):
2558
+ toktype = SentencePieceTokenTypes.CONTROL
2559
+ elif tokenizer.IsUnused(token_id):
2560
+ toktype = SentencePieceTokenTypes.UNUSED
2561
+ elif tokenizer.IsByte(token_id):
2562
+ toktype = SentencePieceTokenTypes.BYTE
2563
+
2564
+ tokens[token_id] = text
2565
+ scores[token_id] = score
2566
+ toktypes[token_id] = toktype
2567
+
2568
+ if vocab_size > len(tokens):
2569
+ pad_count = vocab_size - len(tokens)
2570
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
2571
+ for i in range(1, pad_count + 1):
2572
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
2573
+ scores.append(-1000.0)
2574
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
2575
+
2576
+ # realign tokens (see HF tokenizer code)
2577
+ tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
2578
+ scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
2579
+ toktypes = [
2580
+ SentencePieceTokenTypes.CONTROL,
2581
+ SentencePieceTokenTypes.CONTROL,
2582
+ SentencePieceTokenTypes.CONTROL,
2583
+ SentencePieceTokenTypes.UNKNOWN,
2584
+ ] + toktypes[3:-1]
2585
+
2586
+ self.gguf_writer.add_tokenizer_model("t5")
2587
+ self.gguf_writer.add_tokenizer_pre("default")
2588
+ self.gguf_writer.add_token_list(tokens)
2589
+ self.gguf_writer.add_token_scores(scores)
2590
+ self.gguf_writer.add_token_types(toktypes)
2591
+ self.gguf_writer.add_add_space_prefix(add_prefix)
2592
+ self.gguf_writer.add_token_type_count(1)
2593
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
2594
+ if precompiled_charsmap:
2595
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
2596
+
2597
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2598
+ special_vocab.add_to_gguf(self.gguf_writer)
2599
+
2600
+ self.gguf_writer.add_add_bos_token(True)
2601
+ self.gguf_writer.add_add_eos_token(True)
2602
+
2603
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2604
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2605
+ if name == "embeddings.position_embeddings.weight":
2606
+ if self._position_offset is not None:
2607
+ data_torch = data_torch[self._position_offset:,:]
2608
+
2609
+ return super().modify_tensors(data_torch, name, bid)
2610
+
2611
+
2508
2612
  @Model.register("GemmaForCausalLM")
2509
2613
  class GemmaModel(Model):
2510
2614
  model_arch = gguf.MODEL_ARCH.GEMMA
@@ -2608,7 +2712,7 @@ class StarCoder2Model(Model):
2608
2712
  model_arch = gguf.MODEL_ARCH.STARCODER2
2609
2713
 
2610
2714
 
2611
- @Model.register("MambaForCausalLM", "MambaLMHeadModel")
2715
+ @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
2612
2716
  class MambaModel(Model):
2613
2717
  model_arch = gguf.MODEL_ARCH.MAMBA
2614
2718
 
@@ -2639,7 +2743,10 @@ class MambaModel(Model):
2639
2743
  # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
2640
2744
  dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
2641
2745
  rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
2642
-
2746
+ use_dt_b_c_norm = False
2747
+ # For falconmamba we do apply RMS norm on B / DT and C layers
2748
+ if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
2749
+ use_dt_b_c_norm = True
2643
2750
  # Fail early for models which don't have a block expansion factor of 2
2644
2751
  assert d_inner == 2 * d_model
2645
2752
 
@@ -2647,12 +2754,13 @@ class MambaModel(Model):
2647
2754
  self.gguf_writer.add_embedding_length(d_model)
2648
2755
  self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
2649
2756
  self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
2650
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
2757
+ self.gguf_writer.add_block_count(self.block_count)
2651
2758
  self.gguf_writer.add_ssm_conv_kernel(d_conv)
2652
2759
  self.gguf_writer.add_ssm_inner_size(d_inner)
2653
2760
  self.gguf_writer.add_ssm_state_size(d_state)
2654
2761
  self.gguf_writer.add_ssm_time_step_rank(dt_rank)
2655
2762
  self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
2763
+ self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
2656
2764
  self.gguf_writer.add_file_type(self.ftype)
2657
2765
 
2658
2766
  _tok_embd = None
@@ -2679,19 +2787,6 @@ class MambaModel(Model):
2679
2787
 
2680
2788
  return [(new_name, data_torch)]
2681
2789
 
2682
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
2683
- del n_dims # unused
2684
-
2685
- return bid is not None and new_name in (
2686
- self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
2687
- gguf.MODEL_TENSOR.SSM_CONV1D,
2688
- gguf.MODEL_TENSOR.SSM_X,
2689
- gguf.MODEL_TENSOR.SSM_DT,
2690
- gguf.MODEL_TENSOR.SSM_A,
2691
- gguf.MODEL_TENSOR.SSM_D,
2692
- ]
2693
- )
2694
-
2695
2790
 
2696
2791
  @Model.register("CohereForCausalLM")
2697
2792
  class CommandR2Model(Model):
@@ -3226,6 +3321,145 @@ class T5Model(Model):
3226
3321
  return [(self.map_tensor_name(name), data_torch)]
3227
3322
 
3228
3323
 
3324
+ @Model.register("T5EncoderModel")
3325
+ class T5EncoderModel(Model):
3326
+ model_arch = gguf.MODEL_ARCH.T5ENCODER
3327
+
3328
+ def __init__(self, *args, **kwargs):
3329
+ super().__init__(*args, **kwargs)
3330
+ self.shared_token_embeddings_found = False
3331
+
3332
+ def set_vocab(self):
3333
+ # to avoid TypeError: Descriptors cannot be created directly
3334
+ # exception when importing sentencepiece_model_pb2
3335
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
3336
+ from sentencepiece import SentencePieceProcessor
3337
+ from sentencepiece import sentencepiece_model_pb2 as model
3338
+
3339
+ tokenizer_path = self.dir_model / 'tokenizer.model'
3340
+
3341
+ # many older models use spiece.model tokenizer model filename
3342
+ if not tokenizer_path.is_file():
3343
+ tokenizer_path = self.dir_model / 'spiece.model'
3344
+
3345
+ if not tokenizer_path.is_file():
3346
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
3347
+
3348
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3349
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
3350
+
3351
+ # some models like Pile-T5 family use BPE tokenizer instead of Unigram
3352
+ if sentencepiece_model.trainer_spec.model_type == 2: # BPE
3353
+ # assure the tokenizer model file name is correct
3354
+ assert tokenizer_path.name == 'tokenizer.model'
3355
+ return self._set_vocab_sentencepiece()
3356
+ else:
3357
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
3358
+
3359
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
3360
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
3361
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
3362
+
3363
+ tokenizer = SentencePieceProcessor()
3364
+ tokenizer.LoadFromFile(str(tokenizer_path))
3365
+
3366
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3367
+
3368
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3369
+ scores: list[float] = [-10000.0] * vocab_size
3370
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
3371
+
3372
+ for token_id in range(tokenizer.vocab_size()):
3373
+ piece = tokenizer.IdToPiece(token_id)
3374
+ text = piece.encode("utf-8")
3375
+ score = tokenizer.GetScore(token_id)
3376
+
3377
+ toktype = SentencePieceTokenTypes.NORMAL
3378
+ if tokenizer.IsUnknown(token_id):
3379
+ toktype = SentencePieceTokenTypes.UNKNOWN
3380
+ elif tokenizer.IsControl(token_id):
3381
+ toktype = SentencePieceTokenTypes.CONTROL
3382
+ elif tokenizer.IsUnused(token_id):
3383
+ toktype = SentencePieceTokenTypes.UNUSED
3384
+ elif tokenizer.IsByte(token_id):
3385
+ toktype = SentencePieceTokenTypes.BYTE
3386
+
3387
+ tokens[token_id] = text
3388
+ scores[token_id] = score
3389
+ toktypes[token_id] = toktype
3390
+
3391
+ added_tokens_file = self.dir_model / 'added_tokens.json'
3392
+ if added_tokens_file.is_file():
3393
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
3394
+ added_tokens_json = json.load(f)
3395
+ for key in added_tokens_json:
3396
+ token_id = added_tokens_json[key]
3397
+ if token_id >= vocab_size:
3398
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
3399
+ continue
3400
+
3401
+ tokens[token_id] = key.encode("utf-8")
3402
+ scores[token_id] = -1000.0
3403
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
3404
+
3405
+ if vocab_size > len(tokens):
3406
+ pad_count = vocab_size - len(tokens)
3407
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3408
+ for i in range(1, pad_count + 1):
3409
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3410
+ scores.append(-1000.0)
3411
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
3412
+
3413
+ self.gguf_writer.add_tokenizer_model("t5")
3414
+ self.gguf_writer.add_tokenizer_pre("default")
3415
+ self.gguf_writer.add_token_list(tokens)
3416
+ self.gguf_writer.add_token_scores(scores)
3417
+ self.gguf_writer.add_token_types(toktypes)
3418
+ self.gguf_writer.add_add_space_prefix(add_prefix)
3419
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
3420
+ if precompiled_charsmap:
3421
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
3422
+
3423
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3424
+ special_vocab.add_to_gguf(self.gguf_writer)
3425
+
3426
+ self.gguf_writer.add_add_bos_token(False)
3427
+ self.gguf_writer.add_add_eos_token(True)
3428
+
3429
+ def set_gguf_parameters(self):
3430
+ if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
3431
+ logger.warning("Couldn't find context length in config.json, assuming default value of 512")
3432
+ n_ctx = 512
3433
+ self.gguf_writer.add_context_length(n_ctx)
3434
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
3435
+ self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
3436
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
3437
+ self.gguf_writer.add_head_count(self.hparams["num_heads"])
3438
+ self.gguf_writer.add_key_length(self.hparams["d_kv"])
3439
+ self.gguf_writer.add_value_length(self.hparams["d_kv"])
3440
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
3441
+ self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
3442
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
3443
+ self.gguf_writer.add_file_type(self.ftype)
3444
+
3445
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3446
+ del bid # unused
3447
+
3448
+ # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
3449
+ # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
3450
+ # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
3451
+ # and decoder and ignore the remaining ones.
3452
+ if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
3453
+ if not self.shared_token_embeddings_found:
3454
+ name = "shared.weight"
3455
+ self.shared_token_embeddings_found = True
3456
+ else:
3457
+ logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
3458
+ return []
3459
+
3460
+ return [(self.map_tensor_name(name), data_torch)]
3461
+
3462
+
3229
3463
  @Model.register("JAISLMHeadModel")
3230
3464
  class JaisModel(Model):
3231
3465
  model_arch = gguf.MODEL_ARCH.JAIS
@@ -3497,8 +3731,120 @@ class ChatGLMModel(Model):
3497
3731
  name = name.removeprefix("transformer.")
3498
3732
  return [(self.map_tensor_name(name), data_torch)]
3499
3733
 
3500
- ###### CONVERSION LOGIC ######
3501
3734
 
3735
+ @Model.register("NemotronForCausalLM")
3736
+ class NemotronModel(Model):
3737
+ model_arch = gguf.MODEL_ARCH.NEMOTRON
3738
+
3739
+ def set_vocab(self):
3740
+ self._set_vocab_sentencepiece()
3741
+ self.gguf_writer.add_pad_token_id(0)
3742
+ self.gguf_writer.add_unk_token_id(1)
3743
+
3744
+ def set_gguf_parameters(self):
3745
+ super().set_gguf_parameters()
3746
+ hparams = self.hparams
3747
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3748
+
3749
+ f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
3750
+ self.gguf_writer.add_layer_norm_eps(f_norm_eps)
3751
+
3752
+ # * Partial RoPE
3753
+ rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
3754
+ n_embd = self.find_hparam(["hidden_size", "n_embd"])
3755
+ n_head = self.find_hparam(["num_attention_heads", "n_head"])
3756
+ self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
3757
+
3758
+ # * RopeScaling for Nemotron
3759
+ if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
3760
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3761
+ else:
3762
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3763
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
3764
+
3765
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3766
+ # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
3767
+ # model.layers.{l}.input_layernorm.weight
3768
+ # model.layers.{l}.post_attention_layernorm.weight
3769
+ # model.norm.weight
3770
+ if name.endswith("norm.weight"):
3771
+ data_torch = data_torch + 1
3772
+
3773
+ return [(self.map_tensor_name(name), data_torch)]
3774
+
3775
+
3776
+ @Model.register("ExaoneForCausalLM")
3777
+ class ExaoneModel(Model):
3778
+ model_arch = gguf.MODEL_ARCH.EXAONE
3779
+
3780
+ def set_gguf_parameters(self):
3781
+ hparams = self.hparams
3782
+
3783
+ assert (hparams["activation_function"] == "silu")
3784
+
3785
+ max_position_embeddings = hparams["max_position_embeddings"]
3786
+ embed_dim = hparams["hidden_size"]
3787
+ num_heads = hparams["num_attention_heads"]
3788
+ num_kv_heads = hparams.get("num_key_value_heads", num_heads)
3789
+ layer_norm_eps = hparams["layer_norm_epsilon"]
3790
+ intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
3791
+ num_layers = hparams["num_layers"]
3792
+ # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
3793
+ # attention_dropout_rate = hparams["attention_dropout"]
3794
+ # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
3795
+ # embed_dropout_rate = hparams["embed_dropout"]
3796
+ self.gguf_writer.add_embedding_length(embed_dim)
3797
+ self.gguf_writer.add_head_count(num_heads)
3798
+ self.gguf_writer.add_head_count_kv(num_kv_heads)
3799
+ self.gguf_writer.add_context_length(max_position_embeddings)
3800
+ self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
3801
+ self.gguf_writer.add_feed_forward_length(intermediate_size)
3802
+ self.gguf_writer.add_block_count(num_layers)
3803
+ self.gguf_writer.add_file_type(self.ftype)
3804
+
3805
+ if (rope_theta := self.hparams.get("rope_theta")) is not None:
3806
+ self.gguf_writer.add_rope_freq_base(rope_theta)
3807
+ rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
3808
+ rotary_factor = rotary_factor if rotary_factor is not None else 1.0
3809
+ self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
3810
+ if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
3811
+ if hparams["rope_scaling"].get("type") == "linear":
3812
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3813
+ self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
3814
+
3815
+ def prepare_tensors(self):
3816
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
3817
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
3818
+ base = self.hparams.get("rope_theta", 10000.0)
3819
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3820
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
3821
+
3822
+ factor = rope_scaling.get("factor", 8.0)
3823
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
3824
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
3825
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
3826
+
3827
+ low_freq_wavelen = old_context_len / low_freq_factor
3828
+ high_freq_wavelen = old_context_len / high_freq_factor
3829
+ assert low_freq_wavelen != high_freq_wavelen
3830
+
3831
+ rope_factors = []
3832
+ for freq in freqs:
3833
+ wavelen = 2 * math.pi / freq
3834
+ if wavelen < high_freq_wavelen:
3835
+ rope_factors.append(1)
3836
+ elif wavelen > low_freq_wavelen:
3837
+ rope_factors.append(factor)
3838
+ else:
3839
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
3840
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
3841
+
3842
+ self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
3843
+
3844
+ super().prepare_tensors()
3845
+
3846
+
3847
+ ###### CONVERSION LOGIC ######
3502
3848
 
3503
3849
  # tree of lazy tensors
3504
3850
  class LazyTorchTensor(gguf.LazyBase):