bigdl-core-cpp 2.6.0b20241204__py3-none-win_amd64.whl → 2.6.0b20241211__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +404 -37
  2. bigdl/cpp/convert_hf_to_gguf_update.py +25 -6
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +0 -4
  4. bigdl/cpp/convert_lora_to_gguf.py +11 -1
  5. bigdl/cpp/gguf-py/gguf/constants.py +276 -81
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +25 -1
  7. bigdl/cpp/gguf-py/gguf/lazy.py +0 -1
  8. bigdl/cpp/gguf-py/gguf/quants.py +81 -0
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +135 -23
  10. bigdl/cpp/libs/common.lib +0 -0
  11. bigdl/cpp/libs/ggml.dll +0 -0
  12. bigdl/cpp/libs/llama-batched.exe +0 -0
  13. bigdl/cpp/libs/llama-bench.exe +0 -0
  14. bigdl/cpp/libs/llama-cli.exe +0 -0
  15. bigdl/cpp/libs/llama-embedding.exe +0 -0
  16. bigdl/cpp/libs/llama-gguf.exe +0 -0
  17. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  18. bigdl/cpp/libs/llama-lookup.exe +0 -0
  19. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  20. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  21. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  22. bigdl/cpp/libs/llama-quantize.exe +0 -0
  23. bigdl/cpp/libs/llama-server.exe +0 -0
  24. bigdl/cpp/libs/llama-simple.exe +0 -0
  25. bigdl/cpp/libs/llama-speculative.exe +0 -0
  26. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  27. bigdl/cpp/libs/llama.dll +0 -0
  28. bigdl/cpp/libs/llava_shared.dll +0 -0
  29. bigdl/cpp/libs/ollama.exe +0 -0
  30. {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/METADATA +1 -1
  31. bigdl_core_cpp-2.6.0b20241211.dist-info/RECORD +45 -0
  32. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll +0 -0
  33. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
  34. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
  35. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
  36. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
  37. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
  38. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
  39. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
  40. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
  41. bigdl_core_cpp-2.6.0b20241204.dist-info/RECORD +0 -54
  42. {bigdl_core_cpp-2.6.0b20241204.data → bigdl_core_cpp-2.6.0b20241211.data}/scripts/init-llama-cpp.bat +0 -0
  43. {bigdl_core_cpp-2.6.0b20241204.data → bigdl_core_cpp-2.6.0b20241211.data}/scripts/init-llama-cpp.ps1 +0 -0
  44. {bigdl_core_cpp-2.6.0b20241204.data → bigdl_core_cpp-2.6.0b20241211.data}/scripts/init-ollama.bat +0 -0
  45. {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/WHEEL +0 -0
  46. {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ import re
31
31
  import requests
32
32
  import sys
33
33
  import json
34
+ import shutil
34
35
 
35
36
  from hashlib import sha256
36
37
  from enum import IntEnum, auto
@@ -80,6 +81,7 @@ models = [
80
81
  {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
81
82
  {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
82
83
  {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
84
+ {"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
83
85
  {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
84
86
  {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
85
87
  {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
@@ -97,6 +99,8 @@ models = [
97
99
  {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
98
100
  {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
99
101
  {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
102
+ {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
103
+ {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
100
104
  ]
101
105
 
102
106
 
@@ -125,12 +129,27 @@ def download_model(model):
125
129
  if tokt == TOKENIZER_TYPE.UGM:
126
130
  files.append("spiece.model")
127
131
 
128
- for file in files:
129
- save_path = f"models/tokenizers/{name}/{file}"
130
- if os.path.isfile(save_path):
131
- logger.info(f"{name}: File {save_path} already exists - skipping")
132
- continue
133
- download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
132
+ if os.path.isdir(repo):
133
+ # If repo is a path on the file system, copy the directory
134
+ for file in files:
135
+ src_path = os.path.join(repo, file)
136
+ dst_path = f"models/tokenizers/{name}/{file}"
137
+ if os.path.isfile(dst_path):
138
+ logger.info(f"{name}: File {dst_path} already exists - skipping")
139
+ continue
140
+ if os.path.isfile(src_path):
141
+ shutil.copy2(src_path, dst_path)
142
+ logger.info(f"{name}: Copied {src_path} to {dst_path}")
143
+ else:
144
+ logger.warning(f"{name}: Source file {src_path} does not exist")
145
+ else:
146
+ # If repo is a URL, download the files
147
+ for file in files:
148
+ save_path = f"models/tokenizers/{name}/{file}"
149
+ if os.path.isfile(save_path):
150
+ logger.info(f"{name}: File {save_path} already exists - skipping")
151
+ continue
152
+ download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
134
153
 
135
154
 
136
155
  for model in models:
@@ -294,11 +294,7 @@ class GGMLToGGUF:
294
294
  if self.vocab_override is not None:
295
295
  vo = self.vocab_override
296
296
  logger.info('* Adding vocab item(s)')
297
- <<<<<<< HEAD:convert-llama-ggml-to-gguf.py
298
- for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
299
- =======
300
297
  for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
301
- >>>>>>> 1731d42:convert_llama_ggml_to_gguf.py
302
298
  tokens.append(vbytes)
303
299
  scores.append(score)
304
300
  toktypes.append(ttype)
@@ -331,6 +331,10 @@ if __name__ == '__main__':
331
331
  self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
332
332
  super().set_gguf_parameters()
333
333
 
334
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
335
+ # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
336
+ return ()
337
+
334
338
  def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
335
339
  tensor_map: dict[str, PartialLoraTensor] = {}
336
340
 
@@ -363,7 +367,13 @@ if __name__ == '__main__':
363
367
  yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
364
368
 
365
369
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
366
- dest = super().modify_tensors(data_torch, name, bid)
370
+ dest = list(super().modify_tensors(data_torch, name, bid))
371
+ # some archs may have the same tensor for lm_head and output (tie word embeddings)
372
+ # in this case, adapters targeting lm_head will fail when using llama-export-lora
373
+ # therefore, we ignore them for now
374
+ # see: https://github.com/ggerganov/llama.cpp/issues/9065
375
+ if name == "lm_head.weight" and len(dest) == 0:
376
+ raise ValueError("lm_head is present in adapter, but is ignored in base model")
367
377
  for dest_name, dest_data in dest:
368
378
  assert isinstance(dest_data, LoraTorchTensor)
369
379
  lora_a, lora_b = dest_data.get_lora_A_B()
@@ -94,6 +94,12 @@ class Keys:
94
94
  DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
95
95
  ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
96
96
  FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
97
+ SWIN_NORM = "{arch}.swin_norm"
98
+ RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
99
+ TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
100
+ TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
101
+ RESIDUAL_SCALE = "{arch}.residual_scale"
102
+ EMBEDDING_SCALE = "{arch}.embedding_scale"
97
103
 
98
104
  class Attention:
99
105
  HEAD_COUNT = "{arch}.attention.head_count"
@@ -109,6 +115,7 @@ class Keys:
109
115
  KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
110
116
  REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
111
117
  SLIDING_WINDOW = "{arch}.attention.sliding_window"
118
+ SCALE = "{arch}.attention.scale"
112
119
 
113
120
  class Rope:
114
121
  DIMENSION_COUNT = "{arch}.rope.dimension_count"
@@ -132,6 +139,9 @@ class Keys:
132
139
  TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
133
140
  DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
134
141
 
142
+ class WKV:
143
+ HEAD_SIZE = "{arch}.wkv.head_size"
144
+
135
145
  class Tokenizer:
136
146
  MODEL = "tokenizer.ggml.model"
137
147
  PRE = "tokenizer.ggml.pre"
@@ -204,14 +214,17 @@ class MODEL_ARCH(IntEnum):
204
214
  ORION = auto()
205
215
  INTERNLM2 = auto()
206
216
  MINICPM = auto()
217
+ MINICPM3 = auto()
207
218
  GEMMA = auto()
208
219
  GEMMA2 = auto()
209
220
  STARCODER2 = auto()
221
+ RWKV6 = auto()
210
222
  MAMBA = auto()
211
223
  XVERSE = auto()
212
224
  COMMAND_R = auto()
213
225
  DBRX = auto()
214
226
  OLMO = auto()
227
+ OLMOE = auto()
215
228
  OPENELM = auto()
216
229
  ARCTIC = auto()
217
230
  DEEPSEEK2 = auto()
@@ -222,6 +235,9 @@ class MODEL_ARCH(IntEnum):
222
235
  JAIS = auto()
223
236
  NEMOTRON = auto()
224
237
  EXAONE = auto()
238
+ GRANITE = auto()
239
+ GRANITE_MOE = auto()
240
+ CHAMELEON = auto()
225
241
 
226
242
 
227
243
  class MODEL_TENSOR(IntEnum):
@@ -270,6 +286,29 @@ class MODEL_TENSOR(IntEnum):
270
286
  SSM_A = auto()
271
287
  SSM_D = auto()
272
288
  SSM_OUT = auto()
289
+ TIME_MIX_W1 = auto()
290
+ TIME_MIX_W2 = auto()
291
+ TIME_MIX_LERP_X = auto()
292
+ TIME_MIX_LERP_K = auto()
293
+ TIME_MIX_LERP_V = auto()
294
+ TIME_MIX_LERP_R = auto()
295
+ TIME_MIX_LERP_G = auto()
296
+ TIME_MIX_LERP_W = auto()
297
+ TIME_MIX_FIRST = auto()
298
+ TIME_MIX_DECAY = auto()
299
+ TIME_MIX_DECAY_W1 = auto()
300
+ TIME_MIX_DECAY_W2 = auto()
301
+ TIME_MIX_KEY = auto()
302
+ TIME_MIX_VALUE = auto()
303
+ TIME_MIX_RECEPTANCE = auto()
304
+ TIME_MIX_GATE = auto()
305
+ TIME_MIX_LN = auto()
306
+ TIME_MIX_OUTPUT = auto()
307
+ CHANNEL_MIX_LERP_K = auto()
308
+ CHANNEL_MIX_LERP_R = auto()
309
+ CHANNEL_MIX_KEY = auto()
310
+ CHANNEL_MIX_RECEPTANCE = auto()
311
+ CHANNEL_MIX_VALUE = auto()
273
312
  ATTN_Q_A = auto()
274
313
  ATTN_Q_B = auto()
275
314
  ATTN_KV_A_MQA = auto()
@@ -306,6 +345,8 @@ class MODEL_TENSOR(IntEnum):
306
345
  ENC_FFN_DOWN = auto()
307
346
  ENC_FFN_UP = auto()
308
347
  ENC_OUTPUT_NORM = auto()
348
+ CLS = auto() # classifier
349
+ CLS_OUT = auto() # classifier output projection
309
350
 
310
351
 
311
352
  MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -334,14 +375,17 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
334
375
  MODEL_ARCH.ORION: "orion",
335
376
  MODEL_ARCH.INTERNLM2: "internlm2",
336
377
  MODEL_ARCH.MINICPM: "minicpm",
378
+ MODEL_ARCH.MINICPM3: "minicpm3",
337
379
  MODEL_ARCH.GEMMA: "gemma",
338
380
  MODEL_ARCH.GEMMA2: "gemma2",
339
381
  MODEL_ARCH.STARCODER2: "starcoder2",
382
+ MODEL_ARCH.RWKV6: "rwkv6",
340
383
  MODEL_ARCH.MAMBA: "mamba",
341
384
  MODEL_ARCH.XVERSE: "xverse",
342
385
  MODEL_ARCH.COMMAND_R: "command-r",
343
386
  MODEL_ARCH.DBRX: "dbrx",
344
387
  MODEL_ARCH.OLMO: "olmo",
388
+ MODEL_ARCH.OLMOE: "olmoe",
345
389
  MODEL_ARCH.OPENELM: "openelm",
346
390
  MODEL_ARCH.ARCTIC: "arctic",
347
391
  MODEL_ARCH.DEEPSEEK2: "deepseek2",
@@ -352,90 +396,118 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
352
396
  MODEL_ARCH.JAIS: "jais",
353
397
  MODEL_ARCH.NEMOTRON: "nemotron",
354
398
  MODEL_ARCH.EXAONE: "exaone",
399
+ MODEL_ARCH.GRANITE: "granite",
400
+ MODEL_ARCH.GRANITE_MOE: "granitemoe",
401
+ MODEL_ARCH.CHAMELEON: "chameleon",
355
402
  }
356
403
 
357
404
  TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
358
- MODEL_TENSOR.TOKEN_EMBD: "token_embd",
359
- MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
360
- MODEL_TENSOR.TOKEN_TYPES: "token_types",
361
- MODEL_TENSOR.POS_EMBD: "position_embd",
362
- MODEL_TENSOR.OUTPUT_NORM: "output_norm",
363
- MODEL_TENSOR.OUTPUT: "output",
364
- MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
365
- MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
366
- MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
367
- MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
368
- MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
369
- MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
370
- MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
371
- MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
372
- MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
373
- MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
374
- MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
375
- MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
376
- MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
377
- MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
378
- MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
379
- MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
380
- MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
381
- MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
382
- MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
383
- MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
384
- MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
385
- MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
386
- MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
387
- MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
388
- MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
389
- MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
390
- MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
391
- MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
392
- MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
393
- MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
394
- MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
395
- MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
396
- MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
397
- MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
398
- MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
399
- MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
400
- MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
401
- MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
402
- MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
403
- MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
404
- MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
405
- MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
406
- MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
407
- MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
408
- MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
409
- MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
410
- MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
411
- MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
412
- MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
413
- MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
414
- MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
415
- MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
416
- MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
417
- MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
418
- MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
419
- MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
420
- MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
421
- MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
422
- MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
423
- MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
424
- MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
425
- MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
426
- MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
427
- MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
428
- MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
429
- MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
430
- MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
431
- MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
432
- MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
433
- MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
434
- MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
435
- MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
436
- MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
437
- MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
438
- MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
405
+ MODEL_TENSOR.TOKEN_EMBD: "token_embd",
406
+ MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
407
+ MODEL_TENSOR.TOKEN_TYPES: "token_types",
408
+ MODEL_TENSOR.POS_EMBD: "position_embd",
409
+ MODEL_TENSOR.OUTPUT_NORM: "output_norm",
410
+ MODEL_TENSOR.OUTPUT: "output",
411
+ MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
412
+ MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
413
+ MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
414
+ MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
415
+ MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
416
+ MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
417
+ MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
418
+ MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
419
+ MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
420
+ MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
421
+ MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
422
+ MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
423
+ MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
424
+ MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
425
+ MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
426
+ MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
427
+ MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
428
+ MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
429
+ MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
430
+ MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
431
+ MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
432
+ MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
433
+ MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
434
+ MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
435
+ MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
436
+ MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
437
+ MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
438
+ MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
439
+ MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
440
+ MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
441
+ MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
442
+ MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
443
+ MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
444
+ MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
445
+ MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
446
+ MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
447
+ MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
448
+ MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
449
+ MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
450
+ MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
451
+ MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
452
+ MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
453
+ MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
454
+ MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
455
+ MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
456
+ MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
457
+ MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
458
+ MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
459
+ MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
460
+ MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
461
+ MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
462
+ MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
463
+ MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
464
+ MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
465
+ MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
466
+ MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
467
+ MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
468
+ MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
469
+ MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
470
+ MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
471
+ MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
472
+ MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
473
+ MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
474
+ MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
475
+ MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
476
+ MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
477
+ MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
478
+ MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
479
+ MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
480
+ MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
481
+ MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
482
+ MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
483
+ MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
484
+ MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
485
+ MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
486
+ MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
487
+ MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
488
+ MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
489
+ MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
490
+ MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
491
+ MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
492
+ MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
493
+ MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
494
+ MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
495
+ MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
496
+ MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
497
+ MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
498
+ MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
499
+ MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
500
+ MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
501
+ MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
502
+ MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
503
+ MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
504
+ MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
505
+ MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
506
+ MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
507
+ MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
508
+ MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
509
+ MODEL_TENSOR.CLS: "cls",
510
+ MODEL_TENSOR.CLS_OUT: "cls.output",
439
511
  }
440
512
 
441
513
  MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -545,6 +617,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
545
617
  MODEL_TENSOR.FFN_DOWN,
546
618
  MODEL_TENSOR.FFN_UP,
547
619
  MODEL_TENSOR.LAYER_OUT_NORM,
620
+ MODEL_TENSOR.CLS,
621
+ MODEL_TENSOR.CLS_OUT,
548
622
  ],
549
623
  MODEL_ARCH.NOMIC_BERT: [
550
624
  MODEL_TENSOR.TOKEN_EMBD,
@@ -576,6 +650,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
576
650
  MODEL_TENSOR.FFN_GATE,
577
651
  MODEL_TENSOR.FFN_DOWN,
578
652
  MODEL_TENSOR.LAYER_OUT_NORM,
653
+ MODEL_TENSOR.CLS,
579
654
  ],
580
655
  MODEL_ARCH.MPT: [
581
656
  MODEL_TENSOR.TOKEN_EMBD,
@@ -739,6 +814,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
739
814
  MODEL_TENSOR.TOKEN_EMBD,
740
815
  MODEL_TENSOR.OUTPUT_NORM,
741
816
  MODEL_TENSOR.OUTPUT,
817
+ MODEL_TENSOR.ROPE_FACTORS_LONG,
818
+ MODEL_TENSOR.ROPE_FACTORS_SHORT,
742
819
  MODEL_TENSOR.ATTN_NORM,
743
820
  MODEL_TENSOR.ATTN_QKV,
744
821
  MODEL_TENSOR.ATTN_Q,
@@ -813,6 +890,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
813
890
  MODEL_TENSOR.FFN_DOWN_EXP,
814
891
  MODEL_TENSOR.FFN_UP_EXP,
815
892
  ],
893
+ MODEL_ARCH.MINICPM3: [
894
+ MODEL_TENSOR.TOKEN_EMBD,
895
+ MODEL_TENSOR.OUTPUT_NORM,
896
+ MODEL_TENSOR.OUTPUT,
897
+ MODEL_TENSOR.ROPE_FACTORS_LONG,
898
+ MODEL_TENSOR.ROPE_FACTORS_SHORT,
899
+ MODEL_TENSOR.ATTN_NORM,
900
+ MODEL_TENSOR.ATTN_Q_A,
901
+ MODEL_TENSOR.ATTN_Q_B,
902
+ MODEL_TENSOR.ATTN_KV_A_MQA,
903
+ MODEL_TENSOR.ATTN_KV_B,
904
+ MODEL_TENSOR.ATTN_Q_A_NORM,
905
+ MODEL_TENSOR.ATTN_KV_A_NORM,
906
+ MODEL_TENSOR.ATTN_OUT,
907
+ MODEL_TENSOR.FFN_NORM,
908
+ MODEL_TENSOR.FFN_GATE,
909
+ MODEL_TENSOR.FFN_DOWN,
910
+ MODEL_TENSOR.FFN_UP,
911
+ ],
816
912
  MODEL_ARCH.GEMMA: [
817
913
  MODEL_TENSOR.TOKEN_EMBD,
818
914
  MODEL_TENSOR.OUTPUT_NORM,
@@ -856,6 +952,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
856
952
  MODEL_TENSOR.FFN_DOWN,
857
953
  MODEL_TENSOR.FFN_UP,
858
954
  ],
955
+ MODEL_ARCH.RWKV6: [
956
+ MODEL_TENSOR.TOKEN_EMBD,
957
+ MODEL_TENSOR.TOKEN_EMBD_NORM,
958
+ MODEL_TENSOR.OUTPUT_NORM,
959
+ MODEL_TENSOR.OUTPUT,
960
+ MODEL_TENSOR.ATTN_NORM,
961
+ MODEL_TENSOR.ATTN_NORM_2,
962
+ MODEL_TENSOR.TIME_MIX_W1,
963
+ MODEL_TENSOR.TIME_MIX_W2,
964
+ MODEL_TENSOR.TIME_MIX_LERP_X,
965
+ MODEL_TENSOR.TIME_MIX_LERP_K,
966
+ MODEL_TENSOR.TIME_MIX_LERP_V,
967
+ MODEL_TENSOR.TIME_MIX_LERP_R,
968
+ MODEL_TENSOR.TIME_MIX_LERP_G,
969
+ MODEL_TENSOR.TIME_MIX_LERP_W,
970
+ MODEL_TENSOR.TIME_MIX_FIRST,
971
+ MODEL_TENSOR.TIME_MIX_DECAY,
972
+ MODEL_TENSOR.TIME_MIX_DECAY_W1,
973
+ MODEL_TENSOR.TIME_MIX_DECAY_W2,
974
+ MODEL_TENSOR.TIME_MIX_KEY,
975
+ MODEL_TENSOR.TIME_MIX_VALUE,
976
+ MODEL_TENSOR.TIME_MIX_RECEPTANCE,
977
+ MODEL_TENSOR.TIME_MIX_GATE,
978
+ MODEL_TENSOR.TIME_MIX_LN,
979
+ MODEL_TENSOR.TIME_MIX_OUTPUT,
980
+ MODEL_TENSOR.CHANNEL_MIX_LERP_K,
981
+ MODEL_TENSOR.CHANNEL_MIX_LERP_R,
982
+ MODEL_TENSOR.CHANNEL_MIX_KEY,
983
+ MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
984
+ MODEL_TENSOR.CHANNEL_MIX_VALUE,
985
+ ],
859
986
  MODEL_ARCH.MAMBA: [
860
987
  MODEL_TENSOR.TOKEN_EMBD,
861
988
  MODEL_TENSOR.OUTPUT_NORM,
@@ -923,6 +1050,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
923
1050
  MODEL_TENSOR.FFN_DOWN,
924
1051
  MODEL_TENSOR.FFN_UP,
925
1052
  ],
1053
+ MODEL_ARCH.OLMOE: [
1054
+ MODEL_TENSOR.TOKEN_EMBD,
1055
+ MODEL_TENSOR.OUTPUT_NORM,
1056
+ MODEL_TENSOR.OUTPUT,
1057
+ MODEL_TENSOR.ATTN_OUT,
1058
+ MODEL_TENSOR.ATTN_Q,
1059
+ MODEL_TENSOR.ATTN_K,
1060
+ MODEL_TENSOR.ATTN_V,
1061
+ MODEL_TENSOR.ATTN_NORM,
1062
+ MODEL_TENSOR.ATTN_Q_NORM,
1063
+ MODEL_TENSOR.ATTN_K_NORM,
1064
+ MODEL_TENSOR.FFN_NORM,
1065
+ MODEL_TENSOR.FFN_GATE_INP,
1066
+ MODEL_TENSOR.FFN_GATE_EXP,
1067
+ MODEL_TENSOR.FFN_UP_EXP,
1068
+ MODEL_TENSOR.FFN_DOWN_EXP,
1069
+ ],
926
1070
  MODEL_ARCH.OPENELM: [
927
1071
  MODEL_TENSOR.TOKEN_EMBD,
928
1072
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1101,6 +1245,51 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1101
1245
  MODEL_TENSOR.FFN_DOWN,
1102
1246
  MODEL_TENSOR.FFN_UP,
1103
1247
  ],
1248
+ MODEL_ARCH.GRANITE: [
1249
+ MODEL_TENSOR.TOKEN_EMBD,
1250
+ MODEL_TENSOR.OUTPUT_NORM,
1251
+ MODEL_TENSOR.OUTPUT,
1252
+ MODEL_TENSOR.ATTN_NORM,
1253
+ MODEL_TENSOR.ATTN_Q,
1254
+ MODEL_TENSOR.ATTN_K,
1255
+ MODEL_TENSOR.ATTN_V,
1256
+ MODEL_TENSOR.ATTN_OUT,
1257
+ MODEL_TENSOR.FFN_NORM,
1258
+ MODEL_TENSOR.FFN_GATE,
1259
+ MODEL_TENSOR.FFN_DOWN,
1260
+ MODEL_TENSOR.FFN_UP,
1261
+ ],
1262
+ MODEL_ARCH.GRANITE_MOE: [
1263
+ MODEL_TENSOR.TOKEN_EMBD,
1264
+ MODEL_TENSOR.OUTPUT_NORM,
1265
+ MODEL_TENSOR.OUTPUT,
1266
+ MODEL_TENSOR.ATTN_NORM,
1267
+ MODEL_TENSOR.ATTN_Q,
1268
+ MODEL_TENSOR.ATTN_K,
1269
+ MODEL_TENSOR.ATTN_V,
1270
+ MODEL_TENSOR.ATTN_OUT,
1271
+ MODEL_TENSOR.FFN_NORM,
1272
+ MODEL_TENSOR.FFN_GATE_INP,
1273
+ MODEL_TENSOR.FFN_GATE_EXP,
1274
+ MODEL_TENSOR.FFN_DOWN_EXP,
1275
+ MODEL_TENSOR.FFN_UP_EXP,
1276
+ ],
1277
+ MODEL_ARCH.CHAMELEON: [
1278
+ MODEL_TENSOR.TOKEN_EMBD,
1279
+ MODEL_TENSOR.OUTPUT_NORM,
1280
+ MODEL_TENSOR.OUTPUT,
1281
+ MODEL_TENSOR.ATTN_NORM,
1282
+ MODEL_TENSOR.ATTN_Q,
1283
+ MODEL_TENSOR.ATTN_Q_NORM,
1284
+ MODEL_TENSOR.ATTN_K,
1285
+ MODEL_TENSOR.ATTN_K_NORM,
1286
+ MODEL_TENSOR.ATTN_V,
1287
+ MODEL_TENSOR.ATTN_OUT,
1288
+ MODEL_TENSOR.FFN_NORM,
1289
+ MODEL_TENSOR.FFN_GATE,
1290
+ MODEL_TENSOR.FFN_DOWN,
1291
+ MODEL_TENSOR.FFN_UP,
1292
+ ],
1104
1293
  # TODO
1105
1294
  }
1106
1295
 
@@ -1206,6 +1395,8 @@ class GGMLQuantizationType(IntEnum):
1206
1395
  Q4_0_4_4 = 31
1207
1396
  Q4_0_4_8 = 32
1208
1397
  Q4_0_8_8 = 33
1398
+ TQ1_0 = 34
1399
+ TQ2_0 = 35
1209
1400
 
1210
1401
 
1211
1402
  # TODO: add GGMLFileType from ggml_ftype in ggml.h
@@ -1250,6 +1441,8 @@ class LlamaFileType(IntEnum):
1250
1441
  MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
1251
1442
  MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
1252
1443
  MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
1444
+ MOSTLY_TQ1_0 = 36 # except 1d tensors
1445
+ MOSTLY_TQ2_0 = 37 # except 1d tensors
1253
1446
 
1254
1447
  GUESSED = 1024 # not specified in the model file
1255
1448
 
@@ -1326,6 +1519,8 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
1326
1519
  GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
1327
1520
  GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
1328
1521
  GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
1522
+ GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
1523
+ GGMLQuantizationType.TQ2_0: (256, 2 + 64),
1329
1524
  }
1330
1525
 
1331
1526
 
@@ -670,6 +670,27 @@ class GGUFWriter:
670
670
  def add_expert_weights_scale(self, value: float) -> None:
671
671
  self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
672
672
 
673
+ def add_swin_norm(self, value: bool) -> None:
674
+ self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
675
+
676
+ def add_rescale_every_n_layers(self, count: int) -> None:
677
+ self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
678
+
679
+ def add_time_mix_extra_dim(self, dim: int) -> None:
680
+ self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
681
+
682
+ def add_time_decay_extra_dim(self, dim: int) -> None:
683
+ self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
684
+
685
+ def add_residual_scale(self, value: float) -> None:
686
+ self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value)
687
+
688
+ def add_embedding_scale(self, value: float) -> None:
689
+ self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value)
690
+
691
+ def add_wkv_head_size(self, size: int) -> None:
692
+ self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
693
+
673
694
  def add_layer_norm_eps(self, value: float) -> None:
674
695
  self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
675
696
 
@@ -691,6 +712,9 @@ class GGUFWriter:
691
712
  def add_sliding_window(self, value: int) -> None:
692
713
  self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
693
714
 
715
+ def add_attention_scale(self, value: float) -> None:
716
+ self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
717
+
694
718
  def add_pooling_type(self, value: PoolingType) -> None:
695
719
  self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
696
720
 
@@ -885,4 +909,4 @@ class GGUFWriter:
885
909
  if abs(fnum) < 1000.0:
886
910
  return f"{fnum:3.1f}{unit}"
887
911
  fnum /= 1000.0
888
- return f"{fnum:.1f}T - over 1TB, split recommended"
912
+ return f"{fnum:.1f}T - over 1TB, split recommended"
@@ -210,5 +210,4 @@ class LazyNumpyTensor(LazyBase):
210
210
  eager = LazyNumpyTensor.to_eager(self)
211
211
  return eager.tofile(*args, **kwargs)
212
212
 
213
-
214
213
  # TODO: __array_function__