bigdl-core-cpp 2.6.0b20241204__py3-none-win_amd64.whl → 2.6.0b20241211__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert_hf_to_gguf.py +404 -37
- bigdl/cpp/convert_hf_to_gguf_update.py +25 -6
- bigdl/cpp/convert_llama_ggml_to_gguf.py +0 -4
- bigdl/cpp/convert_lora_to_gguf.py +11 -1
- bigdl/cpp/gguf-py/gguf/constants.py +276 -81
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +25 -1
- bigdl/cpp/gguf-py/gguf/lazy.py +0 -1
- bigdl/cpp/gguf-py/gguf/quants.py +81 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +135 -23
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/ggml.dll +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.6.0b20241211.dist-info/RECORD +45 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl_core_cpp-2.6.0b20241204.dist-info/RECORD +0 -54
- {bigdl_core_cpp-2.6.0b20241204.data → bigdl_core_cpp-2.6.0b20241211.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.6.0b20241204.data → bigdl_core_cpp-2.6.0b20241211.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.6.0b20241204.data → bigdl_core_cpp-2.6.0b20241211.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.6.0b20241204.dist-info → bigdl_core_cpp-2.6.0b20241211.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ import re
|
|
31
31
|
import requests
|
32
32
|
import sys
|
33
33
|
import json
|
34
|
+
import shutil
|
34
35
|
|
35
36
|
from hashlib import sha256
|
36
37
|
from enum import IntEnum, auto
|
@@ -80,6 +81,7 @@ models = [
|
|
80
81
|
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
81
82
|
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
82
83
|
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
84
|
+
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
|
83
85
|
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
84
86
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
85
87
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
@@ -97,6 +99,8 @@ models = [
|
|
97
99
|
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
98
100
|
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
99
101
|
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
102
|
+
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
103
|
+
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
100
104
|
]
|
101
105
|
|
102
106
|
|
@@ -125,12 +129,27 @@ def download_model(model):
|
|
125
129
|
if tokt == TOKENIZER_TYPE.UGM:
|
126
130
|
files.append("spiece.model")
|
127
131
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
132
|
+
if os.path.isdir(repo):
|
133
|
+
# If repo is a path on the file system, copy the directory
|
134
|
+
for file in files:
|
135
|
+
src_path = os.path.join(repo, file)
|
136
|
+
dst_path = f"models/tokenizers/{name}/{file}"
|
137
|
+
if os.path.isfile(dst_path):
|
138
|
+
logger.info(f"{name}: File {dst_path} already exists - skipping")
|
139
|
+
continue
|
140
|
+
if os.path.isfile(src_path):
|
141
|
+
shutil.copy2(src_path, dst_path)
|
142
|
+
logger.info(f"{name}: Copied {src_path} to {dst_path}")
|
143
|
+
else:
|
144
|
+
logger.warning(f"{name}: Source file {src_path} does not exist")
|
145
|
+
else:
|
146
|
+
# If repo is a URL, download the files
|
147
|
+
for file in files:
|
148
|
+
save_path = f"models/tokenizers/{name}/{file}"
|
149
|
+
if os.path.isfile(save_path):
|
150
|
+
logger.info(f"{name}: File {save_path} already exists - skipping")
|
151
|
+
continue
|
152
|
+
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
134
153
|
|
135
154
|
|
136
155
|
for model in models:
|
@@ -294,11 +294,7 @@ class GGMLToGGUF:
|
|
294
294
|
if self.vocab_override is not None:
|
295
295
|
vo = self.vocab_override
|
296
296
|
logger.info('* Adding vocab item(s)')
|
297
|
-
<<<<<<< HEAD:convert-llama-ggml-to-gguf.py
|
298
|
-
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
299
|
-
=======
|
300
297
|
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
301
|
-
>>>>>>> 1731d42:convert_llama_ggml_to_gguf.py
|
302
298
|
tokens.append(vbytes)
|
303
299
|
scores.append(score)
|
304
300
|
toktypes.append(ttype)
|
@@ -331,6 +331,10 @@ if __name__ == '__main__':
|
|
331
331
|
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
332
332
|
super().set_gguf_parameters()
|
333
333
|
|
334
|
+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
335
|
+
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
336
|
+
return ()
|
337
|
+
|
334
338
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
335
339
|
tensor_map: dict[str, PartialLoraTensor] = {}
|
336
340
|
|
@@ -363,7 +367,13 @@ if __name__ == '__main__':
|
|
363
367
|
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
364
368
|
|
365
369
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
366
|
-
dest = super().modify_tensors(data_torch, name, bid)
|
370
|
+
dest = list(super().modify_tensors(data_torch, name, bid))
|
371
|
+
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
372
|
+
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
373
|
+
# therefore, we ignore them for now
|
374
|
+
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
375
|
+
if name == "lm_head.weight" and len(dest) == 0:
|
376
|
+
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
367
377
|
for dest_name, dest_data in dest:
|
368
378
|
assert isinstance(dest_data, LoraTorchTensor)
|
369
379
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
@@ -94,6 +94,12 @@ class Keys:
|
|
94
94
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
95
95
|
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
96
96
|
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
97
|
+
SWIN_NORM = "{arch}.swin_norm"
|
98
|
+
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
99
|
+
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
|
100
|
+
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
101
|
+
RESIDUAL_SCALE = "{arch}.residual_scale"
|
102
|
+
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
97
103
|
|
98
104
|
class Attention:
|
99
105
|
HEAD_COUNT = "{arch}.attention.head_count"
|
@@ -109,6 +115,7 @@ class Keys:
|
|
109
115
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
110
116
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
111
117
|
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
118
|
+
SCALE = "{arch}.attention.scale"
|
112
119
|
|
113
120
|
class Rope:
|
114
121
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
@@ -132,6 +139,9 @@ class Keys:
|
|
132
139
|
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
133
140
|
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
134
141
|
|
142
|
+
class WKV:
|
143
|
+
HEAD_SIZE = "{arch}.wkv.head_size"
|
144
|
+
|
135
145
|
class Tokenizer:
|
136
146
|
MODEL = "tokenizer.ggml.model"
|
137
147
|
PRE = "tokenizer.ggml.pre"
|
@@ -204,14 +214,17 @@ class MODEL_ARCH(IntEnum):
|
|
204
214
|
ORION = auto()
|
205
215
|
INTERNLM2 = auto()
|
206
216
|
MINICPM = auto()
|
217
|
+
MINICPM3 = auto()
|
207
218
|
GEMMA = auto()
|
208
219
|
GEMMA2 = auto()
|
209
220
|
STARCODER2 = auto()
|
221
|
+
RWKV6 = auto()
|
210
222
|
MAMBA = auto()
|
211
223
|
XVERSE = auto()
|
212
224
|
COMMAND_R = auto()
|
213
225
|
DBRX = auto()
|
214
226
|
OLMO = auto()
|
227
|
+
OLMOE = auto()
|
215
228
|
OPENELM = auto()
|
216
229
|
ARCTIC = auto()
|
217
230
|
DEEPSEEK2 = auto()
|
@@ -222,6 +235,9 @@ class MODEL_ARCH(IntEnum):
|
|
222
235
|
JAIS = auto()
|
223
236
|
NEMOTRON = auto()
|
224
237
|
EXAONE = auto()
|
238
|
+
GRANITE = auto()
|
239
|
+
GRANITE_MOE = auto()
|
240
|
+
CHAMELEON = auto()
|
225
241
|
|
226
242
|
|
227
243
|
class MODEL_TENSOR(IntEnum):
|
@@ -270,6 +286,29 @@ class MODEL_TENSOR(IntEnum):
|
|
270
286
|
SSM_A = auto()
|
271
287
|
SSM_D = auto()
|
272
288
|
SSM_OUT = auto()
|
289
|
+
TIME_MIX_W1 = auto()
|
290
|
+
TIME_MIX_W2 = auto()
|
291
|
+
TIME_MIX_LERP_X = auto()
|
292
|
+
TIME_MIX_LERP_K = auto()
|
293
|
+
TIME_MIX_LERP_V = auto()
|
294
|
+
TIME_MIX_LERP_R = auto()
|
295
|
+
TIME_MIX_LERP_G = auto()
|
296
|
+
TIME_MIX_LERP_W = auto()
|
297
|
+
TIME_MIX_FIRST = auto()
|
298
|
+
TIME_MIX_DECAY = auto()
|
299
|
+
TIME_MIX_DECAY_W1 = auto()
|
300
|
+
TIME_MIX_DECAY_W2 = auto()
|
301
|
+
TIME_MIX_KEY = auto()
|
302
|
+
TIME_MIX_VALUE = auto()
|
303
|
+
TIME_MIX_RECEPTANCE = auto()
|
304
|
+
TIME_MIX_GATE = auto()
|
305
|
+
TIME_MIX_LN = auto()
|
306
|
+
TIME_MIX_OUTPUT = auto()
|
307
|
+
CHANNEL_MIX_LERP_K = auto()
|
308
|
+
CHANNEL_MIX_LERP_R = auto()
|
309
|
+
CHANNEL_MIX_KEY = auto()
|
310
|
+
CHANNEL_MIX_RECEPTANCE = auto()
|
311
|
+
CHANNEL_MIX_VALUE = auto()
|
273
312
|
ATTN_Q_A = auto()
|
274
313
|
ATTN_Q_B = auto()
|
275
314
|
ATTN_KV_A_MQA = auto()
|
@@ -306,6 +345,8 @@ class MODEL_TENSOR(IntEnum):
|
|
306
345
|
ENC_FFN_DOWN = auto()
|
307
346
|
ENC_FFN_UP = auto()
|
308
347
|
ENC_OUTPUT_NORM = auto()
|
348
|
+
CLS = auto() # classifier
|
349
|
+
CLS_OUT = auto() # classifier output projection
|
309
350
|
|
310
351
|
|
311
352
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
@@ -334,14 +375,17 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
334
375
|
MODEL_ARCH.ORION: "orion",
|
335
376
|
MODEL_ARCH.INTERNLM2: "internlm2",
|
336
377
|
MODEL_ARCH.MINICPM: "minicpm",
|
378
|
+
MODEL_ARCH.MINICPM3: "minicpm3",
|
337
379
|
MODEL_ARCH.GEMMA: "gemma",
|
338
380
|
MODEL_ARCH.GEMMA2: "gemma2",
|
339
381
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
382
|
+
MODEL_ARCH.RWKV6: "rwkv6",
|
340
383
|
MODEL_ARCH.MAMBA: "mamba",
|
341
384
|
MODEL_ARCH.XVERSE: "xverse",
|
342
385
|
MODEL_ARCH.COMMAND_R: "command-r",
|
343
386
|
MODEL_ARCH.DBRX: "dbrx",
|
344
387
|
MODEL_ARCH.OLMO: "olmo",
|
388
|
+
MODEL_ARCH.OLMOE: "olmoe",
|
345
389
|
MODEL_ARCH.OPENELM: "openelm",
|
346
390
|
MODEL_ARCH.ARCTIC: "arctic",
|
347
391
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
@@ -352,90 +396,118 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
352
396
|
MODEL_ARCH.JAIS: "jais",
|
353
397
|
MODEL_ARCH.NEMOTRON: "nemotron",
|
354
398
|
MODEL_ARCH.EXAONE: "exaone",
|
399
|
+
MODEL_ARCH.GRANITE: "granite",
|
400
|
+
MODEL_ARCH.GRANITE_MOE: "granitemoe",
|
401
|
+
MODEL_ARCH.CHAMELEON: "chameleon",
|
355
402
|
}
|
356
403
|
|
357
404
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
358
|
-
MODEL_TENSOR.TOKEN_EMBD:
|
359
|
-
MODEL_TENSOR.TOKEN_EMBD_NORM:
|
360
|
-
MODEL_TENSOR.TOKEN_TYPES:
|
361
|
-
MODEL_TENSOR.POS_EMBD:
|
362
|
-
MODEL_TENSOR.OUTPUT_NORM:
|
363
|
-
MODEL_TENSOR.OUTPUT:
|
364
|
-
MODEL_TENSOR.ROPE_FREQS:
|
365
|
-
MODEL_TENSOR.ROPE_FACTORS_LONG:
|
366
|
-
MODEL_TENSOR.ROPE_FACTORS_SHORT:
|
367
|
-
MODEL_TENSOR.ATTN_NORM:
|
368
|
-
MODEL_TENSOR.ATTN_NORM_2:
|
369
|
-
MODEL_TENSOR.ATTN_QKV:
|
370
|
-
MODEL_TENSOR.ATTN_Q:
|
371
|
-
MODEL_TENSOR.ATTN_K:
|
372
|
-
MODEL_TENSOR.ATTN_V:
|
373
|
-
MODEL_TENSOR.ATTN_OUT:
|
374
|
-
MODEL_TENSOR.ATTN_ROT_EMBD:
|
375
|
-
MODEL_TENSOR.ATTN_Q_NORM:
|
376
|
-
MODEL_TENSOR.ATTN_K_NORM:
|
377
|
-
MODEL_TENSOR.ATTN_OUT_NORM:
|
378
|
-
MODEL_TENSOR.ATTN_POST_NORM:
|
379
|
-
MODEL_TENSOR.FFN_GATE_INP:
|
380
|
-
MODEL_TENSOR.FFN_GATE_INP_SHEXP:
|
381
|
-
MODEL_TENSOR.FFN_NORM:
|
382
|
-
MODEL_TENSOR.FFN_PRE_NORM:
|
383
|
-
MODEL_TENSOR.FFN_POST_NORM:
|
384
|
-
MODEL_TENSOR.FFN_GATE:
|
385
|
-
MODEL_TENSOR.FFN_DOWN:
|
386
|
-
MODEL_TENSOR.FFN_UP:
|
387
|
-
MODEL_TENSOR.FFN_GATE_SHEXP:
|
388
|
-
MODEL_TENSOR.FFN_DOWN_SHEXP:
|
389
|
-
MODEL_TENSOR.FFN_UP_SHEXP:
|
390
|
-
MODEL_TENSOR.FFN_ACT:
|
391
|
-
MODEL_TENSOR.FFN_NORM_EXP:
|
392
|
-
MODEL_TENSOR.FFN_GATE_EXP:
|
393
|
-
MODEL_TENSOR.FFN_DOWN_EXP:
|
394
|
-
MODEL_TENSOR.FFN_UP_EXP:
|
395
|
-
MODEL_TENSOR.LAYER_OUT_NORM:
|
396
|
-
MODEL_TENSOR.SSM_IN:
|
397
|
-
MODEL_TENSOR.SSM_CONV1D:
|
398
|
-
MODEL_TENSOR.SSM_X:
|
399
|
-
MODEL_TENSOR.SSM_DT:
|
400
|
-
MODEL_TENSOR.SSM_A:
|
401
|
-
MODEL_TENSOR.SSM_D:
|
402
|
-
MODEL_TENSOR.SSM_OUT:
|
403
|
-
MODEL_TENSOR.
|
404
|
-
MODEL_TENSOR.
|
405
|
-
MODEL_TENSOR.
|
406
|
-
MODEL_TENSOR.
|
407
|
-
MODEL_TENSOR.
|
408
|
-
MODEL_TENSOR.
|
409
|
-
MODEL_TENSOR.
|
410
|
-
MODEL_TENSOR.
|
411
|
-
MODEL_TENSOR.
|
412
|
-
MODEL_TENSOR.
|
413
|
-
MODEL_TENSOR.
|
414
|
-
MODEL_TENSOR.
|
415
|
-
MODEL_TENSOR.
|
416
|
-
MODEL_TENSOR.
|
417
|
-
MODEL_TENSOR.
|
418
|
-
MODEL_TENSOR.
|
419
|
-
MODEL_TENSOR.
|
420
|
-
MODEL_TENSOR.
|
421
|
-
MODEL_TENSOR.
|
422
|
-
MODEL_TENSOR.
|
423
|
-
MODEL_TENSOR.
|
424
|
-
MODEL_TENSOR.
|
425
|
-
MODEL_TENSOR.
|
426
|
-
MODEL_TENSOR.
|
427
|
-
MODEL_TENSOR.
|
428
|
-
MODEL_TENSOR.
|
429
|
-
MODEL_TENSOR.
|
430
|
-
MODEL_TENSOR.
|
431
|
-
MODEL_TENSOR.
|
432
|
-
MODEL_TENSOR.
|
433
|
-
MODEL_TENSOR.
|
434
|
-
MODEL_TENSOR.
|
435
|
-
MODEL_TENSOR.
|
436
|
-
MODEL_TENSOR.
|
437
|
-
MODEL_TENSOR.
|
438
|
-
MODEL_TENSOR.
|
405
|
+
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
406
|
+
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
407
|
+
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
408
|
+
MODEL_TENSOR.POS_EMBD: "position_embd",
|
409
|
+
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
410
|
+
MODEL_TENSOR.OUTPUT: "output",
|
411
|
+
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
412
|
+
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
413
|
+
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
414
|
+
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
415
|
+
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
416
|
+
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
417
|
+
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
418
|
+
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
419
|
+
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
420
|
+
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
421
|
+
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
422
|
+
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
423
|
+
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
424
|
+
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
425
|
+
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
426
|
+
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
427
|
+
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
428
|
+
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
429
|
+
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
430
|
+
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
431
|
+
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
432
|
+
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
433
|
+
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
434
|
+
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
435
|
+
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
436
|
+
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
437
|
+
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
438
|
+
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
439
|
+
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
440
|
+
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
441
|
+
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
442
|
+
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
443
|
+
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
444
|
+
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
445
|
+
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
446
|
+
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
447
|
+
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
448
|
+
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
449
|
+
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
450
|
+
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
451
|
+
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
452
|
+
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
|
453
|
+
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
|
454
|
+
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
|
455
|
+
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
|
456
|
+
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
|
457
|
+
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
|
458
|
+
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
|
459
|
+
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
|
460
|
+
MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
|
461
|
+
MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
|
462
|
+
MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
|
463
|
+
MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
|
464
|
+
MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
|
465
|
+
MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
|
466
|
+
MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
|
467
|
+
MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
|
468
|
+
MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
|
469
|
+
MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
|
470
|
+
MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
|
471
|
+
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
|
472
|
+
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
|
473
|
+
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
474
|
+
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
475
|
+
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
476
|
+
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
477
|
+
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
478
|
+
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
479
|
+
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
480
|
+
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
481
|
+
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
482
|
+
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
483
|
+
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
484
|
+
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
485
|
+
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
486
|
+
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
487
|
+
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
488
|
+
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
489
|
+
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
490
|
+
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
491
|
+
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
492
|
+
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
493
|
+
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
494
|
+
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
495
|
+
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
496
|
+
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
497
|
+
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
498
|
+
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
499
|
+
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
500
|
+
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
501
|
+
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
502
|
+
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
503
|
+
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
504
|
+
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
505
|
+
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
506
|
+
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
507
|
+
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
508
|
+
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
509
|
+
MODEL_TENSOR.CLS: "cls",
|
510
|
+
MODEL_TENSOR.CLS_OUT: "cls.output",
|
439
511
|
}
|
440
512
|
|
441
513
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
@@ -545,6 +617,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
545
617
|
MODEL_TENSOR.FFN_DOWN,
|
546
618
|
MODEL_TENSOR.FFN_UP,
|
547
619
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
620
|
+
MODEL_TENSOR.CLS,
|
621
|
+
MODEL_TENSOR.CLS_OUT,
|
548
622
|
],
|
549
623
|
MODEL_ARCH.NOMIC_BERT: [
|
550
624
|
MODEL_TENSOR.TOKEN_EMBD,
|
@@ -576,6 +650,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
576
650
|
MODEL_TENSOR.FFN_GATE,
|
577
651
|
MODEL_TENSOR.FFN_DOWN,
|
578
652
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
653
|
+
MODEL_TENSOR.CLS,
|
579
654
|
],
|
580
655
|
MODEL_ARCH.MPT: [
|
581
656
|
MODEL_TENSOR.TOKEN_EMBD,
|
@@ -739,6 +814,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
739
814
|
MODEL_TENSOR.TOKEN_EMBD,
|
740
815
|
MODEL_TENSOR.OUTPUT_NORM,
|
741
816
|
MODEL_TENSOR.OUTPUT,
|
817
|
+
MODEL_TENSOR.ROPE_FACTORS_LONG,
|
818
|
+
MODEL_TENSOR.ROPE_FACTORS_SHORT,
|
742
819
|
MODEL_TENSOR.ATTN_NORM,
|
743
820
|
MODEL_TENSOR.ATTN_QKV,
|
744
821
|
MODEL_TENSOR.ATTN_Q,
|
@@ -813,6 +890,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
813
890
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
814
891
|
MODEL_TENSOR.FFN_UP_EXP,
|
815
892
|
],
|
893
|
+
MODEL_ARCH.MINICPM3: [
|
894
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
895
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
896
|
+
MODEL_TENSOR.OUTPUT,
|
897
|
+
MODEL_TENSOR.ROPE_FACTORS_LONG,
|
898
|
+
MODEL_TENSOR.ROPE_FACTORS_SHORT,
|
899
|
+
MODEL_TENSOR.ATTN_NORM,
|
900
|
+
MODEL_TENSOR.ATTN_Q_A,
|
901
|
+
MODEL_TENSOR.ATTN_Q_B,
|
902
|
+
MODEL_TENSOR.ATTN_KV_A_MQA,
|
903
|
+
MODEL_TENSOR.ATTN_KV_B,
|
904
|
+
MODEL_TENSOR.ATTN_Q_A_NORM,
|
905
|
+
MODEL_TENSOR.ATTN_KV_A_NORM,
|
906
|
+
MODEL_TENSOR.ATTN_OUT,
|
907
|
+
MODEL_TENSOR.FFN_NORM,
|
908
|
+
MODEL_TENSOR.FFN_GATE,
|
909
|
+
MODEL_TENSOR.FFN_DOWN,
|
910
|
+
MODEL_TENSOR.FFN_UP,
|
911
|
+
],
|
816
912
|
MODEL_ARCH.GEMMA: [
|
817
913
|
MODEL_TENSOR.TOKEN_EMBD,
|
818
914
|
MODEL_TENSOR.OUTPUT_NORM,
|
@@ -856,6 +952,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
856
952
|
MODEL_TENSOR.FFN_DOWN,
|
857
953
|
MODEL_TENSOR.FFN_UP,
|
858
954
|
],
|
955
|
+
MODEL_ARCH.RWKV6: [
|
956
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
957
|
+
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
958
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
959
|
+
MODEL_TENSOR.OUTPUT,
|
960
|
+
MODEL_TENSOR.ATTN_NORM,
|
961
|
+
MODEL_TENSOR.ATTN_NORM_2,
|
962
|
+
MODEL_TENSOR.TIME_MIX_W1,
|
963
|
+
MODEL_TENSOR.TIME_MIX_W2,
|
964
|
+
MODEL_TENSOR.TIME_MIX_LERP_X,
|
965
|
+
MODEL_TENSOR.TIME_MIX_LERP_K,
|
966
|
+
MODEL_TENSOR.TIME_MIX_LERP_V,
|
967
|
+
MODEL_TENSOR.TIME_MIX_LERP_R,
|
968
|
+
MODEL_TENSOR.TIME_MIX_LERP_G,
|
969
|
+
MODEL_TENSOR.TIME_MIX_LERP_W,
|
970
|
+
MODEL_TENSOR.TIME_MIX_FIRST,
|
971
|
+
MODEL_TENSOR.TIME_MIX_DECAY,
|
972
|
+
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
973
|
+
MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
974
|
+
MODEL_TENSOR.TIME_MIX_KEY,
|
975
|
+
MODEL_TENSOR.TIME_MIX_VALUE,
|
976
|
+
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
977
|
+
MODEL_TENSOR.TIME_MIX_GATE,
|
978
|
+
MODEL_TENSOR.TIME_MIX_LN,
|
979
|
+
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
980
|
+
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
|
981
|
+
MODEL_TENSOR.CHANNEL_MIX_LERP_R,
|
982
|
+
MODEL_TENSOR.CHANNEL_MIX_KEY,
|
983
|
+
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
|
984
|
+
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
985
|
+
],
|
859
986
|
MODEL_ARCH.MAMBA: [
|
860
987
|
MODEL_TENSOR.TOKEN_EMBD,
|
861
988
|
MODEL_TENSOR.OUTPUT_NORM,
|
@@ -923,6 +1050,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
923
1050
|
MODEL_TENSOR.FFN_DOWN,
|
924
1051
|
MODEL_TENSOR.FFN_UP,
|
925
1052
|
],
|
1053
|
+
MODEL_ARCH.OLMOE: [
|
1054
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
1055
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
1056
|
+
MODEL_TENSOR.OUTPUT,
|
1057
|
+
MODEL_TENSOR.ATTN_OUT,
|
1058
|
+
MODEL_TENSOR.ATTN_Q,
|
1059
|
+
MODEL_TENSOR.ATTN_K,
|
1060
|
+
MODEL_TENSOR.ATTN_V,
|
1061
|
+
MODEL_TENSOR.ATTN_NORM,
|
1062
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
1063
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
1064
|
+
MODEL_TENSOR.FFN_NORM,
|
1065
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
1066
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
1067
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
1068
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
1069
|
+
],
|
926
1070
|
MODEL_ARCH.OPENELM: [
|
927
1071
|
MODEL_TENSOR.TOKEN_EMBD,
|
928
1072
|
MODEL_TENSOR.OUTPUT_NORM,
|
@@ -1101,6 +1245,51 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
1101
1245
|
MODEL_TENSOR.FFN_DOWN,
|
1102
1246
|
MODEL_TENSOR.FFN_UP,
|
1103
1247
|
],
|
1248
|
+
MODEL_ARCH.GRANITE: [
|
1249
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
1250
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
1251
|
+
MODEL_TENSOR.OUTPUT,
|
1252
|
+
MODEL_TENSOR.ATTN_NORM,
|
1253
|
+
MODEL_TENSOR.ATTN_Q,
|
1254
|
+
MODEL_TENSOR.ATTN_K,
|
1255
|
+
MODEL_TENSOR.ATTN_V,
|
1256
|
+
MODEL_TENSOR.ATTN_OUT,
|
1257
|
+
MODEL_TENSOR.FFN_NORM,
|
1258
|
+
MODEL_TENSOR.FFN_GATE,
|
1259
|
+
MODEL_TENSOR.FFN_DOWN,
|
1260
|
+
MODEL_TENSOR.FFN_UP,
|
1261
|
+
],
|
1262
|
+
MODEL_ARCH.GRANITE_MOE: [
|
1263
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
1264
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
1265
|
+
MODEL_TENSOR.OUTPUT,
|
1266
|
+
MODEL_TENSOR.ATTN_NORM,
|
1267
|
+
MODEL_TENSOR.ATTN_Q,
|
1268
|
+
MODEL_TENSOR.ATTN_K,
|
1269
|
+
MODEL_TENSOR.ATTN_V,
|
1270
|
+
MODEL_TENSOR.ATTN_OUT,
|
1271
|
+
MODEL_TENSOR.FFN_NORM,
|
1272
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
1273
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
1274
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
1275
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
1276
|
+
],
|
1277
|
+
MODEL_ARCH.CHAMELEON: [
|
1278
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
1279
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
1280
|
+
MODEL_TENSOR.OUTPUT,
|
1281
|
+
MODEL_TENSOR.ATTN_NORM,
|
1282
|
+
MODEL_TENSOR.ATTN_Q,
|
1283
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
1284
|
+
MODEL_TENSOR.ATTN_K,
|
1285
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
1286
|
+
MODEL_TENSOR.ATTN_V,
|
1287
|
+
MODEL_TENSOR.ATTN_OUT,
|
1288
|
+
MODEL_TENSOR.FFN_NORM,
|
1289
|
+
MODEL_TENSOR.FFN_GATE,
|
1290
|
+
MODEL_TENSOR.FFN_DOWN,
|
1291
|
+
MODEL_TENSOR.FFN_UP,
|
1292
|
+
],
|
1104
1293
|
# TODO
|
1105
1294
|
}
|
1106
1295
|
|
@@ -1206,6 +1395,8 @@ class GGMLQuantizationType(IntEnum):
|
|
1206
1395
|
Q4_0_4_4 = 31
|
1207
1396
|
Q4_0_4_8 = 32
|
1208
1397
|
Q4_0_8_8 = 33
|
1398
|
+
TQ1_0 = 34
|
1399
|
+
TQ2_0 = 35
|
1209
1400
|
|
1210
1401
|
|
1211
1402
|
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
@@ -1250,6 +1441,8 @@ class LlamaFileType(IntEnum):
|
|
1250
1441
|
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
1251
1442
|
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
1252
1443
|
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
1444
|
+
MOSTLY_TQ1_0 = 36 # except 1d tensors
|
1445
|
+
MOSTLY_TQ2_0 = 37 # except 1d tensors
|
1253
1446
|
|
1254
1447
|
GUESSED = 1024 # not specified in the model file
|
1255
1448
|
|
@@ -1326,6 +1519,8 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
|
1326
1519
|
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
1327
1520
|
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
1328
1521
|
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
1522
|
+
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
1523
|
+
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
1329
1524
|
}
|
1330
1525
|
|
1331
1526
|
|
@@ -670,6 +670,27 @@ class GGUFWriter:
|
|
670
670
|
def add_expert_weights_scale(self, value: float) -> None:
|
671
671
|
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
672
672
|
|
673
|
+
def add_swin_norm(self, value: bool) -> None:
|
674
|
+
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
675
|
+
|
676
|
+
def add_rescale_every_n_layers(self, count: int) -> None:
|
677
|
+
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
|
678
|
+
|
679
|
+
def add_time_mix_extra_dim(self, dim: int) -> None:
|
680
|
+
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
|
681
|
+
|
682
|
+
def add_time_decay_extra_dim(self, dim: int) -> None:
|
683
|
+
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
|
684
|
+
|
685
|
+
def add_residual_scale(self, value: float) -> None:
|
686
|
+
self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value)
|
687
|
+
|
688
|
+
def add_embedding_scale(self, value: float) -> None:
|
689
|
+
self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value)
|
690
|
+
|
691
|
+
def add_wkv_head_size(self, size: int) -> None:
|
692
|
+
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
|
693
|
+
|
673
694
|
def add_layer_norm_eps(self, value: float) -> None:
|
674
695
|
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
675
696
|
|
@@ -691,6 +712,9 @@ class GGUFWriter:
|
|
691
712
|
def add_sliding_window(self, value: int) -> None:
|
692
713
|
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
|
693
714
|
|
715
|
+
def add_attention_scale(self, value: float) -> None:
|
716
|
+
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
|
717
|
+
|
694
718
|
def add_pooling_type(self, value: PoolingType) -> None:
|
695
719
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
696
720
|
|
@@ -885,4 +909,4 @@ class GGUFWriter:
|
|
885
909
|
if abs(fnum) < 1000.0:
|
886
910
|
return f"{fnum:3.1f}{unit}"
|
887
911
|
fnum /= 1000.0
|
888
|
-
return f"{fnum:.1f}T - over 1TB, split recommended"
|
912
|
+
return f"{fnum:.1f}T - over 1TB, split recommended"
|