bigdl-core-cpp 2.5.0b20240421__py3-none-win_amd64.whl → 2.5.0b20240423__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert.py +276 -189
- bigdl/cpp/gguf-py/__init__.py +0 -0
- bigdl/cpp/gguf-py/gguf/__init__.py +5 -0
- bigdl/cpp/gguf-py/gguf/constants.py +943 -0
- bigdl/cpp/gguf-py/gguf/gguf.py +15 -0
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +279 -0
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +518 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +434 -0
- bigdl/cpp/gguf-py/gguf/vocab.py +181 -0
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240423.data}/scripts/init-llama-cpp.bat +1 -0
- {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240423.dist-info}/METADATA +3 -3
- bigdl_core_cpp-2.5.0b20240423.dist-info/RECORD +50 -0
- bigdl_core_cpp-2.5.0b20240421.dist-info/RECORD +0 -42
- {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240423.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240423.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240423.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240423.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,943 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from enum import Enum, IntEnum, auto
|
5
|
+
from typing import Any
|
6
|
+
|
7
|
+
#
|
8
|
+
# constants
|
9
|
+
#
|
10
|
+
|
11
|
+
GGUF_MAGIC = 0x46554747 # "GGUF"
|
12
|
+
GGUF_VERSION = 3
|
13
|
+
GGUF_DEFAULT_ALIGNMENT = 32
|
14
|
+
|
15
|
+
#
|
16
|
+
# metadata keys
|
17
|
+
#
|
18
|
+
|
19
|
+
|
20
|
+
class Keys:
|
21
|
+
class General:
|
22
|
+
ARCHITECTURE = "general.architecture"
|
23
|
+
QUANTIZATION_VERSION = "general.quantization_version"
|
24
|
+
ALIGNMENT = "general.alignment"
|
25
|
+
NAME = "general.name"
|
26
|
+
AUTHOR = "general.author"
|
27
|
+
VERSION = "general.version"
|
28
|
+
URL = "general.url"
|
29
|
+
DESCRIPTION = "general.description"
|
30
|
+
LICENSE = "general.license"
|
31
|
+
SOURCE_URL = "general.source.url"
|
32
|
+
SOURCE_HF_REPO = "general.source.huggingface.repository"
|
33
|
+
FILE_TYPE = "general.file_type"
|
34
|
+
|
35
|
+
class LLM:
|
36
|
+
VOCAB_SIZE = "{arch}.vocab_size"
|
37
|
+
CONTEXT_LENGTH = "{arch}.context_length"
|
38
|
+
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
39
|
+
BLOCK_COUNT = "{arch}.block_count"
|
40
|
+
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
41
|
+
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
42
|
+
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
43
|
+
EXPERT_COUNT = "{arch}.expert_count"
|
44
|
+
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
45
|
+
POOLING_TYPE = "{arch}.pooling_type"
|
46
|
+
LOGIT_SCALE = "{arch}.logit_scale"
|
47
|
+
|
48
|
+
class Attention:
|
49
|
+
HEAD_COUNT = "{arch}.attention.head_count"
|
50
|
+
HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
51
|
+
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
52
|
+
CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
53
|
+
KEY_LENGTH = "{arch}.attention.key_length"
|
54
|
+
VALUE_LENGTH = "{arch}.attention.value_length"
|
55
|
+
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
56
|
+
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
57
|
+
CAUSAL = "{arch}.attention.causal"
|
58
|
+
|
59
|
+
class Rope:
|
60
|
+
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
61
|
+
FREQ_BASE = "{arch}.rope.freq_base"
|
62
|
+
SCALING_TYPE = "{arch}.rope.scaling.type"
|
63
|
+
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
64
|
+
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
65
|
+
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
66
|
+
|
67
|
+
class SSM:
|
68
|
+
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
69
|
+
INNER_SIZE = "{arch}.ssm.inner_size"
|
70
|
+
STATE_SIZE = "{arch}.ssm.state_size"
|
71
|
+
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
72
|
+
|
73
|
+
class Tokenizer:
|
74
|
+
MODEL = "tokenizer.ggml.model"
|
75
|
+
LIST = "tokenizer.ggml.tokens"
|
76
|
+
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
77
|
+
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
78
|
+
SCORES = "tokenizer.ggml.scores"
|
79
|
+
MERGES = "tokenizer.ggml.merges"
|
80
|
+
BOS_ID = "tokenizer.ggml.bos_token_id"
|
81
|
+
EOS_ID = "tokenizer.ggml.eos_token_id"
|
82
|
+
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
83
|
+
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
84
|
+
PAD_ID = "tokenizer.ggml.padding_token_id"
|
85
|
+
CLS_ID = "tokenizer.ggml.cls_token_id"
|
86
|
+
MASK_ID = "tokenizer.ggml.mask_token_id"
|
87
|
+
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
88
|
+
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
89
|
+
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
90
|
+
HF_JSON = "tokenizer.huggingface.json"
|
91
|
+
RWKV = "tokenizer.rwkv.world"
|
92
|
+
CHAT_TEMPLATE = "tokenizer.chat_template"
|
93
|
+
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
94
|
+
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
95
|
+
# FIM/Infill special tokens constants
|
96
|
+
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
97
|
+
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
98
|
+
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
99
|
+
EOT_ID = "tokenizer.ggml.eot_token_id"
|
100
|
+
|
101
|
+
|
102
|
+
#
|
103
|
+
# recommended mapping of model tensor names for storage in gguf
|
104
|
+
#
|
105
|
+
|
106
|
+
|
107
|
+
class MODEL_ARCH(IntEnum):
|
108
|
+
LLAMA = auto()
|
109
|
+
FALCON = auto()
|
110
|
+
BAICHUAN = auto()
|
111
|
+
GROK = auto()
|
112
|
+
GPT2 = auto()
|
113
|
+
GPTJ = auto()
|
114
|
+
GPTNEOX = auto()
|
115
|
+
MPT = auto()
|
116
|
+
STARCODER = auto()
|
117
|
+
PERSIMMON = auto()
|
118
|
+
REFACT = auto()
|
119
|
+
BERT = auto()
|
120
|
+
NOMIC_BERT = auto()
|
121
|
+
BLOOM = auto()
|
122
|
+
STABLELM = auto()
|
123
|
+
QWEN = auto()
|
124
|
+
QWEN2 = auto()
|
125
|
+
QWEN2MOE = auto()
|
126
|
+
PHI2 = auto()
|
127
|
+
PLAMO = auto()
|
128
|
+
CODESHELL = auto()
|
129
|
+
ORION = auto()
|
130
|
+
INTERNLM2 = auto()
|
131
|
+
MINICPM = auto()
|
132
|
+
GEMMA = auto()
|
133
|
+
STARCODER2 = auto()
|
134
|
+
MAMBA = auto()
|
135
|
+
XVERSE = auto()
|
136
|
+
COMMAND_R = auto()
|
137
|
+
DBRX = auto()
|
138
|
+
OLMO = auto()
|
139
|
+
|
140
|
+
|
141
|
+
class MODEL_TENSOR(IntEnum):
|
142
|
+
TOKEN_EMBD = auto()
|
143
|
+
TOKEN_EMBD_NORM = auto()
|
144
|
+
TOKEN_TYPES = auto()
|
145
|
+
POS_EMBD = auto()
|
146
|
+
OUTPUT = auto()
|
147
|
+
OUTPUT_NORM = auto()
|
148
|
+
ROPE_FREQS = auto()
|
149
|
+
ATTN_Q = auto()
|
150
|
+
ATTN_K = auto()
|
151
|
+
ATTN_V = auto()
|
152
|
+
ATTN_QKV = auto()
|
153
|
+
ATTN_OUT = auto()
|
154
|
+
ATTN_NORM = auto()
|
155
|
+
ATTN_NORM_2 = auto()
|
156
|
+
ATTN_OUT_NORM = auto()
|
157
|
+
ATTN_ROT_EMBD = auto()
|
158
|
+
FFN_GATE_INP = auto()
|
159
|
+
FFN_GATE_INP_SHEXP = auto()
|
160
|
+
FFN_NORM = auto()
|
161
|
+
FFN_GATE = auto()
|
162
|
+
FFN_DOWN = auto()
|
163
|
+
FFN_UP = auto()
|
164
|
+
FFN_ACT = auto()
|
165
|
+
FFN_GATE_EXP = auto()
|
166
|
+
FFN_DOWN_EXP = auto()
|
167
|
+
FFN_UP_EXP = auto()
|
168
|
+
FFN_GATE_SHEXP = auto()
|
169
|
+
FFN_DOWN_SHEXP = auto()
|
170
|
+
FFN_UP_SHEXP = auto()
|
171
|
+
ATTN_Q_NORM = auto()
|
172
|
+
ATTN_K_NORM = auto()
|
173
|
+
LAYER_OUT_NORM = auto()
|
174
|
+
SSM_IN = auto()
|
175
|
+
SSM_CONV1D = auto()
|
176
|
+
SSM_X = auto()
|
177
|
+
SSM_DT = auto()
|
178
|
+
SSM_A = auto()
|
179
|
+
SSM_D = auto()
|
180
|
+
SSM_OUT = auto()
|
181
|
+
|
182
|
+
|
183
|
+
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
184
|
+
MODEL_ARCH.LLAMA: "llama",
|
185
|
+
MODEL_ARCH.FALCON: "falcon",
|
186
|
+
MODEL_ARCH.BAICHUAN: "baichuan",
|
187
|
+
MODEL_ARCH.GROK: "grok",
|
188
|
+
MODEL_ARCH.GPT2: "gpt2",
|
189
|
+
MODEL_ARCH.GPTJ: "gptj",
|
190
|
+
MODEL_ARCH.GPTNEOX: "gptneox",
|
191
|
+
MODEL_ARCH.MPT: "mpt",
|
192
|
+
MODEL_ARCH.STARCODER: "starcoder",
|
193
|
+
MODEL_ARCH.PERSIMMON: "persimmon",
|
194
|
+
MODEL_ARCH.REFACT: "refact",
|
195
|
+
MODEL_ARCH.BERT: "bert",
|
196
|
+
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
197
|
+
MODEL_ARCH.BLOOM: "bloom",
|
198
|
+
MODEL_ARCH.STABLELM: "stablelm",
|
199
|
+
MODEL_ARCH.QWEN: "qwen",
|
200
|
+
MODEL_ARCH.QWEN2: "qwen2",
|
201
|
+
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
202
|
+
MODEL_ARCH.PHI2: "phi2",
|
203
|
+
MODEL_ARCH.PLAMO: "plamo",
|
204
|
+
MODEL_ARCH.CODESHELL: "codeshell",
|
205
|
+
MODEL_ARCH.ORION: "orion",
|
206
|
+
MODEL_ARCH.INTERNLM2: "internlm2",
|
207
|
+
MODEL_ARCH.MINICPM: "minicpm",
|
208
|
+
MODEL_ARCH.GEMMA: "gemma",
|
209
|
+
MODEL_ARCH.STARCODER2: "starcoder2",
|
210
|
+
MODEL_ARCH.MAMBA: "mamba",
|
211
|
+
MODEL_ARCH.XVERSE: "xverse",
|
212
|
+
MODEL_ARCH.COMMAND_R: "command-r",
|
213
|
+
MODEL_ARCH.DBRX: "dbrx",
|
214
|
+
MODEL_ARCH.OLMO: "olmo",
|
215
|
+
}
|
216
|
+
|
217
|
+
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
218
|
+
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
219
|
+
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
220
|
+
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
221
|
+
MODEL_TENSOR.POS_EMBD: "position_embd",
|
222
|
+
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
223
|
+
MODEL_TENSOR.OUTPUT: "output",
|
224
|
+
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
225
|
+
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
226
|
+
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
227
|
+
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
228
|
+
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
229
|
+
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
230
|
+
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
231
|
+
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
232
|
+
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
233
|
+
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
234
|
+
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
235
|
+
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
236
|
+
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
237
|
+
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
238
|
+
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
239
|
+
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
240
|
+
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
241
|
+
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
242
|
+
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
243
|
+
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
244
|
+
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
245
|
+
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
246
|
+
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
247
|
+
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
248
|
+
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
249
|
+
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
250
|
+
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
251
|
+
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
252
|
+
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
253
|
+
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
254
|
+
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
255
|
+
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
256
|
+
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
257
|
+
}
|
258
|
+
|
259
|
+
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
260
|
+
MODEL_ARCH.LLAMA: [
|
261
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
262
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
263
|
+
MODEL_TENSOR.OUTPUT,
|
264
|
+
MODEL_TENSOR.ROPE_FREQS,
|
265
|
+
MODEL_TENSOR.ATTN_NORM,
|
266
|
+
MODEL_TENSOR.ATTN_Q,
|
267
|
+
MODEL_TENSOR.ATTN_K,
|
268
|
+
MODEL_TENSOR.ATTN_V,
|
269
|
+
MODEL_TENSOR.ATTN_OUT,
|
270
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
271
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
272
|
+
MODEL_TENSOR.FFN_NORM,
|
273
|
+
MODEL_TENSOR.FFN_GATE,
|
274
|
+
MODEL_TENSOR.FFN_DOWN,
|
275
|
+
MODEL_TENSOR.FFN_UP,
|
276
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
277
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
278
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
279
|
+
],
|
280
|
+
MODEL_ARCH.GROK: [
|
281
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
282
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
283
|
+
MODEL_TENSOR.OUTPUT,
|
284
|
+
MODEL_TENSOR.ROPE_FREQS,
|
285
|
+
MODEL_TENSOR.ATTN_NORM,
|
286
|
+
MODEL_TENSOR.ATTN_Q,
|
287
|
+
MODEL_TENSOR.ATTN_K,
|
288
|
+
MODEL_TENSOR.ATTN_V,
|
289
|
+
MODEL_TENSOR.ATTN_OUT,
|
290
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
291
|
+
MODEL_TENSOR.ATTN_OUT_NORM,
|
292
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
293
|
+
MODEL_TENSOR.FFN_NORM,
|
294
|
+
MODEL_TENSOR.FFN_GATE,
|
295
|
+
MODEL_TENSOR.FFN_DOWN,
|
296
|
+
MODEL_TENSOR.FFN_UP,
|
297
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
298
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
299
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
300
|
+
MODEL_TENSOR.LAYER_OUT_NORM,
|
301
|
+
],
|
302
|
+
MODEL_ARCH.GPTNEOX: [
|
303
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
304
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
305
|
+
MODEL_TENSOR.OUTPUT,
|
306
|
+
MODEL_TENSOR.ATTN_NORM,
|
307
|
+
MODEL_TENSOR.ATTN_QKV,
|
308
|
+
MODEL_TENSOR.ATTN_OUT,
|
309
|
+
MODEL_TENSOR.FFN_NORM,
|
310
|
+
MODEL_TENSOR.FFN_DOWN,
|
311
|
+
MODEL_TENSOR.FFN_UP,
|
312
|
+
],
|
313
|
+
MODEL_ARCH.FALCON: [
|
314
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
315
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
316
|
+
MODEL_TENSOR.OUTPUT,
|
317
|
+
MODEL_TENSOR.ATTN_NORM,
|
318
|
+
MODEL_TENSOR.ATTN_NORM_2,
|
319
|
+
MODEL_TENSOR.ATTN_QKV,
|
320
|
+
MODEL_TENSOR.ATTN_OUT,
|
321
|
+
MODEL_TENSOR.FFN_DOWN,
|
322
|
+
MODEL_TENSOR.FFN_UP,
|
323
|
+
],
|
324
|
+
MODEL_ARCH.BAICHUAN: [
|
325
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
326
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
327
|
+
MODEL_TENSOR.OUTPUT,
|
328
|
+
MODEL_TENSOR.ROPE_FREQS,
|
329
|
+
MODEL_TENSOR.ATTN_NORM,
|
330
|
+
MODEL_TENSOR.ATTN_Q,
|
331
|
+
MODEL_TENSOR.ATTN_K,
|
332
|
+
MODEL_TENSOR.ATTN_V,
|
333
|
+
MODEL_TENSOR.ATTN_OUT,
|
334
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
335
|
+
MODEL_TENSOR.FFN_NORM,
|
336
|
+
MODEL_TENSOR.FFN_GATE,
|
337
|
+
MODEL_TENSOR.FFN_DOWN,
|
338
|
+
MODEL_TENSOR.FFN_UP,
|
339
|
+
],
|
340
|
+
MODEL_ARCH.STARCODER: [
|
341
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
342
|
+
MODEL_TENSOR.POS_EMBD,
|
343
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
344
|
+
MODEL_TENSOR.OUTPUT,
|
345
|
+
MODEL_TENSOR.ATTN_NORM,
|
346
|
+
MODEL_TENSOR.ATTN_QKV,
|
347
|
+
MODEL_TENSOR.ATTN_OUT,
|
348
|
+
MODEL_TENSOR.FFN_NORM,
|
349
|
+
MODEL_TENSOR.FFN_DOWN,
|
350
|
+
MODEL_TENSOR.FFN_UP,
|
351
|
+
],
|
352
|
+
MODEL_ARCH.BERT: [
|
353
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
354
|
+
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
355
|
+
MODEL_TENSOR.TOKEN_TYPES,
|
356
|
+
MODEL_TENSOR.POS_EMBD,
|
357
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
358
|
+
MODEL_TENSOR.ATTN_OUT_NORM,
|
359
|
+
MODEL_TENSOR.ATTN_Q,
|
360
|
+
MODEL_TENSOR.ATTN_K,
|
361
|
+
MODEL_TENSOR.ATTN_V,
|
362
|
+
MODEL_TENSOR.ATTN_OUT,
|
363
|
+
MODEL_TENSOR.FFN_DOWN,
|
364
|
+
MODEL_TENSOR.FFN_UP,
|
365
|
+
MODEL_TENSOR.LAYER_OUT_NORM,
|
366
|
+
],
|
367
|
+
MODEL_ARCH.NOMIC_BERT: [
|
368
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
369
|
+
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
370
|
+
MODEL_TENSOR.TOKEN_TYPES,
|
371
|
+
MODEL_TENSOR.POS_EMBD,
|
372
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
373
|
+
MODEL_TENSOR.ATTN_OUT_NORM,
|
374
|
+
MODEL_TENSOR.ATTN_QKV,
|
375
|
+
MODEL_TENSOR.ATTN_OUT,
|
376
|
+
MODEL_TENSOR.FFN_GATE,
|
377
|
+
MODEL_TENSOR.FFN_DOWN,
|
378
|
+
MODEL_TENSOR.FFN_UP,
|
379
|
+
MODEL_TENSOR.LAYER_OUT_NORM,
|
380
|
+
],
|
381
|
+
MODEL_ARCH.MPT: [
|
382
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
383
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
384
|
+
MODEL_TENSOR.OUTPUT,
|
385
|
+
MODEL_TENSOR.ATTN_NORM,
|
386
|
+
MODEL_TENSOR.ATTN_QKV,
|
387
|
+
MODEL_TENSOR.ATTN_OUT,
|
388
|
+
MODEL_TENSOR.FFN_NORM,
|
389
|
+
MODEL_TENSOR.FFN_DOWN,
|
390
|
+
MODEL_TENSOR.FFN_UP,
|
391
|
+
MODEL_TENSOR.FFN_ACT,
|
392
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
393
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
394
|
+
MODEL_TENSOR.POS_EMBD,
|
395
|
+
],
|
396
|
+
MODEL_ARCH.GPTJ: [
|
397
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
398
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
399
|
+
MODEL_TENSOR.OUTPUT,
|
400
|
+
MODEL_TENSOR.ATTN_NORM,
|
401
|
+
MODEL_TENSOR.ATTN_Q,
|
402
|
+
MODEL_TENSOR.ATTN_K,
|
403
|
+
MODEL_TENSOR.ATTN_V,
|
404
|
+
MODEL_TENSOR.ATTN_OUT,
|
405
|
+
MODEL_TENSOR.FFN_DOWN,
|
406
|
+
MODEL_TENSOR.FFN_UP,
|
407
|
+
],
|
408
|
+
MODEL_ARCH.PERSIMMON: [
|
409
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
410
|
+
MODEL_TENSOR.OUTPUT,
|
411
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
412
|
+
MODEL_TENSOR.ATTN_NORM,
|
413
|
+
MODEL_TENSOR.ATTN_QKV,
|
414
|
+
MODEL_TENSOR.ATTN_OUT,
|
415
|
+
MODEL_TENSOR.FFN_NORM,
|
416
|
+
MODEL_TENSOR.FFN_DOWN,
|
417
|
+
MODEL_TENSOR.FFN_UP,
|
418
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
419
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
420
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
421
|
+
],
|
422
|
+
MODEL_ARCH.REFACT: [
|
423
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
424
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
425
|
+
MODEL_TENSOR.OUTPUT,
|
426
|
+
MODEL_TENSOR.ATTN_NORM,
|
427
|
+
MODEL_TENSOR.ATTN_Q,
|
428
|
+
MODEL_TENSOR.ATTN_K,
|
429
|
+
MODEL_TENSOR.ATTN_V,
|
430
|
+
MODEL_TENSOR.ATTN_OUT,
|
431
|
+
MODEL_TENSOR.FFN_NORM,
|
432
|
+
MODEL_TENSOR.FFN_GATE,
|
433
|
+
MODEL_TENSOR.FFN_DOWN,
|
434
|
+
MODEL_TENSOR.FFN_UP,
|
435
|
+
],
|
436
|
+
MODEL_ARCH.BLOOM: [
|
437
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
438
|
+
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
439
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
440
|
+
MODEL_TENSOR.OUTPUT,
|
441
|
+
MODEL_TENSOR.ATTN_NORM,
|
442
|
+
MODEL_TENSOR.ATTN_QKV,
|
443
|
+
MODEL_TENSOR.ATTN_OUT,
|
444
|
+
MODEL_TENSOR.FFN_NORM,
|
445
|
+
MODEL_TENSOR.FFN_DOWN,
|
446
|
+
MODEL_TENSOR.FFN_UP,
|
447
|
+
],
|
448
|
+
MODEL_ARCH.STABLELM: [
|
449
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
450
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
451
|
+
MODEL_TENSOR.OUTPUT,
|
452
|
+
MODEL_TENSOR.ROPE_FREQS,
|
453
|
+
MODEL_TENSOR.ATTN_NORM,
|
454
|
+
MODEL_TENSOR.ATTN_Q,
|
455
|
+
MODEL_TENSOR.ATTN_K,
|
456
|
+
MODEL_TENSOR.ATTN_V,
|
457
|
+
MODEL_TENSOR.ATTN_OUT,
|
458
|
+
MODEL_TENSOR.FFN_NORM,
|
459
|
+
MODEL_TENSOR.FFN_GATE,
|
460
|
+
MODEL_TENSOR.FFN_DOWN,
|
461
|
+
MODEL_TENSOR.FFN_UP,
|
462
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
463
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
464
|
+
],
|
465
|
+
MODEL_ARCH.QWEN: [
|
466
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
467
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
468
|
+
MODEL_TENSOR.OUTPUT,
|
469
|
+
MODEL_TENSOR.ROPE_FREQS,
|
470
|
+
MODEL_TENSOR.ATTN_NORM,
|
471
|
+
MODEL_TENSOR.ATTN_QKV,
|
472
|
+
MODEL_TENSOR.ATTN_OUT,
|
473
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
474
|
+
MODEL_TENSOR.FFN_NORM,
|
475
|
+
MODEL_TENSOR.FFN_GATE,
|
476
|
+
MODEL_TENSOR.FFN_DOWN,
|
477
|
+
MODEL_TENSOR.FFN_UP,
|
478
|
+
],
|
479
|
+
MODEL_ARCH.QWEN2: [
|
480
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
481
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
482
|
+
MODEL_TENSOR.OUTPUT,
|
483
|
+
MODEL_TENSOR.ATTN_NORM,
|
484
|
+
MODEL_TENSOR.ATTN_Q,
|
485
|
+
MODEL_TENSOR.ATTN_K,
|
486
|
+
MODEL_TENSOR.ATTN_V,
|
487
|
+
MODEL_TENSOR.ATTN_OUT,
|
488
|
+
MODEL_TENSOR.FFN_NORM,
|
489
|
+
MODEL_TENSOR.FFN_GATE,
|
490
|
+
MODEL_TENSOR.FFN_DOWN,
|
491
|
+
MODEL_TENSOR.FFN_UP,
|
492
|
+
],
|
493
|
+
MODEL_ARCH.QWEN2MOE: [
|
494
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
495
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
496
|
+
MODEL_TENSOR.OUTPUT,
|
497
|
+
MODEL_TENSOR.ATTN_NORM,
|
498
|
+
MODEL_TENSOR.ATTN_Q,
|
499
|
+
MODEL_TENSOR.ATTN_K,
|
500
|
+
MODEL_TENSOR.ATTN_V,
|
501
|
+
MODEL_TENSOR.ATTN_OUT,
|
502
|
+
MODEL_TENSOR.FFN_NORM,
|
503
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
504
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
505
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
506
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
507
|
+
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
|
508
|
+
MODEL_TENSOR.FFN_GATE_SHEXP,
|
509
|
+
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
510
|
+
MODEL_TENSOR.FFN_UP_SHEXP,
|
511
|
+
],
|
512
|
+
MODEL_ARCH.PLAMO: [
|
513
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
514
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
515
|
+
MODEL_TENSOR.OUTPUT,
|
516
|
+
MODEL_TENSOR.ROPE_FREQS,
|
517
|
+
MODEL_TENSOR.ATTN_NORM,
|
518
|
+
MODEL_TENSOR.ATTN_Q,
|
519
|
+
MODEL_TENSOR.ATTN_K,
|
520
|
+
MODEL_TENSOR.ATTN_V,
|
521
|
+
MODEL_TENSOR.ATTN_OUT,
|
522
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
523
|
+
MODEL_TENSOR.FFN_GATE,
|
524
|
+
MODEL_TENSOR.FFN_DOWN,
|
525
|
+
MODEL_TENSOR.FFN_UP,
|
526
|
+
],
|
527
|
+
MODEL_ARCH.GPT2: [
|
528
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
529
|
+
MODEL_TENSOR.POS_EMBD,
|
530
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
531
|
+
MODEL_TENSOR.OUTPUT,
|
532
|
+
MODEL_TENSOR.ATTN_NORM,
|
533
|
+
MODEL_TENSOR.ATTN_QKV,
|
534
|
+
MODEL_TENSOR.ATTN_OUT,
|
535
|
+
MODEL_TENSOR.FFN_NORM,
|
536
|
+
MODEL_TENSOR.FFN_DOWN,
|
537
|
+
MODEL_TENSOR.FFN_UP,
|
538
|
+
],
|
539
|
+
MODEL_ARCH.PHI2: [
|
540
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
541
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
542
|
+
MODEL_TENSOR.OUTPUT,
|
543
|
+
MODEL_TENSOR.ATTN_NORM,
|
544
|
+
MODEL_TENSOR.ATTN_QKV,
|
545
|
+
MODEL_TENSOR.ATTN_Q,
|
546
|
+
MODEL_TENSOR.ATTN_K,
|
547
|
+
MODEL_TENSOR.ATTN_V,
|
548
|
+
MODEL_TENSOR.ATTN_OUT,
|
549
|
+
MODEL_TENSOR.FFN_NORM,
|
550
|
+
MODEL_TENSOR.FFN_DOWN,
|
551
|
+
MODEL_TENSOR.FFN_UP,
|
552
|
+
],
|
553
|
+
MODEL_ARCH.CODESHELL: [
|
554
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
555
|
+
MODEL_TENSOR.POS_EMBD,
|
556
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
557
|
+
MODEL_TENSOR.OUTPUT,
|
558
|
+
MODEL_TENSOR.ATTN_NORM,
|
559
|
+
MODEL_TENSOR.ATTN_QKV,
|
560
|
+
MODEL_TENSOR.ATTN_OUT,
|
561
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
562
|
+
MODEL_TENSOR.FFN_NORM,
|
563
|
+
MODEL_TENSOR.FFN_DOWN,
|
564
|
+
MODEL_TENSOR.FFN_UP,
|
565
|
+
],
|
566
|
+
MODEL_ARCH.ORION: [
|
567
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
568
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
569
|
+
MODEL_TENSOR.OUTPUT,
|
570
|
+
MODEL_TENSOR.ROPE_FREQS,
|
571
|
+
MODEL_TENSOR.ATTN_NORM,
|
572
|
+
MODEL_TENSOR.ATTN_Q,
|
573
|
+
MODEL_TENSOR.ATTN_K,
|
574
|
+
MODEL_TENSOR.ATTN_V,
|
575
|
+
MODEL_TENSOR.ATTN_OUT,
|
576
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
577
|
+
MODEL_TENSOR.FFN_NORM,
|
578
|
+
MODEL_TENSOR.FFN_GATE,
|
579
|
+
MODEL_TENSOR.FFN_DOWN,
|
580
|
+
MODEL_TENSOR.FFN_UP,
|
581
|
+
],
|
582
|
+
MODEL_ARCH.INTERNLM2: [
|
583
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
584
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
585
|
+
MODEL_TENSOR.OUTPUT,
|
586
|
+
MODEL_TENSOR.ATTN_NORM,
|
587
|
+
MODEL_TENSOR.ATTN_Q,
|
588
|
+
MODEL_TENSOR.ATTN_K,
|
589
|
+
MODEL_TENSOR.ATTN_V,
|
590
|
+
MODEL_TENSOR.ATTN_OUT,
|
591
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
592
|
+
MODEL_TENSOR.FFN_NORM,
|
593
|
+
MODEL_TENSOR.FFN_GATE,
|
594
|
+
MODEL_TENSOR.FFN_DOWN,
|
595
|
+
MODEL_TENSOR.FFN_UP,
|
596
|
+
],
|
597
|
+
MODEL_ARCH.MINICPM: [
|
598
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
599
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
600
|
+
MODEL_TENSOR.ROPE_FREQS,
|
601
|
+
MODEL_TENSOR.ATTN_NORM,
|
602
|
+
MODEL_TENSOR.ATTN_Q,
|
603
|
+
MODEL_TENSOR.ATTN_K,
|
604
|
+
MODEL_TENSOR.ATTN_V,
|
605
|
+
MODEL_TENSOR.ATTN_OUT,
|
606
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
607
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
608
|
+
MODEL_TENSOR.FFN_NORM,
|
609
|
+
MODEL_TENSOR.FFN_GATE,
|
610
|
+
MODEL_TENSOR.FFN_DOWN,
|
611
|
+
MODEL_TENSOR.FFN_UP,
|
612
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
613
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
614
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
615
|
+
],
|
616
|
+
MODEL_ARCH.GEMMA: [
|
617
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
618
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
619
|
+
MODEL_TENSOR.ATTN_NORM,
|
620
|
+
MODEL_TENSOR.ATTN_Q,
|
621
|
+
MODEL_TENSOR.ATTN_K,
|
622
|
+
MODEL_TENSOR.ATTN_V,
|
623
|
+
MODEL_TENSOR.ATTN_OUT,
|
624
|
+
MODEL_TENSOR.FFN_GATE,
|
625
|
+
MODEL_TENSOR.FFN_DOWN,
|
626
|
+
MODEL_TENSOR.FFN_UP,
|
627
|
+
MODEL_TENSOR.FFN_NORM,
|
628
|
+
],
|
629
|
+
MODEL_ARCH.STARCODER2: [
|
630
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
631
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
632
|
+
MODEL_TENSOR.OUTPUT,
|
633
|
+
MODEL_TENSOR.ROPE_FREQS,
|
634
|
+
MODEL_TENSOR.ATTN_NORM,
|
635
|
+
MODEL_TENSOR.ATTN_Q,
|
636
|
+
MODEL_TENSOR.ATTN_K,
|
637
|
+
MODEL_TENSOR.ATTN_V,
|
638
|
+
MODEL_TENSOR.ATTN_OUT,
|
639
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
640
|
+
MODEL_TENSOR.FFN_NORM,
|
641
|
+
MODEL_TENSOR.FFN_DOWN,
|
642
|
+
MODEL_TENSOR.FFN_UP,
|
643
|
+
],
|
644
|
+
MODEL_ARCH.MAMBA: [
|
645
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
646
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
647
|
+
MODEL_TENSOR.OUTPUT,
|
648
|
+
MODEL_TENSOR.ATTN_NORM,
|
649
|
+
MODEL_TENSOR.SSM_IN,
|
650
|
+
MODEL_TENSOR.SSM_CONV1D,
|
651
|
+
MODEL_TENSOR.SSM_X,
|
652
|
+
MODEL_TENSOR.SSM_DT,
|
653
|
+
MODEL_TENSOR.SSM_A,
|
654
|
+
MODEL_TENSOR.SSM_D,
|
655
|
+
MODEL_TENSOR.SSM_OUT,
|
656
|
+
],
|
657
|
+
MODEL_ARCH.XVERSE: [
|
658
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
659
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
660
|
+
MODEL_TENSOR.OUTPUT,
|
661
|
+
MODEL_TENSOR.ROPE_FREQS,
|
662
|
+
MODEL_TENSOR.ATTN_NORM,
|
663
|
+
MODEL_TENSOR.ATTN_Q,
|
664
|
+
MODEL_TENSOR.ATTN_K,
|
665
|
+
MODEL_TENSOR.ATTN_V,
|
666
|
+
MODEL_TENSOR.ATTN_OUT,
|
667
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
668
|
+
MODEL_TENSOR.FFN_NORM,
|
669
|
+
MODEL_TENSOR.FFN_GATE,
|
670
|
+
MODEL_TENSOR.FFN_DOWN,
|
671
|
+
MODEL_TENSOR.FFN_UP,
|
672
|
+
],
|
673
|
+
MODEL_ARCH.COMMAND_R: [
|
674
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
675
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
676
|
+
MODEL_TENSOR.ATTN_NORM,
|
677
|
+
MODEL_TENSOR.ATTN_Q,
|
678
|
+
MODEL_TENSOR.ATTN_K,
|
679
|
+
MODEL_TENSOR.ATTN_V,
|
680
|
+
MODEL_TENSOR.ATTN_OUT,
|
681
|
+
MODEL_TENSOR.FFN_GATE,
|
682
|
+
MODEL_TENSOR.FFN_DOWN,
|
683
|
+
MODEL_TENSOR.FFN_UP,
|
684
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
685
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
686
|
+
],
|
687
|
+
MODEL_ARCH.DBRX: [
|
688
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
689
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
690
|
+
MODEL_TENSOR.OUTPUT,
|
691
|
+
MODEL_TENSOR.ATTN_NORM,
|
692
|
+
MODEL_TENSOR.ATTN_QKV,
|
693
|
+
MODEL_TENSOR.ATTN_OUT,
|
694
|
+
MODEL_TENSOR.ATTN_OUT_NORM,
|
695
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
696
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
697
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
698
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
699
|
+
],
|
700
|
+
MODEL_ARCH.OLMO: [
|
701
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
702
|
+
MODEL_TENSOR.OUTPUT,
|
703
|
+
MODEL_TENSOR.ATTN_Q,
|
704
|
+
MODEL_TENSOR.ATTN_K,
|
705
|
+
MODEL_TENSOR.ATTN_V,
|
706
|
+
MODEL_TENSOR.ATTN_OUT,
|
707
|
+
MODEL_TENSOR.FFN_GATE,
|
708
|
+
MODEL_TENSOR.FFN_DOWN,
|
709
|
+
MODEL_TENSOR.FFN_UP,
|
710
|
+
],
|
711
|
+
# TODO
|
712
|
+
}
|
713
|
+
|
714
|
+
# tensors that will not be serialized
|
715
|
+
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
716
|
+
MODEL_ARCH.LLAMA: [
|
717
|
+
MODEL_TENSOR.ROPE_FREQS,
|
718
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
719
|
+
],
|
720
|
+
MODEL_ARCH.BAICHUAN: [
|
721
|
+
MODEL_TENSOR.ROPE_FREQS,
|
722
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
723
|
+
],
|
724
|
+
MODEL_ARCH.PERSIMMON: [
|
725
|
+
MODEL_TENSOR.ROPE_FREQS,
|
726
|
+
],
|
727
|
+
MODEL_ARCH.QWEN: [
|
728
|
+
MODEL_TENSOR.ROPE_FREQS,
|
729
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
730
|
+
],
|
731
|
+
MODEL_ARCH.CODESHELL: [
|
732
|
+
MODEL_TENSOR.ROPE_FREQS,
|
733
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
734
|
+
],
|
735
|
+
MODEL_ARCH.ORION: [
|
736
|
+
MODEL_TENSOR.ROPE_FREQS,
|
737
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
738
|
+
],
|
739
|
+
MODEL_ARCH.STARCODER2: [
|
740
|
+
MODEL_TENSOR.ROPE_FREQS,
|
741
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
742
|
+
],
|
743
|
+
MODEL_ARCH.XVERSE: [
|
744
|
+
MODEL_TENSOR.ROPE_FREQS,
|
745
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
746
|
+
],
|
747
|
+
}
|
748
|
+
|
749
|
+
#
|
750
|
+
# types
|
751
|
+
#
|
752
|
+
|
753
|
+
|
754
|
+
class TokenType(IntEnum):
|
755
|
+
NORMAL = 1
|
756
|
+
UNKNOWN = 2
|
757
|
+
CONTROL = 3
|
758
|
+
USER_DEFINED = 4
|
759
|
+
UNUSED = 5
|
760
|
+
BYTE = 6
|
761
|
+
|
762
|
+
|
763
|
+
class RopeScalingType(Enum):
|
764
|
+
NONE = 'none'
|
765
|
+
LINEAR = 'linear'
|
766
|
+
YARN = 'yarn'
|
767
|
+
|
768
|
+
|
769
|
+
class PoolingType(IntEnum):
|
770
|
+
NONE = 0
|
771
|
+
MEAN = 1
|
772
|
+
CLS = 2
|
773
|
+
|
774
|
+
|
775
|
+
class GGMLQuantizationType(IntEnum):
|
776
|
+
F32 = 0
|
777
|
+
F16 = 1
|
778
|
+
Q4_0 = 2
|
779
|
+
Q4_1 = 3
|
780
|
+
Q5_0 = 6
|
781
|
+
Q5_1 = 7
|
782
|
+
Q8_0 = 8
|
783
|
+
Q8_1 = 9
|
784
|
+
Q2_K = 10
|
785
|
+
Q3_K = 11
|
786
|
+
Q4_K = 12
|
787
|
+
Q5_K = 13
|
788
|
+
Q6_K = 14
|
789
|
+
Q8_K = 15
|
790
|
+
IQ2_XXS = 16
|
791
|
+
IQ2_XS = 17
|
792
|
+
IQ3_XXS = 18
|
793
|
+
IQ1_S = 19
|
794
|
+
IQ4_NL = 20
|
795
|
+
IQ3_S = 21
|
796
|
+
IQ2_S = 22
|
797
|
+
IQ4_XS = 23
|
798
|
+
I8 = 24
|
799
|
+
I16 = 25
|
800
|
+
I32 = 26
|
801
|
+
I64 = 27
|
802
|
+
F64 = 28
|
803
|
+
IQ1_M = 29
|
804
|
+
|
805
|
+
|
806
|
+
class GGUFEndian(IntEnum):
|
807
|
+
LITTLE = 0
|
808
|
+
BIG = 1
|
809
|
+
|
810
|
+
|
811
|
+
class GGUFValueType(IntEnum):
|
812
|
+
UINT8 = 0
|
813
|
+
INT8 = 1
|
814
|
+
UINT16 = 2
|
815
|
+
INT16 = 3
|
816
|
+
UINT32 = 4
|
817
|
+
INT32 = 5
|
818
|
+
FLOAT32 = 6
|
819
|
+
BOOL = 7
|
820
|
+
STRING = 8
|
821
|
+
ARRAY = 9
|
822
|
+
UINT64 = 10
|
823
|
+
INT64 = 11
|
824
|
+
FLOAT64 = 12
|
825
|
+
|
826
|
+
@staticmethod
|
827
|
+
def get_type(val: Any) -> GGUFValueType:
|
828
|
+
if isinstance(val, (str, bytes, bytearray)):
|
829
|
+
return GGUFValueType.STRING
|
830
|
+
elif isinstance(val, list):
|
831
|
+
return GGUFValueType.ARRAY
|
832
|
+
elif isinstance(val, float):
|
833
|
+
return GGUFValueType.FLOAT32
|
834
|
+
elif isinstance(val, bool):
|
835
|
+
return GGUFValueType.BOOL
|
836
|
+
elif isinstance(val, int):
|
837
|
+
return GGUFValueType.INT32
|
838
|
+
# TODO: need help with 64-bit types in Python
|
839
|
+
else:
|
840
|
+
print("Unknown type:", type(val))
|
841
|
+
sys.exit()
|
842
|
+
|
843
|
+
|
844
|
+
# Note: Does not support GGML_QKK_64
|
845
|
+
QK_K = 256
|
846
|
+
# Items here are (block size, type size)
|
847
|
+
GGML_QUANT_SIZES = {
|
848
|
+
GGMLQuantizationType.F32: (1, 4),
|
849
|
+
GGMLQuantizationType.F16: (1, 2),
|
850
|
+
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
851
|
+
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
852
|
+
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
853
|
+
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
854
|
+
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
855
|
+
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
856
|
+
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
857
|
+
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
858
|
+
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
859
|
+
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
860
|
+
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
861
|
+
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
862
|
+
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
|
863
|
+
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
|
864
|
+
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
|
865
|
+
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
|
866
|
+
GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
|
867
|
+
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
868
|
+
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
869
|
+
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
870
|
+
GGMLQuantizationType.I8: (1, 1),
|
871
|
+
GGMLQuantizationType.I16: (1, 2),
|
872
|
+
GGMLQuantizationType.I32: (1, 4),
|
873
|
+
GGMLQuantizationType.I64: (1, 8),
|
874
|
+
GGMLQuantizationType.F64: (1, 8),
|
875
|
+
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
876
|
+
}
|
877
|
+
|
878
|
+
|
879
|
+
# Aliases for backward compatibility.
|
880
|
+
|
881
|
+
# general
|
882
|
+
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
|
883
|
+
KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
|
884
|
+
KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
|
885
|
+
KEY_GENERAL_NAME = Keys.General.NAME
|
886
|
+
KEY_GENERAL_AUTHOR = Keys.General.AUTHOR
|
887
|
+
KEY_GENERAL_URL = Keys.General.URL
|
888
|
+
KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION
|
889
|
+
KEY_GENERAL_LICENSE = Keys.General.LICENSE
|
890
|
+
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
|
891
|
+
KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
|
892
|
+
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
893
|
+
|
894
|
+
# LLM
|
895
|
+
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
|
896
|
+
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
897
|
+
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
898
|
+
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
899
|
+
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
|
900
|
+
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
|
901
|
+
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
|
902
|
+
|
903
|
+
# attention
|
904
|
+
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
|
905
|
+
KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
|
906
|
+
KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
|
907
|
+
KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV
|
908
|
+
KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
|
909
|
+
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
|
910
|
+
|
911
|
+
# RoPE
|
912
|
+
KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
|
913
|
+
KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
|
914
|
+
KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
|
915
|
+
KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
|
916
|
+
KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
|
917
|
+
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
|
918
|
+
|
919
|
+
# SSM
|
920
|
+
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
921
|
+
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
922
|
+
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
923
|
+
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
924
|
+
|
925
|
+
# tokenization
|
926
|
+
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
927
|
+
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
|
928
|
+
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
|
929
|
+
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
|
930
|
+
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
|
931
|
+
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
|
932
|
+
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
933
|
+
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
934
|
+
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
935
|
+
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
936
|
+
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
937
|
+
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
938
|
+
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
939
|
+
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
940
|
+
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
|
941
|
+
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
942
|
+
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
943
|
+
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|