cui-llama.rn 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/android/src/main/jni.cpp +9 -9
  2. package/cpp/common.cpp +163 -60
  3. package/cpp/common.h +43 -12
  4. package/cpp/ggml-alloc.c +1042 -1037
  5. package/cpp/ggml-backend-impl.h +255 -256
  6. package/cpp/ggml-backend-reg.cpp +582 -582
  7. package/cpp/ggml-backend.cpp +2002 -2002
  8. package/cpp/ggml-backend.h +354 -352
  9. package/cpp/ggml-common.h +1853 -1853
  10. package/cpp/ggml-cpp.h +39 -39
  11. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  12. package/cpp/ggml-cpu-aarch64.h +8 -8
  13. package/cpp/ggml-cpu-impl.h +386 -386
  14. package/cpp/ggml-cpu-quants.c +10920 -10839
  15. package/cpp/ggml-cpu-traits.cpp +36 -36
  16. package/cpp/ggml-cpu-traits.h +38 -38
  17. package/cpp/ggml-cpu.c +329 -60
  18. package/cpp/ggml-cpu.cpp +10 -2
  19. package/cpp/ggml-cpu.h +135 -135
  20. package/cpp/ggml-impl.h +567 -567
  21. package/cpp/ggml-metal-impl.h +17 -17
  22. package/cpp/ggml-metal.m +4884 -4884
  23. package/cpp/ggml-quants.c +5238 -5238
  24. package/cpp/ggml-threading.h +14 -14
  25. package/cpp/ggml.c +6514 -6448
  26. package/cpp/ggml.h +2194 -2163
  27. package/cpp/gguf.cpp +1329 -1325
  28. package/cpp/gguf.h +202 -202
  29. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  30. package/cpp/json-schema-to-grammar.h +8 -8
  31. package/cpp/json.hpp +24766 -24766
  32. package/cpp/llama-adapter.cpp +347 -346
  33. package/cpp/llama-adapter.h +74 -73
  34. package/cpp/llama-arch.cpp +1487 -1434
  35. package/cpp/llama-arch.h +400 -395
  36. package/cpp/llama-batch.cpp +368 -368
  37. package/cpp/llama-batch.h +88 -88
  38. package/cpp/llama-chat.cpp +578 -567
  39. package/cpp/llama-chat.h +52 -51
  40. package/cpp/llama-context.cpp +1775 -1771
  41. package/cpp/llama-context.h +128 -128
  42. package/cpp/llama-cparams.cpp +1 -1
  43. package/cpp/llama-cparams.h +37 -37
  44. package/cpp/llama-cpp.h +30 -30
  45. package/cpp/llama-grammar.cpp +1139 -1139
  46. package/cpp/llama-grammar.h +143 -143
  47. package/cpp/llama-hparams.cpp +71 -71
  48. package/cpp/llama-hparams.h +139 -140
  49. package/cpp/llama-impl.cpp +167 -167
  50. package/cpp/llama-impl.h +61 -61
  51. package/cpp/llama-kv-cache.cpp +718 -718
  52. package/cpp/llama-kv-cache.h +218 -218
  53. package/cpp/llama-mmap.cpp +2 -1
  54. package/cpp/llama-mmap.h +67 -67
  55. package/cpp/llama-model-loader.cpp +1124 -1011
  56. package/cpp/llama-model-loader.h +167 -158
  57. package/cpp/llama-model.cpp +3997 -2202
  58. package/cpp/llama-model.h +370 -391
  59. package/cpp/llama-sampling.cpp +2408 -2406
  60. package/cpp/llama-sampling.h +32 -48
  61. package/cpp/llama-vocab.cpp +3247 -1982
  62. package/cpp/llama-vocab.h +125 -182
  63. package/cpp/llama.cpp +416 -2886
  64. package/cpp/llama.h +1323 -1285
  65. package/cpp/log.cpp +401 -401
  66. package/cpp/log.h +121 -121
  67. package/cpp/rn-llama.hpp +18 -12
  68. package/cpp/sampling.cpp +505 -500
  69. package/cpp/sgemm.cpp +2597 -2597
  70. package/cpp/speculative.cpp +277 -274
  71. package/cpp/speculative.h +28 -28
  72. package/cpp/unicode.cpp +2 -3
  73. package/package.json +1 -1
package/cpp/llama-arch.h CHANGED
@@ -1,395 +1,400 @@
1
- #pragma once
2
-
3
- #include "ggml.h" // lm_ggml_op
4
-
5
- #include <string>
6
-
7
- //
8
- // gguf constants (sync with gguf.py)
9
- //
10
-
11
- enum llm_arch {
12
- LLM_ARCH_LLAMA,
13
- LLM_ARCH_DECI,
14
- LLM_ARCH_FALCON,
15
- LLM_ARCH_BAICHUAN,
16
- LLM_ARCH_GROK,
17
- LLM_ARCH_GPT2,
18
- LLM_ARCH_GPTJ,
19
- LLM_ARCH_GPTNEOX,
20
- LLM_ARCH_MPT,
21
- LLM_ARCH_STARCODER,
22
- LLM_ARCH_REFACT,
23
- LLM_ARCH_BERT,
24
- LLM_ARCH_NOMIC_BERT,
25
- LLM_ARCH_JINA_BERT_V2,
26
- LLM_ARCH_BLOOM,
27
- LLM_ARCH_STABLELM,
28
- LLM_ARCH_QWEN,
29
- LLM_ARCH_QWEN2,
30
- LLM_ARCH_QWEN2MOE,
31
- LLM_ARCH_QWEN2VL,
32
- LLM_ARCH_PHI2,
33
- LLM_ARCH_PHI3,
34
- LLM_ARCH_PLAMO,
35
- LLM_ARCH_CODESHELL,
36
- LLM_ARCH_ORION,
37
- LLM_ARCH_INTERNLM2,
38
- LLM_ARCH_MINICPM,
39
- LLM_ARCH_MINICPM3,
40
- LLM_ARCH_GEMMA,
41
- LLM_ARCH_GEMMA2,
42
- LLM_ARCH_STARCODER2,
43
- LLM_ARCH_MAMBA,
44
- LLM_ARCH_XVERSE,
45
- LLM_ARCH_COMMAND_R,
46
- LLM_ARCH_COHERE2,
47
- LLM_ARCH_DBRX,
48
- LLM_ARCH_OLMO,
49
- LLM_ARCH_OLMO2,
50
- LLM_ARCH_OLMOE,
51
- LLM_ARCH_OPENELM,
52
- LLM_ARCH_ARCTIC,
53
- LLM_ARCH_DEEPSEEK,
54
- LLM_ARCH_DEEPSEEK2,
55
- LLM_ARCH_CHATGLM,
56
- LLM_ARCH_BITNET,
57
- LLM_ARCH_T5,
58
- LLM_ARCH_T5ENCODER,
59
- LLM_ARCH_JAIS,
60
- LLM_ARCH_NEMOTRON,
61
- LLM_ARCH_EXAONE,
62
- LLM_ARCH_RWKV6,
63
- LLM_ARCH_GRANITE,
64
- LLM_ARCH_GRANITE_MOE,
65
- LLM_ARCH_CHAMELEON,
66
- LLM_ARCH_WAVTOKENIZER_DEC,
67
- LLM_ARCH_UNKNOWN,
68
- };
69
-
70
- enum llm_kv {
71
- LLM_KV_GENERAL_TYPE,
72
- LLM_KV_GENERAL_ARCHITECTURE,
73
- LLM_KV_GENERAL_QUANTIZATION_VERSION,
74
- LLM_KV_GENERAL_ALIGNMENT,
75
- LLM_KV_GENERAL_NAME,
76
- LLM_KV_GENERAL_AUTHOR,
77
- LLM_KV_GENERAL_VERSION,
78
- LLM_KV_GENERAL_URL,
79
- LLM_KV_GENERAL_DESCRIPTION,
80
- LLM_KV_GENERAL_LICENSE,
81
- LLM_KV_GENERAL_SOURCE_URL,
82
- LLM_KV_GENERAL_SOURCE_HF_REPO,
83
-
84
- LLM_KV_VOCAB_SIZE,
85
- LLM_KV_CONTEXT_LENGTH,
86
- LLM_KV_EMBEDDING_LENGTH,
87
- LLM_KV_FEATURES_LENGTH,
88
- LLM_KV_BLOCK_COUNT,
89
- LLM_KV_LEADING_DENSE_BLOCK_COUNT,
90
- LLM_KV_FEED_FORWARD_LENGTH,
91
- LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
92
- LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
93
- LLM_KV_USE_PARALLEL_RESIDUAL,
94
- LLM_KV_TENSOR_DATA_LAYOUT,
95
- LLM_KV_EXPERT_COUNT,
96
- LLM_KV_EXPERT_USED_COUNT,
97
- LLM_KV_EXPERT_SHARED_COUNT,
98
- LLM_KV_EXPERT_WEIGHTS_SCALE,
99
- LLM_KV_EXPERT_WEIGHTS_NORM,
100
- LLM_KV_EXPERT_GATING_FUNC,
101
- LLM_KV_POOLING_TYPE,
102
- LLM_KV_LOGIT_SCALE,
103
- LLM_KV_DECODER_START_TOKEN_ID,
104
- LLM_KV_ATTN_LOGIT_SOFTCAPPING,
105
- LLM_KV_FINAL_LOGIT_SOFTCAPPING,
106
- LLM_KV_SWIN_NORM,
107
- LLM_KV_RESCALE_EVERY_N_LAYERS,
108
- LLM_KV_TIME_MIX_EXTRA_DIM,
109
- LLM_KV_TIME_DECAY_EXTRA_DIM,
110
- LLM_KV_RESIDUAL_SCALE,
111
- LLM_KV_EMBEDDING_SCALE,
112
-
113
- LLM_KV_ATTENTION_HEAD_COUNT,
114
- LLM_KV_ATTENTION_HEAD_COUNT_KV,
115
- LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
116
- LLM_KV_ATTENTION_CLAMP_KQV,
117
- LLM_KV_ATTENTION_KEY_LENGTH,
118
- LLM_KV_ATTENTION_VALUE_LENGTH,
119
- LLM_KV_ATTENTION_LAYERNORM_EPS,
120
- LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
121
- LLM_KV_ATTENTION_GROUPNORM_EPS,
122
- LLM_KV_ATTENTION_GROUPNORM_GROUPS,
123
- LLM_KV_ATTENTION_CAUSAL,
124
- LLM_KV_ATTENTION_Q_LORA_RANK,
125
- LLM_KV_ATTENTION_KV_LORA_RANK,
126
- LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
127
- LLM_KV_ATTENTION_SLIDING_WINDOW,
128
- LLM_KV_ATTENTION_SCALE,
129
-
130
- LLM_KV_ROPE_DIMENSION_COUNT,
131
- LLM_KV_ROPE_DIMENSION_SECTIONS,
132
- LLM_KV_ROPE_FREQ_BASE,
133
- LLM_KV_ROPE_SCALE_LINEAR,
134
- LLM_KV_ROPE_SCALING_TYPE,
135
- LLM_KV_ROPE_SCALING_FACTOR,
136
- LLM_KV_ROPE_SCALING_ATTN_FACTOR,
137
- LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
138
- LLM_KV_ROPE_SCALING_FINETUNED,
139
- LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
140
-
141
- LLM_KV_SPLIT_NO,
142
- LLM_KV_SPLIT_COUNT,
143
- LLM_KV_SPLIT_TENSORS_COUNT,
144
-
145
- LLM_KV_SSM_INNER_SIZE,
146
- LLM_KV_SSM_CONV_KERNEL,
147
- LLM_KV_SSM_STATE_SIZE,
148
- LLM_KV_SSM_TIME_STEP_RANK,
149
- LLM_KV_SSM_DT_B_C_RMS,
150
-
151
- LLM_KV_WKV_HEAD_SIZE,
152
-
153
- LLM_KV_TOKENIZER_MODEL,
154
- LLM_KV_TOKENIZER_PRE,
155
- LLM_KV_TOKENIZER_LIST,
156
- LLM_KV_TOKENIZER_TOKEN_TYPE,
157
- LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
158
- LLM_KV_TOKENIZER_SCORES,
159
- LLM_KV_TOKENIZER_MERGES,
160
- LLM_KV_TOKENIZER_BOS_ID,
161
- LLM_KV_TOKENIZER_EOS_ID,
162
- LLM_KV_TOKENIZER_EOT_ID,
163
- LLM_KV_TOKENIZER_EOM_ID,
164
- LLM_KV_TOKENIZER_UNK_ID,
165
- LLM_KV_TOKENIZER_SEP_ID,
166
- LLM_KV_TOKENIZER_PAD_ID,
167
- LLM_KV_TOKENIZER_CLS_ID,
168
- LLM_KV_TOKENIZER_MASK_ID,
169
- LLM_KV_TOKENIZER_ADD_BOS,
170
- LLM_KV_TOKENIZER_ADD_EOS,
171
- LLM_KV_TOKENIZER_ADD_PREFIX,
172
- LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
173
- LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
174
- LLM_KV_TOKENIZER_HF_JSON,
175
- LLM_KV_TOKENIZER_RWKV,
176
- LLM_KV_TOKENIZER_FIM_PRE_ID,
177
- LLM_KV_TOKENIZER_FIM_SUF_ID,
178
- LLM_KV_TOKENIZER_FIM_MID_ID,
179
- LLM_KV_TOKENIZER_FIM_PAD_ID,
180
- LLM_KV_TOKENIZER_FIM_REP_ID,
181
- LLM_KV_TOKENIZER_FIM_SEP_ID,
182
-
183
- LLM_KV_ADAPTER_TYPE,
184
- LLM_KV_ADAPTER_LORA_ALPHA,
185
-
186
- LLM_KV_POSNET_EMBEDDING_LENGTH,
187
- LLM_KV_POSNET_BLOCK_COUNT,
188
-
189
- LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
190
- LLM_KV_CONVNEXT_BLOCK_COUNT,
191
-
192
- // deprecated:
193
- LLM_KV_TOKENIZER_PREFIX_ID,
194
- LLM_KV_TOKENIZER_SUFFIX_ID,
195
- LLM_KV_TOKENIZER_MIDDLE_ID,
196
- };
197
-
198
- enum llm_tensor {
199
- LLM_TENSOR_TOKEN_EMBD,
200
- LLM_TENSOR_TOKEN_EMBD_NORM,
201
- LLM_TENSOR_TOKEN_TYPES,
202
- LLM_TENSOR_POS_EMBD,
203
- LLM_TENSOR_OUTPUT,
204
- LLM_TENSOR_OUTPUT_NORM,
205
- LLM_TENSOR_ROPE_FREQS,
206
- LLM_TENSOR_ROPE_FACTORS_LONG,
207
- LLM_TENSOR_ROPE_FACTORS_SHORT,
208
- LLM_TENSOR_ATTN_Q,
209
- LLM_TENSOR_ATTN_K,
210
- LLM_TENSOR_ATTN_V,
211
- LLM_TENSOR_ATTN_QKV,
212
- LLM_TENSOR_ATTN_OUT,
213
- LLM_TENSOR_ATTN_NORM,
214
- LLM_TENSOR_ATTN_NORM_2,
215
- LLM_TENSOR_ATTN_OUT_NORM,
216
- LLM_TENSOR_ATTN_POST_NORM,
217
- LLM_TENSOR_ATTN_ROT_EMBD,
218
- LLM_TENSOR_FFN_GATE_INP,
219
- LLM_TENSOR_FFN_GATE_INP_SHEXP,
220
- LLM_TENSOR_FFN_NORM,
221
- LLM_TENSOR_FFN_POST_NORM,
222
- LLM_TENSOR_FFN_GATE,
223
- LLM_TENSOR_FFN_DOWN,
224
- LLM_TENSOR_FFN_UP,
225
- LLM_TENSOR_FFN_ACT,
226
- LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
227
- LLM_TENSOR_FFN_GATE_EXP,
228
- LLM_TENSOR_FFN_UP_EXP,
229
- LLM_TENSOR_FFN_NORM_EXPS,
230
- LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
231
- LLM_TENSOR_FFN_GATE_EXPS,
232
- LLM_TENSOR_FFN_UP_EXPS,
233
- LLM_TENSOR_FFN_DOWN_SHEXP,
234
- LLM_TENSOR_FFN_GATE_SHEXP,
235
- LLM_TENSOR_FFN_UP_SHEXP,
236
- LLM_TENSOR_FFN_EXP_PROBS_B,
237
- LLM_TENSOR_ATTN_Q_NORM,
238
- LLM_TENSOR_ATTN_K_NORM,
239
- LLM_TENSOR_LAYER_OUT_NORM,
240
- LLM_TENSOR_SSM_IN,
241
- LLM_TENSOR_SSM_CONV1D,
242
- LLM_TENSOR_SSM_X,
243
- LLM_TENSOR_SSM_DT,
244
- LLM_TENSOR_SSM_A,
245
- LLM_TENSOR_SSM_D,
246
- LLM_TENSOR_SSM_OUT,
247
- LLM_TENSOR_TIME_MIX_W1,
248
- LLM_TENSOR_TIME_MIX_W2,
249
- LLM_TENSOR_TIME_MIX_LERP_X,
250
- LLM_TENSOR_TIME_MIX_LERP_W,
251
- LLM_TENSOR_TIME_MIX_LERP_K,
252
- LLM_TENSOR_TIME_MIX_LERP_V,
253
- LLM_TENSOR_TIME_MIX_LERP_R,
254
- LLM_TENSOR_TIME_MIX_LERP_G,
255
- LLM_TENSOR_TIME_MIX_FIRST,
256
- LLM_TENSOR_TIME_MIX_DECAY,
257
- LLM_TENSOR_TIME_MIX_DECAY_W1,
258
- LLM_TENSOR_TIME_MIX_DECAY_W2,
259
- LLM_TENSOR_TIME_MIX_KEY,
260
- LLM_TENSOR_TIME_MIX_VALUE,
261
- LLM_TENSOR_TIME_MIX_RECEPTANCE,
262
- LLM_TENSOR_TIME_MIX_GATE,
263
- LLM_TENSOR_TIME_MIX_LN,
264
- LLM_TENSOR_TIME_MIX_OUTPUT,
265
- LLM_TENSOR_CHANNEL_MIX_LERP_K,
266
- LLM_TENSOR_CHANNEL_MIX_LERP_R,
267
- LLM_TENSOR_CHANNEL_MIX_KEY,
268
- LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
269
- LLM_TENSOR_CHANNEL_MIX_VALUE,
270
- LLM_TENSOR_ATTN_Q_A,
271
- LLM_TENSOR_ATTN_Q_B,
272
- LLM_TENSOR_ATTN_KV_A_MQA,
273
- LLM_TENSOR_ATTN_KV_B,
274
- LLM_TENSOR_ATTN_Q_A_NORM,
275
- LLM_TENSOR_ATTN_KV_A_NORM,
276
- LLM_TENSOR_ATTN_SUB_NORM,
277
- LLM_TENSOR_FFN_SUB_NORM,
278
- LLM_TENSOR_DEC_ATTN_NORM,
279
- LLM_TENSOR_DEC_ATTN_Q,
280
- LLM_TENSOR_DEC_ATTN_K,
281
- LLM_TENSOR_DEC_ATTN_V,
282
- LLM_TENSOR_DEC_ATTN_OUT,
283
- LLM_TENSOR_DEC_ATTN_REL_B,
284
- LLM_TENSOR_DEC_CROSS_ATTN_NORM,
285
- LLM_TENSOR_DEC_CROSS_ATTN_Q,
286
- LLM_TENSOR_DEC_CROSS_ATTN_K,
287
- LLM_TENSOR_DEC_CROSS_ATTN_V,
288
- LLM_TENSOR_DEC_CROSS_ATTN_OUT,
289
- LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
290
- LLM_TENSOR_DEC_FFN_NORM,
291
- LLM_TENSOR_DEC_FFN_GATE,
292
- LLM_TENSOR_DEC_FFN_DOWN,
293
- LLM_TENSOR_DEC_FFN_UP,
294
- LLM_TENSOR_DEC_OUTPUT_NORM,
295
- LLM_TENSOR_ENC_ATTN_NORM,
296
- LLM_TENSOR_ENC_ATTN_Q,
297
- LLM_TENSOR_ENC_ATTN_K,
298
- LLM_TENSOR_ENC_ATTN_V,
299
- LLM_TENSOR_ENC_ATTN_OUT,
300
- LLM_TENSOR_ENC_ATTN_REL_B,
301
- LLM_TENSOR_ENC_FFN_NORM,
302
- LLM_TENSOR_ENC_FFN_GATE,
303
- LLM_TENSOR_ENC_FFN_DOWN,
304
- LLM_TENSOR_ENC_FFN_UP,
305
- LLM_TENSOR_ENC_OUTPUT_NORM,
306
- LLM_TENSOR_CLS,
307
- LLM_TENSOR_CLS_OUT,
308
- LLM_TENSOR_CONV1D,
309
- LLM_TENSOR_CONVNEXT_DW,
310
- LLM_TENSOR_CONVNEXT_NORM,
311
- LLM_TENSOR_CONVNEXT_PW1,
312
- LLM_TENSOR_CONVNEXT_PW2,
313
- LLM_TENSOR_CONVNEXT_GAMMA,
314
- LLM_TENSOR_POS_NET_CONV1,
315
- LLM_TENSOR_POS_NET_CONV2,
316
- LLM_TENSOR_POS_NET_NORM,
317
- LLM_TENSOR_POS_NET_NORM1,
318
- LLM_TENSOR_POS_NET_NORM2,
319
- LLM_TENSOR_POS_NET_ATTN_NORM,
320
- LLM_TENSOR_POS_NET_ATTN_Q,
321
- LLM_TENSOR_POS_NET_ATTN_K,
322
- LLM_TENSOR_POS_NET_ATTN_V,
323
- LLM_TENSOR_POS_NET_ATTN_OUT,
324
- };
325
-
326
- enum llm_tensor_layer {
327
- LLM_TENSOR_LAYER_INPUT,
328
- LLM_TENSOR_LAYER_REPEATING,
329
- LLM_TENSOR_LAYER_OUTPUT,
330
- };
331
-
332
- struct LLM_KV {
333
- LLM_KV(llm_arch arch);
334
-
335
- llm_arch arch;
336
-
337
- std::string operator()(llm_kv kv) const;
338
- };
339
-
340
- // helper to handle gguf constants
341
- // usage:
342
- //
343
- // const auto tn = LLM_TN(LLM_ARCH_LLAMA);
344
- //
345
- // std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
346
- // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
347
- // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
348
- //
349
- struct LLM_TN_IMPL {
350
- const llm_arch arch;
351
- const llm_tensor tensor;
352
- const char * const suffix;
353
- const int bid;
354
- const int xid;
355
-
356
- std::string str() const;
357
-
358
- operator std::string() const {
359
- return str();
360
- }
361
-
362
- friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
363
- return str == tn.str();
364
- }
365
-
366
- friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
367
- return str != tn.str();
368
- }
369
- };
370
-
371
- struct LLM_TN {
372
- LLM_TN(llm_arch arch) : arch(arch) {}
373
-
374
- llm_arch arch;
375
-
376
- LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
377
- return { arch, tensor, suffix, bid, xid };
378
- }
379
-
380
- LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
381
- return { arch, tensor, nullptr, bid, xid };
382
- }
383
- };
384
-
385
-
386
- struct llm_tensor_info {
387
- llm_tensor_layer layer;
388
- lm_ggml_op op;
389
- };
390
-
391
- const char * llm_arch_name(llm_arch arch);
392
-
393
- llm_arch llm_arch_from_string(const std::string & name);
394
-
395
- const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
1
+ #pragma once
2
+
3
+ #include "ggml.h" // lm_ggml_op
4
+
5
+ #include <string>
6
+
7
+ //
8
+ // gguf constants (sync with gguf.py)
9
+ //
10
+
11
+ enum llm_arch {
12
+ LLM_ARCH_LLAMA,
13
+ LLM_ARCH_DECI,
14
+ LLM_ARCH_FALCON,
15
+ LLM_ARCH_BAICHUAN,
16
+ LLM_ARCH_GROK,
17
+ LLM_ARCH_GPT2,
18
+ LLM_ARCH_GPTJ,
19
+ LLM_ARCH_GPTNEOX,
20
+ LLM_ARCH_MPT,
21
+ LLM_ARCH_STARCODER,
22
+ LLM_ARCH_REFACT,
23
+ LLM_ARCH_BERT,
24
+ LLM_ARCH_NOMIC_BERT,
25
+ LLM_ARCH_JINA_BERT_V2,
26
+ LLM_ARCH_BLOOM,
27
+ LLM_ARCH_STABLELM,
28
+ LLM_ARCH_QWEN,
29
+ LLM_ARCH_QWEN2,
30
+ LLM_ARCH_QWEN2MOE,
31
+ LLM_ARCH_QWEN2VL,
32
+ LLM_ARCH_PHI2,
33
+ LLM_ARCH_PHI3,
34
+ LLM_ARCH_PHIMOE,
35
+ LLM_ARCH_PLAMO,
36
+ LLM_ARCH_CODESHELL,
37
+ LLM_ARCH_ORION,
38
+ LLM_ARCH_INTERNLM2,
39
+ LLM_ARCH_MINICPM,
40
+ LLM_ARCH_MINICPM3,
41
+ LLM_ARCH_GEMMA,
42
+ LLM_ARCH_GEMMA2,
43
+ LLM_ARCH_STARCODER2,
44
+ LLM_ARCH_MAMBA,
45
+ LLM_ARCH_XVERSE,
46
+ LLM_ARCH_COMMAND_R,
47
+ LLM_ARCH_COHERE2,
48
+ LLM_ARCH_DBRX,
49
+ LLM_ARCH_OLMO,
50
+ LLM_ARCH_OLMO2,
51
+ LLM_ARCH_OLMOE,
52
+ LLM_ARCH_OPENELM,
53
+ LLM_ARCH_ARCTIC,
54
+ LLM_ARCH_DEEPSEEK,
55
+ LLM_ARCH_DEEPSEEK2,
56
+ LLM_ARCH_CHATGLM,
57
+ LLM_ARCH_BITNET,
58
+ LLM_ARCH_T5,
59
+ LLM_ARCH_T5ENCODER,
60
+ LLM_ARCH_JAIS,
61
+ LLM_ARCH_NEMOTRON,
62
+ LLM_ARCH_EXAONE,
63
+ LLM_ARCH_RWKV6,
64
+ LLM_ARCH_RWKV6QWEN2,
65
+ LLM_ARCH_GRANITE,
66
+ LLM_ARCH_GRANITE_MOE,
67
+ LLM_ARCH_CHAMELEON,
68
+ LLM_ARCH_WAVTOKENIZER_DEC,
69
+ LLM_ARCH_UNKNOWN,
70
+ };
71
+
72
+ enum llm_kv {
73
+ LLM_KV_GENERAL_TYPE,
74
+ LLM_KV_GENERAL_ARCHITECTURE,
75
+ LLM_KV_GENERAL_QUANTIZATION_VERSION,
76
+ LLM_KV_GENERAL_ALIGNMENT,
77
+ LLM_KV_GENERAL_NAME,
78
+ LLM_KV_GENERAL_AUTHOR,
79
+ LLM_KV_GENERAL_VERSION,
80
+ LLM_KV_GENERAL_URL,
81
+ LLM_KV_GENERAL_DESCRIPTION,
82
+ LLM_KV_GENERAL_LICENSE,
83
+ LLM_KV_GENERAL_SOURCE_URL,
84
+ LLM_KV_GENERAL_SOURCE_HF_REPO,
85
+
86
+ LLM_KV_VOCAB_SIZE,
87
+ LLM_KV_CONTEXT_LENGTH,
88
+ LLM_KV_EMBEDDING_LENGTH,
89
+ LLM_KV_FEATURES_LENGTH,
90
+ LLM_KV_BLOCK_COUNT,
91
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
92
+ LLM_KV_FEED_FORWARD_LENGTH,
93
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
94
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
95
+ LLM_KV_USE_PARALLEL_RESIDUAL,
96
+ LLM_KV_TENSOR_DATA_LAYOUT,
97
+ LLM_KV_EXPERT_COUNT,
98
+ LLM_KV_EXPERT_USED_COUNT,
99
+ LLM_KV_EXPERT_SHARED_COUNT,
100
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
101
+ LLM_KV_EXPERT_WEIGHTS_NORM,
102
+ LLM_KV_EXPERT_GATING_FUNC,
103
+ LLM_KV_POOLING_TYPE,
104
+ LLM_KV_LOGIT_SCALE,
105
+ LLM_KV_DECODER_START_TOKEN_ID,
106
+ LLM_KV_ATTN_LOGIT_SOFTCAPPING,
107
+ LLM_KV_FINAL_LOGIT_SOFTCAPPING,
108
+ LLM_KV_SWIN_NORM,
109
+ LLM_KV_RESCALE_EVERY_N_LAYERS,
110
+ LLM_KV_TIME_MIX_EXTRA_DIM,
111
+ LLM_KV_TIME_DECAY_EXTRA_DIM,
112
+ LLM_KV_RESIDUAL_SCALE,
113
+ LLM_KV_EMBEDDING_SCALE,
114
+ LLM_KV_TOKEN_SHIFT_COUNT,
115
+
116
+ LLM_KV_ATTENTION_HEAD_COUNT,
117
+ LLM_KV_ATTENTION_HEAD_COUNT_KV,
118
+ LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
119
+ LLM_KV_ATTENTION_CLAMP_KQV,
120
+ LLM_KV_ATTENTION_KEY_LENGTH,
121
+ LLM_KV_ATTENTION_VALUE_LENGTH,
122
+ LLM_KV_ATTENTION_LAYERNORM_EPS,
123
+ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
124
+ LLM_KV_ATTENTION_GROUPNORM_EPS,
125
+ LLM_KV_ATTENTION_GROUPNORM_GROUPS,
126
+ LLM_KV_ATTENTION_CAUSAL,
127
+ LLM_KV_ATTENTION_Q_LORA_RANK,
128
+ LLM_KV_ATTENTION_KV_LORA_RANK,
129
+ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
130
+ LLM_KV_ATTENTION_SLIDING_WINDOW,
131
+ LLM_KV_ATTENTION_SCALE,
132
+
133
+ LLM_KV_ROPE_DIMENSION_COUNT,
134
+ LLM_KV_ROPE_DIMENSION_SECTIONS,
135
+ LLM_KV_ROPE_FREQ_BASE,
136
+ LLM_KV_ROPE_SCALE_LINEAR,
137
+ LLM_KV_ROPE_SCALING_TYPE,
138
+ LLM_KV_ROPE_SCALING_FACTOR,
139
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
140
+ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
141
+ LLM_KV_ROPE_SCALING_FINETUNED,
142
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
143
+
144
+ LLM_KV_SPLIT_NO,
145
+ LLM_KV_SPLIT_COUNT,
146
+ LLM_KV_SPLIT_TENSORS_COUNT,
147
+
148
+ LLM_KV_SSM_INNER_SIZE,
149
+ LLM_KV_SSM_CONV_KERNEL,
150
+ LLM_KV_SSM_STATE_SIZE,
151
+ LLM_KV_SSM_TIME_STEP_RANK,
152
+ LLM_KV_SSM_DT_B_C_RMS,
153
+
154
+ LLM_KV_WKV_HEAD_SIZE,
155
+
156
+ LLM_KV_TOKENIZER_MODEL,
157
+ LLM_KV_TOKENIZER_PRE,
158
+ LLM_KV_TOKENIZER_LIST,
159
+ LLM_KV_TOKENIZER_TOKEN_TYPE,
160
+ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
161
+ LLM_KV_TOKENIZER_SCORES,
162
+ LLM_KV_TOKENIZER_MERGES,
163
+ LLM_KV_TOKENIZER_BOS_ID,
164
+ LLM_KV_TOKENIZER_EOS_ID,
165
+ LLM_KV_TOKENIZER_EOT_ID,
166
+ LLM_KV_TOKENIZER_EOM_ID,
167
+ LLM_KV_TOKENIZER_UNK_ID,
168
+ LLM_KV_TOKENIZER_SEP_ID,
169
+ LLM_KV_TOKENIZER_PAD_ID,
170
+ LLM_KV_TOKENIZER_CLS_ID,
171
+ LLM_KV_TOKENIZER_MASK_ID,
172
+ LLM_KV_TOKENIZER_ADD_BOS,
173
+ LLM_KV_TOKENIZER_ADD_EOS,
174
+ LLM_KV_TOKENIZER_ADD_PREFIX,
175
+ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
176
+ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
177
+ LLM_KV_TOKENIZER_HF_JSON,
178
+ LLM_KV_TOKENIZER_RWKV,
179
+ LLM_KV_TOKENIZER_CHAT_TEMPLATE,
180
+ LLM_KV_TOKENIZER_FIM_PRE_ID,
181
+ LLM_KV_TOKENIZER_FIM_SUF_ID,
182
+ LLM_KV_TOKENIZER_FIM_MID_ID,
183
+ LLM_KV_TOKENIZER_FIM_PAD_ID,
184
+ LLM_KV_TOKENIZER_FIM_REP_ID,
185
+ LLM_KV_TOKENIZER_FIM_SEP_ID,
186
+
187
+ LLM_KV_ADAPTER_TYPE,
188
+ LLM_KV_ADAPTER_LORA_ALPHA,
189
+
190
+ LLM_KV_POSNET_EMBEDDING_LENGTH,
191
+ LLM_KV_POSNET_BLOCK_COUNT,
192
+
193
+ LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
194
+ LLM_KV_CONVNEXT_BLOCK_COUNT,
195
+
196
+ // deprecated:
197
+ LLM_KV_TOKENIZER_PREFIX_ID,
198
+ LLM_KV_TOKENIZER_SUFFIX_ID,
199
+ LLM_KV_TOKENIZER_MIDDLE_ID,
200
+ };
201
+
202
+ enum llm_tensor {
203
+ LLM_TENSOR_TOKEN_EMBD,
204
+ LLM_TENSOR_TOKEN_EMBD_NORM,
205
+ LLM_TENSOR_TOKEN_TYPES,
206
+ LLM_TENSOR_POS_EMBD,
207
+ LLM_TENSOR_OUTPUT,
208
+ LLM_TENSOR_OUTPUT_NORM,
209
+ LLM_TENSOR_ROPE_FREQS,
210
+ LLM_TENSOR_ROPE_FACTORS_LONG,
211
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
212
+ LLM_TENSOR_ATTN_Q,
213
+ LLM_TENSOR_ATTN_K,
214
+ LLM_TENSOR_ATTN_V,
215
+ LLM_TENSOR_ATTN_QKV,
216
+ LLM_TENSOR_ATTN_OUT,
217
+ LLM_TENSOR_ATTN_NORM,
218
+ LLM_TENSOR_ATTN_NORM_2,
219
+ LLM_TENSOR_ATTN_OUT_NORM,
220
+ LLM_TENSOR_ATTN_POST_NORM,
221
+ LLM_TENSOR_ATTN_ROT_EMBD,
222
+ LLM_TENSOR_FFN_GATE_INP,
223
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
224
+ LLM_TENSOR_FFN_NORM,
225
+ LLM_TENSOR_FFN_POST_NORM,
226
+ LLM_TENSOR_FFN_GATE,
227
+ LLM_TENSOR_FFN_DOWN,
228
+ LLM_TENSOR_FFN_UP,
229
+ LLM_TENSOR_FFN_ACT,
230
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
231
+ LLM_TENSOR_FFN_GATE_EXP,
232
+ LLM_TENSOR_FFN_UP_EXP,
233
+ LLM_TENSOR_FFN_NORM_EXPS,
234
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
235
+ LLM_TENSOR_FFN_GATE_EXPS,
236
+ LLM_TENSOR_FFN_UP_EXPS,
237
+ LLM_TENSOR_FFN_DOWN_SHEXP,
238
+ LLM_TENSOR_FFN_GATE_SHEXP,
239
+ LLM_TENSOR_FFN_UP_SHEXP,
240
+ LLM_TENSOR_FFN_EXP_PROBS_B,
241
+ LLM_TENSOR_ATTN_Q_NORM,
242
+ LLM_TENSOR_ATTN_K_NORM,
243
+ LLM_TENSOR_LAYER_OUT_NORM,
244
+ LLM_TENSOR_SSM_IN,
245
+ LLM_TENSOR_SSM_CONV1D,
246
+ LLM_TENSOR_SSM_X,
247
+ LLM_TENSOR_SSM_DT,
248
+ LLM_TENSOR_SSM_A,
249
+ LLM_TENSOR_SSM_D,
250
+ LLM_TENSOR_SSM_OUT,
251
+ LLM_TENSOR_TIME_MIX_W1,
252
+ LLM_TENSOR_TIME_MIX_W2,
253
+ LLM_TENSOR_TIME_MIX_LERP_X,
254
+ LLM_TENSOR_TIME_MIX_LERP_W,
255
+ LLM_TENSOR_TIME_MIX_LERP_K,
256
+ LLM_TENSOR_TIME_MIX_LERP_V,
257
+ LLM_TENSOR_TIME_MIX_LERP_R,
258
+ LLM_TENSOR_TIME_MIX_LERP_G,
259
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
260
+ LLM_TENSOR_TIME_MIX_FIRST,
261
+ LLM_TENSOR_TIME_MIX_DECAY,
262
+ LLM_TENSOR_TIME_MIX_DECAY_W1,
263
+ LLM_TENSOR_TIME_MIX_DECAY_W2,
264
+ LLM_TENSOR_TIME_MIX_KEY,
265
+ LLM_TENSOR_TIME_MIX_VALUE,
266
+ LLM_TENSOR_TIME_MIX_RECEPTANCE,
267
+ LLM_TENSOR_TIME_MIX_GATE,
268
+ LLM_TENSOR_TIME_MIX_LN,
269
+ LLM_TENSOR_TIME_MIX_OUTPUT,
270
+ LLM_TENSOR_CHANNEL_MIX_LERP_K,
271
+ LLM_TENSOR_CHANNEL_MIX_LERP_R,
272
+ LLM_TENSOR_CHANNEL_MIX_KEY,
273
+ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
274
+ LLM_TENSOR_CHANNEL_MIX_VALUE,
275
+ LLM_TENSOR_ATTN_Q_A,
276
+ LLM_TENSOR_ATTN_Q_B,
277
+ LLM_TENSOR_ATTN_KV_A_MQA,
278
+ LLM_TENSOR_ATTN_KV_B,
279
+ LLM_TENSOR_ATTN_Q_A_NORM,
280
+ LLM_TENSOR_ATTN_KV_A_NORM,
281
+ LLM_TENSOR_ATTN_SUB_NORM,
282
+ LLM_TENSOR_FFN_SUB_NORM,
283
+ LLM_TENSOR_DEC_ATTN_NORM,
284
+ LLM_TENSOR_DEC_ATTN_Q,
285
+ LLM_TENSOR_DEC_ATTN_K,
286
+ LLM_TENSOR_DEC_ATTN_V,
287
+ LLM_TENSOR_DEC_ATTN_OUT,
288
+ LLM_TENSOR_DEC_ATTN_REL_B,
289
+ LLM_TENSOR_DEC_CROSS_ATTN_NORM,
290
+ LLM_TENSOR_DEC_CROSS_ATTN_Q,
291
+ LLM_TENSOR_DEC_CROSS_ATTN_K,
292
+ LLM_TENSOR_DEC_CROSS_ATTN_V,
293
+ LLM_TENSOR_DEC_CROSS_ATTN_OUT,
294
+ LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
295
+ LLM_TENSOR_DEC_FFN_NORM,
296
+ LLM_TENSOR_DEC_FFN_GATE,
297
+ LLM_TENSOR_DEC_FFN_DOWN,
298
+ LLM_TENSOR_DEC_FFN_UP,
299
+ LLM_TENSOR_DEC_OUTPUT_NORM,
300
+ LLM_TENSOR_ENC_ATTN_NORM,
301
+ LLM_TENSOR_ENC_ATTN_Q,
302
+ LLM_TENSOR_ENC_ATTN_K,
303
+ LLM_TENSOR_ENC_ATTN_V,
304
+ LLM_TENSOR_ENC_ATTN_OUT,
305
+ LLM_TENSOR_ENC_ATTN_REL_B,
306
+ LLM_TENSOR_ENC_FFN_NORM,
307
+ LLM_TENSOR_ENC_FFN_GATE,
308
+ LLM_TENSOR_ENC_FFN_DOWN,
309
+ LLM_TENSOR_ENC_FFN_UP,
310
+ LLM_TENSOR_ENC_OUTPUT_NORM,
311
+ LLM_TENSOR_CLS,
312
+ LLM_TENSOR_CLS_OUT,
313
+ LLM_TENSOR_CONV1D,
314
+ LLM_TENSOR_CONVNEXT_DW,
315
+ LLM_TENSOR_CONVNEXT_NORM,
316
+ LLM_TENSOR_CONVNEXT_PW1,
317
+ LLM_TENSOR_CONVNEXT_PW2,
318
+ LLM_TENSOR_CONVNEXT_GAMMA,
319
+ LLM_TENSOR_POS_NET_CONV1,
320
+ LLM_TENSOR_POS_NET_CONV2,
321
+ LLM_TENSOR_POS_NET_NORM,
322
+ LLM_TENSOR_POS_NET_NORM1,
323
+ LLM_TENSOR_POS_NET_NORM2,
324
+ LLM_TENSOR_POS_NET_ATTN_NORM,
325
+ LLM_TENSOR_POS_NET_ATTN_Q,
326
+ LLM_TENSOR_POS_NET_ATTN_K,
327
+ LLM_TENSOR_POS_NET_ATTN_V,
328
+ LLM_TENSOR_POS_NET_ATTN_OUT,
329
+ };
330
+
331
+ enum llm_tensor_layer {
332
+ LLM_TENSOR_LAYER_INPUT,
333
+ LLM_TENSOR_LAYER_REPEATING,
334
+ LLM_TENSOR_LAYER_OUTPUT,
335
+ };
336
+
337
+ struct LLM_KV {
338
+ LLM_KV(llm_arch arch);
339
+
340
+ llm_arch arch;
341
+
342
+ std::string operator()(llm_kv kv) const;
343
+ };
344
+
345
+ // helper to handle gguf constants
346
+ // usage:
347
+ //
348
+ // const auto tn = LLM_TN(LLM_ARCH_LLAMA);
349
+ //
350
+ // std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
351
+ // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
352
+ // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
353
+ //
354
+ struct LLM_TN_IMPL {
355
+ const llm_arch arch;
356
+ const llm_tensor tensor;
357
+ const char * const suffix;
358
+ const int bid;
359
+ const int xid;
360
+
361
+ std::string str() const;
362
+
363
+ operator std::string() const {
364
+ return str();
365
+ }
366
+
367
+ friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
368
+ return str == tn.str();
369
+ }
370
+
371
+ friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
372
+ return str != tn.str();
373
+ }
374
+ };
375
+
376
+ struct LLM_TN {
377
+ LLM_TN(llm_arch arch) : arch(arch) {}
378
+
379
+ llm_arch arch;
380
+
381
+ LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
382
+ return { arch, tensor, suffix, bid, xid };
383
+ }
384
+
385
+ LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
386
+ return { arch, tensor, nullptr, bid, xid };
387
+ }
388
+ };
389
+
390
+
391
+ struct llm_tensor_info {
392
+ llm_tensor_layer layer;
393
+ lm_ggml_op op;
394
+ };
395
+
396
+ const char * llm_arch_name(llm_arch arch);
397
+
398
+ llm_arch llm_arch_from_string(const std::string & name);
399
+
400
+ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);