bigdl-core-cpp 2.6.0b20250227__py3-none-win_amd64.whl → 2.6.0b20250231__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. bigdl/cpp/convert_hf_to_gguf.py +687 -60
  2. bigdl/cpp/convert_hf_to_gguf_update.py +46 -41
  3. bigdl/cpp/convert_lora_to_gguf.py +33 -5
  4. bigdl/cpp/gguf-py/gguf/constants.py +306 -104
  5. bigdl/cpp/gguf-py/gguf/gguf_writer.py +31 -3
  6. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +122 -25
  7. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  8. bigdl/cpp/gguf-py/gguf/vocab.py +1 -1
  9. bigdl/cpp/libs/common.lib +0 -0
  10. bigdl/cpp/libs/ggml-base.dll +0 -0
  11. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  12. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  13. bigdl/cpp/libs/ggml.dll +0 -0
  14. bigdl/cpp/libs/llama-batched.exe +0 -0
  15. bigdl/cpp/libs/llama-bench.exe +0 -0
  16. bigdl/cpp/libs/llama-cli.exe +0 -0
  17. bigdl/cpp/libs/llama-embedding.exe +0 -0
  18. bigdl/cpp/libs/llama-gemma3-cli.exe +0 -0
  19. bigdl/cpp/libs/llama-gguf.exe +0 -0
  20. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  21. bigdl/cpp/libs/llama-lookup.exe +0 -0
  22. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  23. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  24. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  25. bigdl/cpp/libs/llama-quantize.exe +0 -0
  26. bigdl/cpp/libs/llama-server.exe +0 -0
  27. bigdl/cpp/libs/llama-simple.exe +0 -0
  28. bigdl/cpp/libs/llama-speculative.exe +0 -0
  29. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  30. bigdl/cpp/libs/llama.dll +0 -0
  31. bigdl/cpp/libs/llava_shared.dll +0 -0
  32. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  33. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  34. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  35. bigdl/cpp/libs/ollama-lib.exe +0 -0
  36. bigdl/cpp/libs/ollama.exe +0 -0
  37. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  38. bigdl/cpp/libs/ollama_llama.dll +0 -0
  39. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  40. {bigdl_core_cpp-2.6.0b20250227.dist-info → bigdl_core_cpp-2.6.0b20250231.dist-info}/METADATA +2 -2
  41. bigdl_core_cpp-2.6.0b20250231.dist-info/RECORD +57 -0
  42. {bigdl_core_cpp-2.6.0b20250227.dist-info → bigdl_core_cpp-2.6.0b20250231.dist-info}/WHEEL +1 -1
  43. bigdl_core_cpp-2.6.0b20250227.dist-info/RECORD +0 -56
  44. {bigdl_core_cpp-2.6.0b20250227.data → bigdl_core_cpp-2.6.0b20250231.data}/scripts/init-llama-cpp.bat +0 -0
  45. {bigdl_core_cpp-2.6.0b20250227.data → bigdl_core_cpp-2.6.0b20250231.data}/scripts/init-llama-cpp.ps1 +0 -0
  46. {bigdl_core_cpp-2.6.0b20250227.data → bigdl_core_cpp-2.6.0b20250231.data}/scripts/init-ollama.bat +0 -0
  47. {bigdl_core_cpp-2.6.0b20250227.dist-info → bigdl_core_cpp-2.6.0b20250231.dist-info}/top_level.txt +0 -0
@@ -90,6 +90,7 @@ class Keys:
90
90
  VOCAB_SIZE = "{arch}.vocab_size"
91
91
  CONTEXT_LENGTH = "{arch}.context_length"
92
92
  EMBEDDING_LENGTH = "{arch}.embedding_length"
93
+ FEATURES_LENGTH = "{arch}.features_length"
93
94
  BLOCK_COUNT = "{arch}.block_count"
94
95
  LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
95
96
  FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
@@ -101,6 +102,8 @@ class Keys:
101
102
  EXPERT_USED_COUNT = "{arch}.expert_used_count"
102
103
  EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
103
104
  EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
105
+ EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
106
+ EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
104
107
  POOLING_TYPE = "{arch}.pooling_type"
105
108
  LOGIT_SCALE = "{arch}.logit_scale"
106
109
  DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
@@ -112,6 +115,7 @@ class Keys:
112
115
  TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
113
116
  RESIDUAL_SCALE = "{arch}.residual_scale"
114
117
  EMBEDDING_SCALE = "{arch}.embedding_scale"
118
+ TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
115
119
 
116
120
  class Attention:
117
121
  HEAD_COUNT = "{arch}.attention.head_count"
@@ -122,6 +126,8 @@ class Keys:
122
126
  VALUE_LENGTH = "{arch}.attention.value_length"
123
127
  LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
124
128
  LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
129
+ GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon"
130
+ GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups"
125
131
  CAUSAL = "{arch}.attention.causal"
126
132
  Q_LORA_RANK = "{arch}.attention.q_lora_rank"
127
133
  KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
@@ -155,6 +161,14 @@ class Keys:
155
161
  class WKV:
156
162
  HEAD_SIZE = "{arch}.wkv.head_size"
157
163
 
164
+ class PosNet:
165
+ EMBEDDING_LENGTH = "{arch}.posnet.embedding_length"
166
+ BLOCK_COUNT = "{arch}.posnet.block_count"
167
+
168
+ class ConvNext:
169
+ EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
170
+ BLOCK_COUNT = "{arch}.convnext.block_count"
171
+
158
172
  class Tokenizer:
159
173
  MODEL = "tokenizer.ggml.model"
160
174
  PRE = "tokenizer.ggml.pre"
@@ -170,7 +184,6 @@ class Keys:
170
184
  UNK_ID = "tokenizer.ggml.unknown_token_id"
171
185
  SEP_ID = "tokenizer.ggml.seperator_token_id"
172
186
  PAD_ID = "tokenizer.ggml.padding_token_id"
173
- CLS_ID = "tokenizer.ggml.cls_token_id"
174
187
  MASK_ID = "tokenizer.ggml.mask_token_id"
175
188
  ADD_BOS = "tokenizer.ggml.add_bos_token"
176
189
  ADD_EOS = "tokenizer.ggml.add_eos_token"
@@ -209,57 +222,63 @@ class GGUFType:
209
222
 
210
223
 
211
224
  class MODEL_ARCH(IntEnum):
212
- LLAMA = auto()
213
- FALCON = auto()
214
- BAICHUAN = auto()
215
- GROK = auto()
216
- GPT2 = auto()
217
- GPTJ = auto()
218
- GPTNEOX = auto()
219
- MPT = auto()
220
- STARCODER = auto()
221
- REFACT = auto()
222
- BERT = auto()
223
- NOMIC_BERT = auto()
224
- JINA_BERT_V2 = auto()
225
- BLOOM = auto()
226
- STABLELM = auto()
227
- QWEN = auto()
228
- QWEN2 = auto()
229
- QWEN2MOE = auto()
230
- QWEN2VL = auto()
231
- PHI2 = auto()
232
- PHI3 = auto()
233
- PLAMO = auto()
234
- CODESHELL = auto()
235
- ORION = auto()
236
- INTERNLM2 = auto()
237
- MINICPM = auto()
238
- MINICPM3 = auto()
239
- GEMMA = auto()
240
- GEMMA2 = auto()
241
- STARCODER2 = auto()
242
- RWKV6 = auto()
243
- MAMBA = auto()
244
- XVERSE = auto()
245
- COMMAND_R = auto()
246
- DBRX = auto()
247
- OLMO = auto()
248
- OLMO2 = auto()
249
- OLMOE = auto()
250
- OPENELM = auto()
251
- ARCTIC = auto()
252
- DEEPSEEK2 = auto()
253
- CHATGLM = auto()
254
- BITNET = auto()
255
- T5 = auto()
256
- T5ENCODER = auto()
257
- JAIS = auto()
258
- NEMOTRON = auto()
259
- EXAONE = auto()
260
- GRANITE = auto()
261
- GRANITE_MOE = auto()
262
- CHAMELEON = auto()
225
+ LLAMA = auto()
226
+ DECI = auto()
227
+ FALCON = auto()
228
+ BAICHUAN = auto()
229
+ GROK = auto()
230
+ GPT2 = auto()
231
+ GPTJ = auto()
232
+ GPTNEOX = auto()
233
+ MPT = auto()
234
+ STARCODER = auto()
235
+ REFACT = auto()
236
+ BERT = auto()
237
+ NOMIC_BERT = auto()
238
+ JINA_BERT_V2 = auto()
239
+ BLOOM = auto()
240
+ STABLELM = auto()
241
+ QWEN = auto()
242
+ QWEN2 = auto()
243
+ QWEN2MOE = auto()
244
+ QWEN2VL = auto()
245
+ PHI2 = auto()
246
+ PHI3 = auto()
247
+ PHIMOE = auto()
248
+ PLAMO = auto()
249
+ CODESHELL = auto()
250
+ ORION = auto()
251
+ INTERNLM2 = auto()
252
+ MINICPM = auto()
253
+ MINICPM3 = auto()
254
+ GEMMA = auto()
255
+ GEMMA2 = auto()
256
+ STARCODER2 = auto()
257
+ RWKV6 = auto()
258
+ RWKV6QWEN2 = auto()
259
+ MAMBA = auto()
260
+ XVERSE = auto()
261
+ COMMAND_R = auto()
262
+ COHERE2 = auto()
263
+ DBRX = auto()
264
+ OLMO = auto()
265
+ OLMO2 = auto()
266
+ OLMOE = auto()
267
+ OPENELM = auto()
268
+ ARCTIC = auto()
269
+ DEEPSEEK = auto()
270
+ DEEPSEEK2 = auto()
271
+ CHATGLM = auto()
272
+ BITNET = auto()
273
+ T5 = auto()
274
+ T5ENCODER = auto()
275
+ JAIS = auto()
276
+ NEMOTRON = auto()
277
+ EXAONE = auto()
278
+ GRANITE = auto()
279
+ GRANITE_MOE = auto()
280
+ CHAMELEON = auto()
281
+ WAVTOKENIZER_DEC = auto()
263
282
 
264
283
 
265
284
  class MODEL_TENSOR(IntEnum):
@@ -298,6 +317,7 @@ class MODEL_TENSOR(IntEnum):
298
317
  FFN_GATE_SHEXP = auto()
299
318
  FFN_DOWN_SHEXP = auto()
300
319
  FFN_UP_SHEXP = auto()
320
+ FFN_EXP_PROBS_B = auto()
301
321
  ATTN_Q_NORM = auto()
302
322
  ATTN_K_NORM = auto()
303
323
  LAYER_OUT_NORM = auto()
@@ -315,6 +335,7 @@ class MODEL_TENSOR(IntEnum):
315
335
  TIME_MIX_LERP_V = auto()
316
336
  TIME_MIX_LERP_R = auto()
317
337
  TIME_MIX_LERP_G = auto()
338
+ TIME_MIX_LERP_FUSED = auto()
318
339
  TIME_MIX_LERP_W = auto()
319
340
  TIME_MIX_FIRST = auto()
320
341
  TIME_MIX_DECAY = auto()
@@ -369,60 +390,82 @@ class MODEL_TENSOR(IntEnum):
369
390
  ENC_OUTPUT_NORM = auto()
370
391
  CLS = auto() # classifier
371
392
  CLS_OUT = auto() # classifier output projection
393
+ CONV1D = auto()
394
+ CONVNEXT_DW = auto()
395
+ CONVNEXT_NORM = auto()
396
+ CONVNEXT_PW1 = auto()
397
+ CONVNEXT_PW2 = auto()
398
+ CONVNEXT_GAMMA = auto()
399
+ POSNET_CONV1 = auto()
400
+ POSNET_CONV2 = auto()
401
+ POSNET_NORM = auto()
402
+ POSNET_NORM1 = auto()
403
+ POSNET_NORM2 = auto()
404
+ POSNET_ATTN_NORM = auto()
405
+ POSNET_ATTN_Q = auto()
406
+ POSNET_ATTN_K = auto()
407
+ POSNET_ATTN_V = auto()
408
+ POSNET_ATTN_OUT = auto()
372
409
 
373
410
 
374
411
  MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
375
- MODEL_ARCH.LLAMA: "llama",
376
- MODEL_ARCH.FALCON: "falcon",
377
- MODEL_ARCH.BAICHUAN: "baichuan",
378
- MODEL_ARCH.GROK: "grok",
379
- MODEL_ARCH.GPT2: "gpt2",
380
- MODEL_ARCH.GPTJ: "gptj",
381
- MODEL_ARCH.GPTNEOX: "gptneox",
382
- MODEL_ARCH.MPT: "mpt",
383
- MODEL_ARCH.STARCODER: "starcoder",
384
- MODEL_ARCH.REFACT: "refact",
385
- MODEL_ARCH.BERT: "bert",
386
- MODEL_ARCH.NOMIC_BERT: "nomic-bert",
387
- MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
388
- MODEL_ARCH.BLOOM: "bloom",
389
- MODEL_ARCH.STABLELM: "stablelm",
390
- MODEL_ARCH.QWEN: "qwen",
391
- MODEL_ARCH.QWEN2: "qwen2",
392
- MODEL_ARCH.QWEN2MOE: "qwen2moe",
393
- MODEL_ARCH.QWEN2VL: "qwen2vl",
394
- MODEL_ARCH.PHI2: "phi2",
395
- MODEL_ARCH.PHI3: "phi3",
396
- MODEL_ARCH.PLAMO: "plamo",
397
- MODEL_ARCH.CODESHELL: "codeshell",
398
- MODEL_ARCH.ORION: "orion",
399
- MODEL_ARCH.INTERNLM2: "internlm2",
400
- MODEL_ARCH.MINICPM: "minicpm",
401
- MODEL_ARCH.MINICPM3: "minicpm3",
402
- MODEL_ARCH.GEMMA: "gemma",
403
- MODEL_ARCH.GEMMA2: "gemma2",
404
- MODEL_ARCH.STARCODER2: "starcoder2",
405
- MODEL_ARCH.RWKV6: "rwkv6",
406
- MODEL_ARCH.MAMBA: "mamba",
407
- MODEL_ARCH.XVERSE: "xverse",
408
- MODEL_ARCH.COMMAND_R: "command-r",
409
- MODEL_ARCH.DBRX: "dbrx",
410
- MODEL_ARCH.OLMO: "olmo",
411
- MODEL_ARCH.OLMO2: "olmo2",
412
- MODEL_ARCH.OLMOE: "olmoe",
413
- MODEL_ARCH.OPENELM: "openelm",
414
- MODEL_ARCH.ARCTIC: "arctic",
415
- MODEL_ARCH.DEEPSEEK2: "deepseek2",
416
- MODEL_ARCH.CHATGLM: "chatglm",
417
- MODEL_ARCH.BITNET: "bitnet",
418
- MODEL_ARCH.T5: "t5",
419
- MODEL_ARCH.T5ENCODER: "t5encoder",
420
- MODEL_ARCH.JAIS: "jais",
421
- MODEL_ARCH.NEMOTRON: "nemotron",
422
- MODEL_ARCH.EXAONE: "exaone",
423
- MODEL_ARCH.GRANITE: "granite",
424
- MODEL_ARCH.GRANITE_MOE: "granitemoe",
425
- MODEL_ARCH.CHAMELEON: "chameleon",
412
+ MODEL_ARCH.LLAMA: "llama",
413
+ MODEL_ARCH.DECI: "deci",
414
+ MODEL_ARCH.FALCON: "falcon",
415
+ MODEL_ARCH.BAICHUAN: "baichuan",
416
+ MODEL_ARCH.GROK: "grok",
417
+ MODEL_ARCH.GPT2: "gpt2",
418
+ MODEL_ARCH.GPTJ: "gptj",
419
+ MODEL_ARCH.GPTNEOX: "gptneox",
420
+ MODEL_ARCH.MPT: "mpt",
421
+ MODEL_ARCH.STARCODER: "starcoder",
422
+ MODEL_ARCH.REFACT: "refact",
423
+ MODEL_ARCH.BERT: "bert",
424
+ MODEL_ARCH.NOMIC_BERT: "nomic-bert",
425
+ MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
426
+ MODEL_ARCH.BLOOM: "bloom",
427
+ MODEL_ARCH.STABLELM: "stablelm",
428
+ MODEL_ARCH.QWEN: "qwen",
429
+ MODEL_ARCH.QWEN2: "qwen2",
430
+ MODEL_ARCH.QWEN2MOE: "qwen2moe",
431
+ MODEL_ARCH.QWEN2VL: "qwen2vl",
432
+ MODEL_ARCH.PHI2: "phi2",
433
+ MODEL_ARCH.PHI3: "phi3",
434
+ MODEL_ARCH.PHIMOE: "phimoe",
435
+ MODEL_ARCH.PLAMO: "plamo",
436
+ MODEL_ARCH.CODESHELL: "codeshell",
437
+ MODEL_ARCH.ORION: "orion",
438
+ MODEL_ARCH.INTERNLM2: "internlm2",
439
+ MODEL_ARCH.MINICPM: "minicpm",
440
+ MODEL_ARCH.MINICPM3: "minicpm3",
441
+ MODEL_ARCH.GEMMA: "gemma",
442
+ MODEL_ARCH.GEMMA2: "gemma2",
443
+ MODEL_ARCH.STARCODER2: "starcoder2",
444
+ MODEL_ARCH.RWKV6: "rwkv6",
445
+ MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
446
+ MODEL_ARCH.MAMBA: "mamba",
447
+ MODEL_ARCH.XVERSE: "xverse",
448
+ MODEL_ARCH.COMMAND_R: "command-r",
449
+ MODEL_ARCH.COHERE2: "cohere2",
450
+ MODEL_ARCH.DBRX: "dbrx",
451
+ MODEL_ARCH.OLMO: "olmo",
452
+ MODEL_ARCH.OLMO2: "olmo2",
453
+ MODEL_ARCH.OLMOE: "olmoe",
454
+ MODEL_ARCH.OPENELM: "openelm",
455
+ MODEL_ARCH.ARCTIC: "arctic",
456
+ MODEL_ARCH.DEEPSEEK: "deepseek",
457
+ MODEL_ARCH.DEEPSEEK2: "deepseek2",
458
+ MODEL_ARCH.CHATGLM: "chatglm",
459
+ MODEL_ARCH.BITNET: "bitnet",
460
+ MODEL_ARCH.T5: "t5",
461
+ MODEL_ARCH.T5ENCODER: "t5encoder",
462
+ MODEL_ARCH.JAIS: "jais",
463
+ MODEL_ARCH.NEMOTRON: "nemotron",
464
+ MODEL_ARCH.EXAONE: "exaone",
465
+ MODEL_ARCH.GRANITE: "granite",
466
+ MODEL_ARCH.GRANITE_MOE: "granitemoe",
467
+ MODEL_ARCH.CHAMELEON: "chameleon",
468
+ MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
426
469
  }
427
470
 
428
471
  TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -463,6 +506,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
463
506
  MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
464
507
  MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
465
508
  MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
509
+ MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
466
510
  MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
467
511
  MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
468
512
  MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
@@ -478,6 +522,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
478
522
  MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
479
523
  MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
480
524
  MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
525
+ MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused",
481
526
  MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
482
527
  MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
483
528
  MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
@@ -532,6 +577,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
532
577
  MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
533
578
  MODEL_TENSOR.CLS: "cls",
534
579
  MODEL_TENSOR.CLS_OUT: "cls.output",
580
+ MODEL_TENSOR.CONV1D: "conv1d",
581
+ MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw",
582
+ MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm",
583
+ MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1",
584
+ MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2",
585
+ MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma",
586
+ MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1",
587
+ MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2",
588
+ MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm",
589
+ MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1",
590
+ MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2",
591
+ MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm",
592
+ MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q",
593
+ MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
594
+ MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
595
+ MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
535
596
  }
536
597
 
537
598
  MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -555,6 +616,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
555
616
  MODEL_TENSOR.FFN_DOWN_EXP,
556
617
  MODEL_TENSOR.FFN_UP_EXP,
557
618
  ],
619
+ MODEL_ARCH.DECI: [
620
+ MODEL_TENSOR.TOKEN_EMBD,
621
+ MODEL_TENSOR.OUTPUT_NORM,
622
+ MODEL_TENSOR.OUTPUT,
623
+ MODEL_TENSOR.ROPE_FREQS,
624
+ MODEL_TENSOR.ATTN_NORM,
625
+ MODEL_TENSOR.ATTN_Q,
626
+ MODEL_TENSOR.ATTN_K,
627
+ MODEL_TENSOR.ATTN_V,
628
+ MODEL_TENSOR.ATTN_OUT,
629
+ MODEL_TENSOR.ATTN_ROT_EMBD,
630
+ MODEL_TENSOR.FFN_GATE_INP,
631
+ MODEL_TENSOR.FFN_NORM,
632
+ MODEL_TENSOR.FFN_GATE,
633
+ MODEL_TENSOR.FFN_DOWN,
634
+ MODEL_TENSOR.FFN_UP,
635
+ MODEL_TENSOR.FFN_GATE_EXP,
636
+ MODEL_TENSOR.FFN_DOWN_EXP,
637
+ MODEL_TENSOR.FFN_UP_EXP,
638
+ ],
558
639
  MODEL_ARCH.GROK: [
559
640
  MODEL_TENSOR.TOKEN_EMBD,
560
641
  MODEL_TENSOR.OUTPUT_NORM,
@@ -865,6 +946,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
865
946
  MODEL_TENSOR.FFN_DOWN,
866
947
  MODEL_TENSOR.FFN_UP,
867
948
  ],
949
+ MODEL_ARCH.PHIMOE: [
950
+ MODEL_TENSOR.TOKEN_EMBD,
951
+ MODEL_TENSOR.OUTPUT_NORM,
952
+ MODEL_TENSOR.OUTPUT,
953
+ MODEL_TENSOR.ROPE_FACTORS_LONG,
954
+ MODEL_TENSOR.ROPE_FACTORS_SHORT,
955
+ MODEL_TENSOR.ATTN_NORM,
956
+ MODEL_TENSOR.ATTN_QKV,
957
+ MODEL_TENSOR.ATTN_Q,
958
+ MODEL_TENSOR.ATTN_K,
959
+ MODEL_TENSOR.ATTN_V,
960
+ MODEL_TENSOR.ATTN_OUT,
961
+ MODEL_TENSOR.FFN_NORM,
962
+ MODEL_TENSOR.FFN_GATE_INP,
963
+ MODEL_TENSOR.FFN_GATE_EXP,
964
+ MODEL_TENSOR.FFN_DOWN_EXP,
965
+ MODEL_TENSOR.FFN_UP_EXP,
966
+ ],
868
967
  MODEL_ARCH.CODESHELL: [
869
968
  MODEL_TENSOR.TOKEN_EMBD,
870
969
  MODEL_TENSOR.POS_EMBD,
@@ -1008,6 +1107,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1008
1107
  MODEL_TENSOR.TIME_MIX_LERP_R,
1009
1108
  MODEL_TENSOR.TIME_MIX_LERP_G,
1010
1109
  MODEL_TENSOR.TIME_MIX_LERP_W,
1110
+ MODEL_TENSOR.TIME_MIX_LERP_FUSED,
1011
1111
  MODEL_TENSOR.TIME_MIX_FIRST,
1012
1112
  MODEL_TENSOR.TIME_MIX_DECAY,
1013
1113
  MODEL_TENSOR.TIME_MIX_DECAY_W1,
@@ -1024,6 +1124,35 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1024
1124
  MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
1025
1125
  MODEL_TENSOR.CHANNEL_MIX_VALUE,
1026
1126
  ],
1127
+ MODEL_ARCH.RWKV6QWEN2: [
1128
+ MODEL_TENSOR.TOKEN_EMBD,
1129
+ MODEL_TENSOR.OUTPUT_NORM,
1130
+ MODEL_TENSOR.OUTPUT,
1131
+ MODEL_TENSOR.ATTN_NORM,
1132
+ MODEL_TENSOR.TIME_MIX_W1,
1133
+ MODEL_TENSOR.TIME_MIX_W2,
1134
+ MODEL_TENSOR.TIME_MIX_LERP_X,
1135
+ MODEL_TENSOR.TIME_MIX_LERP_K,
1136
+ MODEL_TENSOR.TIME_MIX_LERP_V,
1137
+ MODEL_TENSOR.TIME_MIX_LERP_R,
1138
+ MODEL_TENSOR.TIME_MIX_LERP_G,
1139
+ MODEL_TENSOR.TIME_MIX_LERP_W,
1140
+ MODEL_TENSOR.TIME_MIX_LERP_FUSED,
1141
+ MODEL_TENSOR.TIME_MIX_FIRST,
1142
+ MODEL_TENSOR.TIME_MIX_DECAY,
1143
+ MODEL_TENSOR.TIME_MIX_DECAY_W1,
1144
+ MODEL_TENSOR.TIME_MIX_DECAY_W2,
1145
+ MODEL_TENSOR.TIME_MIX_KEY,
1146
+ MODEL_TENSOR.TIME_MIX_VALUE,
1147
+ MODEL_TENSOR.TIME_MIX_RECEPTANCE,
1148
+ MODEL_TENSOR.TIME_MIX_GATE,
1149
+ MODEL_TENSOR.TIME_MIX_LN,
1150
+ MODEL_TENSOR.TIME_MIX_OUTPUT,
1151
+ MODEL_TENSOR.FFN_NORM,
1152
+ MODEL_TENSOR.FFN_GATE,
1153
+ MODEL_TENSOR.FFN_DOWN,
1154
+ MODEL_TENSOR.FFN_UP,
1155
+ ],
1027
1156
  MODEL_ARCH.MAMBA: [
1028
1157
  MODEL_TENSOR.TOKEN_EMBD,
1029
1158
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1067,6 +1196,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1067
1196
  MODEL_TENSOR.ATTN_K_NORM,
1068
1197
  MODEL_TENSOR.ATTN_Q_NORM,
1069
1198
  ],
1199
+ MODEL_ARCH.COHERE2: [
1200
+ MODEL_TENSOR.TOKEN_EMBD,
1201
+ MODEL_TENSOR.OUTPUT_NORM,
1202
+ MODEL_TENSOR.ATTN_NORM,
1203
+ MODEL_TENSOR.ATTN_Q,
1204
+ MODEL_TENSOR.ATTN_K,
1205
+ MODEL_TENSOR.ATTN_V,
1206
+ MODEL_TENSOR.ATTN_OUT,
1207
+ MODEL_TENSOR.FFN_GATE,
1208
+ MODEL_TENSOR.FFN_DOWN,
1209
+ MODEL_TENSOR.FFN_UP,
1210
+ ],
1070
1211
  MODEL_ARCH.DBRX: [
1071
1212
  MODEL_TENSOR.TOKEN_EMBD,
1072
1213
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1158,6 +1299,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1158
1299
  MODEL_TENSOR.FFN_DOWN_EXP,
1159
1300
  MODEL_TENSOR.FFN_UP_EXP,
1160
1301
  ],
1302
+ MODEL_ARCH.DEEPSEEK: [
1303
+ MODEL_TENSOR.TOKEN_EMBD,
1304
+ MODEL_TENSOR.OUTPUT_NORM,
1305
+ MODEL_TENSOR.OUTPUT,
1306
+ MODEL_TENSOR.ROPE_FREQS,
1307
+ MODEL_TENSOR.ATTN_NORM,
1308
+ MODEL_TENSOR.ATTN_Q,
1309
+ MODEL_TENSOR.ATTN_K,
1310
+ MODEL_TENSOR.ATTN_V,
1311
+ MODEL_TENSOR.ATTN_OUT,
1312
+ MODEL_TENSOR.ATTN_ROT_EMBD,
1313
+ MODEL_TENSOR.FFN_GATE_INP,
1314
+ MODEL_TENSOR.FFN_NORM,
1315
+ MODEL_TENSOR.FFN_GATE,
1316
+ MODEL_TENSOR.FFN_DOWN,
1317
+ MODEL_TENSOR.FFN_UP,
1318
+ MODEL_TENSOR.FFN_GATE_EXP,
1319
+ MODEL_TENSOR.FFN_DOWN_EXP,
1320
+ MODEL_TENSOR.FFN_UP_EXP,
1321
+ MODEL_TENSOR.FFN_GATE_SHEXP,
1322
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
1323
+ MODEL_TENSOR.FFN_UP_SHEXP,
1324
+ ],
1161
1325
  MODEL_ARCH.DEEPSEEK2: [
1162
1326
  MODEL_TENSOR.TOKEN_EMBD,
1163
1327
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1184,6 +1348,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1184
1348
  MODEL_TENSOR.FFN_GATE_SHEXP,
1185
1349
  MODEL_TENSOR.FFN_DOWN_SHEXP,
1186
1350
  MODEL_TENSOR.FFN_UP_SHEXP,
1351
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
1187
1352
  ],
1188
1353
  MODEL_ARCH.CHATGLM : [
1189
1354
  MODEL_TENSOR.TOKEN_EMBD,
@@ -1192,6 +1357,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1192
1357
  MODEL_TENSOR.OUTPUT,
1193
1358
  MODEL_TENSOR.ATTN_NORM,
1194
1359
  MODEL_TENSOR.ATTN_QKV,
1360
+ MODEL_TENSOR.ATTN_Q,
1361
+ MODEL_TENSOR.ATTN_K,
1362
+ MODEL_TENSOR.ATTN_V,
1195
1363
  MODEL_TENSOR.ATTN_OUT,
1196
1364
  MODEL_TENSOR.FFN_NORM,
1197
1365
  MODEL_TENSOR.FFN_DOWN,
@@ -1347,6 +1515,28 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1347
1515
  MODEL_TENSOR.FFN_DOWN,
1348
1516
  MODEL_TENSOR.FFN_UP,
1349
1517
  ],
1518
+ MODEL_ARCH.WAVTOKENIZER_DEC: [
1519
+ MODEL_TENSOR.TOKEN_EMBD,
1520
+ MODEL_TENSOR.TOKEN_EMBD_NORM,
1521
+ MODEL_TENSOR.CONV1D,
1522
+ MODEL_TENSOR.CONVNEXT_DW,
1523
+ MODEL_TENSOR.CONVNEXT_NORM,
1524
+ MODEL_TENSOR.CONVNEXT_PW1,
1525
+ MODEL_TENSOR.CONVNEXT_PW2,
1526
+ MODEL_TENSOR.CONVNEXT_GAMMA,
1527
+ MODEL_TENSOR.OUTPUT,
1528
+ MODEL_TENSOR.OUTPUT_NORM,
1529
+ MODEL_TENSOR.POSNET_CONV1,
1530
+ MODEL_TENSOR.POSNET_CONV2,
1531
+ MODEL_TENSOR.POSNET_NORM,
1532
+ MODEL_TENSOR.POSNET_NORM1,
1533
+ MODEL_TENSOR.POSNET_NORM2,
1534
+ MODEL_TENSOR.POSNET_ATTN_NORM,
1535
+ MODEL_TENSOR.POSNET_ATTN_Q,
1536
+ MODEL_TENSOR.POSNET_ATTN_K,
1537
+ MODEL_TENSOR.POSNET_ATTN_V,
1538
+ MODEL_TENSOR.POSNET_ATTN_OUT,
1539
+ ],
1350
1540
  # TODO
1351
1541
  }
1352
1542
 
@@ -1356,6 +1546,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1356
1546
  MODEL_TENSOR.ROPE_FREQS,
1357
1547
  MODEL_TENSOR.ATTN_ROT_EMBD,
1358
1548
  ],
1549
+ MODEL_ARCH.DECI: [
1550
+ MODEL_TENSOR.ROPE_FREQS,
1551
+ MODEL_TENSOR.ATTN_ROT_EMBD,
1552
+ ],
1359
1553
  MODEL_ARCH.BAICHUAN: [
1360
1554
  MODEL_TENSOR.ROPE_FREQS,
1361
1555
  MODEL_TENSOR.ATTN_ROT_EMBD,
@@ -1380,6 +1574,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1380
1574
  MODEL_TENSOR.ROPE_FREQS,
1381
1575
  MODEL_TENSOR.ATTN_ROT_EMBD,
1382
1576
  ],
1577
+ MODEL_ARCH.DEEPSEEK: [
1578
+ MODEL_TENSOR.ROPE_FREQS,
1579
+ MODEL_TENSOR.ATTN_ROT_EMBD,
1580
+ ],
1383
1581
  MODEL_ARCH.DEEPSEEK2: [
1384
1582
  MODEL_TENSOR.ROPE_FREQS,
1385
1583
  MODEL_TENSOR.ATTN_ROT_EMBD,
@@ -1454,6 +1652,11 @@ class GGMLQuantizationType(IntEnum):
1454
1652
  TQ2_0 = 35
1455
1653
 
1456
1654
 
1655
+ class ExpertGatingFuncType(IntEnum):
1656
+ SOFTMAX = 1
1657
+ SIGMOID = 2
1658
+
1659
+
1457
1660
  # TODO: add GGMLFileType from ggml_ftype in ggml.h
1458
1661
 
1459
1662
 
@@ -1636,7 +1839,6 @@ KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
1636
1839
  KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
1637
1840
  KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
1638
1841
  KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
1639
- KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
1640
1842
  KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
1641
1843
  KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
1642
1844
  KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
@@ -26,6 +26,7 @@ from .constants import (
26
26
  RopeScalingType,
27
27
  PoolingType,
28
28
  TokenType,
29
+ ExpertGatingFuncType,
29
30
  )
30
31
 
31
32
  from .quants import quant_shape_from_byte_shape
@@ -631,6 +632,21 @@ class GGUFWriter:
631
632
  def add_embedding_length(self, length: int) -> None:
632
633
  self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
633
634
 
635
+ def add_features_length(self, length: int) -> None:
636
+ self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
637
+
638
+ def add_posnet_embedding_length(self, length: int) -> None:
639
+ self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
640
+
641
+ def add_posnet_block_count(self, length: int) -> None:
642
+ self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
643
+
644
+ def add_convnext_embedding_length(self, length: int) -> None:
645
+ self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
646
+
647
+ def add_convnext_block_count(self, length: int) -> None:
648
+ self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
649
+
634
650
  def add_block_count(self, length: int) -> None:
635
651
  self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
636
652
 
@@ -700,6 +716,12 @@ class GGUFWriter:
700
716
  def add_expert_weights_scale(self, value: float) -> None:
701
717
  self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
702
718
 
719
+ def add_expert_weights_norm(self, value: bool) -> None:
720
+ self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value)
721
+
722
+ def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
723
+ self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
724
+
703
725
  def add_swin_norm(self, value: bool) -> None:
704
726
  self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
705
727
 
@@ -721,12 +743,21 @@ class GGUFWriter:
721
743
  def add_wkv_head_size(self, size: int) -> None:
722
744
  self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
723
745
 
746
+ def add_token_shift_count(self, count: int) -> None:
747
+ self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
748
+
724
749
  def add_layer_norm_eps(self, value: float) -> None:
725
750
  self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
726
751
 
727
752
  def add_layer_norm_rms_eps(self, value: float) -> None:
728
753
  self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
729
754
 
755
+ def add_group_norm_eps(self, value: float) -> None:
756
+ self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
757
+
758
+ def add_group_norm_groups(self, value: int) -> None:
759
+ self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
760
+
730
761
  def add_causal_attention(self, value: bool) -> None:
731
762
  self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
732
763
 
@@ -826,9 +857,6 @@ class GGUFWriter:
826
857
  def add_pad_token_id(self, id: int) -> None:
827
858
  self.add_uint32(Keys.Tokenizer.PAD_ID, id)
828
859
 
829
- def add_cls_token_id(self, id: int) -> None:
830
- self.add_uint32(Keys.Tokenizer.CLS_ID, id)
831
-
832
860
  def add_mask_token_id(self, id: int) -> None:
833
861
  self.add_uint32(Keys.Tokenizer.MASK_ID, id)
834
862