@fugood/llama.node 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,7 @@ enum llm_type {
28
28
  LLM_TYPE_80M,
29
29
  LLM_TYPE_109M,
30
30
  LLM_TYPE_137M,
31
+ LLM_TYPE_140M,
31
32
  LLM_TYPE_160M,
32
33
  LLM_TYPE_190M,
33
34
  LLM_TYPE_220M,
@@ -36,6 +37,7 @@ enum llm_type {
36
37
  LLM_TYPE_270M,
37
38
  LLM_TYPE_335M,
38
39
  LLM_TYPE_350M,
40
+ LLM_TYPE_360M,
39
41
  LLM_TYPE_410M,
40
42
  LLM_TYPE_450M,
41
43
  LLM_TYPE_475M,
@@ -43,6 +45,7 @@ enum llm_type {
43
45
  LLM_TYPE_700M,
44
46
  LLM_TYPE_770M,
45
47
  LLM_TYPE_780M,
48
+ LLM_TYPE_950M,
46
49
  LLM_TYPE_0_3B,
47
50
  LLM_TYPE_0_5B,
48
51
  LLM_TYPE_0_6B,
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
725
725
  // attention layers have a non-zero number of kv heads
726
726
  int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
727
727
  if (llama_model_has_encoder(&model)) {
728
- n_attn_layer *= 3;
728
+ // now n_attn_layer is the number of attention layers in the encoder
729
+ // for each decoder block, there are 2 attention layers
730
+ n_attn_layer += 2 * model.hparams.dec_n_layer;
729
731
  }
730
732
  GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
731
733
  }
@@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
434
434
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
435
435
  };
436
436
  break;
437
+ case LLAMA_VOCAB_PRE_TYPE_GROK_2:
438
+ regex_exprs = {
439
+ // original regex from tokenizer.json
440
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
441
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
442
+ };
443
+ break;
437
444
  default:
438
445
  // default regex for BPE tokenization pre-processing
439
446
  regex_exprs = {
@@ -1955,7 +1962,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1955
1962
  pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1956
1963
  clean_spaces = false;
1957
1964
  } else if (
1958
- tokenizer_pre == "bailingmoe") {
1965
+ tokenizer_pre == "bailingmoe" ||
1966
+ tokenizer_pre == "llada-moe") {
1959
1967
  pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1960
1968
  clean_spaces = false;
1961
1969
  } else if (
@@ -1974,6 +1982,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1974
1982
  tokenizer_pre == "kimi-k2") {
1975
1983
  pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1976
1984
  clean_spaces = false;
1985
+ } else if (
1986
+ tokenizer_pre == "grok-2") {
1987
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1988
+ clean_spaces = false;
1977
1989
  } else {
1978
1990
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1979
1991
  }
@@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
47
47
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
48
  LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
49
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50
+ LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
50
51
  };
51
52
 
52
53
  struct LLM_KV;