@fugood/llama.node 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +484 -204
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +156 -15
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +22 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/json-partial.cpp +51 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  19. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  21. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  28. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
  31. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +11 -9
  40. package/src/llama.cpp/include/llama.h +8 -0
  41. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  42. package/src/llama.cpp/src/llama-arch.h +22 -0
  43. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  44. package/src/llama.cpp/src/llama-context.cpp +6 -0
  45. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  46. package/src/llama.cpp/src/llama-graph.h +10 -1
  47. package/src/llama.cpp/src/llama-hparams.cpp +5 -1
  48. package/src/llama.cpp/src/llama-hparams.h +17 -2
  49. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
  50. package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
  51. package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
  52. package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
  53. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  54. package/src/llama.cpp/src/llama-model.cpp +572 -45
  55. package/src/llama.cpp/src/llama-model.h +18 -0
  56. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  57. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  58. package/src/llama.cpp/src/llama-vocab.h +41 -40
  59. package/src/llama.cpp/src/unicode.h +43 -0
@@ -107,6 +107,7 @@ enum llm_type {
107
107
  LLM_TYPE_17B_16E, // llama4 Scout
108
108
  LLM_TYPE_17B_128E, // llama4 Maverick
109
109
  LLM_TYPE_A13B,
110
+ LLM_TYPE_8B_A1B, // lfm2moe
110
111
  LLM_TYPE_21B_A3B, // Ernie MoE small
111
112
  LLM_TYPE_30B_A3B,
112
113
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
@@ -275,6 +276,11 @@ struct llama_layer {
275
276
  struct ggml_tensor * ffn_down_shexp = nullptr;
276
277
  struct ggml_tensor * ffn_up_shexp = nullptr;
277
278
 
279
+ // ff adjugate experts (chexps)
280
+ struct ggml_tensor * ffn_gate_chexps = nullptr;
281
+ struct ggml_tensor * ffn_down_chexps = nullptr;
282
+ struct ggml_tensor * ffn_up_chexps = nullptr;
283
+
278
284
  // ff bias
279
285
  struct ggml_tensor * ffn_gate_b = nullptr;
280
286
  struct ggml_tensor * ffn_down_b = nullptr; // b2
@@ -375,6 +381,12 @@ struct llama_layer {
375
381
  // openai-moe
376
382
  struct ggml_tensor * attn_sinks = nullptr;
377
383
 
384
+ // xIELU activation parameters for Apertus
385
+ struct ggml_tensor * ffn_act_alpha_n = nullptr;
386
+ struct ggml_tensor * ffn_act_alpha_p = nullptr;
387
+ struct ggml_tensor * ffn_act_beta = nullptr;
388
+ struct ggml_tensor * ffn_act_eps = nullptr;
389
+
378
390
  struct llama_layer_posnet posnet;
379
391
 
380
392
  struct llama_layer_convnext convnext;
@@ -426,6 +438,12 @@ struct llama_model {
426
438
 
427
439
  std::vector<llama_layer> layers;
428
440
 
441
+ //Dense linear projections for SentenceTransformers models like embeddinggemma
442
+ // For Sentence Transformers models structure see
443
+ // https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
444
+ struct ggml_tensor * dense_2_out_layers = nullptr;
445
+ struct ggml_tensor * dense_3_out_layers = nullptr;
446
+
429
447
  llama_model_params params;
430
448
 
431
449
  // gguf metadata
@@ -2541,8 +2541,13 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
2541
2541
  if (n_non_eog == 0) {
2542
2542
  cur_p->size = 1;
2543
2543
  cur_p->data[0].id = ctx->vocab->token_eot();
2544
+ if (cur_p->data[0].id == LLAMA_TOKEN_NULL) {
2545
+ cur_p->data[0].id = ctx->vocab->token_eos();
2546
+ }
2544
2547
  cur_p->data[0].logit = 1.0f;
2545
2548
 
2549
+ GGML_ASSERT(cur_p->data[0].id != LLAMA_TOKEN_NULL);
2550
+
2546
2551
  return;
2547
2552
  }
2548
2553
 
@@ -347,6 +347,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
347
347
  case LLAMA_VOCAB_PRE_TYPE_OLMO:
348
348
  case LLAMA_VOCAB_PRE_TYPE_JAIS:
349
349
  case LLAMA_VOCAB_PRE_TYPE_TRILLION:
350
+ case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
350
351
  regex_exprs = {
351
352
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
352
353
  };
@@ -1772,7 +1773,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1772
1773
  const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1773
1774
  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1774
1775
  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1775
- #ifdef IS_BIG_ENDIAN
1776
+ #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1776
1777
  // correct endiannes of data in precompiled_charsmap binary blob
1777
1778
  uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1778
1779
  *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
@@ -1961,6 +1962,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1961
1962
  tokenizer_pre == "trillion") {
1962
1963
  pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1963
1964
  clean_spaces = false;
1965
+ } else if (
1966
+ tokenizer_pre == "granite-docling") {
1967
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
1968
+ clean_spaces = false;
1964
1969
  } else if (
1965
1970
  tokenizer_pre == "bailingmoe" ||
1966
1971
  tokenizer_pre == "llada-moe") {
@@ -2166,6 +2171,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2166
2171
  || t.first == "<|end|>"
2167
2172
  || t.first == "<end_of_turn>"
2168
2173
  || t.first == "<|endoftext|>"
2174
+ || t.first == "<|end_of_text|>" // granite
2169
2175
  || t.first == "<EOT>"
2170
2176
  || t.first == "_<EOT>"
2171
2177
  || t.first == "<|end▁of▁sentence|>" // DeepSeek
@@ -8,46 +8,47 @@
8
8
 
9
9
  // pre-tokenization types
10
10
  enum llama_vocab_pre_type {
11
- LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15
- LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16
- LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17
- LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18
- LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19
- LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23
- LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24
- LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25
- LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26
- LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29
- LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30
- LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31
- LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32
- LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33
- LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34
- LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36
- LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37
- LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38
- LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40
- LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41
- LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42
- LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43
- LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44
- LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45
- LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46
- LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
- LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
- LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
- LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50
- LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
11
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15
+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16
+ LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17
+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18
+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32
+ LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33
+ LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37
+ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46
+ LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
+ LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50
+ LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
51
+ LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
51
52
  };
52
53
 
53
54
  struct LLM_KV;
@@ -4,6 +4,7 @@
4
4
  #include <string>
5
5
  #include <vector>
6
6
 
7
+ // TODO: reimplement this structure in endian-independent way
7
8
  struct unicode_cpt_flags {
8
9
  enum {
9
10
  UNDEFINED = 0x0001,
@@ -15,6 +16,10 @@ struct unicode_cpt_flags {
15
16
  SYMBOL = 0x0040, // regex: \p{S}
16
17
  CONTROL = 0x0080, // regex: \p{C}
17
18
  MASK_CATEGORIES = 0x00FF,
19
+ WHITESPACE = 0x0100,
20
+ LOWERCASE = 0x0200,
21
+ UPPERCASE = 0x0400,
22
+ NFD = 0x0800,
18
23
  };
19
24
 
20
25
  // codepoint type
@@ -34,11 +39,49 @@ struct unicode_cpt_flags {
34
39
 
35
40
  // decode from uint16
36
41
  inline unicode_cpt_flags(const uint16_t flags = 0) {
42
+ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
37
43
  *reinterpret_cast<uint16_t*>(this) = flags;
44
+ #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
45
+ is_undefined = (flags & UNDEFINED) ? 1 : 0;
46
+ is_number = (flags & NUMBER) ? 1 : 0;
47
+ is_letter = (flags & LETTER) ? 1 : 0;
48
+ is_separator = (flags & SEPARATOR) ? 1 : 0;
49
+ is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
50
+ is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
51
+ is_symbol = (flags & SYMBOL) ? 1 : 0;
52
+ is_control = (flags & CONTROL) ? 1 : 0;
53
+ is_whitespace = (flags & WHITESPACE) ? 1 : 0;
54
+ is_lowercase = (flags & LOWERCASE) ? 1 : 0;
55
+ is_uppercase = (flags & UPPERCASE) ? 1 : 0;
56
+ is_nfd = (flags & NFD) ? 1 : 0;
57
+ #else
58
+ #error Unexpected or undefined __BYTE_ORDER__
59
+ #endif
38
60
  }
39
61
 
40
62
  inline uint16_t as_uint() const {
63
+ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
41
64
  return *reinterpret_cast<const uint16_t*>(this);
65
+ #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
66
+ uint16_t result =
67
+ is_undefined * UNDEFINED
68
+ + is_number * NUMBER
69
+ + is_letter * LETTER
70
+ + is_separator * SEPARATOR
71
+ + is_accent_mark * ACCENT_MARK
72
+ + is_punctuation * PUNCTUATION
73
+ + is_symbol * SYMBOL
74
+ + is_control * CONTROL
75
+ + is_whitespace * WHITESPACE
76
+ + is_lowercase * LOWERCASE
77
+ + is_uppercase * UPPERCASE
78
+ + is_nfd * NFD
79
+ ;
80
+
81
+ return result;
82
+ #else
83
+ #error Unexpected or undefined __BYTE_ORDER__
84
+ #endif
42
85
  }
43
86
 
44
87
  inline uint16_t category_flag() const {