@fugood/llama.node 1.2.0-rc.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +16 -15
  2. package/src/llama.cpp/CMakeLists.txt +7 -0
  3. package/src/llama.cpp/common/arg.cpp +141 -21
  4. package/src/llama.cpp/common/chat.cpp +139 -0
  5. package/src/llama.cpp/common/chat.h +1 -0
  6. package/src/llama.cpp/common/common.h +23 -8
  7. package/src/llama.cpp/common/json-schema-to-grammar.cpp +28 -7
  8. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  9. package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
  10. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  11. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -6
  12. package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
  13. package/src/llama.cpp/ggml/include/ggml.h +10 -5
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -1
  21. package/src/llama.cpp/src/llama-arch.cpp +44 -10
  22. package/src/llama.cpp/src/llama-arch.h +9 -0
  23. package/src/llama.cpp/src/llama-chat.cpp +17 -0
  24. package/src/llama.cpp/src/llama-chat.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +13 -11
  26. package/src/llama.cpp/src/llama-graph.cpp +6 -5
  27. package/src/llama.cpp/src/llama-hparams.h +14 -3
  28. package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
  29. package/src/llama.cpp/src/llama-kv-cache.h +8 -0
  30. package/src/llama.cpp/src/llama-model.cpp +386 -140
  31. package/src/llama.cpp/src/llama-model.h +3 -0
  32. package/src/llama.cpp/src/llama-quant.cpp +6 -4
  33. package/src/llama.cpp/src/llama-vocab.cpp +13 -1
  34. package/src/llama.cpp/src/llama-vocab.h +1 -0
  35. package/src/llama.cpp/src/llama.cpp +53 -10
@@ -28,6 +28,7 @@ enum llm_type {
28
28
  LLM_TYPE_80M,
29
29
  LLM_TYPE_109M,
30
30
  LLM_TYPE_137M,
31
+ LLM_TYPE_140M,
31
32
  LLM_TYPE_160M,
32
33
  LLM_TYPE_190M,
33
34
  LLM_TYPE_220M,
@@ -36,6 +37,7 @@ enum llm_type {
36
37
  LLM_TYPE_270M,
37
38
  LLM_TYPE_335M,
38
39
  LLM_TYPE_350M,
40
+ LLM_TYPE_360M,
39
41
  LLM_TYPE_410M,
40
42
  LLM_TYPE_450M,
41
43
  LLM_TYPE_475M,
@@ -43,6 +45,7 @@ enum llm_type {
43
45
  LLM_TYPE_700M,
44
46
  LLM_TYPE_770M,
45
47
  LLM_TYPE_780M,
48
+ LLM_TYPE_950M,
46
49
  LLM_TYPE_0_3B,
47
50
  LLM_TYPE_0_5B,
48
51
  LLM_TYPE_0_6B,
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
725
725
  // attention layers have a non-zero number of kv heads
726
726
  int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
727
727
  if (llama_model_has_encoder(&model)) {
728
- n_attn_layer *= 3;
728
+ // now n_attn_layer is the number of attention layers in the encoder
729
+ // for each decoder block, there are 2 attention layers
730
+ n_attn_layer += 2 * model.hparams.dec_n_layer;
729
731
  }
730
732
  GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
731
733
  }
@@ -920,7 +922,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
920
922
  new_type = tensor->type;
921
923
  new_data = tensor->data;
922
924
  new_size = ggml_nbytes(tensor);
923
- LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
925
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
924
926
  } else {
925
927
  const int64_t nelements = ggml_nelements(tensor);
926
928
 
@@ -1037,8 +1039,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
1037
1039
  }
1038
1040
  close_ofstream();
1039
1041
 
1040
- LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1041
- LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1042
+ LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
1043
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
1042
1044
 
1043
1045
  if (qs.n_fallback > 0) {
1044
1046
  LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
@@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
434
434
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
435
435
  };
436
436
  break;
437
+ case LLAMA_VOCAB_PRE_TYPE_GROK_2:
438
+ regex_exprs = {
439
+ // original regex from tokenizer.json
440
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
441
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
442
+ };
443
+ break;
437
444
  default:
438
445
  // default regex for BPE tokenization pre-processing
439
446
  regex_exprs = {
@@ -1955,7 +1962,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1955
1962
  pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1956
1963
  clean_spaces = false;
1957
1964
  } else if (
1958
- tokenizer_pre == "bailingmoe") {
1965
+ tokenizer_pre == "bailingmoe" ||
1966
+ tokenizer_pre == "llada-moe") {
1959
1967
  pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1960
1968
  clean_spaces = false;
1961
1969
  } else if (
@@ -1974,6 +1982,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1974
1982
  tokenizer_pre == "kimi-k2") {
1975
1983
  pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1976
1984
  clean_spaces = false;
1985
+ } else if (
1986
+ tokenizer_pre == "grok-2") {
1987
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1988
+ clean_spaces = false;
1977
1989
  } else {
1978
1990
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1979
1991
  }
@@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
47
47
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
48
  LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
49
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50
+ LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
50
51
  };
51
52
 
52
53
  struct LLM_KV;
@@ -59,6 +59,7 @@ bool llama_supports_mlock(void) {
59
59
 
60
60
  bool llama_supports_gpu_offload(void) {
61
61
  return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
62
+ ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
62
63
  llama_supports_rpc();
63
64
  }
64
65
 
@@ -83,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
83
84
  GGML_ASSERT(dev && "CPU backend is not loaded");
84
85
  auto * reg = ggml_backend_dev_backend_reg(dev);
85
86
  auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
86
- numa_init_fn(numa);
87
+ if (numa_init_fn) {
88
+ numa_init_fn(numa);
89
+ }
87
90
  }
88
91
  }
89
92
 
@@ -182,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
182
185
  model->devices.push_back(*dev);
183
186
  }
184
187
  } else {
188
+ // default device selection
189
+
190
+ // build list of available devices
191
+ std::vector<ggml_backend_dev_t> gpus;
192
+ std::vector<ggml_backend_dev_t> igpus;
185
193
  std::vector<ggml_backend_dev_t> rpc_servers;
186
- // use all available devices
194
+
187
195
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
188
196
  ggml_backend_dev_t dev = ggml_backend_dev_get(i);
189
197
  switch (ggml_backend_dev_type(dev)) {
@@ -192,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
192
200
  // skip CPU backends since they are handled separately
193
201
  break;
194
202
 
195
- case GGML_BACKEND_DEVICE_TYPE_GPU:
203
+ case GGML_BACKEND_DEVICE_TYPE_GPU: {
196
204
  ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
197
205
  if (ggml_backend_reg_name(reg) == std::string("RPC")) {
198
206
  rpc_servers.push_back(dev);
199
207
  } else {
200
- model->devices.push_back(dev);
208
+ // check if there is already a GPU with the same device id
209
+ ggml_backend_dev_props props;
210
+ ggml_backend_dev_get_props(dev, &props);
211
+ auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
212
+ ggml_backend_dev_props d_props;
213
+ ggml_backend_dev_get_props(d, &d_props);
214
+ if (props.device_id && d_props.device_id) {
215
+ return strcmp(props.device_id, d_props.device_id) == 0;
216
+ }
217
+ return false;
218
+ });
219
+
220
+ if (it != gpus.end()) {
221
+ LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
222
+ __func__,
223
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
224
+ props.device_id ? props.device_id : "unknown id",
225
+ ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
226
+ } else {
227
+ gpus.push_back(dev);
228
+ }
201
229
  }
202
230
  break;
231
+ }
232
+
233
+ case GGML_BACKEND_DEVICE_TYPE_IGPU:
234
+ igpus.push_back(dev);
235
+ break;
203
236
  }
204
237
  }
205
- // add RPC servers at the front of the list
206
- if (!rpc_servers.empty()) {
207
- model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
238
+
239
+ // add RPC servers at the front of the list to minimize network transfers
240
+ model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
241
+
242
+ // add GPUs
243
+ model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
244
+
245
+ // add integrated GPUs only if no other devices were found
246
+ if (model->devices.empty()) {
247
+ model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
208
248
  }
209
249
  }
210
250
 
@@ -225,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
225
265
  }
226
266
 
227
267
  for (auto * dev : model->devices) {
228
- size_t free, total; // NOLINT
229
- ggml_backend_dev_memory(dev, &free, &total);
230
- LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
268
+ ggml_backend_dev_props props;
269
+ ggml_backend_dev_get_props(dev, &props);
270
+ LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
271
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
272
+ props.device_id ? props.device_id : "unknown id",
273
+ props.memory_free/1024/1024);
231
274
  }
232
275
 
233
276
  const int status = llama_model_load(path_model, splits, *model, params);