@fugood/llama.node 1.3.7 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/lib/binding.js +18 -1
  2. package/lib/binding.ts +19 -1
  3. package/lib/index.js +3 -3
  4. package/lib/index.ts +1 -1
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +7 -7
  7. package/src/LlamaCompletionWorker.cpp +2 -2
  8. package/src/llama.cpp/common/arg.cpp +27 -2
  9. package/src/llama.cpp/common/chat-parser.cpp +968 -0
  10. package/src/llama.cpp/common/chat.cpp +0 -952
  11. package/src/llama.cpp/common/common.cpp +55 -0
  12. package/src/llama.cpp/common/common.h +18 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
  14. package/src/llama.cpp/ggml/CMakeLists.txt +6 -4
  15. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
  16. package/src/llama.cpp/ggml/include/ggml.h +12 -4
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +26 -4
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -15
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +721 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -2
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +71 -4
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +243 -4
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +84 -85
  28. package/src/llama.cpp/include/llama.h +18 -0
  29. package/src/llama.cpp/src/CMakeLists.txt +2 -0
  30. package/src/llama.cpp/src/llama-arch.cpp +95 -16
  31. package/src/llama.cpp/src/llama-arch.h +15 -0
  32. package/src/llama.cpp/src/llama-context.cpp +7 -3
  33. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  34. package/src/llama.cpp/src/llama-hparams.h +1 -1
  35. package/src/llama.cpp/src/llama-model.cpp +141 -6
  36. package/src/llama.cpp/src/llama-model.h +4 -0
  37. package/src/llama.cpp/src/llama-quant.cpp +13 -5
  38. package/src/llama.cpp/src/models/lfm2.cpp +5 -3
  39. package/src/llama.cpp/src/models/models.h +55 -1
  40. package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
  41. package/src/llama.cpp/src/models/rnd1.cpp +126 -0
@@ -8,6 +8,7 @@
8
8
  #include "common.h"
9
9
  #include "log.h"
10
10
  #include "llama.h"
11
+ #include "sampling.h"
11
12
 
12
13
  #include <algorithm>
13
14
  #include <cinttypes>
@@ -949,6 +950,58 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
949
950
  // Model utils
950
951
  //
951
952
 
953
+ static inline void common_init_sampler_from_model(
954
+ const llama_model * model,
955
+ common_params_sampling & sparams) {
956
+
957
+ const uint64_t config = sparams.user_sampling_config;
958
+
959
+ auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
960
+ if (config & user_config) return;
961
+
962
+ char buf[64] = {0};
963
+ if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
964
+ char * end = nullptr;
965
+ int32_t v = strtol(buf, &end, 10);
966
+ if (end && end != buf) dst = v;
967
+ }
968
+ };
969
+
970
+ auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
971
+ if (config & user_config) return;
972
+
973
+ char buf[128] = {0};
974
+ if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
975
+ char * end = nullptr;
976
+ float v = strtof(buf, &end);
977
+ if (end && end != buf) dst = v;
978
+ }
979
+ };
980
+
981
+ // Sampling sequence
982
+ if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
983
+ char buf[512] = {0};
984
+ if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
985
+ const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
986
+ if (!sampler_names.empty()) {
987
+ sparams.samplers = common_sampler_types_from_names(sampler_names, true);
988
+ }
989
+ }
990
+ }
991
+
992
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K), sparams.top_k, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
993
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P), sparams.top_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
994
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P), sparams.min_p, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
995
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
996
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD), sparams.xtc_threshold, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
997
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP), sparams.temp, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
998
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N), sparams.penalty_last_n, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
999
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT), sparams.penalty_repeat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
1000
+ get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT), sparams.mirostat, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
1001
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU), sparams.mirostat_tau, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
1002
+ get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
1003
+ }
1004
+
952
1005
  struct common_init_result common_init_from_params(common_params & params) {
953
1006
  common_init_result iparams;
954
1007
  auto mparams = common_model_params_to_llama(params);
@@ -960,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
960
1013
  return iparams;
961
1014
  }
962
1015
 
1016
+ common_init_sampler_from_model(model, params.sampling);
1017
+
963
1018
  const llama_vocab * vocab = llama_model_get_vocab(model);
964
1019
 
965
1020
  auto cparams = common_context_params_to_llama(params);
@@ -140,6 +140,22 @@ struct common_grammar_trigger {
140
140
  llama_token token = LLAMA_TOKEN_NULL;
141
141
  };
142
142
 
143
+ enum common_params_sampling_config : uint64_t {
144
+ COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
145
+ COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
146
+ COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
147
+ COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
148
+ COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
149
+ COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
150
+ COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
151
+ COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
152
+ COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
153
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
154
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
155
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
156
+ };
157
+
158
+
143
159
  // sampling parameters
144
160
  struct common_params_sampling {
145
161
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -172,6 +188,8 @@ struct common_params_sampling {
172
188
  bool no_perf = false; // disable performance metrics
173
189
  bool timing_per_token = false;
174
190
 
191
+ uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
192
+
175
193
  std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
176
194
 
177
195
 
@@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
268
268
  }
269
269
 
270
270
  std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
271
- std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
271
+ std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
272
272
  std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
273
273
  std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
274
- {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
274
+ {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
275
275
  };
276
276
 
277
277
  std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
@@ -25,16 +25,17 @@ if(GIT_EXE)
25
25
  )
26
26
  endif()
27
27
 
28
- # Build the version string with optional dirty flag
29
28
  set(GGML_VERSION "${GGML_VERSION_BASE}")
30
- if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
31
- set(GGML_VERSION "${GGML_VERSION}-dirty")
32
- endif()
33
29
 
34
30
  if(NOT GGML_BUILD_COMMIT)
35
31
  set(GGML_BUILD_COMMIT "unknown")
36
32
  endif()
37
33
 
34
+ # Build the commit string with optional dirty flag
35
+ if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
36
+ set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
37
+ endif()
38
+
38
39
  include(CheckIncludeFileCXX)
39
40
 
40
41
  set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -182,6 +183,7 @@ endif()
182
183
  # ggml core
183
184
  set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
184
185
  option(GGML_CPU "ggml: enable CPU backend" ON)
186
+ option(GGML_SCHED_NO_REALLOC "ggml: disallow reallocations in ggml-alloc (for debugging)" OFF)
185
187
 
186
188
  # 3rd party libs / backends
187
189
  option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
@@ -8,7 +8,7 @@ extern "C" {
8
8
  #endif
9
9
 
10
10
  #define RPC_PROTO_MAJOR_VERSION 3
11
- #define RPC_PROTO_MINOR_VERSION 0
11
+ #define RPC_PROTO_MINOR_VERSION 5
12
12
  #define RPC_PROTO_PATCH_VERSION 0
13
13
  #define GGML_RPC_MAX_SERVERS 16
14
14
 
@@ -530,6 +530,7 @@ extern "C" {
530
530
  GGML_OP_ARANGE,
531
531
  GGML_OP_TIMESTEP_EMBEDDING,
532
532
  GGML_OP_ARGSORT,
533
+ GGML_OP_TOP_K,
533
534
  GGML_OP_LEAKY_RELU,
534
535
  GGML_OP_TRI,
535
536
  GGML_OP_FILL,
@@ -2258,18 +2259,25 @@ extern "C" {
2258
2259
  struct ggml_tensor * a,
2259
2260
  enum ggml_sort_order order);
2260
2261
 
2261
- GGML_API struct ggml_tensor * ggml_arange(
2262
+ // similar to ggml_top_k but implemented as `argsort` + `view`
2263
+ GGML_API struct ggml_tensor * ggml_argsort_top_k(
2262
2264
  struct ggml_context * ctx,
2263
- float start,
2264
- float stop,
2265
- float step);
2265
+ struct ggml_tensor * a,
2266
+ int k);
2266
2267
 
2267
2268
  // top k elements per row
2269
+ // note: the resulting top k indices are in no particular order
2268
2270
  GGML_API struct ggml_tensor * ggml_top_k(
2269
2271
  struct ggml_context * ctx,
2270
2272
  struct ggml_tensor * a,
2271
2273
  int k);
2272
2274
 
2275
+ GGML_API struct ggml_tensor * ggml_arange(
2276
+ struct ggml_context * ctx,
2277
+ float start,
2278
+ float stop,
2279
+ float step);
2280
+
2273
2281
  #define GGML_KQ_MASK_PAD 64
2274
2282
 
2275
2283
  // q: [n_embd_k, n_batch, n_head, ne3 ]
@@ -221,6 +221,10 @@ if (GGML_BACKEND_DL)
221
221
  target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
222
222
  endif()
223
223
 
224
+ if (GGML_SCHED_NO_REALLOC)
225
+ target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
226
+ endif()
227
+
224
228
  add_library(ggml
225
229
  ggml-backend-reg.cpp)
226
230
  add_library(ggml::ggml ALIAS ggml)
@@ -270,10 +274,13 @@ function(ggml_add_backend_library backend)
270
274
  endif()
271
275
 
272
276
  # Set versioning properties for all backend libraries
273
- set_target_properties(${backend} PROPERTIES
274
- VERSION ${GGML_VERSION}
275
- SOVERSION ${GGML_VERSION_MAJOR}
276
- )
277
+ # Building a MODULE library with a version is not supported on macOS (https://gitlab.kitware.com/cmake/cmake/-/issues/20782)
278
+ if (NOT (APPLE AND GGML_BACKEND_DL))
279
+ set_target_properties(${backend} PROPERTIES
280
+ VERSION ${GGML_VERSION}
281
+ SOVERSION ${GGML_VERSION_MAJOR}
282
+ )
283
+ endif()
277
284
 
278
285
  if(NOT GGML_AVAILABLE_BACKENDS)
279
286
  set(GGML_AVAILABLE_BACKENDS "${backend}"
@@ -328,6 +335,14 @@ function(ggml_add_cpu_backend_variant tag_name)
328
335
  set(GGML_INTERNAL_${feat} OFF)
329
336
  endforeach()
330
337
 
338
+ foreach (feat ${ARGN})
339
+ set(GGML_INTERNAL_${feat} ON)
340
+ endforeach()
341
+ elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
342
+ foreach (feat RVV)
343
+ set(GGML_INTERNAL_${feat} OFF)
344
+ endforeach()
345
+
331
346
  foreach (feat ${ARGN})
332
347
  set(GGML_INTERNAL_${feat} ON)
333
348
  endforeach()
@@ -402,6 +417,13 @@ if (GGML_CPU_ALL_VARIANTS)
402
417
  else()
403
418
  message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
404
419
  endif()
420
+ elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
421
+ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
422
+ ggml_add_cpu_backend_variant(riscv64_0)
423
+ ggml_add_cpu_backend_variant(riscv64_v RVV)
424
+ else()
425
+ message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
426
+ endif()
405
427
  else()
406
428
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
407
429
  endif()
@@ -224,7 +224,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
224
224
 
225
225
  include(CheckCXXSourceCompiles)
226
226
  set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
227
- set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}")
227
+ string(REPLACE ";" " " ARCH_FLAGS_STR "${ARCH_FLAGS}")
228
+ set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS_STR}")
228
229
  foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
229
230
  set(ARM_FEATURE "HAVE_${feature}")
230
231
  check_cxx_source_compiles(
@@ -452,22 +453,35 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
452
453
  ggml-cpu/spacemit/ime_kernels.h
453
454
  )
454
455
  endif()
455
- set(MARCH_STR "rv64gc")
456
- if (GGML_RV_ZFH)
457
- string(APPEND MARCH_STR "_zfh")
458
- endif()
459
- if (GGML_XTHEADVECTOR)
460
- string(APPEND MARCH_STR "_xtheadvector")
461
- elseif (GGML_RVV)
462
- string(APPEND MARCH_STR "_v")
463
- if (GGML_RV_ZVFH)
464
- string(APPEND MARCH_STR "_zvfh")
456
+ if(NOT GGML_CPU_ALL_VARIANTS)
457
+ set(MARCH_STR "rv64gc")
458
+ if (GGML_RV_ZFH)
459
+ string(APPEND MARCH_STR "_zfh")
465
460
  endif()
461
+ if (GGML_XTHEADVECTOR)
462
+ string(APPEND MARCH_STR "_xtheadvector")
463
+ elseif (GGML_RVV)
464
+ string(APPEND MARCH_STR "_v")
465
+ if (GGML_RV_ZVFH)
466
+ string(APPEND MARCH_STR "_zvfh")
467
+ endif()
468
+ endif()
469
+ if (GGML_RV_ZICBOP)
470
+ string(APPEND MARCH_STR "_zicbop")
471
+ endif()
472
+ list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
473
+ else()
474
+ # Begin with the lowest baseline
475
+ set(ARCH_DEFINITIONS "")
476
+
477
+ if (GGML_INTERNAL_RVV)
478
+ message(STATUS "RVV enabled")
479
+ list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
480
+ list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
481
+ endif()
482
+
483
+ ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
466
484
  endif()
467
- if (GGML_RV_ZICBOP)
468
- string(APPEND MARCH_STR "_zicbop")
469
- endif()
470
- list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
471
485
  elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
472
486
  message(STATUS "s390x detected")
473
487
  list(APPEND GGML_CPU_SOURCES