@fugood/llama.node 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/arg.cpp +7 -0
  4. package/src/llama.cpp/common/common.h +1 -0
  5. package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
  6. package/src/llama.cpp/ggml/include/ggml.h +91 -10
  7. package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  8. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  9. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  10. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +726 -155
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +9 -9
  14. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -9
  15. package/src/llama.cpp/include/llama.h +1 -0
  16. package/src/llama.cpp/src/llama-arch.cpp +90 -2
  17. package/src/llama.cpp/src/llama-arch.h +6 -0
  18. package/src/llama.cpp/src/llama-batch.cpp +27 -1
  19. package/src/llama.cpp/src/llama-batch.h +8 -1
  20. package/src/llama.cpp/src/llama-chat.cpp +15 -0
  21. package/src/llama.cpp/src/llama-chat.h +1 -0
  22. package/src/llama.cpp/src/llama-graph.cpp +64 -50
  23. package/src/llama.cpp/src/llama-graph.h +41 -16
  24. package/src/llama.cpp/src/llama-hparams.cpp +2 -1
  25. package/src/llama.cpp/src/llama-hparams.h +1 -0
  26. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  27. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  28. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  29. package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  30. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  31. package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  32. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  33. package/src/llama.cpp/src/llama-memory-recurrent.cpp +15 -2
  34. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  35. package/src/llama.cpp/src/llama-memory.h +3 -0
  36. package/src/llama.cpp/src/llama-model.cpp +1234 -248
  37. package/src/llama.cpp/src/llama-model.h +2 -0
  38. package/src/llama.cpp/src/llama-vocab.cpp +8 -1
  39. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.2",
4
+ "version": "1.0.3",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -70,19 +70,19 @@
70
70
  "CMakeLists.txt"
71
71
  ],
72
72
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.2",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.2",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.2",
76
- "@fugood/node-llama-linux-arm64": "1.0.2",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.2",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.2",
79
- "@fugood/node-llama-win32-x64": "1.0.2",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.2",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.2",
82
- "@fugood/node-llama-win32-arm64": "1.0.2",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.2",
84
- "@fugood/node-llama-darwin-x64": "1.0.2",
85
- "@fugood/node-llama-darwin-arm64": "1.0.2"
73
+ "@fugood/node-llama-linux-x64": "1.0.3",
74
+ "@fugood/node-llama-linux-x64-vulkan": "1.0.3",
75
+ "@fugood/node-llama-linux-x64-cuda": "1.0.3",
76
+ "@fugood/node-llama-linux-arm64": "1.0.3",
77
+ "@fugood/node-llama-linux-arm64-vulkan": "1.0.3",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.0.3",
79
+ "@fugood/node-llama-win32-x64": "1.0.3",
80
+ "@fugood/node-llama-win32-x64-vulkan": "1.0.3",
81
+ "@fugood/node-llama-win32-x64-cuda": "1.0.3",
82
+ "@fugood/node-llama-win32-arm64": "1.0.3",
83
+ "@fugood/node-llama-win32-arm64-vulkan": "1.0.3",
84
+ "@fugood/node-llama-darwin-x64": "1.0.3",
85
+ "@fugood/node-llama-darwin-arm64": "1.0.3"
86
86
  },
87
87
  "devDependencies": {
88
88
  "@babel/preset-env": "^7.24.4",
@@ -120,7 +120,6 @@ endfunction()
120
120
 
121
121
  llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
122
122
  llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
123
- llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
124
123
  llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
125
124
  llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
126
125
  llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2734
2734
  params.public_path = value;
2735
2735
  }
2736
2736
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2737
+ add_opt(common_arg(
2738
+ {"--api-prefix"}, "PREFIX",
2739
+ string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2740
+ [](common_params & params, const std::string & value) {
2741
+ params.api_prefix = value;
2742
+ }
2743
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2737
2744
  add_opt(common_arg(
2738
2745
  {"--no-webui"},
2739
2746
  string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -371,6 +371,7 @@ struct common_params {
371
371
 
372
372
  std::string hostname = "127.0.0.1";
373
373
  std::string public_path = ""; // NOLINT
374
+ std::string api_prefix = ""; // NOLINT
374
375
  std::string chat_template = ""; // NOLINT
375
376
  bool use_jinja = false; // NOLINT
376
377
  bool enable_chat_template = true;
@@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
181
181
  option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
182
182
  option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
183
183
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
184
- option(GGML_KOMPUTE "ggml: use Kompute" OFF)
185
184
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
186
185
  option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
187
186
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
266
265
  include/ggml-cann.h
267
266
  include/ggml-cpp.h
268
267
  include/ggml-cuda.h
269
- include/ggml-kompute.h
270
268
  include/ggml-opt.h
271
269
  include/ggml-metal.h
272
270
  include/ggml-rpc.h
@@ -360,6 +358,13 @@ write_basic_package_version_file(
360
358
  VERSION ${GGML_INSTALL_VERSION}
361
359
  COMPATIBILITY SameMajorVersion)
362
360
 
361
+ target_compile_definitions(ggml-base PRIVATE
362
+ GGML_VERSION="${GGML_INSTALL_VERSION}"
363
+ GGML_COMMIT="${GGML_BUILD_COMMIT}"
364
+ )
365
+ message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
366
+ message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
367
+
363
368
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
364
369
  ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
365
370
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
@@ -314,6 +314,13 @@
314
314
  extern "C" {
315
315
  #endif
316
316
 
317
+ // Function type used in fatal error callbacks
318
+ typedef void (*ggml_abort_callback_t)(const char * error_message);
319
+
320
+ // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
321
+ // Returns the old callback for chaining
322
+ GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
323
+
317
324
  GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
318
325
  GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
319
326
 
@@ -482,12 +489,13 @@ extern "C" {
482
489
  GGML_OP_CONV_TRANSPOSE_1D,
483
490
  GGML_OP_IM2COL,
484
491
  GGML_OP_IM2COL_BACK,
492
+ GGML_OP_CONV_2D,
485
493
  GGML_OP_CONV_2D_DW,
486
494
  GGML_OP_CONV_TRANSPOSE_2D,
487
495
  GGML_OP_POOL_1D,
488
496
  GGML_OP_POOL_2D,
489
497
  GGML_OP_POOL_2D_BACK,
490
- GGML_OP_UPSCALE, // nearest interpolate
498
+ GGML_OP_UPSCALE,
491
499
  GGML_OP_PAD,
492
500
  GGML_OP_PAD_REFLECT_1D,
493
501
  GGML_OP_ROLL,
@@ -549,6 +557,8 @@ extern "C" {
549
557
  GGML_GLU_OP_REGLU,
550
558
  GGML_GLU_OP_GEGLU,
551
559
  GGML_GLU_OP_SWIGLU,
560
+ GGML_GLU_OP_GEGLU_ERF,
561
+ GGML_GLU_OP_GEGLU_QUICK,
552
562
 
553
563
  GGML_GLU_OP_COUNT,
554
564
  };
@@ -638,6 +648,9 @@ extern "C" {
638
648
 
639
649
  // misc
640
650
 
651
+ GGML_API const char * ggml_version(void);
652
+ GGML_API const char * ggml_commit(void);
653
+
641
654
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
642
655
  GGML_API int64_t ggml_time_ms(void);
643
656
  GGML_API int64_t ggml_time_us(void);
@@ -1136,6 +1149,22 @@ extern "C" {
1136
1149
  struct ggml_context * ctx,
1137
1150
  struct ggml_tensor * a);
1138
1151
 
1152
+ GGML_API struct ggml_tensor * ggml_geglu_erf(
1153
+ struct ggml_context * ctx,
1154
+ struct ggml_tensor * a);
1155
+
1156
+ GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
1157
+ struct ggml_context * ctx,
1158
+ struct ggml_tensor * a);
1159
+
1160
+ GGML_API struct ggml_tensor * ggml_geglu_quick(
1161
+ struct ggml_context * ctx,
1162
+ struct ggml_tensor * a);
1163
+
1164
+ GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
1165
+ struct ggml_context * ctx,
1166
+ struct ggml_tensor * a);
1167
+
1139
1168
  // A: n columns, r rows,
1140
1169
  // B: n columns, r rows,
1141
1170
  GGML_API struct ggml_tensor * ggml_glu_split(
@@ -1159,6 +1188,16 @@ extern "C" {
1159
1188
  struct ggml_tensor * a,
1160
1189
  struct ggml_tensor * b);
1161
1190
 
1191
+ GGML_API struct ggml_tensor * ggml_geglu_erf_split(
1192
+ struct ggml_context * ctx,
1193
+ struct ggml_tensor * a,
1194
+ struct ggml_tensor * b);
1195
+
1196
+ GGML_API struct ggml_tensor * ggml_geglu_quick_split(
1197
+ struct ggml_context * ctx,
1198
+ struct ggml_tensor * a,
1199
+ struct ggml_tensor * b);
1200
+
1162
1201
  // normalize along rows
1163
1202
  GGML_API struct ggml_tensor * ggml_norm(
1164
1203
  struct ggml_context * ctx,
@@ -1502,8 +1541,14 @@ extern "C" {
1502
1541
  struct ggml_context * ctx,
1503
1542
  struct ggml_tensor * a);
1504
1543
 
1544
+ // a [ne0, ne01, ne02, ne03]
1545
+ // mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
1546
+ //
1547
+ // broadcast:
1548
+ // ne02 % ne12 == 0
1549
+ // ne03 % ne13 == 0
1550
+ //
1505
1551
  // fused soft_max(a*scale + mask*(ALiBi slope))
1506
- // mask is optional
1507
1552
  // max_bias = 0.0f for no ALiBi
1508
1553
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1509
1554
  struct ggml_context * ctx,
@@ -1813,6 +1858,17 @@ extern "C" {
1813
1858
  struct ggml_tensor * b,
1814
1859
  int stride);
1815
1860
 
1861
+ GGML_API struct ggml_tensor * ggml_conv_2d_direct(
1862
+ struct ggml_context * ctx,
1863
+ struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
1864
+ struct ggml_tensor * b, // input data [W, H, C, N]
1865
+ int s0, // stride dimension 0
1866
+ int s1, // stride dimension 1
1867
+ int p0, // padding dimension 0
1868
+ int p1, // padding dimension 1
1869
+ int d0, // dilation dimension 0
1870
+ int d1); // dilation dimension 1
1871
+
1816
1872
  enum ggml_op_pool {
1817
1873
  GGML_OP_POOL_MAX,
1818
1874
  GGML_OP_POOL_AVG,
@@ -1855,6 +1911,12 @@ extern "C" {
1855
1911
  enum ggml_scale_mode {
1856
1912
  GGML_SCALE_MODE_NEAREST = 0,
1857
1913
  GGML_SCALE_MODE_BILINEAR = 1,
1914
+
1915
+ GGML_SCALE_MODE_COUNT
1916
+ };
1917
+
1918
+ enum ggml_scale_flag {
1919
+ GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
1858
1920
  };
1859
1921
 
1860
1922
  // interpolate
@@ -1867,14 +1929,26 @@ extern "C" {
1867
1929
 
1868
1930
  // interpolate
1869
1931
  // interpolate scale to specified dimensions
1870
- GGML_API struct ggml_tensor * ggml_upscale_ext(
1932
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
1871
1933
  struct ggml_context * ctx,
1872
1934
  struct ggml_tensor * a,
1873
1935
  int ne0,
1874
1936
  int ne1,
1875
1937
  int ne2,
1876
1938
  int ne3,
1877
- enum ggml_scale_mode mode);
1939
+ enum ggml_scale_mode mode),
1940
+ "use ggml_interpolate instead");
1941
+
1942
+ // Up- or downsamples the input to the specified size.
1943
+ // 2D scale modes (eg. bilinear) are applied to the first two dimensions.
1944
+ GGML_API struct ggml_tensor * ggml_interpolate(
1945
+ struct ggml_context * ctx,
1946
+ struct ggml_tensor * a,
1947
+ int64_t ne0,
1948
+ int64_t ne1,
1949
+ int64_t ne2,
1950
+ int64_t ne3,
1951
+ uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
1878
1952
 
1879
1953
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1880
1954
  GGML_API struct ggml_tensor * ggml_pad(
@@ -1937,11 +2011,17 @@ extern "C" {
1937
2011
 
1938
2012
  #define GGML_KQ_MASK_PAD 64
1939
2013
 
1940
- // q: [n_embd_k, n_batch, n_head, 1]
1941
- // k: [n_embd_k, n_kv, n_head_kv, 1]
1942
- // v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1943
- // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1944
- // res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
2014
+ // q: [n_embd_k, n_batch, n_head, ne3 ]
2015
+ // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2016
+ // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2017
+ // mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2018
+ // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2019
+ //
2020
+ // broadcast:
2021
+ // n_head % n_head_kv == 0
2022
+ // n_head % ne32 == 0
2023
+ // ne3 % ne33 == 0
2024
+ //
1945
2025
  GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1946
2026
  struct ggml_context * ctx,
1947
2027
  struct ggml_tensor * q,
@@ -1980,7 +2060,8 @@ extern "C" {
1980
2060
  struct ggml_tensor * dt,
1981
2061
  struct ggml_tensor * A,
1982
2062
  struct ggml_tensor * B,
1983
- struct ggml_tensor * C);
2063
+ struct ggml_tensor * C,
2064
+ struct ggml_tensor * ids);
1984
2065
 
1985
2066
  // partition into non-overlapping windows with padding if needed
1986
2067
  // example:
@@ -365,7 +365,6 @@ ggml_add_backend(BLAS)
365
365
  ggml_add_backend(CANN)
366
366
  ggml_add_backend(CUDA)
367
367
  ggml_add_backend(HIP)
368
- ggml_add_backend(Kompute)
369
368
  ggml_add_backend(METAL)
370
369
  ggml_add_backend(MUSA)
371
370
  ggml_add_backend(RPC)
@@ -5,7 +5,7 @@ function(ggml_add_cpu_backend_features cpu_name arch)
5
5
  # build, using set_source_files_properties() to set the arch flags is not possible
6
6
  set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
7
7
  add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
8
- target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
8
+ target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
9
9
  target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
10
10
  target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
11
11
  set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -589,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
589
589
  if (EMSCRIPTEN)
590
590
  set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
591
591
  endif()
592
+
593
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
594
+ # The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
595
+ target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
596
+ endif()
592
597
  endfunction()
@@ -1193,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
1193
1193
  }
1194
1194
  }
1195
1195
 
1196
- static void ggml_compute_forward_mul_mat(
1196
+ void ggml_compute_forward_mul_mat(
1197
1197
  const struct ggml_compute_params * params,
1198
1198
  struct ggml_tensor * dst) {
1199
1199
 
@@ -1866,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1866
1866
  {
1867
1867
  ggml_compute_forward_im2col_back_f32(params, tensor);
1868
1868
  } break;
1869
+ case GGML_OP_CONV_2D:
1870
+ {
1871
+ ggml_compute_forward_conv_2d(params, tensor);
1872
+ } break;
1869
1873
  case GGML_OP_CONV_2D_DW:
1870
1874
  {
1871
1875
  ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -2168,6 +2172,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2168
2172
  case GGML_GLU_OP_REGLU:
2169
2173
  case GGML_GLU_OP_GEGLU:
2170
2174
  case GGML_GLU_OP_SWIGLU:
2175
+ case GGML_GLU_OP_GEGLU_ERF:
2176
+ case GGML_GLU_OP_GEGLU_QUICK:
2171
2177
  {
2172
2178
  n_tasks = n_threads;
2173
2179
  } break;
@@ -2228,6 +2234,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2228
2234
  } break;
2229
2235
  case GGML_OP_IM2COL:
2230
2236
  case GGML_OP_IM2COL_BACK:
2237
+ case GGML_OP_CONV_2D:
2231
2238
  case GGML_OP_CONV_2D_DW:
2232
2239
  case GGML_OP_CONV_TRANSPOSE_1D:
2233
2240
  case GGML_OP_CONV_TRANSPOSE_2D:
@@ -2746,6 +2753,10 @@ struct ggml_cplan ggml_graph_plan(
2746
2753
  GGML_ABORT("fatal error");
2747
2754
  }
2748
2755
  } break;
2756
+ case GGML_OP_CONV_2D:
2757
+ {
2758
+ cur = GGML_IM2COL_WORK_SIZE;
2759
+ } break;
2749
2760
  case GGML_OP_CONV_TRANSPOSE_2D:
2750
2761
  {
2751
2762
  const int64_t ne00 = node->src[0]->ne[0]; // W