@fugood/llama.node 1.4.3 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.ts CHANGED
@@ -565,6 +565,14 @@ export interface LlamaContext {
565
565
  */
566
566
  cancelRequest(requestId: number): void
567
567
 
568
+ /**
569
+ * Clear the KV and recurrent caches.
570
+ * This is faster than recreating the context and useful for preventing
571
+ * cache contamination between chat sessions.
572
+ * @param clearData If true, also clears the cache data (default: false)
573
+ */
574
+ clearCache(clearData?: boolean): void
575
+
568
576
  // static
569
577
  loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
570
578
  toggleNativeLog(
package/lib/index.js CHANGED
@@ -195,6 +195,15 @@ class LlamaContextWrapper {
195
195
  decodeAudioTokens(tokens) {
196
196
  return this.ctx.decodeAudioTokens(tokens);
197
197
  }
198
+ /**
199
+ * Clear the KV and recurrent caches.
200
+ * This is faster than recreating the context and useful for preventing
201
+ * cache contamination between chat sessions.
202
+ * @param clearData If true, also clears the cache data (default: false)
203
+ */
204
+ clearCache(clearData) {
205
+ this.ctx.clearCache(clearData);
206
+ }
198
207
  }
199
208
  const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
200
209
  var _a, _b;
package/lib/index.ts CHANGED
@@ -299,6 +299,16 @@ class LlamaContextWrapper {
299
299
  decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
300
300
  return this.ctx.decodeAudioTokens(tokens)
301
301
  }
302
+
303
+ /**
304
+ * Clear the KV and recurrent caches.
305
+ * This is faster than recreating the context and useful for preventing
306
+ * cache contamination between chat sessions.
307
+ * @param clearData If true, also clears the cache data (default: false)
308
+ */
309
+ clearCache(clearData?: boolean): void {
310
+ this.ctx.clearCache(clearData)
311
+ }
302
312
  }
303
313
 
304
314
  export const loadModel = async (
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.3",
4
+ "version": "1.4.4",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.3",
76
- "@fugood/node-llama-darwin-x64": "1.4.3",
77
- "@fugood/node-llama-linux-arm64": "1.4.3",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.3",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.3",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.3",
81
- "@fugood/node-llama-linux-x64": "1.4.3",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.3",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.3",
84
- "@fugood/node-llama-win32-arm64": "1.4.3",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.3",
86
- "@fugood/node-llama-win32-x64": "1.4.3",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.3",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.3"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.4",
76
+ "@fugood/node-llama-darwin-x64": "1.4.4",
77
+ "@fugood/node-llama-linux-arm64": "1.4.4",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.4",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.4",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.4",
81
+ "@fugood/node-llama-linux-x64": "1.4.4",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.4",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.4",
84
+ "@fugood/node-llama-win32-arm64": "1.4.4",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.4",
86
+ "@fugood/node-llama-win32-x64": "1.4.4",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.4",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.4"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -200,6 +200,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
200
200
  static_cast<napi_property_attributes>(napi_enumerable)),
201
201
  InstanceMethod<&LlamaContext::CancelRequest>(
202
202
  "cancelRequest",
203
+ static_cast<napi_property_attributes>(napi_enumerable)),
204
+ InstanceMethod<&LlamaContext::ClearCache>(
205
+ "clearCache",
203
206
  static_cast<napi_property_attributes>(napi_enumerable))});
204
207
  Napi::FunctionReference *constructor = new Napi::FunctionReference();
205
208
  *constructor = Napi::Persistent(func);
@@ -1505,3 +1508,24 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
1505
1508
  worker->Queue();
1506
1509
  return worker->Promise();
1507
1510
  }
1511
+
1512
+ // clearCache(clearData?: boolean): void
1513
+ void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
1514
+ Napi::Env env = info.Env();
1515
+ if (!_rn_ctx) {
1516
+ Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
1517
+ return;
1518
+ }
1519
+ if (_rn_ctx->completion != nullptr && _rn_ctx->completion->is_predicting) {
1520
+ Napi::TypeError::New(env, "Cannot clear cache while completion is in progress")
1521
+ .ThrowAsJavaScriptException();
1522
+ return;
1523
+ }
1524
+
1525
+ bool clear_data = false;
1526
+ if (info.Length() >= 1 && info[0].IsBoolean()) {
1527
+ clear_data = info[0].ToBoolean().Value();
1528
+ }
1529
+
1530
+ _rn_ctx->clearCache(clear_data);
1531
+ }
@@ -69,6 +69,9 @@ private:
69
69
  Napi::Value QueueRerank(const Napi::CallbackInfo &info);
70
70
  void CancelRequest(const Napi::CallbackInfo &info);
71
71
 
72
+ // Cache management
73
+ void ClearCache(const Napi::CallbackInfo &info);
74
+
72
75
  std::string _info;
73
76
  std::vector<std::string> _used_devices;
74
77
  Napi::Object _meta;
@@ -427,7 +427,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
427
427
 
428
428
  // model is required (except for server)
429
429
  // TODO @ngxson : maybe show a list of available models in CLI in this case
430
- if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
430
+ if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
431
431
  throw std::invalid_argument("error: --model is required\n");
432
432
  }
433
433
 
@@ -708,6 +708,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
708
708
  params.use_jinja = true;
709
709
  }
710
710
 
711
+ params.use_color = tty_can_use_colors();
712
+
711
713
  // load dynamic backends
712
714
  ggml_backend_load_all();
713
715
 
@@ -790,10 +792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
790
792
  }
791
793
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
792
794
  add_opt(common_arg(
793
- {"-co", "--color"},
794
- string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
795
- [](common_params & params) {
796
- params.use_color = true;
795
+ {"-co", "--color"}, "[on|off|auto]",
796
+ "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
797
+ "'auto' enables colors when output is to a terminal",
798
+ [](common_params & params, const std::string & value) {
799
+ if (is_truthy(value)) {
800
+ params.use_color = true;
801
+ } else if (is_falsey(value)) {
802
+ params.use_color = false;
803
+ } else if (is_autoy(value)) {
804
+ params.use_color = tty_can_use_colors();
805
+ } else {
806
+ throw std::invalid_argument(
807
+ string_format("error: unknown value for --color: '%s'\n", value.c_str()));
808
+ }
797
809
  }
798
810
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
799
811
  add_opt(common_arg(
@@ -1022,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1022
1034
  params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1023
1035
  } else {
1024
1036
  throw std::runtime_error(
1025
- string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1037
+ string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
1026
1038
  }
1027
1039
  }).set_env("LLAMA_ARG_FLASH_ATTN"));
1028
1040
  add_opt(common_arg(
@@ -2696,7 +2708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2696
2708
  common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
2697
2709
  } else {
2698
2710
  throw std::invalid_argument(
2699
- string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
2711
+ string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
2700
2712
  }
2701
2713
  }
2702
2714
  ).set_env("LLAMA_LOG_COLORS"));
@@ -786,11 +786,29 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
786
786
  #include <iostream>
787
787
 
788
788
 
789
+ #ifdef _WIN32
790
+ static std::wstring utf8_to_wstring(const std::string & str) {
791
+ if (str.empty()) {
792
+ return std::wstring();
793
+ }
794
+
795
+ int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
796
+
797
+ if (size <= 0) {
798
+ return std::wstring();
799
+ }
800
+
801
+ std::wstring wstr(size, 0);
802
+ MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
803
+
804
+ return wstr;
805
+ }
806
+ #endif
807
+
789
808
  // returns true if successful, false otherwise
790
809
  bool fs_create_directory_with_parents(const std::string & path) {
791
810
  #ifdef _WIN32
792
- std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
793
- std::wstring wpath = converter.from_bytes(path);
811
+ std::wstring wpath = utf8_to_wstring(path);
794
812
 
795
813
  // if the path already exists, check whether it's a directory
796
814
  const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -964,6 +982,32 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
964
982
  return files;
965
983
  }
966
984
 
985
+ //
986
+ // TTY utils
987
+ //
988
+
989
+ bool tty_can_use_colors() {
990
+ // Check NO_COLOR environment variable (https://no-color.org/)
991
+ if (const char * no_color = std::getenv("NO_COLOR")) {
992
+ if (no_color[0] != '\0') {
993
+ return false;
994
+ }
995
+ }
996
+
997
+ // Check TERM environment variable
998
+ if (const char * term = std::getenv("TERM")) {
999
+ if (std::strcmp(term, "dumb") == 0) {
1000
+ return false;
1001
+ }
1002
+ }
1003
+
1004
+ // Check if stdout and stderr are connected to a terminal
1005
+ // We check both because log messages can go to either
1006
+ bool stdout_is_tty = isatty(fileno(stdout));
1007
+ bool stderr_is_tty = isatty(fileno(stderr));
1008
+
1009
+ return stdout_is_tty || stderr_is_tty;
1010
+ }
967
1011
 
968
1012
  //
969
1013
  // Model utils
@@ -656,6 +656,13 @@ struct common_file_info {
656
656
  };
657
657
  std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
658
658
 
659
+ //
660
+ // TTY utils
661
+ //
662
+
663
+ // Auto-detect if colors can be enabled based on terminal and environment
664
+ bool tty_can_use_colors();
665
+
659
666
  //
660
667
  // Model utils
661
668
  //
@@ -1,3 +1,4 @@
1
+ #include "common.h"
1
2
  #include "log.h"
2
3
 
3
4
  #include <chrono>
@@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
26
27
  common_log_verbosity_thold = verbosity;
27
28
  }
28
29
 
29
- // Auto-detect if colors should be enabled based on terminal and environment
30
- static bool common_log_should_use_colors_auto() {
31
- // Check NO_COLOR environment variable (https://no-color.org/)
32
- if (const char * no_color = std::getenv("NO_COLOR")) {
33
- if (no_color[0] != '\0') {
34
- return false;
35
- }
36
- }
37
-
38
- // Check TERM environment variable
39
- if (const char * term = std::getenv("TERM")) {
40
- if (std::strcmp(term, "dumb") == 0) {
41
- return false;
42
- }
43
- }
44
-
45
- // Check if stdout and stderr are connected to a terminal
46
- // We check both because log messages can go to either
47
- bool stdout_is_tty = isatty(fileno(stdout));
48
- bool stderr_is_tty = isatty(fileno(stderr));
49
-
50
- return stdout_is_tty || stderr_is_tty;
51
- }
52
-
53
30
  static int64_t t_us() {
54
31
  return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
55
32
  }
@@ -391,7 +368,7 @@ struct common_log * common_log_main() {
391
368
  static std::once_flag init_flag;
392
369
  std::call_once(init_flag, [&]() {
393
370
  // Set default to auto-detect colors
394
- log.set_colors(common_log_should_use_colors_auto());
371
+ log.set_colors(tty_can_use_colors());
395
372
  });
396
373
 
397
374
  return &log;
@@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {
422
399
 
423
400
  void common_log_set_colors(struct common_log * log, log_colors colors) {
424
401
  if (colors == LOG_COLORS_AUTO) {
425
- log->set_colors(common_log_should_use_colors_auto());
402
+ log->set_colors(tty_can_use_colors());
426
403
  return;
427
404
  }
428
405
 
@@ -253,6 +253,9 @@ option(GGML_HEXAGON "ggml: enable Hexagon backend"
253
253
  # toolchain for vulkan-shaders-gen
254
254
  set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
255
255
 
256
+ option(GGML_ZENDNN "ggml: use ZenDNN" OFF)
257
+ option(ZENDNN_ROOT "ggml: path to ZenDNN installation" "")
258
+
256
259
  # extra artifacts
257
260
  option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
258
261
  option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
@@ -314,6 +317,7 @@ set(GGML_PUBLIC_HEADERS
314
317
  include/ggml-sycl.h
315
318
  include/ggml-vulkan.h
316
319
  include/ggml-webgpu.h
320
+ include/ggml-zendnn.h
317
321
  include/gguf.h)
318
322
 
319
323
  set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
@@ -1,6 +1,5 @@
1
1
  #pragma once
2
2
 
3
- #include "ggml.h"
4
3
  #include "ggml-backend.h"
5
4
 
6
5
  #ifdef __cplusplus
@@ -8,7 +7,7 @@ extern "C" {
8
7
  #endif
9
8
 
10
9
  #define RPC_PROTO_MAJOR_VERSION 3
11
- #define RPC_PROTO_MINOR_VERSION 5
10
+ #define RPC_PROTO_MINOR_VERSION 6
12
11
  #define RPC_PROTO_PATCH_VERSION 0
13
12
  #define GGML_RPC_MAX_SERVERS 16
14
13
 
@@ -0,0 +1,22 @@
1
+ #pragma once
2
+
3
+ #include "ggml-backend.h"
4
+ #include "ggml.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ // backend API
11
+ GGML_BACKEND_API ggml_backend_t ggml_backend_zendnn_init(void);
12
+
13
+ GGML_BACKEND_API bool ggml_backend_is_zendnn(ggml_backend_t backend);
14
+
15
+ // number of threads used for zendnn operations
16
+ GGML_BACKEND_API void ggml_backend_zendnn_set_n_threads(ggml_backend_t backend_zendnn, int n_threads);
17
+
18
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zendnn_reg(void);
19
+
20
+ #ifdef __cplusplus
21
+ }
22
+ #endif
@@ -2196,6 +2196,15 @@ extern "C" {
2196
2196
  int p2,
2197
2197
  int p3);
2198
2198
 
2199
+ // pad each dimension with values on the other side of the torus (looping around)
2200
+ GGML_API struct ggml_tensor * ggml_pad_circular(
2201
+ struct ggml_context * ctx,
2202
+ struct ggml_tensor * a,
2203
+ int p0,
2204
+ int p1,
2205
+ int p2,
2206
+ int p3);
2207
+
2199
2208
  GGML_API struct ggml_tensor * ggml_pad_ext(
2200
2209
  struct ggml_context * ctx,
2201
2210
  struct ggml_tensor * a,
@@ -2209,6 +2218,19 @@ extern "C" {
2209
2218
  int rp3
2210
2219
  );
2211
2220
 
2221
+ // pad each dimension with values on the other side of the torus (looping around)
2222
+ GGML_API struct ggml_tensor * ggml_pad_ext_circular(
2223
+ struct ggml_context * ctx,
2224
+ struct ggml_tensor * a,
2225
+ int lp0,
2226
+ int rp0,
2227
+ int lp1,
2228
+ int rp1,
2229
+ int lp2,
2230
+ int rp2,
2231
+ int lp3,
2232
+ int rp3);
2233
+
2212
2234
  // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
2213
2235
  GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
2214
2236
  struct ggml_context * ctx,
@@ -440,6 +440,7 @@ ggml_add_backend(WebGPU)
440
440
  ggml_add_backend(zDNN)
441
441
  ggml_add_backend(OpenCL)
442
442
  ggml_add_backend(Hexagon)
443
+ ggml_add_backend(ZenDNN)
443
444
 
444
445
  foreach (target ggml-base ggml)
445
446
  target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
@@ -505,7 +505,6 @@ void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
505
505
  constexpr int blocklen = 8;
506
506
 
507
507
  assert(n % qk == 0);
508
- assert(nr % 4 == 0);
509
508
  assert(nc % ncols_interleaved == 0);
510
509
 
511
510
  UNUSED(nb);
@@ -645,7 +644,6 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
645
644
  constexpr int blocklen = 8;
646
645
 
647
646
  assert(n % qk == 0);
648
- assert(nr % 4 == 0);
649
647
  assert(nc % ncols_interleaved == 0);
650
648
 
651
649
  UNUSED(nb);