@fugood/llama.node 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.js CHANGED
@@ -51,7 +51,7 @@ const getPlatformPackageName = (variant) => {
51
51
  };
52
52
  const loadPlatformPackage = (packageName) => __awaiter(void 0, void 0, void 0, function* () {
53
53
  try {
54
- return yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s)));
54
+ return (yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s))));
55
55
  }
56
56
  catch (error) {
57
57
  return null;
package/lib/binding.ts CHANGED
@@ -1,9 +1,9 @@
1
1
  export type MessagePart = {
2
- type: string,
3
- text?: string,
2
+ type: string
3
+ text?: string
4
4
  image_url?: {
5
5
  url?: string
6
- },
6
+ }
7
7
  input_audio?: {
8
8
  format: string
9
9
  data?: string
@@ -70,6 +70,12 @@ export type LlamaModelOptions = {
70
70
  * Number of layers to keep MoE weights on CPU
71
71
  */
72
72
  n_cpu_moe?: number
73
+ /**
74
+ * List of device names to use for offloading
75
+ * Device names can be obtained from getBackendDevicesInfo()
76
+ * Example: ['Metal', 'BLAS', 'CPU']
77
+ */
78
+ devices?: string[]
73
79
  use_mlock?: boolean
74
80
  use_mmap?: boolean
75
81
  vocab_only?: boolean
@@ -375,9 +381,13 @@ export type ToolCall = {
375
381
  }
376
382
 
377
383
  export interface LlamaContext {
378
- new (options: LlamaModelOptions, onProgress?: (progress: number) => void): LlamaContext
384
+ new (
385
+ options: LlamaModelOptions,
386
+ onProgress?: (progress: number) => void,
387
+ ): LlamaContext
379
388
  getSystemInfo(): string
380
389
  getModelInfo(): ModelInfo
390
+ getUsedDevices(): string[]
381
391
  getFormattedChat(
382
392
  messages: ChatMessage[],
383
393
  chat_template?: string,
@@ -400,8 +410,15 @@ export interface LlamaContext {
400
410
  stopCompletion(): void
401
411
  tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
402
412
  detokenize(tokens: number[]): Promise<string>
403
- embedding(text: string, params?: { embd_normalize?: number }): Promise<EmbeddingResult>
404
- rerank(query: string, documents: string[], params?: RerankParams): Promise<RerankResult[]>
413
+ embedding(
414
+ text: string,
415
+ params?: { embd_normalize?: number },
416
+ ): Promise<EmbeddingResult>
417
+ rerank(
418
+ query: string,
419
+ documents: string[],
420
+ params?: RerankParams,
421
+ ): Promise<RerankResult[]>
405
422
  saveSession(path: string): Promise<void>
406
423
  loadSession(path: string): Promise<void>
407
424
  release(): Promise<void>
@@ -440,7 +457,7 @@ export interface LlamaContext {
440
457
  * @param options Object containing path and optional n_batch
441
458
  * @returns boolean indicating if loading was successful
442
459
  */
443
- initVocoder(options: { path: string, n_batch?: number }): boolean
460
+ initVocoder(options: { path: string; n_batch?: number }): boolean
444
461
 
445
462
  /**
446
463
  * Unload the vocoder model
@@ -459,7 +476,10 @@ export interface LlamaContext {
459
476
  * @param text Text to complete
460
477
  * @returns Formatted audio completion
461
478
  */
462
- getFormattedAudioCompletion(speaker: string|null, text: string): {
479
+ getFormattedAudioCompletion(
480
+ speaker: string | null,
481
+ text: string,
482
+ ): {
463
483
  prompt: string
464
484
  grammar?: string
465
485
  }
@@ -476,7 +496,7 @@ export interface LlamaContext {
476
496
  * @param tokens Tokens to decode
477
497
  * @returns Promise resolving to decoded audio tokens
478
498
  */
479
- decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
499
+ decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array>
480
500
 
481
501
  // Parallel decoding methods
482
502
 
@@ -485,7 +505,7 @@ export interface LlamaContext {
485
505
  * @param params Configuration for parallel mode
486
506
  * @returns boolean indicating if successful
487
507
  */
488
- enableParallelMode(params: { n_parallel?: number, n_batch?: number }): boolean
508
+ enableParallelMode(params: { n_parallel?: number; n_batch?: number }): boolean
489
509
 
490
510
  /**
491
511
  * Disable parallel decoding mode
@@ -563,9 +583,11 @@ const getPlatformPackageName = (variant?: LibVariant): string => {
563
583
  return `@fugood/node-llama-${platform}-${arch}${variantSuffix}`
564
584
  }
565
585
 
566
- const loadPlatformPackage = async (packageName: string): Promise<Module | null> => {
586
+ const loadPlatformPackage = async (
587
+ packageName: string,
588
+ ): Promise<Module | null> => {
567
589
  try {
568
- return await import(packageName) as Module
590
+ return (await import(packageName)) as Module
569
591
  } catch (error) {
570
592
  return null
571
593
  }
@@ -579,7 +601,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
579
601
 
580
602
  module = await loadPlatformPackage(getPlatformPackageName())
581
603
  if (module) {
582
- console.warn(`Not found package for variant "${variant}", fallback to default`)
604
+ console.warn(
605
+ `Not found package for variant "${variant}", fallback to default`,
606
+ )
583
607
  return module
584
608
  }
585
609
 
@@ -588,7 +612,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
588
612
  return (await import('../build/Release/index.node')) as Module
589
613
  }
590
614
 
591
- export const isLibVariantAvailable = async (variant?: LibVariant): Promise<boolean> => {
615
+ export const isLibVariantAvailable = async (
616
+ variant?: LibVariant,
617
+ ): Promise<boolean> => {
592
618
  if (variant && variant !== 'default') {
593
619
  const module = await loadPlatformPackage(getPlatformPackageName(variant))
594
620
  return module != null
package/lib/index.js CHANGED
@@ -76,6 +76,9 @@ class LlamaContextWrapper {
76
76
  getModelInfo() {
77
77
  return this.ctx.getModelInfo();
78
78
  }
79
+ getUsedDevices() {
80
+ return this.ctx.getUsedDevices();
81
+ }
79
82
  isJinjaSupported() {
80
83
  const { minja } = this.ctx.getModelInfo().chatTemplates;
81
84
  return !!(minja === null || minja === void 0 ? void 0 : minja.toolUse) || !!(minja === null || minja === void 0 ? void 0 : minja.default);
@@ -85,7 +88,7 @@ class LlamaContextWrapper {
85
88
  }
86
89
  getFormattedChat(messages, template, params) {
87
90
  var _a;
88
- const { messages: chat, has_media, media_paths, } = (0, utils_1.formatMediaChat)(messages);
91
+ const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
89
92
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
90
93
  let tmpl;
91
94
  if (template)
package/lib/index.ts CHANGED
@@ -94,6 +94,10 @@ class LlamaContextWrapper {
94
94
  return this.ctx.getModelInfo()
95
95
  }
96
96
 
97
+ getUsedDevices(): string[] {
98
+ return this.ctx.getUsedDevices()
99
+ }
100
+
97
101
  isJinjaSupported(): boolean {
98
102
  const { minja } = this.ctx.getModelInfo().chatTemplates
99
103
  return !!minja?.toolUse || !!minja?.default
@@ -118,11 +122,7 @@ class LlamaContextWrapper {
118
122
  chat_template_kwargs?: Record<string, string>
119
123
  },
120
124
  ): FormattedChatResult {
121
- const {
122
- messages: chat,
123
- has_media,
124
- media_paths,
125
- } = formatMediaChat(messages)
125
+ const { messages: chat, has_media, media_paths } = formatMediaChat(messages)
126
126
 
127
127
  const useJinja = this.isJinjaSupported() && params?.jinja
128
128
  let tmpl
@@ -169,8 +169,9 @@ class LlamaContextWrapper {
169
169
  options: LlamaCompletionOptions,
170
170
  callback?: (token: LlamaCompletionToken) => void,
171
171
  ): Promise<LlamaCompletionResult> {
172
- const { messages, media_paths = options.media_paths } =
173
- formatMediaChat(options.messages)
172
+ const { messages, media_paths = options.media_paths } = formatMediaChat(
173
+ options.messages,
174
+ )
174
175
  return this.ctx.completion(
175
176
  {
176
177
  ...options,
@@ -196,7 +197,10 @@ class LlamaContextWrapper {
196
197
  return this.ctx.detokenize(tokens)
197
198
  }
198
199
 
199
- embedding(text: string, params?: { embd_normalize?: number }): Promise<EmbeddingResult> {
200
+ embedding(
201
+ text: string,
202
+ params?: { embd_normalize?: number },
203
+ ): Promise<EmbeddingResult> {
200
204
  return this.ctx.embedding(text, params)
201
205
  }
202
206
 
@@ -329,7 +333,7 @@ export const loadLlamaModelInfo = async (
329
333
  }
330
334
 
331
335
  export const getBackendDevicesInfo = async (
332
- variant: LibVariant = 'default'
336
+ variant: LibVariant = 'default',
333
337
  ): Promise<import('./binding').BackendDeviceInfo[]> => {
334
338
  mods[variant] ??= await loadModule(variant)
335
339
  refreshNativeLogSetup()
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.4",
4
+ "version": "1.3.5",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.4",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.4",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.4",
78
- "@fugood/node-llama-linux-arm64": "1.3.4",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.4",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.4",
81
- "@fugood/node-llama-win32-x64": "1.3.4",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.4",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.4",
84
- "@fugood/node-llama-win32-arm64": "1.3.4",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.4",
86
- "@fugood/node-llama-darwin-x64": "1.3.4",
87
- "@fugood/node-llama-darwin-arm64": "1.3.4"
75
+ "@fugood/node-llama-linux-x64": "1.3.5",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.5",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.5",
78
+ "@fugood/node-llama-linux-arm64": "1.3.5",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.5",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.5",
81
+ "@fugood/node-llama-win32-x64": "1.3.5",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.5",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.3.5",
84
+ "@fugood/node-llama-win32-arm64": "1.3.5",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.5",
86
+ "@fugood/node-llama-darwin-x64": "1.3.5",
87
+ "@fugood/node-llama-darwin-arm64": "1.3.5"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -1,8 +1,8 @@
1
1
  diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
2
- index 706fa32ee..248459903 100644
2
+ index bb168e835..cfc0e2c2e 100644
3
3
  --- a/src/llama.cpp/common/CMakeLists.txt
4
4
  +++ b/src/llama.cpp/common/CMakeLists.txt
5
- @@ -141,9 +141,16 @@ if (LLAMA_LLGUIDANCE)
5
+ @@ -143,9 +143,16 @@ if (LLAMA_LLGUIDANCE)
6
6
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
7
7
  endif ()
8
8
 
@@ -21,7 +21,7 @@ index 706fa32ee..248459903 100644
21
21
 
22
22
  #
23
23
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
24
- index 938872e82..6364f173f 100644
24
+ index 6fa05a604..87dfa7a8b 100644
25
25
  --- a/src/llama.cpp/common/chat.cpp
26
26
  +++ b/src/llama.cpp/common/chat.cpp
27
27
  @@ -6,9 +6,6 @@
@@ -51,7 +51,7 @@ index 938872e82..6364f173f 100644
51
51
  struct templates_params {
52
52
  json messages;
53
53
  json tools;
54
- @@ -811,7 +798,7 @@ static std::string apply(
54
+ @@ -817,7 +804,7 @@ static std::string apply(
55
55
  tmpl_inputs.extra_context.merge_patch(*additional_context);
56
56
  }
57
57
  // TODO: add flag to control date/time, if only for testing purposes.
@@ -61,7 +61,7 @@ index 938872e82..6364f173f 100644
61
61
  minja::chat_template_options tmpl_opts;
62
62
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
63
63
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
64
- index 50efb0d4e..f471a84c7 100644
64
+ index 754c411e2..71241a6cc 100644
65
65
  --- a/src/llama.cpp/common/chat.h
66
66
  +++ b/src/llama.cpp/common/chat.h
67
67
  @@ -9,7 +9,18 @@
@@ -85,10 +85,10 @@ index 50efb0d4e..f471a84c7 100644
85
85
  struct common_chat_tool_call {
86
86
  std::string name;
87
87
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
88
- index 4dc95dcba..ea0ea86c0 100644
88
+ index f3cc55247..65398844f 100644
89
89
  --- a/src/llama.cpp/common/common.cpp
90
90
  +++ b/src/llama.cpp/common/common.cpp
91
- @@ -1155,6 +1155,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
91
+ @@ -1162,6 +1162,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
92
92
  mparams.n_gpu_layers = params.n_gpu_layers;
93
93
  }
94
94
 
@@ -97,10 +97,10 @@ index 4dc95dcba..ea0ea86c0 100644
97
97
  mparams.split_mode = params.split_mode;
98
98
  mparams.tensor_split = params.tensor_split;
99
99
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
100
- index f42c083fa..c573cc812 100644
100
+ index de5b404dd..d30d252c9 100644
101
101
  --- a/src/llama.cpp/common/common.h
102
102
  +++ b/src/llama.cpp/common/common.h
103
- @@ -274,6 +274,7 @@ struct lr_opt {
103
+ @@ -281,6 +281,7 @@ struct lr_opt {
104
104
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
105
105
 
106
106
  struct common_params {
@@ -109,7 +109,7 @@ index f42c083fa..c573cc812 100644
109
109
  int32_t n_ctx = 4096; // context size
110
110
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
111
111
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
112
- index e52e050a8..c1000c162 100644
112
+ index d0cab0bcb..48d532838 100644
113
113
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
114
114
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
115
115
  @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -105,6 +105,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
105
105
  InstanceMethod<&LlamaContext::GetModelInfo>(
106
106
  "getModelInfo",
107
107
  static_cast<napi_property_attributes>(napi_enumerable)),
108
+ InstanceMethod<&LlamaContext::GetUsedDevices>(
109
+ "getUsedDevices",
110
+ static_cast<napi_property_attributes>(napi_enumerable)),
108
111
  InstanceMethod<&LlamaContext::GetFormattedChat>(
109
112
  "getFormattedChat",
110
113
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -306,6 +309,19 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
306
309
  llama_backend_init();
307
310
  llama_numa_init(params.numa);
308
311
 
312
+ // Parse devices array
313
+ if (options.Has("devices") && options.Get("devices").IsArray()) {
314
+ auto devices_array = options.Get("devices").As<Napi::Array>();
315
+ for (size_t i = 0; i < devices_array.Length(); i++) {
316
+ auto device_name = devices_array.Get(i).ToString().Utf8Value();
317
+ auto * dev = ggml_backend_dev_by_name(device_name.c_str());
318
+ if (dev) {
319
+ params.devices.push_back(dev);
320
+ }
321
+ // Skip invalid device names silently
322
+ }
323
+ }
324
+
309
325
  std::vector<common_adapter_lora_info> lora;
310
326
  auto lora_path = get_option<std::string>(options, "lora", "");
311
327
  auto lora_scaled = get_option<float>(options, "lora_scaled", 1.0f);
@@ -378,6 +394,17 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
378
394
  }
379
395
  _rn_ctx->attachThreadpoolsIfAvailable();
380
396
 
397
+ // Collect used devices from the loaded model
398
+ if (_rn_ctx->llama_init.model) {
399
+ const auto &model_devices = _rn_ctx->llama_init.model->devices;
400
+ for (auto dev : model_devices) {
401
+ const char *dev_name = ggml_backend_dev_name(dev);
402
+ if (dev_name != nullptr) {
403
+ _used_devices.push_back(std::string(dev_name));
404
+ }
405
+ }
406
+ }
407
+
381
408
  // Release progress callback after model is loaded
382
409
  if (has_progress_callback) {
383
410
  _progress_tsfn.Release();
@@ -583,6 +610,15 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
583
610
  return details;
584
611
  }
585
612
 
613
+ // getUsedDevices(): string[]
614
+ Napi::Value LlamaContext::GetUsedDevices(const Napi::CallbackInfo &info) {
615
+ Napi::Env env = info.Env();
616
+ Napi::Array devices = Napi::Array::New(env, _used_devices.size());
617
+ for (size_t i = 0; i < _used_devices.size(); i++) {
618
+ devices[i] = Napi::String::New(env, _used_devices[i]);
619
+ }
620
+ return devices;
621
+ }
586
622
 
587
623
 
588
624
  // getFormattedChat(
@@ -31,6 +31,7 @@ public:
31
31
  private:
32
32
  Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
33
33
  Napi::Value GetModelInfo(const Napi::CallbackInfo &info);
34
+ Napi::Value GetUsedDevices(const Napi::CallbackInfo &info);
34
35
  Napi::Value GetFormattedChat(const Napi::CallbackInfo &info);
35
36
  Napi::Value Completion(const Napi::CallbackInfo &info);
36
37
  void StopCompletion(const Napi::CallbackInfo &info);
@@ -69,6 +70,7 @@ private:
69
70
  void CancelRequest(const Napi::CallbackInfo &info);
70
71
 
71
72
  std::string _info;
73
+ std::vector<std::string> _used_devices;
72
74
  Napi::Object _meta;
73
75
  LlamaCompletionWorker *_wip = nullptr;
74
76
 
@@ -50,6 +50,8 @@ add_library(${TARGET} STATIC
50
50
  base64.hpp
51
51
  chat-parser.cpp
52
52
  chat-parser.h
53
+ chat-parser-xml-toolcall.h
54
+ chat-parser-xml-toolcall.cpp
53
55
  chat.cpp
54
56
  chat.h
55
57
  common.cpp