@fugood/llama.node 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/CMakeLists.txt +21 -1
  2. package/lib/binding.js +1 -1
  3. package/lib/binding.ts +47 -15
  4. package/lib/index.js +26 -2
  5. package/lib/index.ts +42 -10
  6. package/package.json +15 -14
  7. package/scripts/llama.cpp.patch +31 -10
  8. package/src/LlamaContext.cpp +46 -0
  9. package/src/LlamaContext.h +2 -0
  10. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  12. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  13. package/src/llama.cpp/common/chat-parser.h +10 -0
  14. package/src/llama.cpp/common/chat.cpp +461 -87
  15. package/src/llama.cpp/common/chat.h +6 -0
  16. package/src/llama.cpp/common/common.cpp +8 -1
  17. package/src/llama.cpp/common/common.h +12 -5
  18. package/src/llama.cpp/common/json-partial.cpp +19 -2
  19. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
  20. package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
  21. package/src/llama.cpp/common/sampling.cpp +60 -6
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
  24. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
  26. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
  28. package/src/llama.cpp/src/llama-grammar.cpp +17 -9
  29. package/src/llama.cpp/src/llama-impl.cpp +3 -3
  30. package/src/llama.cpp/src/llama-sampling.cpp +3 -6
  31. package/src/llama.cpp/src/llama-vocab.cpp +1 -0
package/CMakeLists.txt CHANGED
@@ -44,7 +44,8 @@ else()
44
44
  endif()
45
45
 
46
46
  if (TO_PACKAGE)
47
- set(PLATFORM_BINARY_DIR ${CMAKE_SOURCE_DIR}/packages/node-llama-${PLATFORM}-${ARCH}${VARIANT})
47
+ set(PACKAGE_NAME "node-llama-${PLATFORM}-${ARCH}${VARIANT}")
48
+ set(PLATFORM_BINARY_DIR ${CMAKE_SOURCE_DIR}/packages/${PACKAGE_NAME})
48
49
  else()
49
50
  set(PLATFORM_BINARY_DIR ${CMAKE_SOURCE_DIR}/build/Release)
50
51
  endif()
@@ -188,6 +189,13 @@ if (NOT MSVC AND CMAKE_SYSTEM_NAME STREQUAL "Windows")
188
189
  set(CMAKE_JS_LIB win_dynamic_load)
189
190
  endif()
190
191
 
192
+ if (TO_PACKAGE AND GGML_HEXAGON)
193
+ set(NODE_RPATH "node_modules/@fugood/${PACKAGE_NAME}")
194
+ set(ELECTRON_ASAR_RPATH "resources/app.asar.unpacked/node_modules/@fugood/${PACKAGE_NAME}")
195
+ set(ELECTRON_RES_RPATH "resources/node_modules/@fugood/${PACKAGE_NAME}")
196
+ set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath,${NODE_RPATH} -Wl,-rpath,${ELECTRON_ASAR_RPATH} -Wl,-rpath,${ELECTRON_RES_RPATH}")
197
+ endif()
198
+
191
199
  add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
192
200
  set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
193
201
  target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB} llama ggml common mtmd ${CMAKE_THREAD_LIBS_INIT})
@@ -237,3 +245,15 @@ if (GGML_CLBLAST AND TO_PACKAGE)
237
245
  )
238
246
  endif()
239
247
  endif()
248
+
249
+ if (GGML_HEXAGON)
250
+ get_target_property(HTP_LIBS_DIR ggml-hexagon BINARY_DIR)
251
+ add_custom_command(
252
+ TARGET copy_assets
253
+ COMMAND ${CMAKE_COMMAND} -E copy ${HTP_LIBS_DIR}/libggml-htp-v73.so ${PLATFORM_BINARY_DIR}
254
+ COMMAND ${CMAKE_COMMAND} -E copy ${HTP_LIBS_DIR}/libggml-htp-v75.so ${PLATFORM_BINARY_DIR}
255
+ COMMAND ${CMAKE_COMMAND} -E copy ${HTP_LIBS_DIR}/libggml-htp-v79.so ${PLATFORM_BINARY_DIR}
256
+ COMMAND ${CMAKE_COMMAND} -E copy ${HTP_LIBS_DIR}/libggml-htp-v81.so ${PLATFORM_BINARY_DIR}
257
+ COMMENT "Copying HTP libraries to bin folder"
258
+ )
259
+ endif()
package/lib/binding.js CHANGED
@@ -51,7 +51,7 @@ const getPlatformPackageName = (variant) => {
51
51
  };
52
52
  const loadPlatformPackage = (packageName) => __awaiter(void 0, void 0, void 0, function* () {
53
53
  try {
54
- return yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s)));
54
+ return (yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s))));
55
55
  }
56
56
  catch (error) {
57
57
  return null;
package/lib/binding.ts CHANGED
@@ -1,9 +1,9 @@
1
1
  export type MessagePart = {
2
- type: string,
3
- text?: string,
2
+ type: string
3
+ text?: string
4
4
  image_url?: {
5
5
  url?: string
6
- },
6
+ }
7
7
  input_audio?: {
8
8
  format: string
9
9
  data?: string
@@ -25,6 +25,12 @@ export type LlamaModelOptions = {
25
25
  n_ctx?: number
26
26
  n_batch?: number
27
27
  n_ubatch?: number
28
+ /**
29
+ * CPU affinity mask
30
+ * Example: '0xfc'
31
+ */
32
+ cpu_mask?: string
33
+ cpu_strict?: boolean
28
34
  /**
29
35
  * Number of parallel sequences to support (sets n_seq_max).
30
36
  * This determines the maximum number of parallel slots that can be used.
@@ -70,6 +76,12 @@ export type LlamaModelOptions = {
70
76
  * Number of layers to keep MoE weights on CPU
71
77
  */
72
78
  n_cpu_moe?: number
79
+ /**
80
+ * List of device names to use for offloading
81
+ * Device names can be obtained from getBackendDevicesInfo()
82
+ * Example: ['Metal', 'BLAS', 'CPU']
83
+ */
84
+ devices?: string[]
73
85
  use_mlock?: boolean
74
86
  use_mmap?: boolean
75
87
  vocab_only?: boolean
@@ -375,9 +387,13 @@ export type ToolCall = {
375
387
  }
376
388
 
377
389
  export interface LlamaContext {
378
- new (options: LlamaModelOptions, onProgress?: (progress: number) => void): LlamaContext
390
+ new (
391
+ options: LlamaModelOptions,
392
+ onProgress?: (progress: number) => void,
393
+ ): LlamaContext
379
394
  getSystemInfo(): string
380
395
  getModelInfo(): ModelInfo
396
+ getUsedDevices(): string[]
381
397
  getFormattedChat(
382
398
  messages: ChatMessage[],
383
399
  chat_template?: string,
@@ -400,8 +416,15 @@ export interface LlamaContext {
400
416
  stopCompletion(): void
401
417
  tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
402
418
  detokenize(tokens: number[]): Promise<string>
403
- embedding(text: string, params?: { embd_normalize?: number }): Promise<EmbeddingResult>
404
- rerank(query: string, documents: string[], params?: RerankParams): Promise<RerankResult[]>
419
+ embedding(
420
+ text: string,
421
+ params?: { embd_normalize?: number },
422
+ ): Promise<EmbeddingResult>
423
+ rerank(
424
+ query: string,
425
+ documents: string[],
426
+ params?: RerankParams,
427
+ ): Promise<RerankResult[]>
405
428
  saveSession(path: string): Promise<void>
406
429
  loadSession(path: string): Promise<void>
407
430
  release(): Promise<void>
@@ -440,7 +463,7 @@ export interface LlamaContext {
440
463
  * @param options Object containing path and optional n_batch
441
464
  * @returns boolean indicating if loading was successful
442
465
  */
443
- initVocoder(options: { path: string, n_batch?: number }): boolean
466
+ initVocoder(options: { path: string; n_batch?: number }): boolean
444
467
 
445
468
  /**
446
469
  * Unload the vocoder model
@@ -459,7 +482,10 @@ export interface LlamaContext {
459
482
  * @param text Text to complete
460
483
  * @returns Formatted audio completion
461
484
  */
462
- getFormattedAudioCompletion(speaker: string|null, text: string): {
485
+ getFormattedAudioCompletion(
486
+ speaker: string | null,
487
+ text: string,
488
+ ): {
463
489
  prompt: string
464
490
  grammar?: string
465
491
  }
@@ -476,7 +502,7 @@ export interface LlamaContext {
476
502
  * @param tokens Tokens to decode
477
503
  * @returns Promise resolving to decoded audio tokens
478
504
  */
479
- decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
505
+ decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array>
480
506
 
481
507
  // Parallel decoding methods
482
508
 
@@ -485,7 +511,7 @@ export interface LlamaContext {
485
511
  * @param params Configuration for parallel mode
486
512
  * @returns boolean indicating if successful
487
513
  */
488
- enableParallelMode(params: { n_parallel?: number, n_batch?: number }): boolean
514
+ enableParallelMode(params: { n_parallel?: number; n_batch?: number }): boolean
489
515
 
490
516
  /**
491
517
  * Disable parallel decoding mode
@@ -554,7 +580,7 @@ export interface Module {
554
580
  LlamaContext: LlamaContext
555
581
  }
556
582
 
557
- export type LibVariant = 'default' | 'vulkan' | 'cuda'
583
+ export type LibVariant = 'default' | 'vulkan' | 'cuda' | 'snapdragon'
558
584
 
559
585
  const getPlatformPackageName = (variant?: LibVariant): string => {
560
586
  const platform = process.platform
@@ -563,9 +589,11 @@ const getPlatformPackageName = (variant?: LibVariant): string => {
563
589
  return `@fugood/node-llama-${platform}-${arch}${variantSuffix}`
564
590
  }
565
591
 
566
- const loadPlatformPackage = async (packageName: string): Promise<Module | null> => {
592
+ const loadPlatformPackage = async (
593
+ packageName: string,
594
+ ): Promise<Module | null> => {
567
595
  try {
568
- return await import(packageName) as Module
596
+ return (await import(packageName)) as Module
569
597
  } catch (error) {
570
598
  return null
571
599
  }
@@ -579,7 +607,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
579
607
 
580
608
  module = await loadPlatformPackage(getPlatformPackageName())
581
609
  if (module) {
582
- console.warn(`Not found package for variant "${variant}", fallback to default`)
610
+ console.warn(
611
+ `Not found package for variant "${variant}", fallback to default`,
612
+ )
583
613
  return module
584
614
  }
585
615
 
@@ -588,7 +618,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
588
618
  return (await import('../build/Release/index.node')) as Module
589
619
  }
590
620
 
591
- export const isLibVariantAvailable = async (variant?: LibVariant): Promise<boolean> => {
621
+ export const isLibVariantAvailable = async (
622
+ variant?: LibVariant,
623
+ ): Promise<boolean> => {
592
624
  if (variant && variant !== 'default') {
593
625
  const module = await loadPlatformPackage(getPlatformPackageName(variant))
594
626
  return module != null
package/lib/index.js CHANGED
@@ -76,6 +76,9 @@ class LlamaContextWrapper {
76
76
  getModelInfo() {
77
77
  return this.ctx.getModelInfo();
78
78
  }
79
+ getUsedDevices() {
80
+ return this.ctx.getUsedDevices();
81
+ }
79
82
  isJinjaSupported() {
80
83
  const { minja } = this.ctx.getModelInfo().chatTemplates;
81
84
  return !!(minja === null || minja === void 0 ? void 0 : minja.toolUse) || !!(minja === null || minja === void 0 ? void 0 : minja.default);
@@ -85,7 +88,7 @@ class LlamaContextWrapper {
85
88
  }
86
89
  getFormattedChat(messages, template, params) {
87
90
  var _a;
88
- const { messages: chat, has_media, media_paths, } = (0, utils_1.formatMediaChat)(messages);
91
+ const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
89
92
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
90
93
  let tmpl;
91
94
  if (template)
@@ -198,7 +201,28 @@ const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, fun
198
201
  const variant = (_a = options.lib_variant) !== null && _a !== void 0 ? _a : 'default';
199
202
  (_b = mods[variant]) !== null && _b !== void 0 ? _b : (mods[variant] = yield (0, binding_1.loadModule)(options.lib_variant));
200
203
  refreshNativeLogSetup();
201
- const nativeCtx = new mods[variant].LlamaContext(options, onProgress);
204
+ const { devices } = options;
205
+ let filteredDevs = [];
206
+ if (Array.isArray(devices)) {
207
+ filteredDevs = [...devices];
208
+ // Handle HTP* to use all HTP devices on Hexagon
209
+ if (variant === 'snapdragon' && devices.includes('HTP*')) {
210
+ const backendDevices = yield (0, exports.getBackendDevicesInfo)(variant);
211
+ const htpDevices = backendDevices
212
+ .filter((d) => d.deviceName.startsWith('HTP'))
213
+ .map((d) => d.deviceName);
214
+ filteredDevs = filteredDevs.reduce((acc, dev) => {
215
+ if (dev.startsWith('HTP*')) {
216
+ acc.push(...htpDevices);
217
+ }
218
+ else if (!dev.startsWith('HTP')) {
219
+ acc.push(dev);
220
+ }
221
+ return acc;
222
+ }, []);
223
+ }
224
+ }
225
+ const nativeCtx = new mods[variant].LlamaContext(Object.assign(Object.assign({}, options), { devices: filteredDevs.length > 0 ? filteredDevs : undefined }), onProgress);
202
226
  return new LlamaContextWrapper(nativeCtx);
203
227
  });
204
228
  exports.loadModel = loadModel;
package/lib/index.ts CHANGED
@@ -94,6 +94,10 @@ class LlamaContextWrapper {
94
94
  return this.ctx.getModelInfo()
95
95
  }
96
96
 
97
+ getUsedDevices(): string[] {
98
+ return this.ctx.getUsedDevices()
99
+ }
100
+
97
101
  isJinjaSupported(): boolean {
98
102
  const { minja } = this.ctx.getModelInfo().chatTemplates
99
103
  return !!minja?.toolUse || !!minja?.default
@@ -118,11 +122,7 @@ class LlamaContextWrapper {
118
122
  chat_template_kwargs?: Record<string, string>
119
123
  },
120
124
  ): FormattedChatResult {
121
- const {
122
- messages: chat,
123
- has_media,
124
- media_paths,
125
- } = formatMediaChat(messages)
125
+ const { messages: chat, has_media, media_paths } = formatMediaChat(messages)
126
126
 
127
127
  const useJinja = this.isJinjaSupported() && params?.jinja
128
128
  let tmpl
@@ -169,8 +169,9 @@ class LlamaContextWrapper {
169
169
  options: LlamaCompletionOptions,
170
170
  callback?: (token: LlamaCompletionToken) => void,
171
171
  ): Promise<LlamaCompletionResult> {
172
- const { messages, media_paths = options.media_paths } =
173
- formatMediaChat(options.messages)
172
+ const { messages, media_paths = options.media_paths } = formatMediaChat(
173
+ options.messages,
174
+ )
174
175
  return this.ctx.completion(
175
176
  {
176
177
  ...options,
@@ -196,7 +197,10 @@ class LlamaContextWrapper {
196
197
  return this.ctx.detokenize(tokens)
197
198
  }
198
199
 
199
- embedding(text: string, params?: { embd_normalize?: number }): Promise<EmbeddingResult> {
200
+ embedding(
201
+ text: string,
202
+ params?: { embd_normalize?: number },
203
+ ): Promise<EmbeddingResult> {
200
204
  return this.ctx.embedding(text, params)
201
205
  }
202
206
 
@@ -305,7 +309,35 @@ export const loadModel = async (
305
309
  mods[variant] ??= await loadModule(options.lib_variant)
306
310
  refreshNativeLogSetup()
307
311
 
308
- const nativeCtx = new mods[variant].LlamaContext(options, onProgress)
312
+ const { devices } = options
313
+ let filteredDevs: Array<string> = []
314
+ if (Array.isArray(devices)) {
315
+ filteredDevs = [...devices]
316
+
317
+ // Handle HTP* to use all HTP devices on Hexagon
318
+ if (variant === 'snapdragon' && devices.includes('HTP*')) {
319
+ const backendDevices = await getBackendDevicesInfo(variant)
320
+ const htpDevices = backendDevices
321
+ .filter((d) => d.deviceName.startsWith('HTP'))
322
+ .map((d) => d.deviceName)
323
+ filteredDevs = filteredDevs.reduce((acc, dev) => {
324
+ if (dev.startsWith('HTP*')) {
325
+ acc.push(...htpDevices)
326
+ } else if (!dev.startsWith('HTP')) {
327
+ acc.push(dev)
328
+ }
329
+ return acc
330
+ }, [] as Array<string>)
331
+ }
332
+ }
333
+
334
+ const nativeCtx = new mods[variant].LlamaContext(
335
+ {
336
+ ...options,
337
+ devices: filteredDevs.length > 0 ? filteredDevs : undefined,
338
+ },
339
+ onProgress,
340
+ )
309
341
  return new LlamaContextWrapper(nativeCtx)
310
342
  }
311
343
 
@@ -329,7 +361,7 @@ export const loadLlamaModelInfo = async (
329
361
  }
330
362
 
331
363
  export const getBackendDevicesInfo = async (
332
- variant: LibVariant = 'default'
364
+ variant: LibVariant = 'default',
333
365
  ): Promise<import('./binding').BackendDeviceInfo[]> => {
334
366
  mods[variant] ??= await loadModule(variant)
335
367
  refreshNativeLogSetup()
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.4",
4
+ "version": "1.3.6",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.4",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.4",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.4",
78
- "@fugood/node-llama-linux-arm64": "1.3.4",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.4",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.4",
81
- "@fugood/node-llama-win32-x64": "1.3.4",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.4",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.4",
84
- "@fugood/node-llama-win32-arm64": "1.3.4",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.4",
86
- "@fugood/node-llama-darwin-x64": "1.3.4",
87
- "@fugood/node-llama-darwin-arm64": "1.3.4"
75
+ "@fugood/node-llama-linux-x64": "1.3.6",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.6",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.6",
78
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.3.6",
79
+ "@fugood/node-llama-linux-arm64": "1.3.6",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.6",
81
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.6",
82
+ "@fugood/node-llama-win32-x64": "1.3.6",
83
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.6",
84
+ "@fugood/node-llama-win32-x64-cuda": "1.3.6",
85
+ "@fugood/node-llama-win32-arm64": "1.3.6",
86
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.6",
87
+ "@fugood/node-llama-darwin-x64": "1.3.6",
88
+ "@fugood/node-llama-darwin-arm64": "1.3.6"
88
89
  },
89
90
  "devDependencies": {
90
91
  "@babel/preset-env": "^7.24.4",
@@ -1,8 +1,8 @@
1
1
  diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
2
- index 706fa32ee..248459903 100644
2
+ index bb168e835..cfc0e2c2e 100644
3
3
  --- a/src/llama.cpp/common/CMakeLists.txt
4
4
  +++ b/src/llama.cpp/common/CMakeLists.txt
5
- @@ -141,9 +141,16 @@ if (LLAMA_LLGUIDANCE)
5
+ @@ -143,9 +143,16 @@ if (LLAMA_LLGUIDANCE)
6
6
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
7
7
  endif ()
8
8
 
@@ -21,7 +21,7 @@ index 706fa32ee..248459903 100644
21
21
 
22
22
  #
23
23
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
24
- index 938872e82..6364f173f 100644
24
+ index 6fa05a604..87dfa7a8b 100644
25
25
  --- a/src/llama.cpp/common/chat.cpp
26
26
  +++ b/src/llama.cpp/common/chat.cpp
27
27
  @@ -6,9 +6,6 @@
@@ -51,7 +51,7 @@ index 938872e82..6364f173f 100644
51
51
  struct templates_params {
52
52
  json messages;
53
53
  json tools;
54
- @@ -811,7 +798,7 @@ static std::string apply(
54
+ @@ -817,7 +804,7 @@ static std::string apply(
55
55
  tmpl_inputs.extra_context.merge_patch(*additional_context);
56
56
  }
57
57
  // TODO: add flag to control date/time, if only for testing purposes.
@@ -61,7 +61,7 @@ index 938872e82..6364f173f 100644
61
61
  minja::chat_template_options tmpl_opts;
62
62
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
63
63
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
64
- index 50efb0d4e..f471a84c7 100644
64
+ index 754c411e2..71241a6cc 100644
65
65
  --- a/src/llama.cpp/common/chat.h
66
66
  +++ b/src/llama.cpp/common/chat.h
67
67
  @@ -9,7 +9,18 @@
@@ -85,10 +85,10 @@ index 50efb0d4e..f471a84c7 100644
85
85
  struct common_chat_tool_call {
86
86
  std::string name;
87
87
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
88
- index 4dc95dcba..ea0ea86c0 100644
88
+ index f3cc55247..65398844f 100644
89
89
  --- a/src/llama.cpp/common/common.cpp
90
90
  +++ b/src/llama.cpp/common/common.cpp
91
- @@ -1155,6 +1155,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
91
+ @@ -1162,6 +1162,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
92
92
  mparams.n_gpu_layers = params.n_gpu_layers;
93
93
  }
94
94
 
@@ -97,10 +97,10 @@ index 4dc95dcba..ea0ea86c0 100644
97
97
  mparams.split_mode = params.split_mode;
98
98
  mparams.tensor_split = params.tensor_split;
99
99
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
100
- index f42c083fa..c573cc812 100644
100
+ index de5b404dd..d30d252c9 100644
101
101
  --- a/src/llama.cpp/common/common.h
102
102
  +++ b/src/llama.cpp/common/common.h
103
- @@ -274,6 +274,7 @@ struct lr_opt {
103
+ @@ -281,6 +281,7 @@ struct lr_opt {
104
104
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
105
105
 
106
106
  struct common_params {
@@ -109,7 +109,7 @@ index f42c083fa..c573cc812 100644
109
109
  int32_t n_ctx = 4096; // context size
110
110
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
111
111
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
112
- index e52e050a8..c1000c162 100644
112
+ index d0cab0bcb..48d532838 100644
113
113
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
114
114
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
115
115
  @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -121,6 +121,27 @@ index e52e050a8..c1000c162 100644
121
121
  else()
122
122
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
123
123
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
124
+ diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
125
+ index cabd301ad..31eec134c 100644
126
+ --- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
127
+ +++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
128
+ @@ -9,6 +9,7 @@
129
+ #include <chrono>
130
+ #include <mutex>
131
+ #include <string>
132
+ +#include <stdexcept>
133
+
134
+ #ifdef _WIN32
135
+ # include <sal.h>
136
+ @@ -3682,6 +3683,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
137
+ } catch (std::exception const &exc) {
138
+ GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
139
+ devices[i].context = nullptr;
140
+ + opt_ndev = i;
141
+ + break;
142
+ }
143
+ }
144
+ }
124
145
  diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
125
146
  index de01336cd..29b1a043d 100644
126
147
  --- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -105,6 +105,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
105
105
  InstanceMethod<&LlamaContext::GetModelInfo>(
106
106
  "getModelInfo",
107
107
  static_cast<napi_property_attributes>(napi_enumerable)),
108
+ InstanceMethod<&LlamaContext::GetUsedDevices>(
109
+ "getUsedDevices",
110
+ static_cast<napi_property_attributes>(napi_enumerable)),
108
111
  InstanceMethod<&LlamaContext::GetFormattedChat>(
109
112
  "getFormattedChat",
110
113
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -303,9 +306,32 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
303
306
  params.tensor_buft_overrides.push_back({nullptr, nullptr});
304
307
  }
305
308
 
309
+ auto cpu_mask = get_option<std::string>(options, "cpu_mask", "");
310
+ if (!cpu_mask.empty()) {
311
+ params.cpuparams.mask_valid = true;
312
+ if (!parse_cpu_mask(cpu_mask, params.cpuparams.cpumask)) {
313
+ Napi::TypeError::New(env, "Invalid cpu_mask").ThrowAsJavaScriptException();
314
+ }
315
+ }
316
+
317
+ params.cpuparams.strict_cpu = get_option<bool>(options, "cpu_strict", false);
318
+
306
319
  llama_backend_init();
307
320
  llama_numa_init(params.numa);
308
321
 
322
+ // Parse devices array
323
+ if (options.Has("devices") && options.Get("devices").IsArray()) {
324
+ auto devices_array = options.Get("devices").As<Napi::Array>();
325
+ for (size_t i = 0; i < devices_array.Length(); i++) {
326
+ auto device_name = devices_array.Get(i).ToString().Utf8Value();
327
+ auto * dev = ggml_backend_dev_by_name(device_name.c_str());
328
+ if (dev) {
329
+ params.devices.push_back(dev);
330
+ }
331
+ // Skip invalid device names silently
332
+ }
333
+ }
334
+
309
335
  std::vector<common_adapter_lora_info> lora;
310
336
  auto lora_path = get_option<std::string>(options, "lora", "");
311
337
  auto lora_scaled = get_option<float>(options, "lora_scaled", 1.0f);
@@ -378,6 +404,17 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
378
404
  }
379
405
  _rn_ctx->attachThreadpoolsIfAvailable();
380
406
 
407
+ // Collect used devices from the loaded model
408
+ if (_rn_ctx->llama_init.model) {
409
+ const auto &model_devices = _rn_ctx->llama_init.model->devices;
410
+ for (auto dev : model_devices) {
411
+ const char *dev_name = ggml_backend_dev_name(dev);
412
+ if (dev_name != nullptr) {
413
+ _used_devices.push_back(std::string(dev_name));
414
+ }
415
+ }
416
+ }
417
+
381
418
  // Release progress callback after model is loaded
382
419
  if (has_progress_callback) {
383
420
  _progress_tsfn.Release();
@@ -583,6 +620,15 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
583
620
  return details;
584
621
  }
585
622
 
623
+ // getUsedDevices(): string[]
624
+ Napi::Value LlamaContext::GetUsedDevices(const Napi::CallbackInfo &info) {
625
+ Napi::Env env = info.Env();
626
+ Napi::Array devices = Napi::Array::New(env, _used_devices.size());
627
+ for (size_t i = 0; i < _used_devices.size(); i++) {
628
+ devices[i] = Napi::String::New(env, _used_devices[i]);
629
+ }
630
+ return devices;
631
+ }
586
632
 
587
633
 
588
634
  // getFormattedChat(
@@ -31,6 +31,7 @@ public:
31
31
  private:
32
32
  Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
33
33
  Napi::Value GetModelInfo(const Napi::CallbackInfo &info);
34
+ Napi::Value GetUsedDevices(const Napi::CallbackInfo &info);
34
35
  Napi::Value GetFormattedChat(const Napi::CallbackInfo &info);
35
36
  Napi::Value Completion(const Napi::CallbackInfo &info);
36
37
  void StopCompletion(const Napi::CallbackInfo &info);
@@ -69,6 +70,7 @@ private:
69
70
  void CancelRequest(const Napi::CallbackInfo &info);
70
71
 
71
72
  std::string _info;
73
+ std::vector<std::string> _used_devices;
72
74
  Napi::Object _meta;
73
75
  LlamaCompletionWorker *_wip = nullptr;
74
76
 
@@ -50,6 +50,8 @@ add_library(${TARGET} STATIC
50
50
  base64.hpp
51
51
  chat-parser.cpp
52
52
  chat-parser.h
53
+ chat-parser-xml-toolcall.h
54
+ chat-parser-xml-toolcall.cpp
53
55
  chat.cpp
54
56
  chat.h
55
57
  common.cpp