@fugood/llama.node 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +21 -1
- package/lib/binding.js +1 -1
- package/lib/binding.ts +47 -15
- package/lib/index.js +26 -2
- package/lib/index.ts +42 -10
- package/package.json +15 -14
- package/scripts/llama.cpp.patch +31 -10
- package/src/LlamaContext.cpp +46 -0
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/src/llama.cpp/common/chat-parser.h +10 -0
- package/src/llama.cpp/common/chat.cpp +461 -87
- package/src/llama.cpp/common/chat.h +6 -0
- package/src/llama.cpp/common/common.cpp +8 -1
- package/src/llama.cpp/common/common.h +12 -5
- package/src/llama.cpp/common/json-partial.cpp +19 -2
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
- package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/src/llama.cpp/common/sampling.cpp +60 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
- package/src/llama.cpp/src/llama-grammar.cpp +17 -9
- package/src/llama.cpp/src/llama-impl.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +3 -6
- package/src/llama.cpp/src/llama-vocab.cpp +1 -0
package/CMakeLists.txt
CHANGED
|
@@ -44,7 +44,8 @@ else()
|
|
|
44
44
|
endif()
|
|
45
45
|
|
|
46
46
|
if (TO_PACKAGE)
|
|
47
|
-
set(
|
|
47
|
+
set(PACKAGE_NAME "node-llama-${PLATFORM}-${ARCH}${VARIANT}")
|
|
48
|
+
set(PLATFORM_BINARY_DIR ${CMAKE_SOURCE_DIR}/packages/${PACKAGE_NAME})
|
|
48
49
|
else()
|
|
49
50
|
set(PLATFORM_BINARY_DIR ${CMAKE_SOURCE_DIR}/build/Release)
|
|
50
51
|
endif()
|
|
@@ -188,6 +189,13 @@ if (NOT MSVC AND CMAKE_SYSTEM_NAME STREQUAL "Windows")
|
|
|
188
189
|
set(CMAKE_JS_LIB win_dynamic_load)
|
|
189
190
|
endif()
|
|
190
191
|
|
|
192
|
+
if (TO_PACKAGE AND GGML_HEXAGON)
|
|
193
|
+
set(NODE_RPATH "node_modules/@fugood/${PACKAGE_NAME}")
|
|
194
|
+
set(ELECTRON_ASAR_RPATH "resources/app.asar.unpacked/node_modules/@fugood/${PACKAGE_NAME}")
|
|
195
|
+
set(ELECTRON_RES_RPATH "resources/node_modules/@fugood/${PACKAGE_NAME}")
|
|
196
|
+
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath,${NODE_RPATH} -Wl,-rpath,${ELECTRON_ASAR_RPATH} -Wl,-rpath,${ELECTRON_RES_RPATH}")
|
|
197
|
+
endif()
|
|
198
|
+
|
|
191
199
|
add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
|
|
192
200
|
set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
|
|
193
201
|
target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB} llama ggml common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
|
@@ -237,3 +245,15 @@ if (GGML_CLBLAST AND TO_PACKAGE)
|
|
|
237
245
|
)
|
|
238
246
|
endif()
|
|
239
247
|
endif()
|
|
248
|
+
|
|
249
|
+
if (GGML_HEXAGON)
|
|
250
|
+
get_target_property(HTP_LIBS_DIR ggml-hexagon BINARY_DIR)
|
|
251
|
+
add_custom_command(
|
|
252
|
+
TARGET copy_assets
|
|
253
|
+
COMMAND ${CMAKE_COMMAND} -E copy ${HTP_LIBS_DIR}/libggml-htp-v73.so ${PLATFORM_BINARY_DIR}
|
|
254
|
+
COMMAND ${CMAKE_COMMAND} -E copy ${HTP_LIBS_DIR}/libggml-htp-v75.so ${PLATFORM_BINARY_DIR}
|
|
255
|
+
COMMAND ${CMAKE_COMMAND} -E copy ${HTP_LIBS_DIR}/libggml-htp-v79.so ${PLATFORM_BINARY_DIR}
|
|
256
|
+
COMMAND ${CMAKE_COMMAND} -E copy ${HTP_LIBS_DIR}/libggml-htp-v81.so ${PLATFORM_BINARY_DIR}
|
|
257
|
+
COMMENT "Copying HTP libraries to bin folder"
|
|
258
|
+
)
|
|
259
|
+
endif()
|
package/lib/binding.js
CHANGED
|
@@ -51,7 +51,7 @@ const getPlatformPackageName = (variant) => {
|
|
|
51
51
|
};
|
|
52
52
|
const loadPlatformPackage = (packageName) => __awaiter(void 0, void 0, void 0, function* () {
|
|
53
53
|
try {
|
|
54
|
-
return yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s)));
|
|
54
|
+
return (yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s))));
|
|
55
55
|
}
|
|
56
56
|
catch (error) {
|
|
57
57
|
return null;
|
package/lib/binding.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
export type MessagePart = {
|
|
2
|
-
type: string
|
|
3
|
-
text?: string
|
|
2
|
+
type: string
|
|
3
|
+
text?: string
|
|
4
4
|
image_url?: {
|
|
5
5
|
url?: string
|
|
6
|
-
}
|
|
6
|
+
}
|
|
7
7
|
input_audio?: {
|
|
8
8
|
format: string
|
|
9
9
|
data?: string
|
|
@@ -25,6 +25,12 @@ export type LlamaModelOptions = {
|
|
|
25
25
|
n_ctx?: number
|
|
26
26
|
n_batch?: number
|
|
27
27
|
n_ubatch?: number
|
|
28
|
+
/**
|
|
29
|
+
* CPU affinity mask
|
|
30
|
+
* Example: '0xfc'
|
|
31
|
+
*/
|
|
32
|
+
cpu_mask?: string
|
|
33
|
+
cpu_strict?: boolean
|
|
28
34
|
/**
|
|
29
35
|
* Number of parallel sequences to support (sets n_seq_max).
|
|
30
36
|
* This determines the maximum number of parallel slots that can be used.
|
|
@@ -70,6 +76,12 @@ export type LlamaModelOptions = {
|
|
|
70
76
|
* Number of layers to keep MoE weights on CPU
|
|
71
77
|
*/
|
|
72
78
|
n_cpu_moe?: number
|
|
79
|
+
/**
|
|
80
|
+
* List of device names to use for offloading
|
|
81
|
+
* Device names can be obtained from getBackendDevicesInfo()
|
|
82
|
+
* Example: ['Metal', 'BLAS', 'CPU']
|
|
83
|
+
*/
|
|
84
|
+
devices?: string[]
|
|
73
85
|
use_mlock?: boolean
|
|
74
86
|
use_mmap?: boolean
|
|
75
87
|
vocab_only?: boolean
|
|
@@ -375,9 +387,13 @@ export type ToolCall = {
|
|
|
375
387
|
}
|
|
376
388
|
|
|
377
389
|
export interface LlamaContext {
|
|
378
|
-
new (
|
|
390
|
+
new (
|
|
391
|
+
options: LlamaModelOptions,
|
|
392
|
+
onProgress?: (progress: number) => void,
|
|
393
|
+
): LlamaContext
|
|
379
394
|
getSystemInfo(): string
|
|
380
395
|
getModelInfo(): ModelInfo
|
|
396
|
+
getUsedDevices(): string[]
|
|
381
397
|
getFormattedChat(
|
|
382
398
|
messages: ChatMessage[],
|
|
383
399
|
chat_template?: string,
|
|
@@ -400,8 +416,15 @@ export interface LlamaContext {
|
|
|
400
416
|
stopCompletion(): void
|
|
401
417
|
tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
|
|
402
418
|
detokenize(tokens: number[]): Promise<string>
|
|
403
|
-
embedding(
|
|
404
|
-
|
|
419
|
+
embedding(
|
|
420
|
+
text: string,
|
|
421
|
+
params?: { embd_normalize?: number },
|
|
422
|
+
): Promise<EmbeddingResult>
|
|
423
|
+
rerank(
|
|
424
|
+
query: string,
|
|
425
|
+
documents: string[],
|
|
426
|
+
params?: RerankParams,
|
|
427
|
+
): Promise<RerankResult[]>
|
|
405
428
|
saveSession(path: string): Promise<void>
|
|
406
429
|
loadSession(path: string): Promise<void>
|
|
407
430
|
release(): Promise<void>
|
|
@@ -440,7 +463,7 @@ export interface LlamaContext {
|
|
|
440
463
|
* @param options Object containing path and optional n_batch
|
|
441
464
|
* @returns boolean indicating if loading was successful
|
|
442
465
|
*/
|
|
443
|
-
initVocoder(options: { path: string
|
|
466
|
+
initVocoder(options: { path: string; n_batch?: number }): boolean
|
|
444
467
|
|
|
445
468
|
/**
|
|
446
469
|
* Unload the vocoder model
|
|
@@ -459,7 +482,10 @@ export interface LlamaContext {
|
|
|
459
482
|
* @param text Text to complete
|
|
460
483
|
* @returns Formatted audio completion
|
|
461
484
|
*/
|
|
462
|
-
getFormattedAudioCompletion(
|
|
485
|
+
getFormattedAudioCompletion(
|
|
486
|
+
speaker: string | null,
|
|
487
|
+
text: string,
|
|
488
|
+
): {
|
|
463
489
|
prompt: string
|
|
464
490
|
grammar?: string
|
|
465
491
|
}
|
|
@@ -476,7 +502,7 @@ export interface LlamaContext {
|
|
|
476
502
|
* @param tokens Tokens to decode
|
|
477
503
|
* @returns Promise resolving to decoded audio tokens
|
|
478
504
|
*/
|
|
479
|
-
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
|
|
505
|
+
decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array>
|
|
480
506
|
|
|
481
507
|
// Parallel decoding methods
|
|
482
508
|
|
|
@@ -485,7 +511,7 @@ export interface LlamaContext {
|
|
|
485
511
|
* @param params Configuration for parallel mode
|
|
486
512
|
* @returns boolean indicating if successful
|
|
487
513
|
*/
|
|
488
|
-
enableParallelMode(params: { n_parallel?: number
|
|
514
|
+
enableParallelMode(params: { n_parallel?: number; n_batch?: number }): boolean
|
|
489
515
|
|
|
490
516
|
/**
|
|
491
517
|
* Disable parallel decoding mode
|
|
@@ -554,7 +580,7 @@ export interface Module {
|
|
|
554
580
|
LlamaContext: LlamaContext
|
|
555
581
|
}
|
|
556
582
|
|
|
557
|
-
export type LibVariant = 'default' | 'vulkan' | 'cuda'
|
|
583
|
+
export type LibVariant = 'default' | 'vulkan' | 'cuda' | 'snapdragon'
|
|
558
584
|
|
|
559
585
|
const getPlatformPackageName = (variant?: LibVariant): string => {
|
|
560
586
|
const platform = process.platform
|
|
@@ -563,9 +589,11 @@ const getPlatformPackageName = (variant?: LibVariant): string => {
|
|
|
563
589
|
return `@fugood/node-llama-${platform}-${arch}${variantSuffix}`
|
|
564
590
|
}
|
|
565
591
|
|
|
566
|
-
const loadPlatformPackage = async (
|
|
592
|
+
const loadPlatformPackage = async (
|
|
593
|
+
packageName: string,
|
|
594
|
+
): Promise<Module | null> => {
|
|
567
595
|
try {
|
|
568
|
-
return await import(packageName) as Module
|
|
596
|
+
return (await import(packageName)) as Module
|
|
569
597
|
} catch (error) {
|
|
570
598
|
return null
|
|
571
599
|
}
|
|
@@ -579,7 +607,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
|
|
|
579
607
|
|
|
580
608
|
module = await loadPlatformPackage(getPlatformPackageName())
|
|
581
609
|
if (module) {
|
|
582
|
-
console.warn(
|
|
610
|
+
console.warn(
|
|
611
|
+
`Not found package for variant "${variant}", fallback to default`,
|
|
612
|
+
)
|
|
583
613
|
return module
|
|
584
614
|
}
|
|
585
615
|
|
|
@@ -588,7 +618,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
|
|
|
588
618
|
return (await import('../build/Release/index.node')) as Module
|
|
589
619
|
}
|
|
590
620
|
|
|
591
|
-
export const isLibVariantAvailable = async (
|
|
621
|
+
export const isLibVariantAvailable = async (
|
|
622
|
+
variant?: LibVariant,
|
|
623
|
+
): Promise<boolean> => {
|
|
592
624
|
if (variant && variant !== 'default') {
|
|
593
625
|
const module = await loadPlatformPackage(getPlatformPackageName(variant))
|
|
594
626
|
return module != null
|
package/lib/index.js
CHANGED
|
@@ -76,6 +76,9 @@ class LlamaContextWrapper {
|
|
|
76
76
|
getModelInfo() {
|
|
77
77
|
return this.ctx.getModelInfo();
|
|
78
78
|
}
|
|
79
|
+
getUsedDevices() {
|
|
80
|
+
return this.ctx.getUsedDevices();
|
|
81
|
+
}
|
|
79
82
|
isJinjaSupported() {
|
|
80
83
|
const { minja } = this.ctx.getModelInfo().chatTemplates;
|
|
81
84
|
return !!(minja === null || minja === void 0 ? void 0 : minja.toolUse) || !!(minja === null || minja === void 0 ? void 0 : minja.default);
|
|
@@ -85,7 +88,7 @@ class LlamaContextWrapper {
|
|
|
85
88
|
}
|
|
86
89
|
getFormattedChat(messages, template, params) {
|
|
87
90
|
var _a;
|
|
88
|
-
const { messages: chat, has_media, media_paths
|
|
91
|
+
const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
|
|
89
92
|
const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
|
|
90
93
|
let tmpl;
|
|
91
94
|
if (template)
|
|
@@ -198,7 +201,28 @@ const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, fun
|
|
|
198
201
|
const variant = (_a = options.lib_variant) !== null && _a !== void 0 ? _a : 'default';
|
|
199
202
|
(_b = mods[variant]) !== null && _b !== void 0 ? _b : (mods[variant] = yield (0, binding_1.loadModule)(options.lib_variant));
|
|
200
203
|
refreshNativeLogSetup();
|
|
201
|
-
const
|
|
204
|
+
const { devices } = options;
|
|
205
|
+
let filteredDevs = [];
|
|
206
|
+
if (Array.isArray(devices)) {
|
|
207
|
+
filteredDevs = [...devices];
|
|
208
|
+
// Handle HTP* to use all HTP devices on Hexagon
|
|
209
|
+
if (variant === 'snapdragon' && devices.includes('HTP*')) {
|
|
210
|
+
const backendDevices = yield (0, exports.getBackendDevicesInfo)(variant);
|
|
211
|
+
const htpDevices = backendDevices
|
|
212
|
+
.filter((d) => d.deviceName.startsWith('HTP'))
|
|
213
|
+
.map((d) => d.deviceName);
|
|
214
|
+
filteredDevs = filteredDevs.reduce((acc, dev) => {
|
|
215
|
+
if (dev.startsWith('HTP*')) {
|
|
216
|
+
acc.push(...htpDevices);
|
|
217
|
+
}
|
|
218
|
+
else if (!dev.startsWith('HTP')) {
|
|
219
|
+
acc.push(dev);
|
|
220
|
+
}
|
|
221
|
+
return acc;
|
|
222
|
+
}, []);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
const nativeCtx = new mods[variant].LlamaContext(Object.assign(Object.assign({}, options), { devices: filteredDevs.length > 0 ? filteredDevs : undefined }), onProgress);
|
|
202
226
|
return new LlamaContextWrapper(nativeCtx);
|
|
203
227
|
});
|
|
204
228
|
exports.loadModel = loadModel;
|
package/lib/index.ts
CHANGED
|
@@ -94,6 +94,10 @@ class LlamaContextWrapper {
|
|
|
94
94
|
return this.ctx.getModelInfo()
|
|
95
95
|
}
|
|
96
96
|
|
|
97
|
+
getUsedDevices(): string[] {
|
|
98
|
+
return this.ctx.getUsedDevices()
|
|
99
|
+
}
|
|
100
|
+
|
|
97
101
|
isJinjaSupported(): boolean {
|
|
98
102
|
const { minja } = this.ctx.getModelInfo().chatTemplates
|
|
99
103
|
return !!minja?.toolUse || !!minja?.default
|
|
@@ -118,11 +122,7 @@ class LlamaContextWrapper {
|
|
|
118
122
|
chat_template_kwargs?: Record<string, string>
|
|
119
123
|
},
|
|
120
124
|
): FormattedChatResult {
|
|
121
|
-
const {
|
|
122
|
-
messages: chat,
|
|
123
|
-
has_media,
|
|
124
|
-
media_paths,
|
|
125
|
-
} = formatMediaChat(messages)
|
|
125
|
+
const { messages: chat, has_media, media_paths } = formatMediaChat(messages)
|
|
126
126
|
|
|
127
127
|
const useJinja = this.isJinjaSupported() && params?.jinja
|
|
128
128
|
let tmpl
|
|
@@ -169,8 +169,9 @@ class LlamaContextWrapper {
|
|
|
169
169
|
options: LlamaCompletionOptions,
|
|
170
170
|
callback?: (token: LlamaCompletionToken) => void,
|
|
171
171
|
): Promise<LlamaCompletionResult> {
|
|
172
|
-
const { messages, media_paths = options.media_paths } =
|
|
173
|
-
|
|
172
|
+
const { messages, media_paths = options.media_paths } = formatMediaChat(
|
|
173
|
+
options.messages,
|
|
174
|
+
)
|
|
174
175
|
return this.ctx.completion(
|
|
175
176
|
{
|
|
176
177
|
...options,
|
|
@@ -196,7 +197,10 @@ class LlamaContextWrapper {
|
|
|
196
197
|
return this.ctx.detokenize(tokens)
|
|
197
198
|
}
|
|
198
199
|
|
|
199
|
-
embedding(
|
|
200
|
+
embedding(
|
|
201
|
+
text: string,
|
|
202
|
+
params?: { embd_normalize?: number },
|
|
203
|
+
): Promise<EmbeddingResult> {
|
|
200
204
|
return this.ctx.embedding(text, params)
|
|
201
205
|
}
|
|
202
206
|
|
|
@@ -305,7 +309,35 @@ export const loadModel = async (
|
|
|
305
309
|
mods[variant] ??= await loadModule(options.lib_variant)
|
|
306
310
|
refreshNativeLogSetup()
|
|
307
311
|
|
|
308
|
-
const
|
|
312
|
+
const { devices } = options
|
|
313
|
+
let filteredDevs: Array<string> = []
|
|
314
|
+
if (Array.isArray(devices)) {
|
|
315
|
+
filteredDevs = [...devices]
|
|
316
|
+
|
|
317
|
+
// Handle HTP* to use all HTP devices on Hexagon
|
|
318
|
+
if (variant === 'snapdragon' && devices.includes('HTP*')) {
|
|
319
|
+
const backendDevices = await getBackendDevicesInfo(variant)
|
|
320
|
+
const htpDevices = backendDevices
|
|
321
|
+
.filter((d) => d.deviceName.startsWith('HTP'))
|
|
322
|
+
.map((d) => d.deviceName)
|
|
323
|
+
filteredDevs = filteredDevs.reduce((acc, dev) => {
|
|
324
|
+
if (dev.startsWith('HTP*')) {
|
|
325
|
+
acc.push(...htpDevices)
|
|
326
|
+
} else if (!dev.startsWith('HTP')) {
|
|
327
|
+
acc.push(dev)
|
|
328
|
+
}
|
|
329
|
+
return acc
|
|
330
|
+
}, [] as Array<string>)
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const nativeCtx = new mods[variant].LlamaContext(
|
|
335
|
+
{
|
|
336
|
+
...options,
|
|
337
|
+
devices: filteredDevs.length > 0 ? filteredDevs : undefined,
|
|
338
|
+
},
|
|
339
|
+
onProgress,
|
|
340
|
+
)
|
|
309
341
|
return new LlamaContextWrapper(nativeCtx)
|
|
310
342
|
}
|
|
311
343
|
|
|
@@ -329,7 +361,7 @@ export const loadLlamaModelInfo = async (
|
|
|
329
361
|
}
|
|
330
362
|
|
|
331
363
|
export const getBackendDevicesInfo = async (
|
|
332
|
-
variant: LibVariant = 'default'
|
|
364
|
+
variant: LibVariant = 'default',
|
|
333
365
|
): Promise<import('./binding').BackendDeviceInfo[]> => {
|
|
334
366
|
mods[variant] ??= await loadModule(variant)
|
|
335
367
|
refreshNativeLogSetup()
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.3.
|
|
4
|
+
"version": "1.3.6",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.3.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.3.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.3.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.3.
|
|
79
|
-
"@fugood/node-llama-linux-arm64
|
|
80
|
-
"@fugood/node-llama-linux-arm64-
|
|
81
|
-
"@fugood/node-llama-
|
|
82
|
-
"@fugood/node-llama-win32-x64
|
|
83
|
-
"@fugood/node-llama-win32-x64-
|
|
84
|
-
"@fugood/node-llama-win32-
|
|
85
|
-
"@fugood/node-llama-win32-arm64
|
|
86
|
-
"@fugood/node-llama-
|
|
87
|
-
"@fugood/node-llama-darwin-
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.6",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.6",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.6",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.3.6",
|
|
79
|
+
"@fugood/node-llama-linux-arm64": "1.3.6",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.6",
|
|
81
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.6",
|
|
82
|
+
"@fugood/node-llama-win32-x64": "1.3.6",
|
|
83
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.6",
|
|
84
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.6",
|
|
85
|
+
"@fugood/node-llama-win32-arm64": "1.3.6",
|
|
86
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.6",
|
|
87
|
+
"@fugood/node-llama-darwin-x64": "1.3.6",
|
|
88
|
+
"@fugood/node-llama-darwin-arm64": "1.3.6"
|
|
88
89
|
},
|
|
89
90
|
"devDependencies": {
|
|
90
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
|
|
2
|
-
index
|
|
2
|
+
index bb168e835..cfc0e2c2e 100644
|
|
3
3
|
--- a/src/llama.cpp/common/CMakeLists.txt
|
|
4
4
|
+++ b/src/llama.cpp/common/CMakeLists.txt
|
|
5
|
-
@@ -
|
|
5
|
+
@@ -143,9 +143,16 @@ if (LLAMA_LLGUIDANCE)
|
|
6
6
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
|
7
7
|
endif ()
|
|
8
8
|
|
|
@@ -21,7 +21,7 @@ index 706fa32ee..248459903 100644
|
|
|
21
21
|
|
|
22
22
|
#
|
|
23
23
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
24
|
-
index
|
|
24
|
+
index 6fa05a604..87dfa7a8b 100644
|
|
25
25
|
--- a/src/llama.cpp/common/chat.cpp
|
|
26
26
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
27
27
|
@@ -6,9 +6,6 @@
|
|
@@ -51,7 +51,7 @@ index 938872e82..6364f173f 100644
|
|
|
51
51
|
struct templates_params {
|
|
52
52
|
json messages;
|
|
53
53
|
json tools;
|
|
54
|
-
@@ -
|
|
54
|
+
@@ -817,7 +804,7 @@ static std::string apply(
|
|
55
55
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
56
56
|
}
|
|
57
57
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
@@ -61,7 +61,7 @@ index 938872e82..6364f173f 100644
|
|
|
61
61
|
minja::chat_template_options tmpl_opts;
|
|
62
62
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
63
63
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
64
|
-
index
|
|
64
|
+
index 754c411e2..71241a6cc 100644
|
|
65
65
|
--- a/src/llama.cpp/common/chat.h
|
|
66
66
|
+++ b/src/llama.cpp/common/chat.h
|
|
67
67
|
@@ -9,7 +9,18 @@
|
|
@@ -85,10 +85,10 @@ index 50efb0d4e..f471a84c7 100644
|
|
|
85
85
|
struct common_chat_tool_call {
|
|
86
86
|
std::string name;
|
|
87
87
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
88
|
-
index
|
|
88
|
+
index f3cc55247..65398844f 100644
|
|
89
89
|
--- a/src/llama.cpp/common/common.cpp
|
|
90
90
|
+++ b/src/llama.cpp/common/common.cpp
|
|
91
|
-
@@ -
|
|
91
|
+
@@ -1162,6 +1162,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
92
92
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
93
93
|
}
|
|
94
94
|
|
|
@@ -97,10 +97,10 @@ index 4dc95dcba..ea0ea86c0 100644
|
|
|
97
97
|
mparams.split_mode = params.split_mode;
|
|
98
98
|
mparams.tensor_split = params.tensor_split;
|
|
99
99
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
100
|
-
index
|
|
100
|
+
index de5b404dd..d30d252c9 100644
|
|
101
101
|
--- a/src/llama.cpp/common/common.h
|
|
102
102
|
+++ b/src/llama.cpp/common/common.h
|
|
103
|
-
@@ -
|
|
103
|
+
@@ -281,6 +281,7 @@ struct lr_opt {
|
|
104
104
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
105
105
|
|
|
106
106
|
struct common_params {
|
|
@@ -109,7 +109,7 @@ index f42c083fa..c573cc812 100644
|
|
|
109
109
|
int32_t n_ctx = 4096; // context size
|
|
110
110
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
111
111
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
112
|
-
index
|
|
112
|
+
index d0cab0bcb..48d532838 100644
|
|
113
113
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
114
114
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
115
115
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -121,6 +121,27 @@ index e52e050a8..c1000c162 100644
|
|
|
121
121
|
else()
|
|
122
122
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
123
123
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
124
|
+
diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
125
|
+
index cabd301ad..31eec134c 100644
|
|
126
|
+
--- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
127
|
+
+++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
128
|
+
@@ -9,6 +9,7 @@
|
|
129
|
+
#include <chrono>
|
|
130
|
+
#include <mutex>
|
|
131
|
+
#include <string>
|
|
132
|
+
+#include <stdexcept>
|
|
133
|
+
|
|
134
|
+
#ifdef _WIN32
|
|
135
|
+
# include <sal.h>
|
|
136
|
+
@@ -3682,6 +3683,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
137
|
+
} catch (std::exception const &exc) {
|
|
138
|
+
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
|
139
|
+
devices[i].context = nullptr;
|
|
140
|
+
+ opt_ndev = i;
|
|
141
|
+
+ break;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
124
145
|
diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
125
146
|
index de01336cd..29b1a043d 100644
|
|
126
147
|
--- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -105,6 +105,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
105
105
|
InstanceMethod<&LlamaContext::GetModelInfo>(
|
|
106
106
|
"getModelInfo",
|
|
107
107
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
108
|
+
InstanceMethod<&LlamaContext::GetUsedDevices>(
|
|
109
|
+
"getUsedDevices",
|
|
110
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
108
111
|
InstanceMethod<&LlamaContext::GetFormattedChat>(
|
|
109
112
|
"getFormattedChat",
|
|
110
113
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
@@ -303,9 +306,32 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
303
306
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
304
307
|
}
|
|
305
308
|
|
|
309
|
+
auto cpu_mask = get_option<std::string>(options, "cpu_mask", "");
|
|
310
|
+
if (!cpu_mask.empty()) {
|
|
311
|
+
params.cpuparams.mask_valid = true;
|
|
312
|
+
if (!parse_cpu_mask(cpu_mask, params.cpuparams.cpumask)) {
|
|
313
|
+
Napi::TypeError::New(env, "Invalid cpu_mask").ThrowAsJavaScriptException();
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
params.cpuparams.strict_cpu = get_option<bool>(options, "cpu_strict", false);
|
|
318
|
+
|
|
306
319
|
llama_backend_init();
|
|
307
320
|
llama_numa_init(params.numa);
|
|
308
321
|
|
|
322
|
+
// Parse devices array
|
|
323
|
+
if (options.Has("devices") && options.Get("devices").IsArray()) {
|
|
324
|
+
auto devices_array = options.Get("devices").As<Napi::Array>();
|
|
325
|
+
for (size_t i = 0; i < devices_array.Length(); i++) {
|
|
326
|
+
auto device_name = devices_array.Get(i).ToString().Utf8Value();
|
|
327
|
+
auto * dev = ggml_backend_dev_by_name(device_name.c_str());
|
|
328
|
+
if (dev) {
|
|
329
|
+
params.devices.push_back(dev);
|
|
330
|
+
}
|
|
331
|
+
// Skip invalid device names silently
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
309
335
|
std::vector<common_adapter_lora_info> lora;
|
|
310
336
|
auto lora_path = get_option<std::string>(options, "lora", "");
|
|
311
337
|
auto lora_scaled = get_option<float>(options, "lora_scaled", 1.0f);
|
|
@@ -378,6 +404,17 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
378
404
|
}
|
|
379
405
|
_rn_ctx->attachThreadpoolsIfAvailable();
|
|
380
406
|
|
|
407
|
+
// Collect used devices from the loaded model
|
|
408
|
+
if (_rn_ctx->llama_init.model) {
|
|
409
|
+
const auto &model_devices = _rn_ctx->llama_init.model->devices;
|
|
410
|
+
for (auto dev : model_devices) {
|
|
411
|
+
const char *dev_name = ggml_backend_dev_name(dev);
|
|
412
|
+
if (dev_name != nullptr) {
|
|
413
|
+
_used_devices.push_back(std::string(dev_name));
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
381
418
|
// Release progress callback after model is loaded
|
|
382
419
|
if (has_progress_callback) {
|
|
383
420
|
_progress_tsfn.Release();
|
|
@@ -583,6 +620,15 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
|
|
|
583
620
|
return details;
|
|
584
621
|
}
|
|
585
622
|
|
|
623
|
+
// getUsedDevices(): string[]
|
|
624
|
+
Napi::Value LlamaContext::GetUsedDevices(const Napi::CallbackInfo &info) {
|
|
625
|
+
Napi::Env env = info.Env();
|
|
626
|
+
Napi::Array devices = Napi::Array::New(env, _used_devices.size());
|
|
627
|
+
for (size_t i = 0; i < _used_devices.size(); i++) {
|
|
628
|
+
devices[i] = Napi::String::New(env, _used_devices[i]);
|
|
629
|
+
}
|
|
630
|
+
return devices;
|
|
631
|
+
}
|
|
586
632
|
|
|
587
633
|
|
|
588
634
|
// getFormattedChat(
|
package/src/LlamaContext.h
CHANGED
|
@@ -31,6 +31,7 @@ public:
|
|
|
31
31
|
private:
|
|
32
32
|
Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
|
|
33
33
|
Napi::Value GetModelInfo(const Napi::CallbackInfo &info);
|
|
34
|
+
Napi::Value GetUsedDevices(const Napi::CallbackInfo &info);
|
|
34
35
|
Napi::Value GetFormattedChat(const Napi::CallbackInfo &info);
|
|
35
36
|
Napi::Value Completion(const Napi::CallbackInfo &info);
|
|
36
37
|
void StopCompletion(const Napi::CallbackInfo &info);
|
|
@@ -69,6 +70,7 @@ private:
|
|
|
69
70
|
void CancelRequest(const Napi::CallbackInfo &info);
|
|
70
71
|
|
|
71
72
|
std::string _info;
|
|
73
|
+
std::vector<std::string> _used_devices;
|
|
72
74
|
Napi::Object _meta;
|
|
73
75
|
LlamaCompletionWorker *_wip = nullptr;
|
|
74
76
|
|