@fugood/llama.node 1.3.3 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -3
- package/lib/binding.js +1 -1
- package/lib/binding.ts +40 -14
- package/lib/index.js +4 -1
- package/lib/index.ts +13 -9
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +10 -10
- package/src/LlamaCompletionWorker.cpp +33 -33
- package/src/LlamaContext.cpp +53 -16
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/src/llama.cpp/common/chat-parser.h +10 -0
- package/src/llama.cpp/common/chat.cpp +461 -87
- package/src/llama.cpp/common/chat.h +6 -0
- package/src/llama.cpp/common/common.cpp +8 -1
- package/src/llama.cpp/common/common.h +12 -5
- package/src/llama.cpp/common/json-partial.cpp +19 -2
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
- package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/src/llama.cpp/common/sampling.cpp +60 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
- package/src/llama.cpp/src/llama-grammar.cpp +17 -9
- package/src/llama.cpp/src/llama-impl.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +3 -6
- package/src/llama.cpp/src/llama-vocab.cpp +1 -0
package/CMakeLists.txt
CHANGED
|
@@ -120,16 +120,20 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DEFINED GGML_OPENMP OR GGML_O
|
|
|
120
120
|
endif()
|
|
121
121
|
|
|
122
122
|
set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
|
|
123
|
-
|
|
124
|
-
set(
|
|
125
|
-
|
|
123
|
+
set(LLAMA_BUILD_TOOLS OFF CACHE BOOL "Build tools")
|
|
124
|
+
set(LLAMA_BUILD_TESTS OFF CACHE BOOL "Build tests")
|
|
125
|
+
set(LLAMA_BUILD_SERVER OFF CACHE BOOL "Build server")
|
|
126
|
+
set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "Build examples")
|
|
126
127
|
set(LLAMA_CURL OFF CACHE BOOL "Build curl")
|
|
127
128
|
|
|
129
|
+
set(LLAMA_INSTALL_VERSION "0.0.0") # TODO: Set the version number (0.0.<BUILD_NUMBER>)
|
|
130
|
+
|
|
128
131
|
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
|
|
129
132
|
|
|
130
133
|
add_definitions(-DGGML_MAX_NAME=80)
|
|
131
134
|
|
|
132
135
|
add_subdirectory("src/llama.cpp")
|
|
136
|
+
add_subdirectory("src/llama.cpp/tools/mtmd")
|
|
133
137
|
|
|
134
138
|
include_directories(
|
|
135
139
|
${CMAKE_JS_INC}
|
package/lib/binding.js
CHANGED
|
@@ -51,7 +51,7 @@ const getPlatformPackageName = (variant) => {
|
|
|
51
51
|
};
|
|
52
52
|
const loadPlatformPackage = (packageName) => __awaiter(void 0, void 0, void 0, function* () {
|
|
53
53
|
try {
|
|
54
|
-
return yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s)));
|
|
54
|
+
return (yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s))));
|
|
55
55
|
}
|
|
56
56
|
catch (error) {
|
|
57
57
|
return null;
|
package/lib/binding.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
export type MessagePart = {
|
|
2
|
-
type: string
|
|
3
|
-
text?: string
|
|
2
|
+
type: string
|
|
3
|
+
text?: string
|
|
4
4
|
image_url?: {
|
|
5
5
|
url?: string
|
|
6
|
-
}
|
|
6
|
+
}
|
|
7
7
|
input_audio?: {
|
|
8
8
|
format: string
|
|
9
9
|
data?: string
|
|
@@ -70,6 +70,12 @@ export type LlamaModelOptions = {
|
|
|
70
70
|
* Number of layers to keep MoE weights on CPU
|
|
71
71
|
*/
|
|
72
72
|
n_cpu_moe?: number
|
|
73
|
+
/**
|
|
74
|
+
* List of device names to use for offloading
|
|
75
|
+
* Device names can be obtained from getBackendDevicesInfo()
|
|
76
|
+
* Example: ['Metal', 'BLAS', 'CPU']
|
|
77
|
+
*/
|
|
78
|
+
devices?: string[]
|
|
73
79
|
use_mlock?: boolean
|
|
74
80
|
use_mmap?: boolean
|
|
75
81
|
vocab_only?: boolean
|
|
@@ -375,9 +381,13 @@ export type ToolCall = {
|
|
|
375
381
|
}
|
|
376
382
|
|
|
377
383
|
export interface LlamaContext {
|
|
378
|
-
new (
|
|
384
|
+
new (
|
|
385
|
+
options: LlamaModelOptions,
|
|
386
|
+
onProgress?: (progress: number) => void,
|
|
387
|
+
): LlamaContext
|
|
379
388
|
getSystemInfo(): string
|
|
380
389
|
getModelInfo(): ModelInfo
|
|
390
|
+
getUsedDevices(): string[]
|
|
381
391
|
getFormattedChat(
|
|
382
392
|
messages: ChatMessage[],
|
|
383
393
|
chat_template?: string,
|
|
@@ -400,8 +410,15 @@ export interface LlamaContext {
|
|
|
400
410
|
stopCompletion(): void
|
|
401
411
|
tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
|
|
402
412
|
detokenize(tokens: number[]): Promise<string>
|
|
403
|
-
embedding(
|
|
404
|
-
|
|
413
|
+
embedding(
|
|
414
|
+
text: string,
|
|
415
|
+
params?: { embd_normalize?: number },
|
|
416
|
+
): Promise<EmbeddingResult>
|
|
417
|
+
rerank(
|
|
418
|
+
query: string,
|
|
419
|
+
documents: string[],
|
|
420
|
+
params?: RerankParams,
|
|
421
|
+
): Promise<RerankResult[]>
|
|
405
422
|
saveSession(path: string): Promise<void>
|
|
406
423
|
loadSession(path: string): Promise<void>
|
|
407
424
|
release(): Promise<void>
|
|
@@ -440,7 +457,7 @@ export interface LlamaContext {
|
|
|
440
457
|
* @param options Object containing path and optional n_batch
|
|
441
458
|
* @returns boolean indicating if loading was successful
|
|
442
459
|
*/
|
|
443
|
-
initVocoder(options: { path: string
|
|
460
|
+
initVocoder(options: { path: string; n_batch?: number }): boolean
|
|
444
461
|
|
|
445
462
|
/**
|
|
446
463
|
* Unload the vocoder model
|
|
@@ -459,7 +476,10 @@ export interface LlamaContext {
|
|
|
459
476
|
* @param text Text to complete
|
|
460
477
|
* @returns Formatted audio completion
|
|
461
478
|
*/
|
|
462
|
-
getFormattedAudioCompletion(
|
|
479
|
+
getFormattedAudioCompletion(
|
|
480
|
+
speaker: string | null,
|
|
481
|
+
text: string,
|
|
482
|
+
): {
|
|
463
483
|
prompt: string
|
|
464
484
|
grammar?: string
|
|
465
485
|
}
|
|
@@ -476,7 +496,7 @@ export interface LlamaContext {
|
|
|
476
496
|
* @param tokens Tokens to decode
|
|
477
497
|
* @returns Promise resolving to decoded audio tokens
|
|
478
498
|
*/
|
|
479
|
-
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
|
|
499
|
+
decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array>
|
|
480
500
|
|
|
481
501
|
// Parallel decoding methods
|
|
482
502
|
|
|
@@ -485,7 +505,7 @@ export interface LlamaContext {
|
|
|
485
505
|
* @param params Configuration for parallel mode
|
|
486
506
|
* @returns boolean indicating if successful
|
|
487
507
|
*/
|
|
488
|
-
enableParallelMode(params: { n_parallel?: number
|
|
508
|
+
enableParallelMode(params: { n_parallel?: number; n_batch?: number }): boolean
|
|
489
509
|
|
|
490
510
|
/**
|
|
491
511
|
* Disable parallel decoding mode
|
|
@@ -563,9 +583,11 @@ const getPlatformPackageName = (variant?: LibVariant): string => {
|
|
|
563
583
|
return `@fugood/node-llama-${platform}-${arch}${variantSuffix}`
|
|
564
584
|
}
|
|
565
585
|
|
|
566
|
-
const loadPlatformPackage = async (
|
|
586
|
+
const loadPlatformPackage = async (
|
|
587
|
+
packageName: string,
|
|
588
|
+
): Promise<Module | null> => {
|
|
567
589
|
try {
|
|
568
|
-
return await import(packageName) as Module
|
|
590
|
+
return (await import(packageName)) as Module
|
|
569
591
|
} catch (error) {
|
|
570
592
|
return null
|
|
571
593
|
}
|
|
@@ -579,7 +601,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
|
|
|
579
601
|
|
|
580
602
|
module = await loadPlatformPackage(getPlatformPackageName())
|
|
581
603
|
if (module) {
|
|
582
|
-
console.warn(
|
|
604
|
+
console.warn(
|
|
605
|
+
`Not found package for variant "${variant}", fallback to default`,
|
|
606
|
+
)
|
|
583
607
|
return module
|
|
584
608
|
}
|
|
585
609
|
|
|
@@ -588,7 +612,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
|
|
|
588
612
|
return (await import('../build/Release/index.node')) as Module
|
|
589
613
|
}
|
|
590
614
|
|
|
591
|
-
export const isLibVariantAvailable = async (
|
|
615
|
+
export const isLibVariantAvailable = async (
|
|
616
|
+
variant?: LibVariant,
|
|
617
|
+
): Promise<boolean> => {
|
|
592
618
|
if (variant && variant !== 'default') {
|
|
593
619
|
const module = await loadPlatformPackage(getPlatformPackageName(variant))
|
|
594
620
|
return module != null
|
package/lib/index.js
CHANGED
|
@@ -76,6 +76,9 @@ class LlamaContextWrapper {
|
|
|
76
76
|
getModelInfo() {
|
|
77
77
|
return this.ctx.getModelInfo();
|
|
78
78
|
}
|
|
79
|
+
getUsedDevices() {
|
|
80
|
+
return this.ctx.getUsedDevices();
|
|
81
|
+
}
|
|
79
82
|
isJinjaSupported() {
|
|
80
83
|
const { minja } = this.ctx.getModelInfo().chatTemplates;
|
|
81
84
|
return !!(minja === null || minja === void 0 ? void 0 : minja.toolUse) || !!(minja === null || minja === void 0 ? void 0 : minja.default);
|
|
@@ -85,7 +88,7 @@ class LlamaContextWrapper {
|
|
|
85
88
|
}
|
|
86
89
|
getFormattedChat(messages, template, params) {
|
|
87
90
|
var _a;
|
|
88
|
-
const { messages: chat, has_media, media_paths
|
|
91
|
+
const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
|
|
89
92
|
const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
|
|
90
93
|
let tmpl;
|
|
91
94
|
if (template)
|
package/lib/index.ts
CHANGED
|
@@ -94,6 +94,10 @@ class LlamaContextWrapper {
|
|
|
94
94
|
return this.ctx.getModelInfo()
|
|
95
95
|
}
|
|
96
96
|
|
|
97
|
+
getUsedDevices(): string[] {
|
|
98
|
+
return this.ctx.getUsedDevices()
|
|
99
|
+
}
|
|
100
|
+
|
|
97
101
|
isJinjaSupported(): boolean {
|
|
98
102
|
const { minja } = this.ctx.getModelInfo().chatTemplates
|
|
99
103
|
return !!minja?.toolUse || !!minja?.default
|
|
@@ -118,11 +122,7 @@ class LlamaContextWrapper {
|
|
|
118
122
|
chat_template_kwargs?: Record<string, string>
|
|
119
123
|
},
|
|
120
124
|
): FormattedChatResult {
|
|
121
|
-
const {
|
|
122
|
-
messages: chat,
|
|
123
|
-
has_media,
|
|
124
|
-
media_paths,
|
|
125
|
-
} = formatMediaChat(messages)
|
|
125
|
+
const { messages: chat, has_media, media_paths } = formatMediaChat(messages)
|
|
126
126
|
|
|
127
127
|
const useJinja = this.isJinjaSupported() && params?.jinja
|
|
128
128
|
let tmpl
|
|
@@ -169,8 +169,9 @@ class LlamaContextWrapper {
|
|
|
169
169
|
options: LlamaCompletionOptions,
|
|
170
170
|
callback?: (token: LlamaCompletionToken) => void,
|
|
171
171
|
): Promise<LlamaCompletionResult> {
|
|
172
|
-
const { messages, media_paths = options.media_paths } =
|
|
173
|
-
|
|
172
|
+
const { messages, media_paths = options.media_paths } = formatMediaChat(
|
|
173
|
+
options.messages,
|
|
174
|
+
)
|
|
174
175
|
return this.ctx.completion(
|
|
175
176
|
{
|
|
176
177
|
...options,
|
|
@@ -196,7 +197,10 @@ class LlamaContextWrapper {
|
|
|
196
197
|
return this.ctx.detokenize(tokens)
|
|
197
198
|
}
|
|
198
199
|
|
|
199
|
-
embedding(
|
|
200
|
+
embedding(
|
|
201
|
+
text: string,
|
|
202
|
+
params?: { embd_normalize?: number },
|
|
203
|
+
): Promise<EmbeddingResult> {
|
|
200
204
|
return this.ctx.embedding(text, params)
|
|
201
205
|
}
|
|
202
206
|
|
|
@@ -329,7 +333,7 @@ export const loadLlamaModelInfo = async (
|
|
|
329
333
|
}
|
|
330
334
|
|
|
331
335
|
export const getBackendDevicesInfo = async (
|
|
332
|
-
variant: LibVariant = 'default'
|
|
336
|
+
variant: LibVariant = 'default',
|
|
333
337
|
): Promise<import('./binding').BackendDeviceInfo[]> => {
|
|
334
338
|
mods[variant] ??= await loadModule(variant)
|
|
335
339
|
refreshNativeLogSetup()
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.3.
|
|
4
|
+
"version": "1.3.5",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.3.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.3.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.3.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.3.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.3.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.3.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.3.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.3.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.3.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.3.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.3.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.3.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.3.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.5",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.5",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.5",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.3.5",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.5",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.5",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.3.5",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.5",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.5",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.3.5",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.5",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.3.5",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.3.5"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
|
|
2
|
-
index
|
|
2
|
+
index bb168e835..cfc0e2c2e 100644
|
|
3
3
|
--- a/src/llama.cpp/common/CMakeLists.txt
|
|
4
4
|
+++ b/src/llama.cpp/common/CMakeLists.txt
|
|
5
|
-
@@ -
|
|
5
|
+
@@ -143,9 +143,16 @@ if (LLAMA_LLGUIDANCE)
|
|
6
6
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
|
7
7
|
endif ()
|
|
8
8
|
|
|
@@ -21,7 +21,7 @@ index 706fa32ee..248459903 100644
|
|
|
21
21
|
|
|
22
22
|
#
|
|
23
23
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
24
|
-
index
|
|
24
|
+
index 6fa05a604..87dfa7a8b 100644
|
|
25
25
|
--- a/src/llama.cpp/common/chat.cpp
|
|
26
26
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
27
27
|
@@ -6,9 +6,6 @@
|
|
@@ -51,7 +51,7 @@ index 938872e82..6364f173f 100644
|
|
|
51
51
|
struct templates_params {
|
|
52
52
|
json messages;
|
|
53
53
|
json tools;
|
|
54
|
-
@@ -
|
|
54
|
+
@@ -817,7 +804,7 @@ static std::string apply(
|
|
55
55
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
56
56
|
}
|
|
57
57
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
@@ -61,7 +61,7 @@ index 938872e82..6364f173f 100644
|
|
|
61
61
|
minja::chat_template_options tmpl_opts;
|
|
62
62
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
63
63
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
64
|
-
index
|
|
64
|
+
index 754c411e2..71241a6cc 100644
|
|
65
65
|
--- a/src/llama.cpp/common/chat.h
|
|
66
66
|
+++ b/src/llama.cpp/common/chat.h
|
|
67
67
|
@@ -9,7 +9,18 @@
|
|
@@ -85,10 +85,10 @@ index 50efb0d4e..f471a84c7 100644
|
|
|
85
85
|
struct common_chat_tool_call {
|
|
86
86
|
std::string name;
|
|
87
87
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
88
|
-
index
|
|
88
|
+
index f3cc55247..65398844f 100644
|
|
89
89
|
--- a/src/llama.cpp/common/common.cpp
|
|
90
90
|
+++ b/src/llama.cpp/common/common.cpp
|
|
91
|
-
@@ -
|
|
91
|
+
@@ -1162,6 +1162,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
92
92
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
93
93
|
}
|
|
94
94
|
|
|
@@ -97,10 +97,10 @@ index 4dc95dcba..ea0ea86c0 100644
|
|
|
97
97
|
mparams.split_mode = params.split_mode;
|
|
98
98
|
mparams.tensor_split = params.tensor_split;
|
|
99
99
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
100
|
-
index
|
|
100
|
+
index de5b404dd..d30d252c9 100644
|
|
101
101
|
--- a/src/llama.cpp/common/common.h
|
|
102
102
|
+++ b/src/llama.cpp/common/common.h
|
|
103
|
-
@@ -
|
|
103
|
+
@@ -281,6 +281,7 @@ struct lr_opt {
|
|
104
104
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
105
105
|
|
|
106
106
|
struct common_params {
|
|
@@ -109,7 +109,7 @@ index f42c083fa..c573cc812 100644
|
|
|
109
109
|
int32_t n_ctx = 4096; // context size
|
|
110
110
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
111
111
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
112
|
-
index
|
|
112
|
+
index d0cab0bcb..48d532838 100644
|
|
113
113
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
114
114
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
115
115
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -9,10 +9,10 @@ Napi::Array TokenProbsToArray(Napi::Env env, llama_context* ctx, const std::vect
|
|
|
9
9
|
for (size_t i = 0; i < probs.size(); i++) {
|
|
10
10
|
const auto &prob = probs[i];
|
|
11
11
|
Napi::Object token_obj = Napi::Object::New(env);
|
|
12
|
-
|
|
12
|
+
|
|
13
13
|
std::string token_str = common_token_to_piece(ctx, prob.tok);
|
|
14
14
|
token_obj.Set("content", Napi::String::New(env, token_str));
|
|
15
|
-
|
|
15
|
+
|
|
16
16
|
Napi::Array token_probs = Napi::Array::New(env);
|
|
17
17
|
for (size_t j = 0; j < prob.probs.size(); j++) {
|
|
18
18
|
const auto &p = prob.probs[j];
|
|
@@ -83,10 +83,10 @@ void LlamaCompletionWorker::Execute() {
|
|
|
83
83
|
}
|
|
84
84
|
|
|
85
85
|
auto completion = _rn_ctx->completion;
|
|
86
|
-
|
|
86
|
+
|
|
87
87
|
// Prepare completion context
|
|
88
88
|
completion->rewind();
|
|
89
|
-
|
|
89
|
+
|
|
90
90
|
// Set up parameters
|
|
91
91
|
_rn_ctx->params.prompt = _params.prompt;
|
|
92
92
|
_rn_ctx->params.sampling = _params.sampling;
|
|
@@ -95,50 +95,50 @@ void LlamaCompletionWorker::Execute() {
|
|
|
95
95
|
_rn_ctx->params.n_ctx = _params.n_ctx;
|
|
96
96
|
_rn_ctx->params.n_batch = _params.n_batch;
|
|
97
97
|
_rn_ctx->params.ctx_shift = _params.ctx_shift;
|
|
98
|
-
|
|
98
|
+
|
|
99
99
|
// Set prefill text
|
|
100
100
|
completion->prefill_text = _prefill_text;
|
|
101
|
-
|
|
101
|
+
|
|
102
102
|
// Set up TTS guide tokens if enabled
|
|
103
103
|
if (_has_vocoder && _rn_ctx->tts_wrapper != nullptr) {
|
|
104
104
|
_rn_ctx->tts_wrapper->guide_tokens = _guide_tokens;
|
|
105
105
|
_rn_ctx->tts_wrapper->next_token_uses_guide_token = true;
|
|
106
106
|
}
|
|
107
|
-
|
|
107
|
+
|
|
108
108
|
// Initialize sampling
|
|
109
109
|
if (!completion->initSampling()) {
|
|
110
110
|
SetError("Failed to initialize sampling");
|
|
111
111
|
return;
|
|
112
112
|
}
|
|
113
|
-
|
|
113
|
+
|
|
114
114
|
// Load prompt (handles both text-only and multimodal)
|
|
115
115
|
completion->loadPrompt(_media_paths);
|
|
116
|
-
|
|
116
|
+
|
|
117
117
|
// Check if context is full after loading prompt
|
|
118
118
|
if (completion->context_full) {
|
|
119
119
|
_result.context_full = true;
|
|
120
120
|
return;
|
|
121
121
|
}
|
|
122
|
-
|
|
122
|
+
|
|
123
123
|
// Begin completion with chat format and reasoning settings
|
|
124
124
|
completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open);
|
|
125
|
-
|
|
125
|
+
|
|
126
126
|
// Main completion loop
|
|
127
127
|
int token_count = 0;
|
|
128
128
|
const int max_tokens = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
|
|
129
129
|
while (completion->has_next_token && !_interrupted && token_count < max_tokens) {
|
|
130
130
|
// Get next token using rn-llama completion
|
|
131
131
|
rnllama::completion_token_output token_output = completion->doCompletion();
|
|
132
|
-
|
|
132
|
+
|
|
133
133
|
if (token_output.tok == -1) {
|
|
134
134
|
break;
|
|
135
135
|
}
|
|
136
|
-
|
|
136
|
+
|
|
137
137
|
token_count++;
|
|
138
|
-
|
|
138
|
+
|
|
139
139
|
std::string token_text = common_token_to_piece(_rn_ctx->ctx, token_output.tok);
|
|
140
140
|
_result.text += token_text;
|
|
141
|
-
|
|
141
|
+
|
|
142
142
|
// Check for stopping strings after adding the token
|
|
143
143
|
if (!_stop_words.empty()) {
|
|
144
144
|
size_t stop_pos = completion->findStoppingStrings(_result.text, token_text.size(), rnllama::STOP_FULL);
|
|
@@ -148,7 +148,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
148
148
|
break;
|
|
149
149
|
}
|
|
150
150
|
}
|
|
151
|
-
|
|
151
|
+
|
|
152
152
|
// Handle streaming callback
|
|
153
153
|
if (_has_callback && !completion->incomplete) {
|
|
154
154
|
struct TokenData {
|
|
@@ -160,9 +160,9 @@ void LlamaCompletionWorker::Execute() {
|
|
|
160
160
|
std::vector<rnllama::completion_token_output> completion_probabilities;
|
|
161
161
|
llama_context* ctx;
|
|
162
162
|
};
|
|
163
|
-
|
|
163
|
+
|
|
164
164
|
auto partial_output = completion->parseChatOutput(true);
|
|
165
|
-
|
|
165
|
+
|
|
166
166
|
// Extract completion probabilities if n_probs > 0, similar to iOS implementation
|
|
167
167
|
std::vector<rnllama::completion_token_output> probs_output;
|
|
168
168
|
if (_rn_ctx->params.sampling.n_probs > 0) {
|
|
@@ -171,23 +171,23 @@ void LlamaCompletionWorker::Execute() {
|
|
|
171
171
|
size_t probs_stop_pos = std::min(_sent_token_probs_index + to_send_toks.size(), completion->generated_token_probs.size());
|
|
172
172
|
if (probs_pos < probs_stop_pos) {
|
|
173
173
|
probs_output = std::vector<rnllama::completion_token_output>(
|
|
174
|
-
completion->generated_token_probs.begin() + probs_pos,
|
|
174
|
+
completion->generated_token_probs.begin() + probs_pos,
|
|
175
175
|
completion->generated_token_probs.begin() + probs_stop_pos
|
|
176
176
|
);
|
|
177
177
|
}
|
|
178
178
|
_sent_token_probs_index = probs_stop_pos;
|
|
179
179
|
}
|
|
180
|
-
|
|
180
|
+
|
|
181
181
|
TokenData *token_data = new TokenData{
|
|
182
|
-
token_text,
|
|
183
|
-
partial_output.content,
|
|
184
|
-
partial_output.reasoning_content,
|
|
185
|
-
partial_output.tool_calls,
|
|
182
|
+
token_text,
|
|
183
|
+
partial_output.content,
|
|
184
|
+
partial_output.reasoning_content,
|
|
185
|
+
partial_output.tool_calls,
|
|
186
186
|
partial_output.accumulated_text,
|
|
187
187
|
probs_output,
|
|
188
188
|
_rn_ctx->ctx
|
|
189
189
|
};
|
|
190
|
-
|
|
190
|
+
|
|
191
191
|
_tsfn.BlockingCall(token_data, [](Napi::Env env, Napi::Function jsCallback,
|
|
192
192
|
TokenData *data) {
|
|
193
193
|
auto obj = Napi::Object::New(env);
|
|
@@ -216,25 +216,25 @@ void LlamaCompletionWorker::Execute() {
|
|
|
216
216
|
obj.Set("tool_calls", tool_calls);
|
|
217
217
|
}
|
|
218
218
|
obj.Set("accumulated_text", Napi::String::New(env, data->accumulated_text));
|
|
219
|
-
|
|
219
|
+
|
|
220
220
|
// Add completion_probabilities if available
|
|
221
221
|
if (!data->completion_probabilities.empty()) {
|
|
222
222
|
obj.Set("completion_probabilities", TokenProbsToArray(env, data->ctx, data->completion_probabilities));
|
|
223
223
|
}
|
|
224
|
-
|
|
224
|
+
|
|
225
225
|
delete data;
|
|
226
226
|
jsCallback.Call({obj});
|
|
227
227
|
});
|
|
228
228
|
}
|
|
229
229
|
}
|
|
230
|
-
|
|
230
|
+
|
|
231
231
|
// Check stopping conditions
|
|
232
232
|
if (token_count >= max_tokens) {
|
|
233
233
|
_result.stopped_limited = true;
|
|
234
234
|
} else if (!completion->has_next_token && completion->n_remain == 0) {
|
|
235
235
|
_result.stopped_limited = true;
|
|
236
236
|
}
|
|
237
|
-
|
|
237
|
+
|
|
238
238
|
// Set completion results from rn-llama completion context
|
|
239
239
|
// tokens_evaluated should include both prompt tokens and generated tokens that were processed
|
|
240
240
|
_result.tokens_evaluated = completion->num_prompt_tokens + completion->num_tokens_predicted;
|
|
@@ -245,20 +245,20 @@ void LlamaCompletionWorker::Execute() {
|
|
|
245
245
|
_result.stopped_words = completion->stopped_word;
|
|
246
246
|
_result.stopping_word = completion->stopping_word;
|
|
247
247
|
_result.stopped_limited = completion->stopped_limit;
|
|
248
|
-
|
|
248
|
+
|
|
249
249
|
// Get audio tokens if TTS is enabled
|
|
250
250
|
if (_has_vocoder && _rn_ctx->tts_wrapper != nullptr) {
|
|
251
251
|
_result.audio_tokens = _rn_ctx->tts_wrapper->audio_tokens;
|
|
252
252
|
}
|
|
253
|
-
|
|
253
|
+
common_perf_print(_rn_ctx->ctx, _rn_ctx->completion->ctx_sampling);
|
|
254
254
|
// End completion
|
|
255
255
|
completion->endCompletion();
|
|
256
|
-
|
|
256
|
+
|
|
257
257
|
} catch (const std::exception &e) {
|
|
258
258
|
SetError(e.what());
|
|
259
259
|
return;
|
|
260
260
|
}
|
|
261
|
-
|
|
261
|
+
|
|
262
262
|
if (_onComplete) {
|
|
263
263
|
_onComplete();
|
|
264
264
|
}
|