@fugood/llama.node 1.4.2 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -1
- package/lib/binding.js +3 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +9 -0
- package/lib/index.ts +10 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -11
- package/src/LlamaContext.cpp +24 -0
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/CMakeLists.txt +21 -6
- package/src/llama.cpp/common/CMakeLists.txt +6 -0
- package/src/llama.cpp/common/arg.cpp +83 -22
- package/src/llama.cpp/common/chat-parser.cpp +40 -0
- package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
- package/src/llama.cpp/common/chat-peg-parser.h +105 -0
- package/src/llama.cpp/common/chat.cpp +40 -29
- package/src/llama.cpp/common/chat.h +10 -1
- package/src/llama.cpp/common/common.cpp +70 -7
- package/src/llama.cpp/common/common.h +23 -5
- package/src/llama.cpp/common/download.cpp +18 -8
- package/src/llama.cpp/common/download.h +3 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +18 -27
- package/src/llama.cpp/common/log.h +19 -12
- package/src/llama.cpp/common/peg-parser.cpp +1712 -0
- package/src/llama.cpp/common/peg-parser.h +459 -0
- package/src/llama.cpp/common/unicode.cpp +64 -0
- package/src/llama.cpp/common/unicode.h +22 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +52 -48
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
- package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +98 -12
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +30 -1
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-graph.cpp +3 -6
- package/src/llama.cpp/src/llama-hparams.h +2 -2
- package/src/llama.cpp/src/llama-impl.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +54 -6
- package/src/llama.cpp/src/llama-quant.cpp +0 -29
- package/src/llama.cpp/src/llama-vocab.cpp +1 -2
- package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
- package/src/llama.cpp/src/models/mistral3.cpp +160 -0
- package/src/llama.cpp/src/models/models.h +4 -0
- package/src/llama.cpp/src/unicode.cpp +2 -2
package/CMakeLists.txt
CHANGED
package/lib/binding.js
CHANGED
|
@@ -64,6 +64,9 @@ const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
64
64
|
/* no-op */
|
|
65
65
|
}
|
|
66
66
|
}
|
|
67
|
+
const nDev = process.env.GGML_HEXAGON_NDEV;
|
|
68
|
+
if (!nDev)
|
|
69
|
+
process.env.GGML_HEXAGON_NDEV = '16';
|
|
67
70
|
}
|
|
68
71
|
let module = yield loadPlatformPackage(packageName);
|
|
69
72
|
if (module) {
|
package/lib/binding.ts
CHANGED
|
@@ -565,6 +565,14 @@ export interface LlamaContext {
|
|
|
565
565
|
*/
|
|
566
566
|
cancelRequest(requestId: number): void
|
|
567
567
|
|
|
568
|
+
/**
|
|
569
|
+
* Clear the KV and recurrent caches.
|
|
570
|
+
* This is faster than recreating the context and useful for preventing
|
|
571
|
+
* cache contamination between chat sessions.
|
|
572
|
+
* @param clearData If true, also clears the cache data (default: false)
|
|
573
|
+
*/
|
|
574
|
+
clearCache(clearData?: boolean): void
|
|
575
|
+
|
|
568
576
|
// static
|
|
569
577
|
loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
|
|
570
578
|
toggleNativeLog(
|
|
@@ -616,6 +624,8 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
|
|
|
616
624
|
/* no-op */
|
|
617
625
|
}
|
|
618
626
|
}
|
|
627
|
+
const nDev = process.env.GGML_HEXAGON_NDEV
|
|
628
|
+
if (!nDev) process.env.GGML_HEXAGON_NDEV = '16'
|
|
619
629
|
}
|
|
620
630
|
|
|
621
631
|
let module = await loadPlatformPackage(packageName)
|
package/lib/index.js
CHANGED
|
@@ -195,6 +195,15 @@ class LlamaContextWrapper {
|
|
|
195
195
|
decodeAudioTokens(tokens) {
|
|
196
196
|
return this.ctx.decodeAudioTokens(tokens);
|
|
197
197
|
}
|
|
198
|
+
/**
|
|
199
|
+
* Clear the KV and recurrent caches.
|
|
200
|
+
* This is faster than recreating the context and useful for preventing
|
|
201
|
+
* cache contamination between chat sessions.
|
|
202
|
+
* @param clearData If true, also clears the cache data (default: false)
|
|
203
|
+
*/
|
|
204
|
+
clearCache(clearData) {
|
|
205
|
+
this.ctx.clearCache(clearData);
|
|
206
|
+
}
|
|
198
207
|
}
|
|
199
208
|
const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
|
|
200
209
|
var _a, _b;
|
package/lib/index.ts
CHANGED
|
@@ -299,6 +299,16 @@ class LlamaContextWrapper {
|
|
|
299
299
|
decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
|
|
300
300
|
return this.ctx.decodeAudioTokens(tokens)
|
|
301
301
|
}
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* Clear the KV and recurrent caches.
|
|
305
|
+
* This is faster than recreating the context and useful for preventing
|
|
306
|
+
* cache contamination between chat sessions.
|
|
307
|
+
* @param clearData If true, also clears the cache data (default: false)
|
|
308
|
+
*/
|
|
309
|
+
clearCache(clearData?: boolean): void {
|
|
310
|
+
this.ctx.clearCache(clearData)
|
|
311
|
+
}
|
|
302
312
|
}
|
|
303
313
|
|
|
304
314
|
export const loadModel = async (
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.4",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.4",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.4",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.4",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.4",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.4",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.4",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.4",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.4",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.4",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.4",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.4",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.4",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.4",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.4"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
|
|
2
|
-
index
|
|
2
|
+
index 377b26846..1873b5206 100644
|
|
3
3
|
--- a/src/llama.cpp/common/CMakeLists.txt
|
|
4
4
|
+++ b/src/llama.cpp/common/CMakeLists.txt
|
|
5
|
-
@@ -
|
|
5
|
+
@@ -149,9 +149,16 @@ if (LLAMA_LLGUIDANCE)
|
|
6
6
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
|
7
7
|
endif ()
|
|
8
8
|
|
|
@@ -20,8 +20,22 @@ index bb168e835..cfc0e2c2e 100644
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
#
|
|
23
|
+
diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
|
|
24
|
+
index 74a7b6a46..7b7a1bd50 100644
|
|
25
|
+
--- a/src/llama.cpp/common/chat-peg-parser.cpp
|
|
26
|
+
+++ b/src/llama.cpp/common/chat-peg-parser.cpp
|
|
27
|
+
@@ -1,9 +1,5 @@
|
|
28
|
+
#include "chat-peg-parser.h"
|
|
29
|
+
|
|
30
|
+
-#include <nlohmann/json.hpp>
|
|
31
|
+
-
|
|
32
|
+
-using json = nlohmann::json;
|
|
33
|
+
-
|
|
34
|
+
static std::string_view trim_trailing_space(std::string_view sv) {
|
|
35
|
+
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
|
|
36
|
+
sv.remove_suffix(1);
|
|
23
37
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
24
|
-
index
|
|
38
|
+
index 41a5bb42d..da5cf4b94 100644
|
|
25
39
|
--- a/src/llama.cpp/common/chat.cpp
|
|
26
40
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
27
41
|
@@ -6,9 +6,6 @@
|
|
@@ -34,7 +48,7 @@ index b4a0f985e..2383d2ea9 100644
|
|
|
34
48
|
#include <algorithm>
|
|
35
49
|
#include <cstdio>
|
|
36
50
|
#include <cctype>
|
|
37
|
-
@@ -
|
|
51
|
+
@@ -134,16 +131,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
38
52
|
return diffs;
|
|
39
53
|
}
|
|
40
54
|
|
|
@@ -51,7 +65,7 @@ index b4a0f985e..2383d2ea9 100644
|
|
|
51
65
|
struct templates_params {
|
|
52
66
|
json messages;
|
|
53
67
|
json tools;
|
|
54
|
-
@@ -
|
|
68
|
+
@@ -720,7 +707,7 @@ static std::string apply(
|
|
55
69
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
56
70
|
}
|
|
57
71
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
@@ -61,10 +75,10 @@ index b4a0f985e..2383d2ea9 100644
|
|
|
61
75
|
minja::chat_template_options tmpl_opts;
|
|
62
76
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
63
77
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
64
|
-
index
|
|
78
|
+
index 6085510a4..263076ce2 100644
|
|
65
79
|
--- a/src/llama.cpp/common/chat.h
|
|
66
80
|
+++ b/src/llama.cpp/common/chat.h
|
|
67
|
-
@@ -
|
|
81
|
+
@@ -10,7 +10,18 @@
|
|
68
82
|
#include <vector>
|
|
69
83
|
#include <map>
|
|
70
84
|
|
|
@@ -85,10 +99,10 @@ index 754c411e2..71241a6cc 100644
|
|
|
85
99
|
struct common_chat_tool_call {
|
|
86
100
|
std::string name;
|
|
87
101
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
88
|
-
index
|
|
102
|
+
index f07af1d86..1b10c7b13 100644
|
|
89
103
|
--- a/src/llama.cpp/common/common.cpp
|
|
90
104
|
+++ b/src/llama.cpp/common/common.cpp
|
|
91
|
-
@@ -
|
|
105
|
+
@@ -1236,6 +1236,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
92
106
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
93
107
|
}
|
|
94
108
|
|
|
@@ -97,10 +111,10 @@ index 0d7fd9a93..6bf3cc7ab 100644
|
|
|
97
111
|
mparams.split_mode = params.split_mode;
|
|
98
112
|
mparams.tensor_split = params.tensor_split;
|
|
99
113
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
100
|
-
index
|
|
114
|
+
index 179113a4d..78aa24bc3 100644
|
|
101
115
|
--- a/src/llama.cpp/common/common.h
|
|
102
116
|
+++ b/src/llama.cpp/common/common.h
|
|
103
|
-
@@ -
|
|
117
|
+
@@ -302,6 +302,7 @@ struct lr_opt {
|
|
104
118
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
105
119
|
|
|
106
120
|
struct common_params {
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -200,6 +200,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
200
200
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
201
201
|
InstanceMethod<&LlamaContext::CancelRequest>(
|
|
202
202
|
"cancelRequest",
|
|
203
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
204
|
+
InstanceMethod<&LlamaContext::ClearCache>(
|
|
205
|
+
"clearCache",
|
|
203
206
|
static_cast<napi_property_attributes>(napi_enumerable))});
|
|
204
207
|
Napi::FunctionReference *constructor = new Napi::FunctionReference();
|
|
205
208
|
*constructor = Napi::Persistent(func);
|
|
@@ -1505,3 +1508,24 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
|
|
|
1505
1508
|
worker->Queue();
|
|
1506
1509
|
return worker->Promise();
|
|
1507
1510
|
}
|
|
1511
|
+
|
|
1512
|
+
// clearCache(clearData?: boolean): void
|
|
1513
|
+
void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
|
|
1514
|
+
Napi::Env env = info.Env();
|
|
1515
|
+
if (!_rn_ctx) {
|
|
1516
|
+
Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
|
|
1517
|
+
return;
|
|
1518
|
+
}
|
|
1519
|
+
if (_rn_ctx->completion != nullptr && _rn_ctx->completion->is_predicting) {
|
|
1520
|
+
Napi::TypeError::New(env, "Cannot clear cache while completion is in progress")
|
|
1521
|
+
.ThrowAsJavaScriptException();
|
|
1522
|
+
return;
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
bool clear_data = false;
|
|
1526
|
+
if (info.Length() >= 1 && info[0].IsBoolean()) {
|
|
1527
|
+
clear_data = info[0].ToBoolean().Value();
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
_rn_ctx->clearCache(clear_data);
|
|
1531
|
+
}
|
package/src/LlamaContext.h
CHANGED
|
@@ -69,6 +69,9 @@ private:
|
|
|
69
69
|
Napi::Value QueueRerank(const Napi::CallbackInfo &info);
|
|
70
70
|
void CancelRequest(const Napi::CallbackInfo &info);
|
|
71
71
|
|
|
72
|
+
// Cache management
|
|
73
|
+
void ClearCache(const Napi::CallbackInfo &info);
|
|
74
|
+
|
|
72
75
|
std::string _info;
|
|
73
76
|
std::vector<std::string> _used_devices;
|
|
74
77
|
Napi::Object _meta;
|
|
@@ -33,10 +33,24 @@ endif()
|
|
|
33
33
|
|
|
34
34
|
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
|
35
35
|
|
|
36
|
+
option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
|
|
37
|
+
|
|
36
38
|
if (EMSCRIPTEN)
|
|
37
39
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
|
38
40
|
|
|
39
|
-
|
|
41
|
+
# Use 64-bit memory to support backend_get_memory queries
|
|
42
|
+
# TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
|
|
43
|
+
if (LLAMA_WASM_MEM64)
|
|
44
|
+
add_compile_options("-sMEMORY64=1")
|
|
45
|
+
add_link_options("-sMEMORY64=1")
|
|
46
|
+
endif()
|
|
47
|
+
add_link_options("-sALLOW_MEMORY_GROWTH=1")
|
|
48
|
+
|
|
49
|
+
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
|
|
50
|
+
option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
|
|
51
|
+
if (LLAMA_BUILD_HTML)
|
|
52
|
+
set(CMAKE_EXECUTABLE_SUFFIX ".html")
|
|
53
|
+
endif()
|
|
40
54
|
else()
|
|
41
55
|
if (MINGW)
|
|
42
56
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
|
@@ -58,6 +72,12 @@ if (MSVC)
|
|
|
58
72
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
|
59
73
|
endif()
|
|
60
74
|
|
|
75
|
+
if (LLAMA_STANDALONE)
|
|
76
|
+
# enable parallel builds for msbuild
|
|
77
|
+
list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
|
|
78
|
+
list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
|
|
79
|
+
endif()
|
|
80
|
+
|
|
61
81
|
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
|
|
62
82
|
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
|
|
63
83
|
else()
|
|
@@ -179,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
|
|
179
199
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
|
180
200
|
endif()
|
|
181
201
|
|
|
182
|
-
if (MINGW)
|
|
183
|
-
# Target Windows 8 for PrefetchVirtualMemory
|
|
184
|
-
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
185
|
-
endif()
|
|
186
|
-
|
|
187
202
|
#
|
|
188
203
|
# build the library
|
|
189
204
|
#
|
|
@@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
|
|
|
52
52
|
chat-parser.h
|
|
53
53
|
chat-parser-xml-toolcall.h
|
|
54
54
|
chat-parser-xml-toolcall.cpp
|
|
55
|
+
chat-peg-parser.cpp
|
|
56
|
+
chat-peg-parser.h
|
|
55
57
|
chat.cpp
|
|
56
58
|
chat.h
|
|
57
59
|
common.cpp
|
|
@@ -69,12 +71,16 @@ add_library(${TARGET} STATIC
|
|
|
69
71
|
log.h
|
|
70
72
|
ngram-cache.cpp
|
|
71
73
|
ngram-cache.h
|
|
74
|
+
peg-parser.cpp
|
|
75
|
+
peg-parser.h
|
|
72
76
|
regex-partial.cpp
|
|
73
77
|
regex-partial.h
|
|
74
78
|
sampling.cpp
|
|
75
79
|
sampling.h
|
|
76
80
|
speculative.cpp
|
|
77
81
|
speculative.h
|
|
82
|
+
unicode.cpp
|
|
83
|
+
unicode.h
|
|
78
84
|
)
|
|
79
85
|
|
|
80
86
|
if (BUILD_SHARED_LIBS)
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
#include <thread> // for hardware_concurrency
|
|
31
31
|
#include <vector>
|
|
32
32
|
|
|
33
|
+
#ifndef __EMSCRIPTEN__
|
|
33
34
|
#ifdef __linux__
|
|
34
35
|
#include <linux/limits.h>
|
|
35
36
|
#elif defined(_WIN32)
|
|
@@ -41,6 +42,8 @@
|
|
|
41
42
|
#else
|
|
42
43
|
#include <sys/syslimits.h>
|
|
43
44
|
#endif
|
|
45
|
+
#endif
|
|
46
|
+
|
|
44
47
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
45
48
|
|
|
46
49
|
using json = nlohmann::ordered_json;
|
|
@@ -212,13 +215,13 @@ struct handle_model_result {
|
|
|
212
215
|
static handle_model_result common_params_handle_model(
|
|
213
216
|
struct common_params_model & model,
|
|
214
217
|
const std::string & bearer_token,
|
|
215
|
-
const std::string & model_path_default,
|
|
216
218
|
bool offline) {
|
|
217
219
|
handle_model_result result;
|
|
218
220
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
|
219
221
|
{
|
|
220
222
|
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
|
|
221
223
|
model.path = common_docker_resolve_model(model.docker_repo);
|
|
224
|
+
model.name = model.docker_repo; // set name for consistency
|
|
222
225
|
} else if (!model.hf_repo.empty()) {
|
|
223
226
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
224
227
|
if (model.hf_file.empty()) {
|
|
@@ -227,7 +230,8 @@ static handle_model_result common_params_handle_model(
|
|
|
227
230
|
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
|
228
231
|
exit(1); // built without CURL, error message already printed
|
|
229
232
|
}
|
|
230
|
-
model.
|
|
233
|
+
model.name = model.hf_repo; // repo name with tag
|
|
234
|
+
model.hf_repo = auto_detected.repo; // repo name without tag
|
|
231
235
|
model.hf_file = auto_detected.ggufFile;
|
|
232
236
|
if (!auto_detected.mmprojFile.empty()) {
|
|
233
237
|
result.found_mmproj = true;
|
|
@@ -257,8 +261,6 @@ static handle_model_result common_params_handle_model(
|
|
|
257
261
|
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
|
258
262
|
}
|
|
259
263
|
|
|
260
|
-
} else if (model.path.empty()) {
|
|
261
|
-
model.path = model_path_default;
|
|
262
264
|
}
|
|
263
265
|
}
|
|
264
266
|
|
|
@@ -405,7 +407,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
405
407
|
|
|
406
408
|
// handle model and download
|
|
407
409
|
{
|
|
408
|
-
auto res = common_params_handle_model(params.model, params.hf_token,
|
|
410
|
+
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
|
|
409
411
|
if (params.no_mmproj) {
|
|
410
412
|
params.mmproj = {};
|
|
411
413
|
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
|
@@ -415,12 +417,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
415
417
|
// only download mmproj if the current example is using it
|
|
416
418
|
for (auto & ex : mmproj_examples) {
|
|
417
419
|
if (ctx_arg.ex == ex) {
|
|
418
|
-
common_params_handle_model(params.mmproj, params.hf_token,
|
|
420
|
+
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
|
|
419
421
|
break;
|
|
420
422
|
}
|
|
421
423
|
}
|
|
422
|
-
common_params_handle_model(params.speculative.model, params.hf_token,
|
|
423
|
-
common_params_handle_model(params.vocoder.model, params.hf_token,
|
|
424
|
+
common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
|
|
425
|
+
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// model is required (except for server)
|
|
429
|
+
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
|
430
|
+
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
|
|
431
|
+
throw std::invalid_argument("error: --model is required\n");
|
|
424
432
|
}
|
|
425
433
|
|
|
426
434
|
if (params.escape) {
|
|
@@ -700,6 +708,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
700
708
|
params.use_jinja = true;
|
|
701
709
|
}
|
|
702
710
|
|
|
711
|
+
params.use_color = tty_can_use_colors();
|
|
712
|
+
|
|
703
713
|
// load dynamic backends
|
|
704
714
|
ggml_backend_load_all();
|
|
705
715
|
|
|
@@ -782,10 +792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
782
792
|
}
|
|
783
793
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
784
794
|
add_opt(common_arg(
|
|
785
|
-
{"-co", "--color"},
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
795
|
+
{"-co", "--color"}, "[on|off|auto]",
|
|
796
|
+
"Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
|
|
797
|
+
"'auto' enables colors when output is to a terminal",
|
|
798
|
+
[](common_params & params, const std::string & value) {
|
|
799
|
+
if (is_truthy(value)) {
|
|
800
|
+
params.use_color = true;
|
|
801
|
+
} else if (is_falsey(value)) {
|
|
802
|
+
params.use_color = false;
|
|
803
|
+
} else if (is_autoy(value)) {
|
|
804
|
+
params.use_color = tty_can_use_colors();
|
|
805
|
+
} else {
|
|
806
|
+
throw std::invalid_argument(
|
|
807
|
+
string_format("error: unknown value for --color: '%s'\n", value.c_str()));
|
|
808
|
+
}
|
|
789
809
|
}
|
|
790
810
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
791
811
|
add_opt(common_arg(
|
|
@@ -1014,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1014
1034
|
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
|
1015
1035
|
} else {
|
|
1016
1036
|
throw std::runtime_error(
|
|
1017
|
-
string_format("error:
|
|
1037
|
+
string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
|
|
1018
1038
|
}
|
|
1019
1039
|
}).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
1020
1040
|
add_opt(common_arg(
|
|
@@ -1221,7 +1241,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1221
1241
|
[](common_params & params) {
|
|
1222
1242
|
params.warmup = false;
|
|
1223
1243
|
}
|
|
1224
|
-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1244
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1225
1245
|
add_opt(common_arg(
|
|
1226
1246
|
{"--spm-infill"},
|
|
1227
1247
|
string_format(
|
|
@@ -2090,11 +2110,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2090
2110
|
add_opt(common_arg(
|
|
2091
2111
|
{"-m", "--model"}, "FNAME",
|
|
2092
2112
|
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
2093
|
-
?
|
|
2094
|
-
:
|
|
2095
|
-
"model path (default: `models/$filename` with filename from `--hf-file` "
|
|
2096
|
-
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
|
2097
|
-
),
|
|
2113
|
+
? "model path from which to load base model"
|
|
2114
|
+
: "model path to load",
|
|
2098
2115
|
[](common_params & params, const std::string & value) {
|
|
2099
2116
|
params.model.path = value;
|
|
2100
2117
|
}
|
|
@@ -2486,12 +2503,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2486
2503
|
"path to save slot kv cache (default: disabled)",
|
|
2487
2504
|
[](common_params & params, const std::string & value) {
|
|
2488
2505
|
params.slot_save_path = value;
|
|
2506
|
+
if (!fs_is_directory(params.slot_save_path)) {
|
|
2507
|
+
throw std::invalid_argument("not a directory: " + value);
|
|
2508
|
+
}
|
|
2489
2509
|
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
|
2490
2510
|
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
|
2491
2511
|
params.slot_save_path += DIRECTORY_SEPARATOR;
|
|
2492
2512
|
}
|
|
2493
2513
|
}
|
|
2494
2514
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2515
|
+
add_opt(common_arg(
|
|
2516
|
+
{"--media-path"}, "PATH",
|
|
2517
|
+
"directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
|
|
2518
|
+
[](common_params & params, const std::string & value) {
|
|
2519
|
+
params.media_path = value;
|
|
2520
|
+
if (!fs_is_directory(params.media_path)) {
|
|
2521
|
+
throw std::invalid_argument("not a directory: " + value);
|
|
2522
|
+
}
|
|
2523
|
+
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
|
2524
|
+
if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
|
2525
|
+
params.media_path += DIRECTORY_SEPARATOR;
|
|
2526
|
+
}
|
|
2527
|
+
}
|
|
2528
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2529
|
+
add_opt(common_arg(
|
|
2530
|
+
{"--models-dir"}, "PATH",
|
|
2531
|
+
"directory containing models for the router server (default: disabled)",
|
|
2532
|
+
[](common_params & params, const std::string & value) {
|
|
2533
|
+
params.models_dir = value;
|
|
2534
|
+
}
|
|
2535
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
|
2536
|
+
add_opt(common_arg(
|
|
2537
|
+
{"--models-max"}, "N",
|
|
2538
|
+
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
|
|
2539
|
+
[](common_params & params, int value) {
|
|
2540
|
+
params.models_max = value;
|
|
2541
|
+
}
|
|
2542
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
|
2543
|
+
add_opt(common_arg(
|
|
2544
|
+
{"--no-models-autoload"},
|
|
2545
|
+
"disables automatic loading of models (default: enabled)",
|
|
2546
|
+
[](common_params & params) {
|
|
2547
|
+
params.models_autoload = false;
|
|
2548
|
+
}
|
|
2549
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
|
|
2495
2550
|
add_opt(common_arg(
|
|
2496
2551
|
{"--jinja"},
|
|
2497
2552
|
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
|
@@ -2639,7 +2694,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2639
2694
|
[](common_params &, const std::string & value) {
|
|
2640
2695
|
common_log_set_file(common_log_main(), value.c_str());
|
|
2641
2696
|
}
|
|
2642
|
-
));
|
|
2697
|
+
).set_env("LLAMA_LOG_FILE"));
|
|
2643
2698
|
add_opt(common_arg(
|
|
2644
2699
|
{"--log-colors"}, "[on|off|auto]",
|
|
2645
2700
|
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
|
@@ -2653,7 +2708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2653
2708
|
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
|
|
2654
2709
|
} else {
|
|
2655
2710
|
throw std::invalid_argument(
|
|
2656
|
-
string_format("error:
|
|
2711
|
+
string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
|
|
2657
2712
|
}
|
|
2658
2713
|
}
|
|
2659
2714
|
).set_env("LLAMA_LOG_COLORS"));
|
|
@@ -2674,7 +2729,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2674
2729
|
).set_env("LLAMA_OFFLINE"));
|
|
2675
2730
|
add_opt(common_arg(
|
|
2676
2731
|
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
|
2677
|
-
"Set the verbosity threshold. Messages with a higher verbosity will be ignored."
|
|
2732
|
+
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
|
|
2733
|
+
" - 0: generic output\n"
|
|
2734
|
+
" - 1: error\n"
|
|
2735
|
+
" - 2: warning\n"
|
|
2736
|
+
" - 3: info\n"
|
|
2737
|
+
" - 4: debug\n"
|
|
2738
|
+
"(default: %d)\n", params.verbosity),
|
|
2678
2739
|
[](common_params & params, int value) {
|
|
2679
2740
|
params.verbosity = value;
|
|
2680
2741
|
common_log_set_verbosity_thold(value);
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#include "chat-parser.h"
|
|
2
|
+
#include "chat-peg-parser.h"
|
|
2
3
|
#include "common.h"
|
|
3
4
|
#include "log.h"
|
|
5
|
+
#include "peg-parser.h"
|
|
4
6
|
#include "regex-partial.h"
|
|
5
7
|
|
|
6
8
|
#include <algorithm>
|
|
@@ -1483,6 +1485,11 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1483
1485
|
}
|
|
1484
1486
|
|
|
1485
1487
|
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
|
|
1488
|
+
if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
|
|
1489
|
+
syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
|
|
1490
|
+
syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
|
|
1491
|
+
return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
|
|
1492
|
+
}
|
|
1486
1493
|
common_chat_msg_parser builder(input, is_partial, syntax);
|
|
1487
1494
|
try {
|
|
1488
1495
|
common_chat_parse(builder);
|
|
@@ -1500,3 +1507,36 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
|
|
1500
1507
|
}
|
|
1501
1508
|
return msg;
|
|
1502
1509
|
}
|
|
1510
|
+
|
|
1511
|
+
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
|
|
1512
|
+
if (parser.empty()) {
|
|
1513
|
+
throw std::runtime_error("Failed to parse due to missing parser definition.");
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
|
|
1517
|
+
|
|
1518
|
+
common_peg_parse_context ctx(input, is_partial);
|
|
1519
|
+
auto result = parser.parse(ctx);
|
|
1520
|
+
if (result.fail()) {
|
|
1521
|
+
throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
common_chat_msg msg;
|
|
1525
|
+
msg.role = "assistant";
|
|
1526
|
+
|
|
1527
|
+
if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
|
|
1528
|
+
auto mapper = common_chat_peg_native_mapper(msg);
|
|
1529
|
+
mapper.from_ast(ctx.ast, result);
|
|
1530
|
+
} else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
|
|
1531
|
+
auto mapper = common_chat_peg_constructed_mapper(msg);
|
|
1532
|
+
mapper.from_ast(ctx.ast, result);
|
|
1533
|
+
} else {
|
|
1534
|
+
// Generic mapper
|
|
1535
|
+
auto mapper = common_chat_peg_mapper(msg);
|
|
1536
|
+
mapper.from_ast(ctx.ast, result);
|
|
1537
|
+
}
|
|
1538
|
+
if (!is_partial) {
|
|
1539
|
+
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
|
1540
|
+
}
|
|
1541
|
+
return msg;
|
|
1542
|
+
}
|