@fugood/llama.node 1.2.0-rc.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +16 -15
- package/src/llama.cpp/CMakeLists.txt +7 -0
- package/src/llama.cpp/common/arg.cpp +141 -21
- package/src/llama.cpp/common/chat.cpp +139 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.h +23 -8
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +28 -7
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +10 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -1
- package/src/llama.cpp/src/llama-arch.cpp +44 -10
- package/src/llama.cpp/src/llama-arch.h +9 -0
- package/src/llama.cpp/src/llama-chat.cpp +17 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +13 -11
- package/src/llama.cpp/src/llama-graph.cpp +6 -5
- package/src/llama.cpp/src/llama-hparams.h +14 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
- package/src/llama.cpp/src/llama-kv-cache.h +8 -0
- package/src/llama.cpp/src/llama-model.cpp +386 -140
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-quant.cpp +6 -4
- package/src/llama.cpp/src/llama-vocab.cpp +13 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +53 -10
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.2.
|
|
4
|
+
"version": "1.2.1",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.2.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.2.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.2.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.2.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.2.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.2.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.2.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.2.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.2.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.2.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.2.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.2.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.2.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.2.1",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.2.1",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.2.1",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.2.1",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.2.1",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.2.1",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.2.1",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.2.1",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.2.1",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.2.1",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.2.1",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.2.1",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.2.1"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -118,7 +118,8 @@
|
|
|
118
118
|
"**/*.test.ts"
|
|
119
119
|
],
|
|
120
120
|
"testPathIgnorePatterns": [
|
|
121
|
-
"<rootDir>/src/llama.rn/"
|
|
121
|
+
"<rootDir>/src/llama.rn/",
|
|
122
|
+
"<rootDir>/src/llama.cpp/"
|
|
122
123
|
]
|
|
123
124
|
},
|
|
124
125
|
"prettier": {
|
|
@@ -58,6 +58,12 @@ if (MSVC)
|
|
|
58
58
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
|
59
59
|
endif()
|
|
60
60
|
|
|
61
|
+
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
|
|
62
|
+
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
|
|
63
|
+
else()
|
|
64
|
+
set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
|
|
65
|
+
endif()
|
|
66
|
+
|
|
61
67
|
#
|
|
62
68
|
# option list
|
|
63
69
|
#
|
|
@@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
|
|
82
88
|
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
|
83
89
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
|
84
90
|
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
|
91
|
+
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
|
85
92
|
|
|
86
93
|
# 3rd party libs
|
|
87
94
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
|
|
@@ -745,6 +745,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|
|
745
745
|
|
|
746
746
|
#endif // LLAMA_USE_CURL
|
|
747
747
|
|
|
748
|
+
//
|
|
749
|
+
// Docker registry functions
|
|
750
|
+
//
|
|
751
|
+
|
|
752
|
+
static std::string common_docker_get_token(const std::string & repo) {
|
|
753
|
+
std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
|
|
754
|
+
|
|
755
|
+
common_remote_params params;
|
|
756
|
+
auto res = common_remote_get_content(url, params);
|
|
757
|
+
|
|
758
|
+
if (res.first != 200) {
|
|
759
|
+
throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
std::string response_str(res.second.begin(), res.second.end());
|
|
763
|
+
nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
|
|
764
|
+
|
|
765
|
+
if (!response.contains("token")) {
|
|
766
|
+
throw std::runtime_error("Docker registry token response missing 'token' field");
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
return response["token"].get<std::string>();
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
static std::string common_docker_resolve_model(const std::string & docker) {
|
|
773
|
+
// Parse ai/smollm2:135M-Q4_K_M
|
|
774
|
+
size_t colon_pos = docker.find(':');
|
|
775
|
+
std::string repo, tag;
|
|
776
|
+
if (colon_pos != std::string::npos) {
|
|
777
|
+
repo = docker.substr(0, colon_pos);
|
|
778
|
+
tag = docker.substr(colon_pos + 1);
|
|
779
|
+
} else {
|
|
780
|
+
repo = docker;
|
|
781
|
+
tag = "latest";
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
// ai/ is the default
|
|
785
|
+
size_t slash_pos = docker.find('/');
|
|
786
|
+
if (slash_pos == std::string::npos) {
|
|
787
|
+
repo.insert(0, "ai/");
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
|
|
791
|
+
try {
|
|
792
|
+
// --- helper: digest validation ---
|
|
793
|
+
auto validate_oci_digest = [](const std::string & digest) -> std::string {
|
|
794
|
+
// Expected: algo:hex ; start with sha256 (64 hex chars)
|
|
795
|
+
// You can extend this map if supporting other algorithms in future.
|
|
796
|
+
static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
|
|
797
|
+
std::smatch m;
|
|
798
|
+
if (!std::regex_match(digest, m, re)) {
|
|
799
|
+
throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
|
|
800
|
+
}
|
|
801
|
+
// normalize hex to lowercase
|
|
802
|
+
std::string normalized = digest;
|
|
803
|
+
std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
|
|
804
|
+
return std::tolower(c);
|
|
805
|
+
});
|
|
806
|
+
return normalized;
|
|
807
|
+
};
|
|
808
|
+
|
|
809
|
+
std::string token = common_docker_get_token(repo); // Get authentication token
|
|
810
|
+
|
|
811
|
+
// Get manifest
|
|
812
|
+
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
|
|
813
|
+
std::string manifest_url = url_prefix + "/manifests/" + tag;
|
|
814
|
+
common_remote_params manifest_params;
|
|
815
|
+
manifest_params.headers.push_back("Authorization: Bearer " + token);
|
|
816
|
+
manifest_params.headers.push_back(
|
|
817
|
+
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
|
|
818
|
+
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
|
|
819
|
+
if (manifest_res.first != 200) {
|
|
820
|
+
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
|
|
824
|
+
nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
|
|
825
|
+
std::string gguf_digest; // Find the GGUF layer
|
|
826
|
+
if (manifest.contains("layers")) {
|
|
827
|
+
for (const auto & layer : manifest["layers"]) {
|
|
828
|
+
if (layer.contains("mediaType")) {
|
|
829
|
+
std::string media_type = layer["mediaType"].get<std::string>();
|
|
830
|
+
if (media_type == "application/vnd.docker.ai.gguf.v3" ||
|
|
831
|
+
media_type.find("gguf") != std::string::npos) {
|
|
832
|
+
gguf_digest = layer["digest"].get<std::string>();
|
|
833
|
+
break;
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
if (gguf_digest.empty()) {
|
|
840
|
+
throw std::runtime_error("No GGUF layer found in Docker manifest");
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
// Validate & normalize digest
|
|
844
|
+
gguf_digest = validate_oci_digest(gguf_digest);
|
|
845
|
+
LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
|
|
846
|
+
|
|
847
|
+
// Prepare local filename
|
|
848
|
+
std::string model_filename = repo;
|
|
849
|
+
std::replace(model_filename.begin(), model_filename.end(), '/', '_');
|
|
850
|
+
model_filename += "_" + tag + ".gguf";
|
|
851
|
+
std::string local_path = fs_get_cache_file(model_filename);
|
|
852
|
+
|
|
853
|
+
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
|
854
|
+
if (!common_download_file_single(blob_url, local_path, token, false)) {
|
|
855
|
+
throw std::runtime_error("Failed to download Docker Model");
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
|
|
859
|
+
return local_path;
|
|
860
|
+
} catch (const std::exception & e) {
|
|
861
|
+
LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
|
|
862
|
+
throw;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
|
|
748
866
|
//
|
|
749
867
|
// utils
|
|
750
868
|
//
|
|
@@ -795,7 +913,9 @@ static handle_model_result common_params_handle_model(
|
|
|
795
913
|
handle_model_result result;
|
|
796
914
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
|
797
915
|
{
|
|
798
|
-
if (!model.
|
|
916
|
+
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
|
|
917
|
+
model.path = common_docker_resolve_model(model.docker_repo);
|
|
918
|
+
} else if (!model.hf_repo.empty()) {
|
|
799
919
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
800
920
|
if (model.hf_file.empty()) {
|
|
801
921
|
if (model.path.empty()) {
|
|
@@ -1184,7 +1304,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
|
|
1184
1304
|
} else {
|
|
1185
1305
|
for (const auto & device : dev_names) {
|
|
1186
1306
|
auto * dev = ggml_backend_dev_by_name(device.c_str());
|
|
1187
|
-
if (!dev || ggml_backend_dev_type(dev)
|
|
1307
|
+
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
1188
1308
|
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
|
|
1189
1309
|
}
|
|
1190
1310
|
devices.push_back(dev);
|
|
@@ -1194,7 +1314,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
|
|
1194
1314
|
return devices;
|
|
1195
1315
|
}
|
|
1196
1316
|
|
|
1197
|
-
static void add_rpc_devices(std::string servers) {
|
|
1317
|
+
static void add_rpc_devices(const std::string & servers) {
|
|
1198
1318
|
auto rpc_servers = string_split<std::string>(servers, ',');
|
|
1199
1319
|
if (rpc_servers.empty()) {
|
|
1200
1320
|
throw std::invalid_argument("no RPC servers specified");
|
|
@@ -1584,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1584
1704
|
[](common_params & params, const std::string & value) {
|
|
1585
1705
|
params.system_prompt = value;
|
|
1586
1706
|
}
|
|
1587
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1707
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1588
1708
|
add_opt(common_arg(
|
|
1589
1709
|
{"--no-perf"},
|
|
1590
1710
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
@@ -2396,24 +2516,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2396
2516
|
{"--list-devices"},
|
|
2397
2517
|
"print list of available devices and exit",
|
|
2398
2518
|
[](common_params &) {
|
|
2399
|
-
std::vector<ggml_backend_dev_t>
|
|
2400
|
-
std::vector<ggml_backend_dev_t> all_devices;
|
|
2519
|
+
std::vector<ggml_backend_dev_t> devices;
|
|
2401
2520
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
2402
2521
|
auto * dev = ggml_backend_dev_get(i);
|
|
2403
|
-
if (ggml_backend_dev_type(dev)
|
|
2404
|
-
|
|
2405
|
-
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
|
2406
|
-
rpc_devices.push_back(dev);
|
|
2407
|
-
} else {
|
|
2408
|
-
all_devices.push_back(dev);
|
|
2409
|
-
}
|
|
2522
|
+
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
2523
|
+
devices.push_back(dev);
|
|
2410
2524
|
}
|
|
2411
2525
|
}
|
|
2412
|
-
// insert RPC devices in front
|
|
2413
|
-
all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
|
|
2414
2526
|
printf("Available devices:\n");
|
|
2415
|
-
for (
|
|
2416
|
-
auto * dev = all_devices[i];
|
|
2527
|
+
for (auto * dev : devices) {
|
|
2417
2528
|
size_t free, total;
|
|
2418
2529
|
ggml_backend_dev_memory(dev, &free, &total);
|
|
2419
2530
|
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
|
@@ -2437,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2437
2548
|
{"--cpu-moe", "-cmoe"},
|
|
2438
2549
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
2439
2550
|
[](common_params & params) {
|
|
2440
|
-
params.tensor_buft_overrides.push_back(
|
|
2551
|
+
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2441
2552
|
}
|
|
2442
2553
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
|
2443
2554
|
add_opt(common_arg(
|
|
@@ -2450,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2450
2561
|
for (int i = 0; i < value; ++i) {
|
|
2451
2562
|
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
2452
2563
|
static std::list<std::string> buft_overrides;
|
|
2453
|
-
buft_overrides.push_back(
|
|
2564
|
+
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
|
|
2454
2565
|
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2455
2566
|
}
|
|
2456
2567
|
}
|
|
@@ -2459,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2459
2570
|
{"--cpu-moe-draft", "-cmoed"},
|
|
2460
2571
|
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
|
2461
2572
|
[](common_params & params) {
|
|
2462
|
-
params.speculative.tensor_buft_overrides.push_back(
|
|
2573
|
+
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2463
2574
|
}
|
|
2464
2575
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
2465
2576
|
add_opt(common_arg(
|
|
@@ -2471,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2471
2582
|
}
|
|
2472
2583
|
for (int i = 0; i < value; ++i) {
|
|
2473
2584
|
static std::list<std::string> buft_overrides_draft;
|
|
2474
|
-
buft_overrides_draft.push_back(
|
|
2585
|
+
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
|
|
2475
2586
|
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2476
2587
|
}
|
|
2477
2588
|
}
|
|
@@ -2636,6 +2747,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2636
2747
|
params.model.url = value;
|
|
2637
2748
|
}
|
|
2638
2749
|
).set_env("LLAMA_ARG_MODEL_URL"));
|
|
2750
|
+
add_opt(common_arg(
|
|
2751
|
+
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
|
|
2752
|
+
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
|
|
2753
|
+
"example: gemma3\n"
|
|
2754
|
+
"(default: unused)",
|
|
2755
|
+
[](common_params & params, const std::string & value) {
|
|
2756
|
+
params.model.docker_repo = value;
|
|
2757
|
+
}
|
|
2758
|
+
).set_env("LLAMA_ARG_DOCKER_REPO"));
|
|
2639
2759
|
add_opt(common_arg(
|
|
2640
2760
|
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
|
2641
2761
|
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
|
@@ -618,6 +618,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
618
618
|
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
|
|
619
619
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
|
620
620
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
621
|
+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
|
|
621
622
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
622
623
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
623
624
|
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
@@ -685,11 +686,13 @@ static void parse_json_tool_calls(
|
|
|
685
686
|
size_t from = std::string::npos;
|
|
686
687
|
auto first = true;
|
|
687
688
|
while (true) {
|
|
689
|
+
auto start_pos = builder.pos();
|
|
688
690
|
auto res = function_regex_start_only && first
|
|
689
691
|
? builder.try_consume_regex(*function_regex_start_only)
|
|
690
692
|
: function_regex
|
|
691
693
|
? builder.try_find_regex(*function_regex, from)
|
|
692
694
|
: std::nullopt;
|
|
695
|
+
|
|
693
696
|
if (res) {
|
|
694
697
|
std::string name;
|
|
695
698
|
if (get_function_name) {
|
|
@@ -724,6 +727,8 @@ static void parse_json_tool_calls(
|
|
|
724
727
|
return;
|
|
725
728
|
}
|
|
726
729
|
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
730
|
+
} else {
|
|
731
|
+
builder.move_to(start_pos);
|
|
727
732
|
}
|
|
728
733
|
break;
|
|
729
734
|
}
|
|
@@ -1374,6 +1379,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
|
|
1374
1379
|
}
|
|
1375
1380
|
return data;
|
|
1376
1381
|
}
|
|
1382
|
+
|
|
1383
|
+
static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1384
|
+
common_chat_params data;
|
|
1385
|
+
|
|
1386
|
+
// Pass thinking context for DeepSeek V3.1 template
|
|
1387
|
+
json additional_context = {
|
|
1388
|
+
{"thinking", inputs.enable_thinking},
|
|
1389
|
+
};
|
|
1390
|
+
|
|
1391
|
+
auto prompt = apply(tmpl, inputs,
|
|
1392
|
+
/* messages_override= */ inputs.messages,
|
|
1393
|
+
/* tools_override= */ std::nullopt,
|
|
1394
|
+
additional_context);
|
|
1395
|
+
data.prompt = prompt;
|
|
1396
|
+
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
|
|
1397
|
+
if (string_ends_with(data.prompt, "<think>")) {
|
|
1398
|
+
if (!inputs.enable_thinking) {
|
|
1399
|
+
data.prompt += "</think>";
|
|
1400
|
+
} else {
|
|
1401
|
+
data.thinking_forced_open = true;
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1405
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
|
|
1406
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1407
|
+
std::vector<std::string> tool_rules;
|
|
1408
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1409
|
+
const auto & function = tool.at("function");
|
|
1410
|
+
std::string name = function.at("name");
|
|
1411
|
+
auto parameters = function.at("parameters");
|
|
1412
|
+
builder.resolve_refs(parameters);
|
|
1413
|
+
tool_rules.push_back(builder.add_rule(name + "-call",
|
|
1414
|
+
"( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
|
|
1415
|
+
"\" " + builder.add_schema(name + "-args", parameters) + " "
|
|
1416
|
+
"\"<|tool▁call▁end|>\""));
|
|
1417
|
+
});
|
|
1418
|
+
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
|
|
1419
|
+
// so we accept common variants (then it's all constrained)
|
|
1420
|
+
builder.add_rule("root",
|
|
1421
|
+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
|
1422
|
+
"( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
|
|
1423
|
+
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
|
|
1424
|
+
"\"<|tool▁calls▁end|>\""
|
|
1425
|
+
" space");
|
|
1426
|
+
data.grammar_triggers.push_back({
|
|
1427
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1428
|
+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1429
|
+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1430
|
+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
|
|
1431
|
+
"(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
|
|
1432
|
+
});
|
|
1433
|
+
data.preserved_tokens = {
|
|
1434
|
+
"<think>",
|
|
1435
|
+
"</think>",
|
|
1436
|
+
"<|tool▁calls▁begin|>",
|
|
1437
|
+
"<|tool▁call▁begin|>",
|
|
1438
|
+
"<|tool▁sep|>",
|
|
1439
|
+
"<|tool▁call▁end|>",
|
|
1440
|
+
"<|tool▁calls▁end|>",
|
|
1441
|
+
};
|
|
1442
|
+
});
|
|
1443
|
+
}
|
|
1444
|
+
return data;
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1377
1447
|
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
1378
1448
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
1379
1449
|
if (!builder.syntax().parse_tool_calls) {
|
|
@@ -1395,6 +1465,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1395
1465
|
tool_calls_end);
|
|
1396
1466
|
}
|
|
1397
1467
|
|
|
1468
|
+
static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
|
|
1469
|
+
static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
|
|
1470
|
+
|
|
1471
|
+
static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
|
|
1472
|
+
static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
|
|
1473
|
+
static const common_regex tool_calls_end("<|tool▁calls▁end|>");
|
|
1474
|
+
|
|
1475
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1476
|
+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
|
|
1477
|
+
builder.add_content(builder.consume_rest());
|
|
1478
|
+
return;
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
LOG_DBG("%s: parse_tool_calls\n", __func__);
|
|
1482
|
+
|
|
1483
|
+
parse_json_tool_calls(
|
|
1484
|
+
builder,
|
|
1485
|
+
/* block_open= */ tool_calls_begin,
|
|
1486
|
+
/* function_regex_start_only= */ std::nullopt,
|
|
1487
|
+
function_regex,
|
|
1488
|
+
close_regex,
|
|
1489
|
+
tool_calls_end);
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
|
|
1493
|
+
// DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
|
|
1494
|
+
// First try to parse using the standard reasoning parsing method
|
|
1495
|
+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
|
|
1496
|
+
|
|
1497
|
+
auto start_pos = builder.pos();
|
|
1498
|
+
auto found_end_think = builder.try_find_literal("</think>");
|
|
1499
|
+
builder.move_to(start_pos);
|
|
1500
|
+
|
|
1501
|
+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
|
|
1502
|
+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
|
|
1503
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1504
|
+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
|
|
1505
|
+
// If reasoning was parsed successfully, the remaining content is regular content
|
|
1506
|
+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
|
|
1507
|
+
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
|
|
1508
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1509
|
+
} else {
|
|
1510
|
+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
|
1511
|
+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
|
|
1512
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1513
|
+
return;
|
|
1514
|
+
}
|
|
1515
|
+
// If no reasoning tags found, check if we should treat everything as reasoning
|
|
1516
|
+
if (builder.syntax().thinking_forced_open) {
|
|
1517
|
+
// If thinking is forced open but no tags found, treat everything as reasoning
|
|
1518
|
+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
|
|
1519
|
+
builder.add_reasoning_content(builder.consume_rest());
|
|
1520
|
+
} else {
|
|
1521
|
+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
|
|
1522
|
+
// <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
|
|
1523
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1398
1528
|
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1399
1529
|
common_chat_params data;
|
|
1400
1530
|
auto prompt = apply(tmpl, inputs);
|
|
@@ -2351,6 +2481,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2351
2481
|
}
|
|
2352
2482
|
}
|
|
2353
2483
|
|
|
2484
|
+
// DeepSeek V3.1: detect based on specific patterns in the template
|
|
2485
|
+
if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
|
|
2486
|
+
params.json_schema.is_null()) {
|
|
2487
|
+
return common_chat_params_init_deepseek_v3_1(tmpl, params);
|
|
2488
|
+
}
|
|
2489
|
+
|
|
2354
2490
|
// DeepSeek R1: use handler in all cases except json schema (thinking / tools).
|
|
2355
2491
|
if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
|
|
2356
2492
|
return common_chat_params_init_deepseek_r1(tmpl, params);
|
|
@@ -2523,6 +2659,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2523
2659
|
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
|
|
2524
2660
|
common_chat_parse_deepseek_r1(builder);
|
|
2525
2661
|
break;
|
|
2662
|
+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
|
|
2663
|
+
common_chat_parse_deepseek_v3_1(builder);
|
|
2664
|
+
break;
|
|
2526
2665
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
|
|
2527
2666
|
common_chat_parse_functionary_v3_2(builder);
|
|
2528
2667
|
break;
|
|
@@ -118,6 +118,7 @@ enum common_chat_format {
|
|
|
118
118
|
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
|
119
119
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
|
120
120
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
121
|
+
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
|
121
122
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
122
123
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
123
124
|
COMMON_CHAT_FORMAT_GRANITE,
|
|
@@ -193,10 +193,11 @@ struct common_params_sampling {
|
|
|
193
193
|
};
|
|
194
194
|
|
|
195
195
|
struct common_params_model {
|
|
196
|
-
std::string path
|
|
197
|
-
std::string url
|
|
198
|
-
std::string hf_repo
|
|
199
|
-
std::string hf_file
|
|
196
|
+
std::string path = ""; // model local path // NOLINT
|
|
197
|
+
std::string url = ""; // model url to download // NOLINT
|
|
198
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
199
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
200
|
+
std::string docker_repo = ""; // Docker repo // NOLINT
|
|
200
201
|
};
|
|
201
202
|
|
|
202
203
|
struct common_params_speculative {
|
|
@@ -288,9 +289,9 @@ struct common_params {
|
|
|
288
289
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
|
289
290
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
290
291
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
|
291
|
-
float yarn_attn_factor =
|
|
292
|
-
float yarn_beta_fast =
|
|
293
|
-
float yarn_beta_slow =
|
|
292
|
+
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
|
|
293
|
+
float yarn_beta_fast = -1.0f; // YaRN low correction dim
|
|
294
|
+
float yarn_beta_slow = -1.0f; // YaRN high correction dim
|
|
294
295
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
295
296
|
|
|
296
297
|
// offload params
|
|
@@ -453,7 +454,7 @@ struct common_params {
|
|
|
453
454
|
|
|
454
455
|
std::string slot_save_path;
|
|
455
456
|
|
|
456
|
-
float slot_prompt_similarity = 0.
|
|
457
|
+
float slot_prompt_similarity = 0.1f;
|
|
457
458
|
|
|
458
459
|
// batched-bench params
|
|
459
460
|
bool is_pp_shared = false;
|
|
@@ -734,6 +735,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
734
735
|
|
|
735
736
|
}
|
|
736
737
|
|
|
738
|
+
//
|
|
739
|
+
// MoE utils
|
|
740
|
+
//
|
|
741
|
+
|
|
742
|
+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
|
|
743
|
+
|
|
744
|
+
static std::string llm_ffn_exps_block_regex(int idx) {
|
|
745
|
+
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
|
|
749
|
+
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
|
|
750
|
+
}
|
|
751
|
+
|
|
737
752
|
//
|
|
738
753
|
// training utils
|
|
739
754
|
//
|
|
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
|
|
257
257
|
};
|
|
258
258
|
|
|
259
259
|
static bool is_reserved_name(const std::string & name) {
|
|
260
|
-
static std::unordered_set<std::string> RESERVED_NAMES
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
for (const auto &p : PRIMITIVE_RULES)
|
|
264
|
-
for (const auto &p : STRING_FORMAT_RULES)
|
|
265
|
-
|
|
260
|
+
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
|
|
261
|
+
std::unordered_set<std::string> s;
|
|
262
|
+
s.insert("root");
|
|
263
|
+
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
|
|
264
|
+
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
|
|
265
|
+
return s;
|
|
266
|
+
}();
|
|
266
267
|
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
|
267
268
|
}
|
|
268
269
|
|
|
@@ -843,9 +844,10 @@ public:
|
|
|
843
844
|
_build_object_rule(
|
|
844
845
|
properties, required, name,
|
|
845
846
|
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
|
|
846
|
-
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
|
|
847
|
+
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
|
|
847
848
|
std::unordered_set<std::string> required;
|
|
848
849
|
std::vector<std::pair<std::string, json>> properties;
|
|
850
|
+
std::map<std::string, size_t> enum_values;
|
|
849
851
|
std::string hybrid_name = name;
|
|
850
852
|
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
|
|
851
853
|
if (comp_schema.contains("$ref")) {
|
|
@@ -857,6 +859,14 @@ public:
|
|
|
857
859
|
required.insert(prop.key());
|
|
858
860
|
}
|
|
859
861
|
}
|
|
862
|
+
} else if (comp_schema.contains("enum")) {
|
|
863
|
+
for (const auto & v : comp_schema["enum"]) {
|
|
864
|
+
const auto rule = _generate_constant_rule(v);
|
|
865
|
+
if (enum_values.find(rule) == enum_values.end()) {
|
|
866
|
+
enum_values[rule] = 0;
|
|
867
|
+
}
|
|
868
|
+
enum_values[rule] += 1;
|
|
869
|
+
}
|
|
860
870
|
} else {
|
|
861
871
|
// todo warning
|
|
862
872
|
}
|
|
@@ -870,6 +880,17 @@ public:
|
|
|
870
880
|
add_component(t, true);
|
|
871
881
|
}
|
|
872
882
|
}
|
|
883
|
+
if (!enum_values.empty()) {
|
|
884
|
+
std::vector<std::string> enum_intersection;
|
|
885
|
+
for (const auto & p : enum_values) {
|
|
886
|
+
if (p.second == schema["allOf"].size()) {
|
|
887
|
+
enum_intersection.push_back(p.first);
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
if (!enum_intersection.empty()) {
|
|
891
|
+
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
|
|
892
|
+
}
|
|
893
|
+
}
|
|
873
894
|
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
|
|
874
895
|
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
|
|
875
896
|
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
|
|
@@ -190,7 +190,6 @@ option(GGML_WEBGPU "ggml: use WebGPU"
|
|
|
190
190
|
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
191
191
|
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
|
192
192
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
193
|
-
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
194
193
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
195
194
|
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
|
196
195
|
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|