@fugood/llama.node 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +16 -15
- package/src/llama.cpp/CMakeLists.txt +7 -0
- package/src/llama.cpp/common/arg.cpp +141 -21
- package/src/llama.cpp/common/common.h +23 -8
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +7 -6
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -1
- package/src/llama.cpp/src/llama-arch.cpp +43 -10
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +17 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +8 -8
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-hparams.h +13 -3
- package/src/llama.cpp/src/llama-model.cpp +328 -44
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-quant.cpp +3 -1
- package/src/llama.cpp/src/llama-vocab.cpp +13 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.2.
|
|
4
|
+
"version": "1.2.1",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.2.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.2.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.2.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.2.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.2.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.2.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.2.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.2.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.2.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.2.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.2.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.2.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.2.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.2.1",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.2.1",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.2.1",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.2.1",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.2.1",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.2.1",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.2.1",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.2.1",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.2.1",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.2.1",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.2.1",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.2.1",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.2.1"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -118,7 +118,8 @@
|
|
|
118
118
|
"**/*.test.ts"
|
|
119
119
|
],
|
|
120
120
|
"testPathIgnorePatterns": [
|
|
121
|
-
"<rootDir>/src/llama.rn/"
|
|
121
|
+
"<rootDir>/src/llama.rn/",
|
|
122
|
+
"<rootDir>/src/llama.cpp/"
|
|
122
123
|
]
|
|
123
124
|
},
|
|
124
125
|
"prettier": {
|
|
@@ -58,6 +58,12 @@ if (MSVC)
|
|
|
58
58
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
|
59
59
|
endif()
|
|
60
60
|
|
|
61
|
+
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
|
|
62
|
+
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
|
|
63
|
+
else()
|
|
64
|
+
set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
|
|
65
|
+
endif()
|
|
66
|
+
|
|
61
67
|
#
|
|
62
68
|
# option list
|
|
63
69
|
#
|
|
@@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
|
|
82
88
|
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
|
83
89
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
|
84
90
|
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
|
91
|
+
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
|
85
92
|
|
|
86
93
|
# 3rd party libs
|
|
87
94
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
|
|
@@ -745,6 +745,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|
|
745
745
|
|
|
746
746
|
#endif // LLAMA_USE_CURL
|
|
747
747
|
|
|
748
|
+
//
|
|
749
|
+
// Docker registry functions
|
|
750
|
+
//
|
|
751
|
+
|
|
752
|
+
static std::string common_docker_get_token(const std::string & repo) {
|
|
753
|
+
std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
|
|
754
|
+
|
|
755
|
+
common_remote_params params;
|
|
756
|
+
auto res = common_remote_get_content(url, params);
|
|
757
|
+
|
|
758
|
+
if (res.first != 200) {
|
|
759
|
+
throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
std::string response_str(res.second.begin(), res.second.end());
|
|
763
|
+
nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
|
|
764
|
+
|
|
765
|
+
if (!response.contains("token")) {
|
|
766
|
+
throw std::runtime_error("Docker registry token response missing 'token' field");
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
return response["token"].get<std::string>();
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
static std::string common_docker_resolve_model(const std::string & docker) {
|
|
773
|
+
// Parse ai/smollm2:135M-Q4_K_M
|
|
774
|
+
size_t colon_pos = docker.find(':');
|
|
775
|
+
std::string repo, tag;
|
|
776
|
+
if (colon_pos != std::string::npos) {
|
|
777
|
+
repo = docker.substr(0, colon_pos);
|
|
778
|
+
tag = docker.substr(colon_pos + 1);
|
|
779
|
+
} else {
|
|
780
|
+
repo = docker;
|
|
781
|
+
tag = "latest";
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
// ai/ is the default
|
|
785
|
+
size_t slash_pos = docker.find('/');
|
|
786
|
+
if (slash_pos == std::string::npos) {
|
|
787
|
+
repo.insert(0, "ai/");
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
|
|
791
|
+
try {
|
|
792
|
+
// --- helper: digest validation ---
|
|
793
|
+
auto validate_oci_digest = [](const std::string & digest) -> std::string {
|
|
794
|
+
// Expected: algo:hex ; start with sha256 (64 hex chars)
|
|
795
|
+
// You can extend this map if supporting other algorithms in future.
|
|
796
|
+
static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
|
|
797
|
+
std::smatch m;
|
|
798
|
+
if (!std::regex_match(digest, m, re)) {
|
|
799
|
+
throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
|
|
800
|
+
}
|
|
801
|
+
// normalize hex to lowercase
|
|
802
|
+
std::string normalized = digest;
|
|
803
|
+
std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
|
|
804
|
+
return std::tolower(c);
|
|
805
|
+
});
|
|
806
|
+
return normalized;
|
|
807
|
+
};
|
|
808
|
+
|
|
809
|
+
std::string token = common_docker_get_token(repo); // Get authentication token
|
|
810
|
+
|
|
811
|
+
// Get manifest
|
|
812
|
+
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
|
|
813
|
+
std::string manifest_url = url_prefix + "/manifests/" + tag;
|
|
814
|
+
common_remote_params manifest_params;
|
|
815
|
+
manifest_params.headers.push_back("Authorization: Bearer " + token);
|
|
816
|
+
manifest_params.headers.push_back(
|
|
817
|
+
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
|
|
818
|
+
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
|
|
819
|
+
if (manifest_res.first != 200) {
|
|
820
|
+
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
|
|
824
|
+
nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
|
|
825
|
+
std::string gguf_digest; // Find the GGUF layer
|
|
826
|
+
if (manifest.contains("layers")) {
|
|
827
|
+
for (const auto & layer : manifest["layers"]) {
|
|
828
|
+
if (layer.contains("mediaType")) {
|
|
829
|
+
std::string media_type = layer["mediaType"].get<std::string>();
|
|
830
|
+
if (media_type == "application/vnd.docker.ai.gguf.v3" ||
|
|
831
|
+
media_type.find("gguf") != std::string::npos) {
|
|
832
|
+
gguf_digest = layer["digest"].get<std::string>();
|
|
833
|
+
break;
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
if (gguf_digest.empty()) {
|
|
840
|
+
throw std::runtime_error("No GGUF layer found in Docker manifest");
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
// Validate & normalize digest
|
|
844
|
+
gguf_digest = validate_oci_digest(gguf_digest);
|
|
845
|
+
LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
|
|
846
|
+
|
|
847
|
+
// Prepare local filename
|
|
848
|
+
std::string model_filename = repo;
|
|
849
|
+
std::replace(model_filename.begin(), model_filename.end(), '/', '_');
|
|
850
|
+
model_filename += "_" + tag + ".gguf";
|
|
851
|
+
std::string local_path = fs_get_cache_file(model_filename);
|
|
852
|
+
|
|
853
|
+
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
|
854
|
+
if (!common_download_file_single(blob_url, local_path, token, false)) {
|
|
855
|
+
throw std::runtime_error("Failed to download Docker Model");
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
|
|
859
|
+
return local_path;
|
|
860
|
+
} catch (const std::exception & e) {
|
|
861
|
+
LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
|
|
862
|
+
throw;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
|
|
748
866
|
//
|
|
749
867
|
// utils
|
|
750
868
|
//
|
|
@@ -795,7 +913,9 @@ static handle_model_result common_params_handle_model(
|
|
|
795
913
|
handle_model_result result;
|
|
796
914
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
|
797
915
|
{
|
|
798
|
-
if (!model.
|
|
916
|
+
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
|
|
917
|
+
model.path = common_docker_resolve_model(model.docker_repo);
|
|
918
|
+
} else if (!model.hf_repo.empty()) {
|
|
799
919
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
|
800
920
|
if (model.hf_file.empty()) {
|
|
801
921
|
if (model.path.empty()) {
|
|
@@ -1184,7 +1304,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
|
|
1184
1304
|
} else {
|
|
1185
1305
|
for (const auto & device : dev_names) {
|
|
1186
1306
|
auto * dev = ggml_backend_dev_by_name(device.c_str());
|
|
1187
|
-
if (!dev || ggml_backend_dev_type(dev)
|
|
1307
|
+
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
1188
1308
|
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
|
|
1189
1309
|
}
|
|
1190
1310
|
devices.push_back(dev);
|
|
@@ -1194,7 +1314,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
|
|
1194
1314
|
return devices;
|
|
1195
1315
|
}
|
|
1196
1316
|
|
|
1197
|
-
static void add_rpc_devices(std::string servers) {
|
|
1317
|
+
static void add_rpc_devices(const std::string & servers) {
|
|
1198
1318
|
auto rpc_servers = string_split<std::string>(servers, ',');
|
|
1199
1319
|
if (rpc_servers.empty()) {
|
|
1200
1320
|
throw std::invalid_argument("no RPC servers specified");
|
|
@@ -1584,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1584
1704
|
[](common_params & params, const std::string & value) {
|
|
1585
1705
|
params.system_prompt = value;
|
|
1586
1706
|
}
|
|
1587
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1707
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1588
1708
|
add_opt(common_arg(
|
|
1589
1709
|
{"--no-perf"},
|
|
1590
1710
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
@@ -2396,24 +2516,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2396
2516
|
{"--list-devices"},
|
|
2397
2517
|
"print list of available devices and exit",
|
|
2398
2518
|
[](common_params &) {
|
|
2399
|
-
std::vector<ggml_backend_dev_t>
|
|
2400
|
-
std::vector<ggml_backend_dev_t> all_devices;
|
|
2519
|
+
std::vector<ggml_backend_dev_t> devices;
|
|
2401
2520
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
2402
2521
|
auto * dev = ggml_backend_dev_get(i);
|
|
2403
|
-
if (ggml_backend_dev_type(dev)
|
|
2404
|
-
|
|
2405
|
-
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
|
2406
|
-
rpc_devices.push_back(dev);
|
|
2407
|
-
} else {
|
|
2408
|
-
all_devices.push_back(dev);
|
|
2409
|
-
}
|
|
2522
|
+
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
2523
|
+
devices.push_back(dev);
|
|
2410
2524
|
}
|
|
2411
2525
|
}
|
|
2412
|
-
// insert RPC devices in front
|
|
2413
|
-
all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
|
|
2414
2526
|
printf("Available devices:\n");
|
|
2415
|
-
for (
|
|
2416
|
-
auto * dev = all_devices[i];
|
|
2527
|
+
for (auto * dev : devices) {
|
|
2417
2528
|
size_t free, total;
|
|
2418
2529
|
ggml_backend_dev_memory(dev, &free, &total);
|
|
2419
2530
|
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
|
@@ -2437,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2437
2548
|
{"--cpu-moe", "-cmoe"},
|
|
2438
2549
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
2439
2550
|
[](common_params & params) {
|
|
2440
|
-
params.tensor_buft_overrides.push_back(
|
|
2551
|
+
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2441
2552
|
}
|
|
2442
2553
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
|
2443
2554
|
add_opt(common_arg(
|
|
@@ -2450,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2450
2561
|
for (int i = 0; i < value; ++i) {
|
|
2451
2562
|
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
2452
2563
|
static std::list<std::string> buft_overrides;
|
|
2453
|
-
buft_overrides.push_back(
|
|
2564
|
+
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
|
|
2454
2565
|
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2455
2566
|
}
|
|
2456
2567
|
}
|
|
@@ -2459,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2459
2570
|
{"--cpu-moe-draft", "-cmoed"},
|
|
2460
2571
|
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
|
2461
2572
|
[](common_params & params) {
|
|
2462
|
-
params.speculative.tensor_buft_overrides.push_back(
|
|
2573
|
+
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
2463
2574
|
}
|
|
2464
2575
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
2465
2576
|
add_opt(common_arg(
|
|
@@ -2471,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2471
2582
|
}
|
|
2472
2583
|
for (int i = 0; i < value; ++i) {
|
|
2473
2584
|
static std::list<std::string> buft_overrides_draft;
|
|
2474
|
-
buft_overrides_draft.push_back(
|
|
2585
|
+
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
|
|
2475
2586
|
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2476
2587
|
}
|
|
2477
2588
|
}
|
|
@@ -2636,6 +2747,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2636
2747
|
params.model.url = value;
|
|
2637
2748
|
}
|
|
2638
2749
|
).set_env("LLAMA_ARG_MODEL_URL"));
|
|
2750
|
+
add_opt(common_arg(
|
|
2751
|
+
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
|
|
2752
|
+
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
|
|
2753
|
+
"example: gemma3\n"
|
|
2754
|
+
"(default: unused)",
|
|
2755
|
+
[](common_params & params, const std::string & value) {
|
|
2756
|
+
params.model.docker_repo = value;
|
|
2757
|
+
}
|
|
2758
|
+
).set_env("LLAMA_ARG_DOCKER_REPO"));
|
|
2639
2759
|
add_opt(common_arg(
|
|
2640
2760
|
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
|
|
2641
2761
|
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
|
|
@@ -193,10 +193,11 @@ struct common_params_sampling {
|
|
|
193
193
|
};
|
|
194
194
|
|
|
195
195
|
struct common_params_model {
|
|
196
|
-
std::string path
|
|
197
|
-
std::string url
|
|
198
|
-
std::string hf_repo
|
|
199
|
-
std::string hf_file
|
|
196
|
+
std::string path = ""; // model local path // NOLINT
|
|
197
|
+
std::string url = ""; // model url to download // NOLINT
|
|
198
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
199
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
200
|
+
std::string docker_repo = ""; // Docker repo // NOLINT
|
|
200
201
|
};
|
|
201
202
|
|
|
202
203
|
struct common_params_speculative {
|
|
@@ -288,9 +289,9 @@ struct common_params {
|
|
|
288
289
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
|
289
290
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
290
291
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
|
291
|
-
float yarn_attn_factor =
|
|
292
|
-
float yarn_beta_fast =
|
|
293
|
-
float yarn_beta_slow =
|
|
292
|
+
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
|
|
293
|
+
float yarn_beta_fast = -1.0f; // YaRN low correction dim
|
|
294
|
+
float yarn_beta_slow = -1.0f; // YaRN high correction dim
|
|
294
295
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
295
296
|
|
|
296
297
|
// offload params
|
|
@@ -453,7 +454,7 @@ struct common_params {
|
|
|
453
454
|
|
|
454
455
|
std::string slot_save_path;
|
|
455
456
|
|
|
456
|
-
float slot_prompt_similarity = 0.
|
|
457
|
+
float slot_prompt_similarity = 0.1f;
|
|
457
458
|
|
|
458
459
|
// batched-bench params
|
|
459
460
|
bool is_pp_shared = false;
|
|
@@ -734,6 +735,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
734
735
|
|
|
735
736
|
}
|
|
736
737
|
|
|
738
|
+
//
|
|
739
|
+
// MoE utils
|
|
740
|
+
//
|
|
741
|
+
|
|
742
|
+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
|
|
743
|
+
|
|
744
|
+
static std::string llm_ffn_exps_block_regex(int idx) {
|
|
745
|
+
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
|
|
749
|
+
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
|
|
750
|
+
}
|
|
751
|
+
|
|
737
752
|
//
|
|
738
753
|
// training utils
|
|
739
754
|
//
|
|
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
|
|
257
257
|
};
|
|
258
258
|
|
|
259
259
|
static bool is_reserved_name(const std::string & name) {
|
|
260
|
-
static std::unordered_set<std::string> RESERVED_NAMES
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
for (const auto &p : PRIMITIVE_RULES)
|
|
264
|
-
for (const auto &p : STRING_FORMAT_RULES)
|
|
265
|
-
|
|
260
|
+
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
|
|
261
|
+
std::unordered_set<std::string> s;
|
|
262
|
+
s.insert("root");
|
|
263
|
+
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
|
|
264
|
+
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
|
|
265
|
+
return s;
|
|
266
|
+
}();
|
|
266
267
|
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
|
267
268
|
}
|
|
268
269
|
|
|
@@ -190,7 +190,6 @@ option(GGML_WEBGPU "ggml: use WebGPU"
|
|
|
190
190
|
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
191
191
|
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
|
192
192
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
193
|
-
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
194
193
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
195
194
|
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
|
196
195
|
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
|
@@ -284,19 +284,19 @@ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexc
|
|
|
284
284
|
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
|
285
285
|
//
|
|
286
286
|
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
|
287
|
-
const type prefix##0 = (pointer)->array[0]; \
|
|
287
|
+
const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
|
|
288
288
|
GGML_UNUSED(prefix##0);
|
|
289
289
|
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
|
290
290
|
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
|
291
|
-
const type prefix##1 = (pointer)->array[1]; \
|
|
291
|
+
const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
|
|
292
292
|
GGML_UNUSED(prefix##1);
|
|
293
293
|
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
|
294
294
|
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
|
295
|
-
const type prefix##2 = (pointer)->array[2]; \
|
|
295
|
+
const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
|
|
296
296
|
GGML_UNUSED(prefix##2);
|
|
297
297
|
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
|
298
298
|
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
|
299
|
-
const type prefix##3 = (pointer)->array[3]; \
|
|
299
|
+
const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
|
|
300
300
|
GGML_UNUSED(prefix##3);
|
|
301
301
|
|
|
302
302
|
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
|
@@ -96,6 +96,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
96
96
|
{ LLM_ARCH_DREAM, "dream" },
|
|
97
97
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
|
98
98
|
{ LLM_ARCH_LLADA, "llada" },
|
|
99
|
+
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
|
|
99
100
|
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
|
100
101
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
101
102
|
};
|
|
@@ -139,6 +140,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
139
140
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
140
141
|
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
|
|
141
142
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
|
143
|
+
{ LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
|
|
142
144
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
|
143
145
|
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
|
144
146
|
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
|
@@ -169,19 +171,25 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
169
171
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
|
170
172
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
|
171
173
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
174
|
+
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
|
175
|
+
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
|
172
176
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
|
173
177
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
174
178
|
|
|
175
|
-
{ LLM_KV_ROPE_DIMENSION_COUNT,
|
|
176
|
-
{ LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
177
|
-
{ LLM_KV_ROPE_FREQ_BASE,
|
|
178
|
-
{ LLM_KV_ROPE_SCALE_LINEAR,
|
|
179
|
-
{ LLM_KV_ROPE_SCALING_TYPE,
|
|
180
|
-
{ LLM_KV_ROPE_SCALING_FACTOR,
|
|
181
|
-
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
|
182
|
-
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
183
|
-
{ LLM_KV_ROPE_SCALING_FINETUNED,
|
|
184
|
-
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
179
|
+
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
180
|
+
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
181
|
+
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
182
|
+
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
183
|
+
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
184
|
+
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
|
185
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
|
186
|
+
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
|
187
|
+
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
|
188
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
|
189
|
+
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
|
|
190
|
+
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
|
|
191
|
+
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
|
|
192
|
+
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
|
|
185
193
|
|
|
186
194
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
|
187
195
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
|
@@ -398,12 +406,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
398
406
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
399
407
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
400
408
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
409
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
410
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
411
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
401
412
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
|
402
413
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
|
403
414
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
404
415
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
405
416
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
406
417
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
418
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
407
419
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
408
420
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
409
421
|
},
|
|
@@ -2136,6 +2148,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2136
2148
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2137
2149
|
},
|
|
2138
2150
|
},
|
|
2151
|
+
{
|
|
2152
|
+
LLM_ARCH_LLADA_MOE,
|
|
2153
|
+
{
|
|
2154
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2155
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2156
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2157
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2158
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2159
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2160
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2161
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2162
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2163
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2164
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2165
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2166
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2167
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2168
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2169
|
+
},
|
|
2170
|
+
},
|
|
2139
2171
|
{
|
|
2140
2172
|
LLM_ARCH_SEED_OSS,
|
|
2141
2173
|
{
|
|
@@ -2416,6 +2448,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|
|
2416
2448
|
switch (arch) {
|
|
2417
2449
|
case LLM_ARCH_DREAM:
|
|
2418
2450
|
case LLM_ARCH_LLADA:
|
|
2451
|
+
case LLM_ARCH_LLADA_MOE:
|
|
2419
2452
|
return true;
|
|
2420
2453
|
default:
|
|
2421
2454
|
return false;
|
|
@@ -100,6 +100,7 @@ enum llm_arch {
|
|
|
100
100
|
LLM_ARCH_DREAM,
|
|
101
101
|
LLM_ARCH_SMALLTHINKER,
|
|
102
102
|
LLM_ARCH_LLADA,
|
|
103
|
+
LLM_ARCH_LLADA_MOE,
|
|
103
104
|
LLM_ARCH_SEED_OSS,
|
|
104
105
|
LLM_ARCH_UNKNOWN,
|
|
105
106
|
};
|
|
@@ -143,6 +144,7 @@ enum llm_kv {
|
|
|
143
144
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
144
145
|
LLM_KV_DECODER_BLOCK_COUNT,
|
|
145
146
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
|
147
|
+
LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
|
|
146
148
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
|
147
149
|
LLM_KV_SWIN_NORM,
|
|
148
150
|
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
|
@@ -173,6 +175,8 @@ enum llm_kv {
|
|
|
173
175
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
|
174
176
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
175
177
|
LLM_KV_ATTENTION_SCALE,
|
|
178
|
+
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
|
179
|
+
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
|
176
180
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
177
181
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
178
182
|
|
|
@@ -186,6 +190,10 @@ enum llm_kv {
|
|
|
186
190
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
187
191
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
|
188
192
|
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
193
|
+
LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
|
|
194
|
+
LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
|
|
195
|
+
LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
|
|
196
|
+
LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
|
|
189
197
|
|
|
190
198
|
LLM_KV_SPLIT_NO,
|
|
191
199
|
LLM_KV_SPLIT_COUNT,
|
|
@@ -70,6 +70,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
70
70
|
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
|
71
71
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
|
72
72
|
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
|
73
|
+
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
|
73
74
|
};
|
|
74
75
|
|
|
75
76
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -204,6 +205,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
204
205
|
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
|
205
206
|
} else if (tmpl_contains("<seed:bos>")) {
|
|
206
207
|
return LLM_CHAT_TEMPLATE_SEED_OSS;
|
|
208
|
+
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
|
|
209
|
+
return LLM_CHAT_TEMPLATE_GROK_2;
|
|
207
210
|
}
|
|
208
211
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
209
212
|
}
|
|
@@ -763,6 +766,20 @@ int32_t llm_chat_apply_template(
|
|
|
763
766
|
if (add_ass) {
|
|
764
767
|
ss << "<seed:bos>assistant\n";
|
|
765
768
|
}
|
|
769
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
|
|
770
|
+
for (auto message : chat) {
|
|
771
|
+
std::string role(message->role);
|
|
772
|
+
if (role == "system") {
|
|
773
|
+
ss << "System: " << trim(message->content) << "<|separator|>\n\n";
|
|
774
|
+
} else if (role == "user") {
|
|
775
|
+
ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
|
|
776
|
+
} else if (role == "assistant") {
|
|
777
|
+
ss << "Assistant: " << message->content << "<|separator|>\n\n";
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
if (add_ass) {
|
|
781
|
+
ss << "Assistant:";
|
|
782
|
+
}
|
|
766
783
|
} else {
|
|
767
784
|
// template not supported
|
|
768
785
|
return -1;
|