@fugood/llama.node 1.2.0-rc.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +16 -15
  2. package/src/llama.cpp/CMakeLists.txt +7 -0
  3. package/src/llama.cpp/common/arg.cpp +141 -21
  4. package/src/llama.cpp/common/chat.cpp +139 -0
  5. package/src/llama.cpp/common/chat.h +1 -0
  6. package/src/llama.cpp/common/common.h +23 -8
  7. package/src/llama.cpp/common/json-schema-to-grammar.cpp +28 -7
  8. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  9. package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
  10. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  11. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -6
  12. package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
  13. package/src/llama.cpp/ggml/include/ggml.h +10 -5
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -1
  21. package/src/llama.cpp/src/llama-arch.cpp +44 -10
  22. package/src/llama.cpp/src/llama-arch.h +9 -0
  23. package/src/llama.cpp/src/llama-chat.cpp +17 -0
  24. package/src/llama.cpp/src/llama-chat.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +13 -11
  26. package/src/llama.cpp/src/llama-graph.cpp +6 -5
  27. package/src/llama.cpp/src/llama-hparams.h +14 -3
  28. package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
  29. package/src/llama.cpp/src/llama-kv-cache.h +8 -0
  30. package/src/llama.cpp/src/llama-model.cpp +386 -140
  31. package/src/llama.cpp/src/llama-model.h +3 -0
  32. package/src/llama.cpp/src/llama-quant.cpp +6 -4
  33. package/src/llama.cpp/src/llama-vocab.cpp +13 -1
  34. package/src/llama.cpp/src/llama-vocab.h +1 -0
  35. package/src/llama.cpp/src/llama.cpp +53 -10
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.2.0-rc.0",
4
+ "version": "1.2.1",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.2.0-rc.0",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.2.0-rc.0",
77
- "@fugood/node-llama-linux-x64-cuda": "1.2.0-rc.0",
78
- "@fugood/node-llama-linux-arm64": "1.2.0-rc.0",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.2.0-rc.0",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.2.0-rc.0",
81
- "@fugood/node-llama-win32-x64": "1.2.0-rc.0",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.2.0-rc.0",
83
- "@fugood/node-llama-win32-x64-cuda": "1.2.0-rc.0",
84
- "@fugood/node-llama-win32-arm64": "1.2.0-rc.0",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.2.0-rc.0",
86
- "@fugood/node-llama-darwin-x64": "1.2.0-rc.0",
87
- "@fugood/node-llama-darwin-arm64": "1.2.0-rc.0"
75
+ "@fugood/node-llama-linux-x64": "1.2.1",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.2.1",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.2.1",
78
+ "@fugood/node-llama-linux-arm64": "1.2.1",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.2.1",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.2.1",
81
+ "@fugood/node-llama-win32-x64": "1.2.1",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.2.1",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.2.1",
84
+ "@fugood/node-llama-win32-arm64": "1.2.1",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.2.1",
86
+ "@fugood/node-llama-darwin-x64": "1.2.1",
87
+ "@fugood/node-llama-darwin-arm64": "1.2.1"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -118,7 +118,8 @@
118
118
  "**/*.test.ts"
119
119
  ],
120
120
  "testPathIgnorePatterns": [
121
- "<rootDir>/src/llama.rn/"
121
+ "<rootDir>/src/llama.rn/",
122
+ "<rootDir>/src/llama.cpp/"
122
123
  ]
123
124
  },
124
125
  "prettier": {
@@ -58,6 +58,12 @@ if (MSVC)
58
58
  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
59
59
  endif()
60
60
 
61
+ if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
62
+ set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
63
+ else()
64
+ set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
65
+ endif()
66
+
61
67
  #
62
68
  # option list
63
69
  #
@@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
82
88
  option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
83
89
  option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
84
90
  option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
91
+ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
85
92
 
86
93
  # 3rd party libs
87
94
  option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
@@ -745,6 +745,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
745
745
 
746
746
  #endif // LLAMA_USE_CURL
747
747
 
748
+ //
749
+ // Docker registry functions
750
+ //
751
+
752
+ static std::string common_docker_get_token(const std::string & repo) {
753
+ std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
754
+
755
+ common_remote_params params;
756
+ auto res = common_remote_get_content(url, params);
757
+
758
+ if (res.first != 200) {
759
+ throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
760
+ }
761
+
762
+ std::string response_str(res.second.begin(), res.second.end());
763
+ nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
764
+
765
+ if (!response.contains("token")) {
766
+ throw std::runtime_error("Docker registry token response missing 'token' field");
767
+ }
768
+
769
+ return response["token"].get<std::string>();
770
+ }
771
+
772
+ static std::string common_docker_resolve_model(const std::string & docker) {
773
+ // Parse ai/smollm2:135M-Q4_K_M
774
+ size_t colon_pos = docker.find(':');
775
+ std::string repo, tag;
776
+ if (colon_pos != std::string::npos) {
777
+ repo = docker.substr(0, colon_pos);
778
+ tag = docker.substr(colon_pos + 1);
779
+ } else {
780
+ repo = docker;
781
+ tag = "latest";
782
+ }
783
+
784
+ // ai/ is the default
785
+ size_t slash_pos = docker.find('/');
786
+ if (slash_pos == std::string::npos) {
787
+ repo.insert(0, "ai/");
788
+ }
789
+
790
+ LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
791
+ try {
792
+ // --- helper: digest validation ---
793
+ auto validate_oci_digest = [](const std::string & digest) -> std::string {
794
+ // Expected: algo:hex ; start with sha256 (64 hex chars)
795
+ // You can extend this map if supporting other algorithms in future.
796
+ static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
797
+ std::smatch m;
798
+ if (!std::regex_match(digest, m, re)) {
799
+ throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
800
+ }
801
+ // normalize hex to lowercase
802
+ std::string normalized = digest;
803
+ std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
804
+ return std::tolower(c);
805
+ });
806
+ return normalized;
807
+ };
808
+
809
+ std::string token = common_docker_get_token(repo); // Get authentication token
810
+
811
+ // Get manifest
812
+ const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
813
+ std::string manifest_url = url_prefix + "/manifests/" + tag;
814
+ common_remote_params manifest_params;
815
+ manifest_params.headers.push_back("Authorization: Bearer " + token);
816
+ manifest_params.headers.push_back(
817
+ "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
818
+ auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
819
+ if (manifest_res.first != 200) {
820
+ throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
821
+ }
822
+
823
+ std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
824
+ nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
825
+ std::string gguf_digest; // Find the GGUF layer
826
+ if (manifest.contains("layers")) {
827
+ for (const auto & layer : manifest["layers"]) {
828
+ if (layer.contains("mediaType")) {
829
+ std::string media_type = layer["mediaType"].get<std::string>();
830
+ if (media_type == "application/vnd.docker.ai.gguf.v3" ||
831
+ media_type.find("gguf") != std::string::npos) {
832
+ gguf_digest = layer["digest"].get<std::string>();
833
+ break;
834
+ }
835
+ }
836
+ }
837
+ }
838
+
839
+ if (gguf_digest.empty()) {
840
+ throw std::runtime_error("No GGUF layer found in Docker manifest");
841
+ }
842
+
843
+ // Validate & normalize digest
844
+ gguf_digest = validate_oci_digest(gguf_digest);
845
+ LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
846
+
847
+ // Prepare local filename
848
+ std::string model_filename = repo;
849
+ std::replace(model_filename.begin(), model_filename.end(), '/', '_');
850
+ model_filename += "_" + tag + ".gguf";
851
+ std::string local_path = fs_get_cache_file(model_filename);
852
+
853
+ const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
854
+ if (!common_download_file_single(blob_url, local_path, token, false)) {
855
+ throw std::runtime_error("Failed to download Docker Model");
856
+ }
857
+
858
+ LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
859
+ return local_path;
860
+ } catch (const std::exception & e) {
861
+ LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
862
+ throw;
863
+ }
864
+ }
865
+
748
866
  //
749
867
  // utils
750
868
  //
@@ -795,7 +913,9 @@ static handle_model_result common_params_handle_model(
795
913
  handle_model_result result;
796
914
  // handle pre-fill default model path and url based on hf_repo and hf_file
797
915
  {
798
- if (!model.hf_repo.empty()) {
916
+ if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
917
+ model.path = common_docker_resolve_model(model.docker_repo);
918
+ } else if (!model.hf_repo.empty()) {
799
919
  // short-hand to avoid specifying --hf-file -> default it to --model
800
920
  if (model.hf_file.empty()) {
801
921
  if (model.path.empty()) {
@@ -1184,7 +1304,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
1184
1304
  } else {
1185
1305
  for (const auto & device : dev_names) {
1186
1306
  auto * dev = ggml_backend_dev_by_name(device.c_str());
1187
- if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
1307
+ if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
1188
1308
  throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
1189
1309
  }
1190
1310
  devices.push_back(dev);
@@ -1194,7 +1314,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
1194
1314
  return devices;
1195
1315
  }
1196
1316
 
1197
- static void add_rpc_devices(std::string servers) {
1317
+ static void add_rpc_devices(const std::string & servers) {
1198
1318
  auto rpc_servers = string_split<std::string>(servers, ',');
1199
1319
  if (rpc_servers.empty()) {
1200
1320
  throw std::invalid_argument("no RPC servers specified");
@@ -1584,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1584
1704
  [](common_params & params, const std::string & value) {
1585
1705
  params.system_prompt = value;
1586
1706
  }
1587
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1707
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1588
1708
  add_opt(common_arg(
1589
1709
  {"--no-perf"},
1590
1710
  string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -2396,24 +2516,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2396
2516
  {"--list-devices"},
2397
2517
  "print list of available devices and exit",
2398
2518
  [](common_params &) {
2399
- std::vector<ggml_backend_dev_t> rpc_devices;
2400
- std::vector<ggml_backend_dev_t> all_devices;
2519
+ std::vector<ggml_backend_dev_t> devices;
2401
2520
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2402
2521
  auto * dev = ggml_backend_dev_get(i);
2403
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
2404
- ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
2405
- if (ggml_backend_reg_name(reg) == std::string("RPC")) {
2406
- rpc_devices.push_back(dev);
2407
- } else {
2408
- all_devices.push_back(dev);
2409
- }
2522
+ if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
2523
+ devices.push_back(dev);
2410
2524
  }
2411
2525
  }
2412
- // insert RPC devices in front
2413
- all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
2414
2526
  printf("Available devices:\n");
2415
- for (size_t i = 0; i < all_devices.size(); ++i) {
2416
- auto * dev = all_devices[i];
2527
+ for (auto * dev : devices) {
2417
2528
  size_t free, total;
2418
2529
  ggml_backend_dev_memory(dev, &free, &total);
2419
2530
  printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
@@ -2437,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2437
2548
  {"--cpu-moe", "-cmoe"},
2438
2549
  "keep all Mixture of Experts (MoE) weights in the CPU",
2439
2550
  [](common_params & params) {
2440
- params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2551
+ params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2441
2552
  }
2442
2553
  ).set_env("LLAMA_ARG_CPU_MOE"));
2443
2554
  add_opt(common_arg(
@@ -2450,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2450
2561
  for (int i = 0; i < value; ++i) {
2451
2562
  // keep strings alive and avoid leaking memory by storing them in a static vector
2452
2563
  static std::list<std::string> buft_overrides;
2453
- buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2564
+ buft_overrides.push_back(llm_ffn_exps_block_regex(i));
2454
2565
  params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2455
2566
  }
2456
2567
  }
@@ -2459,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2459
2570
  {"--cpu-moe-draft", "-cmoed"},
2460
2571
  "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2461
2572
  [](common_params & params) {
2462
- params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2573
+ params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2463
2574
  }
2464
2575
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2465
2576
  add_opt(common_arg(
@@ -2471,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2471
2582
  }
2472
2583
  for (int i = 0; i < value; ++i) {
2473
2584
  static std::list<std::string> buft_overrides_draft;
2474
- buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2585
+ buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
2475
2586
  params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
2476
2587
  }
2477
2588
  }
@@ -2636,6 +2747,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2636
2747
  params.model.url = value;
2637
2748
  }
2638
2749
  ).set_env("LLAMA_ARG_MODEL_URL"));
2750
+ add_opt(common_arg(
2751
+ { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
2752
+ "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
2753
+ "example: gemma3\n"
2754
+ "(default: unused)",
2755
+ [](common_params & params, const std::string & value) {
2756
+ params.model.docker_repo = value;
2757
+ }
2758
+ ).set_env("LLAMA_ARG_DOCKER_REPO"));
2639
2759
  add_opt(common_arg(
2640
2760
  {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
2641
2761
  "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -618,6 +618,7 @@ const char * common_chat_format_name(common_chat_format format) {
618
618
  case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
619
619
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
620
620
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
621
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
621
622
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
622
623
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
623
624
  case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
@@ -685,11 +686,13 @@ static void parse_json_tool_calls(
685
686
  size_t from = std::string::npos;
686
687
  auto first = true;
687
688
  while (true) {
689
+ auto start_pos = builder.pos();
688
690
  auto res = function_regex_start_only && first
689
691
  ? builder.try_consume_regex(*function_regex_start_only)
690
692
  : function_regex
691
693
  ? builder.try_find_regex(*function_regex, from)
692
694
  : std::nullopt;
695
+
693
696
  if (res) {
694
697
  std::string name;
695
698
  if (get_function_name) {
@@ -724,6 +727,8 @@ static void parse_json_tool_calls(
724
727
  return;
725
728
  }
726
729
  throw common_chat_msg_partial_exception("incomplete tool call");
730
+ } else {
731
+ builder.move_to(start_pos);
727
732
  }
728
733
  break;
729
734
  }
@@ -1374,6 +1379,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
1374
1379
  }
1375
1380
  return data;
1376
1381
  }
1382
+
1383
+ static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1384
+ common_chat_params data;
1385
+
1386
+ // Pass thinking context for DeepSeek V3.1 template
1387
+ json additional_context = {
1388
+ {"thinking", inputs.enable_thinking},
1389
+ };
1390
+
1391
+ auto prompt = apply(tmpl, inputs,
1392
+ /* messages_override= */ inputs.messages,
1393
+ /* tools_override= */ std::nullopt,
1394
+ additional_context);
1395
+ data.prompt = prompt;
1396
+ data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
1397
+ if (string_ends_with(data.prompt, "<think>")) {
1398
+ if (!inputs.enable_thinking) {
1399
+ data.prompt += "</think>";
1400
+ } else {
1401
+ data.thinking_forced_open = true;
1402
+ }
1403
+ }
1404
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
1405
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
1406
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1407
+ std::vector<std::string> tool_rules;
1408
+ foreach_function(inputs.tools, [&](const json & tool) {
1409
+ const auto & function = tool.at("function");
1410
+ std::string name = function.at("name");
1411
+ auto parameters = function.at("parameters");
1412
+ builder.resolve_refs(parameters);
1413
+ tool_rules.push_back(builder.add_rule(name + "-call",
1414
+ "( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
1415
+ "\" " + builder.add_schema(name + "-args", parameters) + " "
1416
+ "\"<|tool▁call▁end|>\""));
1417
+ });
1418
+ // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
1419
+ // so we accept common variants (then it's all constrained)
1420
+ builder.add_rule("root",
1421
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1422
+ "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
1423
+ "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
1424
+ "\"<|tool▁calls▁end|>\""
1425
+ " space");
1426
+ data.grammar_triggers.push_back({
1427
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1428
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
1429
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1430
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1431
+ "(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
1432
+ });
1433
+ data.preserved_tokens = {
1434
+ "<think>",
1435
+ "</think>",
1436
+ "<|tool▁calls▁begin|>",
1437
+ "<|tool▁call▁begin|>",
1438
+ "<|tool▁sep|>",
1439
+ "<|tool▁call▁end|>",
1440
+ "<|tool▁calls▁end|>",
1441
+ };
1442
+ });
1443
+ }
1444
+ return data;
1445
+ }
1446
+
1377
1447
  static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1378
1448
  builder.try_parse_reasoning("<think>", "</think>");
1379
1449
  if (!builder.syntax().parse_tool_calls) {
@@ -1395,6 +1465,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1395
1465
  tool_calls_end);
1396
1466
  }
1397
1467
 
1468
+ static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
1469
+ static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
1470
+
1471
+ static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
1472
+ static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
1473
+ static const common_regex tool_calls_end("<|tool▁calls▁end|>");
1474
+
1475
+ if (!builder.syntax().parse_tool_calls) {
1476
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
1477
+ builder.add_content(builder.consume_rest());
1478
+ return;
1479
+ }
1480
+
1481
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
1482
+
1483
+ parse_json_tool_calls(
1484
+ builder,
1485
+ /* block_open= */ tool_calls_begin,
1486
+ /* function_regex_start_only= */ std::nullopt,
1487
+ function_regex,
1488
+ close_regex,
1489
+ tool_calls_end);
1490
+ }
1491
+
1492
+ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
1493
+ // DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1494
+ // First try to parse using the standard reasoning parsing method
1495
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1496
+
1497
+ auto start_pos = builder.pos();
1498
+ auto found_end_think = builder.try_find_literal("</think>");
1499
+ builder.move_to(start_pos);
1500
+
1501
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1502
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1503
+ common_chat_parse_deepseek_v3_1_content(builder);
1504
+ } else if (builder.try_parse_reasoning("<think>", "</think>")) {
1505
+ // If reasoning was parsed successfully, the remaining content is regular content
1506
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1507
+ // </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
1508
+ common_chat_parse_deepseek_v3_1_content(builder);
1509
+ } else {
1510
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1511
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1512
+ common_chat_parse_deepseek_v3_1_content(builder);
1513
+ return;
1514
+ }
1515
+ // If no reasoning tags found, check if we should treat everything as reasoning
1516
+ if (builder.syntax().thinking_forced_open) {
1517
+ // If thinking is forced open but no tags found, treat everything as reasoning
1518
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1519
+ builder.add_reasoning_content(builder.consume_rest());
1520
+ } else {
1521
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1522
+ // <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
1523
+ common_chat_parse_deepseek_v3_1_content(builder);
1524
+ }
1525
+ }
1526
+ }
1527
+
1398
1528
  static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1399
1529
  common_chat_params data;
1400
1530
  auto prompt = apply(tmpl, inputs);
@@ -2351,6 +2481,12 @@ static common_chat_params common_chat_templates_apply_jinja(
2351
2481
  }
2352
2482
  }
2353
2483
 
2484
+ // DeepSeek V3.1: detect based on specific patterns in the template
2485
+ if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
2486
+ params.json_schema.is_null()) {
2487
+ return common_chat_params_init_deepseek_v3_1(tmpl, params);
2488
+ }
2489
+
2354
2490
  // DeepSeek R1: use handler in all cases except json schema (thinking / tools).
2355
2491
  if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
2356
2492
  return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2523,6 +2659,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2523
2659
  case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
2524
2660
  common_chat_parse_deepseek_r1(builder);
2525
2661
  break;
2662
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
2663
+ common_chat_parse_deepseek_v3_1(builder);
2664
+ break;
2526
2665
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
2527
2666
  common_chat_parse_functionary_v3_2(builder);
2528
2667
  break;
@@ -118,6 +118,7 @@ enum common_chat_format {
118
118
  COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
119
119
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
120
120
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
121
+ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
121
122
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
122
123
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
124
  COMMON_CHAT_FORMAT_GRANITE,
@@ -193,10 +193,11 @@ struct common_params_sampling {
193
193
  };
194
194
 
195
195
  struct common_params_model {
196
- std::string path = ""; // model local path // NOLINT
197
- std::string url = ""; // model url to download // NOLINT
198
- std::string hf_repo = ""; // HF repo // NOLINT
199
- std::string hf_file = ""; // HF file // NOLINT
196
+ std::string path = ""; // model local path // NOLINT
197
+ std::string url = ""; // model url to download // NOLINT
198
+ std::string hf_repo = ""; // HF repo // NOLINT
199
+ std::string hf_file = ""; // HF file // NOLINT
200
+ std::string docker_repo = ""; // Docker repo // NOLINT
200
201
  };
201
202
 
202
203
  struct common_params_speculative {
@@ -288,9 +289,9 @@ struct common_params {
288
289
  float rope_freq_base = 0.0f; // RoPE base frequency
289
290
  float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
290
291
  float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
291
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
292
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
293
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
292
+ float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
293
+ float yarn_beta_fast = -1.0f; // YaRN low correction dim
294
+ float yarn_beta_slow = -1.0f; // YaRN high correction dim
294
295
  int32_t yarn_orig_ctx = 0; // YaRN original context length
295
296
 
296
297
  // offload params
@@ -453,7 +454,7 @@ struct common_params {
453
454
 
454
455
  std::string slot_save_path;
455
456
 
456
- float slot_prompt_similarity = 0.5f;
457
+ float slot_prompt_similarity = 0.1f;
457
458
 
458
459
  // batched-bench params
459
460
  bool is_pp_shared = false;
@@ -734,6 +735,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
734
735
 
735
736
  }
736
737
 
738
+ //
739
+ // MoE utils
740
+ //
741
+
742
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
743
+
744
+ static std::string llm_ffn_exps_block_regex(int idx) {
745
+ return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
746
+ }
747
+
748
+ static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
749
+ return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
750
+ }
751
+
737
752
  //
738
753
  // training utils
739
754
  //
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
257
257
  };
258
258
 
259
259
  static bool is_reserved_name(const std::string & name) {
260
- static std::unordered_set<std::string> RESERVED_NAMES;
261
- if (RESERVED_NAMES.empty()) {
262
- RESERVED_NAMES.insert("root");
263
- for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
264
- for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
265
- }
260
+ static const std::unordered_set<std::string> RESERVED_NAMES = [] {
261
+ std::unordered_set<std::string> s;
262
+ s.insert("root");
263
+ for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
264
+ for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
265
+ return s;
266
+ }();
266
267
  return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
267
268
  }
268
269
 
@@ -843,9 +844,10 @@ public:
843
844
  _build_object_rule(
844
845
  properties, required, name,
845
846
  schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
846
- } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
847
+ } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
847
848
  std::unordered_set<std::string> required;
848
849
  std::vector<std::pair<std::string, json>> properties;
850
+ std::map<std::string, size_t> enum_values;
849
851
  std::string hybrid_name = name;
850
852
  std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
851
853
  if (comp_schema.contains("$ref")) {
@@ -857,6 +859,14 @@ public:
857
859
  required.insert(prop.key());
858
860
  }
859
861
  }
862
+ } else if (comp_schema.contains("enum")) {
863
+ for (const auto & v : comp_schema["enum"]) {
864
+ const auto rule = _generate_constant_rule(v);
865
+ if (enum_values.find(rule) == enum_values.end()) {
866
+ enum_values[rule] = 0;
867
+ }
868
+ enum_values[rule] += 1;
869
+ }
860
870
  } else {
861
871
  // todo warning
862
872
  }
@@ -870,6 +880,17 @@ public:
870
880
  add_component(t, true);
871
881
  }
872
882
  }
883
+ if (!enum_values.empty()) {
884
+ std::vector<std::string> enum_intersection;
885
+ for (const auto & p : enum_values) {
886
+ if (p.second == schema["allOf"].size()) {
887
+ enum_intersection.push_back(p.first);
888
+ }
889
+ }
890
+ if (!enum_intersection.empty()) {
891
+ return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
892
+ }
893
+ }
873
894
  return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
874
895
  } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
875
896
  json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
@@ -190,7 +190,6 @@ option(GGML_WEBGPU "ggml: use WebGPU"
190
190
  option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
191
191
  option(GGML_ZDNN "ggml: use zDNN" OFF)
192
192
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
193
- option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
194
193
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
195
194
  option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
196
195
  option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})