@fugood/llama.node 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.2.0",
4
+ "version": "1.2.1",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.2.0",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.2.0",
77
- "@fugood/node-llama-linux-x64-cuda": "1.2.0",
78
- "@fugood/node-llama-linux-arm64": "1.2.0",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.2.0",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.2.0",
81
- "@fugood/node-llama-win32-x64": "1.2.0",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.2.0",
83
- "@fugood/node-llama-win32-x64-cuda": "1.2.0",
84
- "@fugood/node-llama-win32-arm64": "1.2.0",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.2.0",
86
- "@fugood/node-llama-darwin-x64": "1.2.0",
87
- "@fugood/node-llama-darwin-arm64": "1.2.0"
75
+ "@fugood/node-llama-linux-x64": "1.2.1",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.2.1",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.2.1",
78
+ "@fugood/node-llama-linux-arm64": "1.2.1",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.2.1",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.2.1",
81
+ "@fugood/node-llama-win32-x64": "1.2.1",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.2.1",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.2.1",
84
+ "@fugood/node-llama-win32-arm64": "1.2.1",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.2.1",
86
+ "@fugood/node-llama-darwin-x64": "1.2.1",
87
+ "@fugood/node-llama-darwin-arm64": "1.2.1"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -118,7 +118,8 @@
118
118
  "**/*.test.ts"
119
119
  ],
120
120
  "testPathIgnorePatterns": [
121
- "<rootDir>/src/llama.rn/"
121
+ "<rootDir>/src/llama.rn/",
122
+ "<rootDir>/src/llama.cpp/"
122
123
  ]
123
124
  },
124
125
  "prettier": {
@@ -58,6 +58,12 @@ if (MSVC)
58
58
  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
59
59
  endif()
60
60
 
61
+ if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
62
+ set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
63
+ else()
64
+ set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
65
+ endif()
66
+
61
67
  #
62
68
  # option list
63
69
  #
@@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
82
88
  option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
83
89
  option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
84
90
  option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
91
+ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
85
92
 
86
93
  # 3rd party libs
87
94
  option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
@@ -745,6 +745,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
745
745
 
746
746
  #endif // LLAMA_USE_CURL
747
747
 
748
+ //
749
+ // Docker registry functions
750
+ //
751
+
752
+ static std::string common_docker_get_token(const std::string & repo) {
753
+ std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
754
+
755
+ common_remote_params params;
756
+ auto res = common_remote_get_content(url, params);
757
+
758
+ if (res.first != 200) {
759
+ throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
760
+ }
761
+
762
+ std::string response_str(res.second.begin(), res.second.end());
763
+ nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
764
+
765
+ if (!response.contains("token")) {
766
+ throw std::runtime_error("Docker registry token response missing 'token' field");
767
+ }
768
+
769
+ return response["token"].get<std::string>();
770
+ }
771
+
772
+ static std::string common_docker_resolve_model(const std::string & docker) {
773
+ // Parse ai/smollm2:135M-Q4_K_M
774
+ size_t colon_pos = docker.find(':');
775
+ std::string repo, tag;
776
+ if (colon_pos != std::string::npos) {
777
+ repo = docker.substr(0, colon_pos);
778
+ tag = docker.substr(colon_pos + 1);
779
+ } else {
780
+ repo = docker;
781
+ tag = "latest";
782
+ }
783
+
784
+ // ai/ is the default
785
+ size_t slash_pos = docker.find('/');
786
+ if (slash_pos == std::string::npos) {
787
+ repo.insert(0, "ai/");
788
+ }
789
+
790
+ LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
791
+ try {
792
+ // --- helper: digest validation ---
793
+ auto validate_oci_digest = [](const std::string & digest) -> std::string {
794
+ // Expected: algo:hex ; start with sha256 (64 hex chars)
795
+ // You can extend this map if supporting other algorithms in future.
796
+ static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
797
+ std::smatch m;
798
+ if (!std::regex_match(digest, m, re)) {
799
+ throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
800
+ }
801
+ // normalize hex to lowercase
802
+ std::string normalized = digest;
803
+ std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
804
+ return std::tolower(c);
805
+ });
806
+ return normalized;
807
+ };
808
+
809
+ std::string token = common_docker_get_token(repo); // Get authentication token
810
+
811
+ // Get manifest
812
+ const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
813
+ std::string manifest_url = url_prefix + "/manifests/" + tag;
814
+ common_remote_params manifest_params;
815
+ manifest_params.headers.push_back("Authorization: Bearer " + token);
816
+ manifest_params.headers.push_back(
817
+ "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
818
+ auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
819
+ if (manifest_res.first != 200) {
820
+ throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
821
+ }
822
+
823
+ std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
824
+ nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
825
+ std::string gguf_digest; // Find the GGUF layer
826
+ if (manifest.contains("layers")) {
827
+ for (const auto & layer : manifest["layers"]) {
828
+ if (layer.contains("mediaType")) {
829
+ std::string media_type = layer["mediaType"].get<std::string>();
830
+ if (media_type == "application/vnd.docker.ai.gguf.v3" ||
831
+ media_type.find("gguf") != std::string::npos) {
832
+ gguf_digest = layer["digest"].get<std::string>();
833
+ break;
834
+ }
835
+ }
836
+ }
837
+ }
838
+
839
+ if (gguf_digest.empty()) {
840
+ throw std::runtime_error("No GGUF layer found in Docker manifest");
841
+ }
842
+
843
+ // Validate & normalize digest
844
+ gguf_digest = validate_oci_digest(gguf_digest);
845
+ LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
846
+
847
+ // Prepare local filename
848
+ std::string model_filename = repo;
849
+ std::replace(model_filename.begin(), model_filename.end(), '/', '_');
850
+ model_filename += "_" + tag + ".gguf";
851
+ std::string local_path = fs_get_cache_file(model_filename);
852
+
853
+ const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
854
+ if (!common_download_file_single(blob_url, local_path, token, false)) {
855
+ throw std::runtime_error("Failed to download Docker Model");
856
+ }
857
+
858
+ LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
859
+ return local_path;
860
+ } catch (const std::exception & e) {
861
+ LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
862
+ throw;
863
+ }
864
+ }
865
+
748
866
  //
749
867
  // utils
750
868
  //
@@ -795,7 +913,9 @@ static handle_model_result common_params_handle_model(
795
913
  handle_model_result result;
796
914
  // handle pre-fill default model path and url based on hf_repo and hf_file
797
915
  {
798
- if (!model.hf_repo.empty()) {
916
+ if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
917
+ model.path = common_docker_resolve_model(model.docker_repo);
918
+ } else if (!model.hf_repo.empty()) {
799
919
  // short-hand to avoid specifying --hf-file -> default it to --model
800
920
  if (model.hf_file.empty()) {
801
921
  if (model.path.empty()) {
@@ -1184,7 +1304,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
1184
1304
  } else {
1185
1305
  for (const auto & device : dev_names) {
1186
1306
  auto * dev = ggml_backend_dev_by_name(device.c_str());
1187
- if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
1307
+ if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
1188
1308
  throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
1189
1309
  }
1190
1310
  devices.push_back(dev);
@@ -1194,7 +1314,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
1194
1314
  return devices;
1195
1315
  }
1196
1316
 
1197
- static void add_rpc_devices(std::string servers) {
1317
+ static void add_rpc_devices(const std::string & servers) {
1198
1318
  auto rpc_servers = string_split<std::string>(servers, ',');
1199
1319
  if (rpc_servers.empty()) {
1200
1320
  throw std::invalid_argument("no RPC servers specified");
@@ -1584,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1584
1704
  [](common_params & params, const std::string & value) {
1585
1705
  params.system_prompt = value;
1586
1706
  }
1587
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1707
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1588
1708
  add_opt(common_arg(
1589
1709
  {"--no-perf"},
1590
1710
  string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -2396,24 +2516,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2396
2516
  {"--list-devices"},
2397
2517
  "print list of available devices and exit",
2398
2518
  [](common_params &) {
2399
- std::vector<ggml_backend_dev_t> rpc_devices;
2400
- std::vector<ggml_backend_dev_t> all_devices;
2519
+ std::vector<ggml_backend_dev_t> devices;
2401
2520
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2402
2521
  auto * dev = ggml_backend_dev_get(i);
2403
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
2404
- ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
2405
- if (ggml_backend_reg_name(reg) == std::string("RPC")) {
2406
- rpc_devices.push_back(dev);
2407
- } else {
2408
- all_devices.push_back(dev);
2409
- }
2522
+ if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
2523
+ devices.push_back(dev);
2410
2524
  }
2411
2525
  }
2412
- // insert RPC devices in front
2413
- all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
2414
2526
  printf("Available devices:\n");
2415
- for (size_t i = 0; i < all_devices.size(); ++i) {
2416
- auto * dev = all_devices[i];
2527
+ for (auto * dev : devices) {
2417
2528
  size_t free, total;
2418
2529
  ggml_backend_dev_memory(dev, &free, &total);
2419
2530
  printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
@@ -2437,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2437
2548
  {"--cpu-moe", "-cmoe"},
2438
2549
  "keep all Mixture of Experts (MoE) weights in the CPU",
2439
2550
  [](common_params & params) {
2440
- params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2551
+ params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2441
2552
  }
2442
2553
  ).set_env("LLAMA_ARG_CPU_MOE"));
2443
2554
  add_opt(common_arg(
@@ -2450,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2450
2561
  for (int i = 0; i < value; ++i) {
2451
2562
  // keep strings alive and avoid leaking memory by storing them in a static vector
2452
2563
  static std::list<std::string> buft_overrides;
2453
- buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2564
+ buft_overrides.push_back(llm_ffn_exps_block_regex(i));
2454
2565
  params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2455
2566
  }
2456
2567
  }
@@ -2459,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2459
2570
  {"--cpu-moe-draft", "-cmoed"},
2460
2571
  "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2461
2572
  [](common_params & params) {
2462
- params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2573
+ params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2463
2574
  }
2464
2575
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2465
2576
  add_opt(common_arg(
@@ -2471,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2471
2582
  }
2472
2583
  for (int i = 0; i < value; ++i) {
2473
2584
  static std::list<std::string> buft_overrides_draft;
2474
- buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2585
+ buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
2475
2586
  params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
2476
2587
  }
2477
2588
  }
@@ -2636,6 +2747,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2636
2747
  params.model.url = value;
2637
2748
  }
2638
2749
  ).set_env("LLAMA_ARG_MODEL_URL"));
2750
+ add_opt(common_arg(
2751
+ { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
2752
+ "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
2753
+ "example: gemma3\n"
2754
+ "(default: unused)",
2755
+ [](common_params & params, const std::string & value) {
2756
+ params.model.docker_repo = value;
2757
+ }
2758
+ ).set_env("LLAMA_ARG_DOCKER_REPO"));
2639
2759
  add_opt(common_arg(
2640
2760
  {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
2641
2761
  "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -193,10 +193,11 @@ struct common_params_sampling {
193
193
  };
194
194
 
195
195
  struct common_params_model {
196
- std::string path = ""; // model local path // NOLINT
197
- std::string url = ""; // model url to download // NOLINT
198
- std::string hf_repo = ""; // HF repo // NOLINT
199
- std::string hf_file = ""; // HF file // NOLINT
196
+ std::string path = ""; // model local path // NOLINT
197
+ std::string url = ""; // model url to download // NOLINT
198
+ std::string hf_repo = ""; // HF repo // NOLINT
199
+ std::string hf_file = ""; // HF file // NOLINT
200
+ std::string docker_repo = ""; // Docker repo // NOLINT
200
201
  };
201
202
 
202
203
  struct common_params_speculative {
@@ -288,9 +289,9 @@ struct common_params {
288
289
  float rope_freq_base = 0.0f; // RoPE base frequency
289
290
  float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
290
291
  float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
291
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
292
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
293
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
292
+ float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
293
+ float yarn_beta_fast = -1.0f; // YaRN low correction dim
294
+ float yarn_beta_slow = -1.0f; // YaRN high correction dim
294
295
  int32_t yarn_orig_ctx = 0; // YaRN original context length
295
296
 
296
297
  // offload params
@@ -453,7 +454,7 @@ struct common_params {
453
454
 
454
455
  std::string slot_save_path;
455
456
 
456
- float slot_prompt_similarity = 0.5f;
457
+ float slot_prompt_similarity = 0.1f;
457
458
 
458
459
  // batched-bench params
459
460
  bool is_pp_shared = false;
@@ -734,6 +735,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
734
735
 
735
736
  }
736
737
 
738
+ //
739
+ // MoE utils
740
+ //
741
+
742
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
743
+
744
+ static std::string llm_ffn_exps_block_regex(int idx) {
745
+ return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
746
+ }
747
+
748
+ static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
749
+ return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
750
+ }
751
+
737
752
  //
738
753
  // training utils
739
754
  //
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
257
257
  };
258
258
 
259
259
  static bool is_reserved_name(const std::string & name) {
260
- static std::unordered_set<std::string> RESERVED_NAMES;
261
- if (RESERVED_NAMES.empty()) {
262
- RESERVED_NAMES.insert("root");
263
- for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
264
- for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
265
- }
260
+ static const std::unordered_set<std::string> RESERVED_NAMES = [] {
261
+ std::unordered_set<std::string> s;
262
+ s.insert("root");
263
+ for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
264
+ for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
265
+ return s;
266
+ }();
266
267
  return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
267
268
  }
268
269
 
@@ -190,7 +190,6 @@ option(GGML_WEBGPU "ggml: use WebGPU"
190
190
  option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
191
191
  option(GGML_ZDNN "ggml: use zDNN" OFF)
192
192
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
193
- option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
194
193
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
195
194
  option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
196
195
  option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
@@ -39,6 +39,7 @@ extern "C" {
39
39
  // user-code should use only these functions
40
40
  //
41
41
 
42
+ // TODO: remove in the future
42
43
  GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
43
44
 
44
45
  GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
@@ -7,8 +7,6 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
- GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
11
-
12
10
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
13
11
 
14
12
  #ifdef __cplusplus
@@ -284,19 +284,19 @@ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexc
284
284
  // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
285
285
  //
286
286
  #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
287
- const type prefix##0 = (pointer)->array[0]; \
287
+ const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
288
288
  GGML_UNUSED(prefix##0);
289
289
  #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
290
290
  GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
291
- const type prefix##1 = (pointer)->array[1]; \
291
+ const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
292
292
  GGML_UNUSED(prefix##1);
293
293
  #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
294
294
  GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
295
- const type prefix##2 = (pointer)->array[2]; \
295
+ const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
296
296
  GGML_UNUSED(prefix##2);
297
297
  #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
298
298
  GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
299
- const type prefix##3 = (pointer)->array[3]; \
299
+ const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
300
300
  GGML_UNUSED(prefix##3);
301
301
 
302
302
  #define GGML_TENSOR_UNARY_OP_LOCALS \
@@ -8599,7 +8599,6 @@ static void ggml_compute_forward_timestep_embedding_f32(
8599
8599
  }
8600
8600
  if (dim % 2 != 0 && ith == 0) {
8601
8601
  embed_data[2 * half] = 0.f;
8602
- embed_data[dim] = 0.f;
8603
8602
  }
8604
8603
  }
8605
8604
  }
@@ -96,6 +96,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
96
96
  { LLM_ARCH_DREAM, "dream" },
97
97
  { LLM_ARCH_SMALLTHINKER, "smallthinker" },
98
98
  { LLM_ARCH_LLADA, "llada" },
99
+ { LLM_ARCH_LLADA_MOE, "llada-moe" },
99
100
  { LLM_ARCH_SEED_OSS, "seed_oss" },
100
101
  { LLM_ARCH_UNKNOWN, "(unknown)" },
101
102
  };
@@ -139,6 +140,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
139
140
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
140
141
  { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
141
142
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
143
+ { LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
142
144
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
143
145
  { LLM_KV_SWIN_NORM, "%s.swin_norm" },
144
146
  { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
@@ -169,19 +171,25 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
169
171
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
170
172
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
171
173
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
174
+ { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
175
+ { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
172
176
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
173
177
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
174
178
 
175
- { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
176
- { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
177
- { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
178
- { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
179
- { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
180
- { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
181
- { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
182
- { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
183
- { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
184
- { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
179
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
180
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
181
+ { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
182
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
183
+ { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
184
+ { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
185
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
186
+ { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
187
+ { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
188
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
189
+ { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
190
+ { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
191
+ { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
192
+ { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
185
193
 
186
194
  { LLM_KV_SPLIT_NO, "split.no" },
187
195
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -398,12 +406,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
398
406
  { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
399
407
  { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
400
408
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
409
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
410
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
411
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
401
412
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
402
413
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
403
414
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
404
415
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
405
416
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
406
417
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
418
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
407
419
  { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
408
420
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
409
421
  },
@@ -2136,6 +2148,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2136
2148
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2137
2149
  },
2138
2150
  },
2151
+ {
2152
+ LLM_ARCH_LLADA_MOE,
2153
+ {
2154
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2155
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2156
+ { LLM_TENSOR_OUTPUT, "output" },
2157
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2158
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2159
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2160
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2161
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2162
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2163
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2164
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2165
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2166
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2167
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2168
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2169
+ },
2170
+ },
2139
2171
  {
2140
2172
  LLM_ARCH_SEED_OSS,
2141
2173
  {
@@ -2416,6 +2448,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
2416
2448
  switch (arch) {
2417
2449
  case LLM_ARCH_DREAM:
2418
2450
  case LLM_ARCH_LLADA:
2451
+ case LLM_ARCH_LLADA_MOE:
2419
2452
  return true;
2420
2453
  default:
2421
2454
  return false;
@@ -100,6 +100,7 @@ enum llm_arch {
100
100
  LLM_ARCH_DREAM,
101
101
  LLM_ARCH_SMALLTHINKER,
102
102
  LLM_ARCH_LLADA,
103
+ LLM_ARCH_LLADA_MOE,
103
104
  LLM_ARCH_SEED_OSS,
104
105
  LLM_ARCH_UNKNOWN,
105
106
  };
@@ -143,6 +144,7 @@ enum llm_kv {
143
144
  LLM_KV_DECODER_START_TOKEN_ID,
144
145
  LLM_KV_DECODER_BLOCK_COUNT,
145
146
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
147
+ LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
146
148
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
147
149
  LLM_KV_SWIN_NORM,
148
150
  LLM_KV_RESCALE_EVERY_N_LAYERS,
@@ -173,6 +175,8 @@ enum llm_kv {
173
175
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
174
176
  LLM_KV_ATTENTION_SLIDING_WINDOW,
175
177
  LLM_KV_ATTENTION_SCALE,
178
+ LLM_KV_ATTENTION_OUTPUT_SCALE,
179
+ LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
176
180
  LLM_KV_ATTENTION_KEY_LENGTH_MLA,
177
181
  LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
178
182
 
@@ -186,6 +190,10 @@ enum llm_kv {
186
190
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
187
191
  LLM_KV_ROPE_SCALING_FINETUNED,
188
192
  LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
193
+ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
194
+ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
195
+ LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
196
+ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
189
197
 
190
198
  LLM_KV_SPLIT_NO,
191
199
  LLM_KV_SPLIT_COUNT,
@@ -70,6 +70,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
70
70
  { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
71
71
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
72
72
  { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
73
+ { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
73
74
  };
74
75
 
75
76
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -204,6 +205,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
204
205
  return LLM_CHAT_TEMPLATE_KIMI_K2;
205
206
  } else if (tmpl_contains("<seed:bos>")) {
206
207
  return LLM_CHAT_TEMPLATE_SEED_OSS;
208
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
209
+ return LLM_CHAT_TEMPLATE_GROK_2;
207
210
  }
208
211
  return LLM_CHAT_TEMPLATE_UNKNOWN;
209
212
  }
@@ -763,6 +766,20 @@ int32_t llm_chat_apply_template(
763
766
  if (add_ass) {
764
767
  ss << "<seed:bos>assistant\n";
765
768
  }
769
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
770
+ for (auto message : chat) {
771
+ std::string role(message->role);
772
+ if (role == "system") {
773
+ ss << "System: " << trim(message->content) << "<|separator|>\n\n";
774
+ } else if (role == "user") {
775
+ ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
776
+ } else if (role == "assistant") {
777
+ ss << "Assistant: " << message->content << "<|separator|>\n\n";
778
+ }
779
+ }
780
+ if (add_ass) {
781
+ ss << "Assistant:";
782
+ }
766
783
  } else {
767
784
  // template not supported
768
785
  return -1;
@@ -50,6 +50,7 @@ enum llm_chat_template {
50
50
  LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
51
51
  LLM_CHAT_TEMPLATE_KIMI_K2,
52
52
  LLM_CHAT_TEMPLATE_SEED_OSS,
53
+ LLM_CHAT_TEMPLATE_GROK_2,
53
54
  LLM_CHAT_TEMPLATE_UNKNOWN,
54
55
  };
55
56