@fugood/llama.node 1.3.0-rc.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CMakeLists.txt CHANGED
@@ -64,13 +64,23 @@ endif()
64
64
  # Improve speed
65
65
  if(CMAKE_BUILD_TYPE STREQUAL "Release")
66
66
  if (MSVC)
67
- if (NOT GGML_VULKAN)
67
+ # Enable parallel compilation for all MSVC builds
68
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
69
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP")
70
+
71
+ if (NOT GGML_VULKAN AND NOT GGML_CUDA)
72
+ # Full optimization with LTCG for default builds
68
73
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Ob2 /Oi /Ot /Oy /GL")
69
74
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /Ob2 /Oi /Ot /Oy /GL")
70
75
  set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} /LTCG")
71
- else()
76
+ elseif(GGML_VULKAN)
77
+ # Reduced optimization for Vulkan builds
72
78
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O1 /Ob1 /bigobj")
73
79
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O1 /Ob1 /bigobj")
80
+ else()
81
+ # Faster linking for CUDA builds (no LTCG)
82
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Ob2 /Oi")
83
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /Ob2 /Oi")
74
84
  endif()
75
85
  else()
76
86
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -funroll-loops -flto=auto")
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.0-rc.6",
4
+ "version": "1.3.0",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.0-rc.6",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.6",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.6",
78
- "@fugood/node-llama-linux-arm64": "1.3.0-rc.6",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.6",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.6",
81
- "@fugood/node-llama-win32-x64": "1.3.0-rc.6",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.6",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.6",
84
- "@fugood/node-llama-win32-arm64": "1.3.0-rc.6",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.6",
86
- "@fugood/node-llama-darwin-x64": "1.3.0-rc.6",
87
- "@fugood/node-llama-darwin-arm64": "1.3.0-rc.6"
75
+ "@fugood/node-llama-linux-x64": "1.3.0",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.0",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.0",
78
+ "@fugood/node-llama-linux-arm64": "1.3.0",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.0",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.0",
81
+ "@fugood/node-llama-win32-x64": "1.3.0",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.0",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.3.0",
84
+ "@fugood/node-llama-win32-arm64": "1.3.0",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.0",
86
+ "@fugood/node-llama-darwin-x64": "1.3.0",
87
+ "@fugood/node-llama-darwin-arm64": "1.3.0"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -21,7 +21,7 @@ index fe290bf8f..d377e29b9 100644
21
21
 
22
22
  #
23
23
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
24
- index 8587140e1..7931a31a1 100644
24
+ index 63583fb22..f8be20148 100644
25
25
  --- a/src/llama.cpp/common/chat.cpp
26
26
  +++ b/src/llama.cpp/common/chat.cpp
27
27
  @@ -6,9 +6,6 @@
@@ -31,10 +31,10 @@ index 8587140e1..7931a31a1 100644
31
31
  -#include <minja/chat-template.hpp>
32
32
  -#include <minja/minja.hpp>
33
33
  -
34
+ #include <algorithm>
34
35
  #include <cstdio>
35
- #include <exception>
36
- #include <iostream>
37
- @@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
36
+ #include <cctype>
37
+ @@ -126,16 +123,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
38
38
  return diffs;
39
39
  }
40
40
 
@@ -51,18 +51,17 @@ index 8587140e1..7931a31a1 100644
51
51
  struct templates_params {
52
52
  json messages;
53
53
  json tools;
54
- @@ -807,8 +794,7 @@ static std::string apply(
55
- if (additional_context) {
54
+ @@ -812,7 +799,7 @@ static std::string apply(
56
55
  tmpl_inputs.extra_context.merge_patch(*additional_context);
57
56
  }
58
- - // TODO: add flag to control date/time, if only for testing purposes.
57
+ // TODO: add flag to control date/time, if only for testing purposes.
59
58
  - // tmpl_inputs.now = std::chrono::system_clock::now();
60
59
  + tmpl_inputs.now = inputs.now;
61
60
 
62
61
  minja::chat_template_options tmpl_opts;
63
62
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
64
63
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
65
- index f7b36ec71..c07429f08 100644
64
+ index 50efb0d4e..f471a84c7 100644
66
65
  --- a/src/llama.cpp/common/chat.h
67
66
  +++ b/src/llama.cpp/common/chat.h
68
67
  @@ -9,7 +9,18 @@
@@ -3248,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3248
3248
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
3249
3249
  add_opt(common_arg(
3250
3250
  {"--embd-output-format"}, "FORMAT",
3251
- "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
3251
+ "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
3252
3252
  [](common_params & params, const std::string & value) {
3253
3253
  params.embd_out = value;
3254
3254
  }
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3435
3435
  [](common_params & params) {
3436
3436
  params.use_jinja = true;
3437
3437
  }
3438
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
3438
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
3439
3439
  add_opt(common_arg(
3440
3440
  {"--reasoning-format"}, "FORMAT",
3441
3441
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -6,8 +6,11 @@
6
6
  #include "log.h"
7
7
  #include "regex-partial.h"
8
8
 
9
+ #include <algorithm>
9
10
  #include <cstdio>
11
+ #include <cctype>
10
12
  #include <exception>
13
+ #include <functional>
11
14
  #include <iostream>
12
15
  #include <optional>
13
16
  #include <stdexcept>
@@ -627,6 +630,7 @@ const char * common_chat_format_name(common_chat_format format) {
627
630
  case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
628
631
  case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
629
632
  case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
633
+ case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
630
634
  default:
631
635
  throw std::runtime_error("Unknown chat format");
632
636
  }
@@ -794,6 +798,7 @@ static std::string apply(
794
798
  if (additional_context) {
795
799
  tmpl_inputs.extra_context.merge_patch(*additional_context);
796
800
  }
801
+ // TODO: add flag to control date/time, if only for testing purposes.
797
802
  tmpl_inputs.now = inputs.now;
798
803
 
799
804
  minja::chat_template_options tmpl_opts;
@@ -972,6 +977,126 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
972
977
  return data;
973
978
  }
974
979
 
980
+
981
+ // Case-insensitive find
982
+ static size_t ifind_string(const std::string & haystack, const std::string & needle, size_t pos = 0) {
983
+ auto it = std::search(
984
+ haystack.begin() + pos, haystack.end(),
985
+ needle.begin(), needle.end(),
986
+ [](char a, char b) { return std::tolower(a) == std::tolower(b); }
987
+ );
988
+ return (it == haystack.end()) ? std::string::npos : std::distance(haystack.begin(), it);
989
+ }
990
+
991
+ static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const struct templates_params & inputs) {
992
+ common_chat_params data;
993
+ const auto is_json_schema_provided = !inputs.json_schema.is_null();
994
+ const auto is_grammar_provided = !inputs.grammar.empty();
995
+ const auto are_tools_provided = inputs.tools.is_array() && !inputs.tools.empty();
996
+
997
+ // the logic requires potentially modifying the messages
998
+ auto tweaked_messages = inputs.messages;
999
+
1000
+ auto replace_json_schema_marker = [](json & messages) -> bool {
1001
+ static std::string marker1 = "force json schema.\n";
1002
+ static std::string marker2 = "force json schema.";
1003
+
1004
+ if (messages.empty() || messages.at(0).at("role") != "system") {
1005
+ return false;
1006
+ }
1007
+
1008
+ std::string content = messages.at(0).at("content");
1009
+
1010
+ for (const auto & marker : {marker1, marker2}) {
1011
+ const auto pos = ifind_string(content, marker);
1012
+ if (pos != std::string::npos) {
1013
+ content.replace(pos, marker.length(), "");
1014
+ // inject modified content back into the messages
1015
+ messages.at(0).at("content") = content;
1016
+ return true;
1017
+ }
1018
+ }
1019
+
1020
+ return false;
1021
+ };
1022
+
1023
+ // Lfm2 model does not natively work with json, but can generally understand the tools structure
1024
+ //
1025
+ // Example of the pytorch dialog structure:
1026
+ // <|startoftext|><|im_start|>system
1027
+ // List of tools: <|tool_list_start|>[{"name": "get_candidate_status", "description": "Retrieves the current status of a candidate in the recruitment process", "parameters": {"type": "object", "properties": {"candidate_id": {"type": "string", "description": "Unique identifier for the candidate"}}, "required": ["candidate_id"]}}]<|tool_list_end|><|im_end|>
1028
+ // <|im_start|>user
1029
+ // What is the current status of candidate ID 12345?<|im_end|>
1030
+ // <|im_start|>assistant
1031
+ // <|tool_call_start|>[get_candidate_status(candidate_id="12345")]<|tool_call_end|>Checking the current status of candidate ID 12345.<|im_end|>
1032
+ // <|im_start|>tool
1033
+ // <|tool_response_start|>{"candidate_id": "12345", "status": "Interview Scheduled", "position": "Clinical Research Associate", "date": "2023-11-20"}<|tool_response_end|><|im_end|>
1034
+ // <|im_start|>assistant
1035
+ // The candidate with ID 12345 is currently in the "Interview Scheduled" stage for the position of Clinical Research Associate, with an interview date set for 2023-11-20.<|im_end|>
1036
+ //
1037
+ // For the llama server compatibility with json tools semantic,
1038
+ // the client can add "Follow json schema." line into the system message prompt to force the json output.
1039
+ //
1040
+ if (are_tools_provided && (is_json_schema_provided || is_grammar_provided)) {
1041
+ // server/utils.hpp prohibits that branch for the custom grammar anyways
1042
+ throw std::runtime_error("Tools call must not use \"json_schema\" or \"grammar\", use non-tool invocation if you want to use custom grammar");
1043
+ } else if (are_tools_provided && replace_json_schema_marker(tweaked_messages)) {
1044
+ LOG_INF("%s: Using tools to build a grammar\n", __func__);
1045
+
1046
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1047
+ auto schemas = json::array();
1048
+ foreach_function(inputs.tools, [&](const json & tool) {
1049
+ const auto & function = tool.at("function");
1050
+ schemas.push_back({
1051
+ {"type", "object"},
1052
+ {"properties", {
1053
+ {"name", {
1054
+ {"type", "string"},
1055
+ {"const", function.at("name")},
1056
+ }},
1057
+ {"arguments", function.at("parameters")},
1058
+ }},
1059
+ {"required", json::array({"name", "arguments", "id"})},
1060
+ });
1061
+ });
1062
+ auto schema = json {
1063
+ {"type", "array"},
1064
+ {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
1065
+ {"minItems", 1},
1066
+ };
1067
+ if (!inputs.parallel_tool_calls) {
1068
+ schema["maxItems"] = 1;
1069
+ }
1070
+
1071
+ builder.add_rule("root", "\"<|tool_call_start|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tool_call_end|>\"");
1072
+ });
1073
+ // model has no concept of tool selection mode choice,
1074
+ // if the system prompt rendered correctly it will produce a tool call
1075
+ // the grammar goes inside the tool call body
1076
+ data.grammar_lazy = true;
1077
+ data.grammar_triggers = {{COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, "\\s*<\\|tool_call_start\\|>\\s*\\["}};
1078
+ data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
1079
+ data.format = COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS;
1080
+ } else if (are_tools_provided && (!is_json_schema_provided && !is_grammar_provided)) {
1081
+ LOG_INF("%s: Using tools without json schema or grammar\n", __func__);
1082
+ // output those tokens
1083
+ data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
1084
+ } else if (is_json_schema_provided) {
1085
+ LOG_INF("%s: Using provided json schema to build a grammar\n", __func__);
1086
+ data.grammar = json_schema_to_grammar(inputs.json_schema);
1087
+ } else if (is_grammar_provided) {
1088
+ LOG_INF("%s: Using provided grammar\n", __func__);
1089
+ data.grammar = inputs.grammar;
1090
+ } else {
1091
+ LOG_INF("%s: Using content relying on the template\n", __func__);
1092
+ }
1093
+
1094
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
1095
+ LOG_DBG("%s: Prompt: %s\n", __func__, data.prompt.c_str());
1096
+
1097
+ return data;
1098
+ }
1099
+
975
1100
  static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
976
1101
  common_chat_params data;
977
1102
  data.prompt = apply(tmpl, inputs);
@@ -2485,6 +2610,71 @@ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
2485
2610
  builder.add_content(builder.consume_rest());
2486
2611
  }
2487
2612
 
2613
+
2614
+ static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
2615
+ if (!builder.syntax().parse_tool_calls) {
2616
+ builder.add_content(builder.consume_rest());
2617
+ return;
2618
+ }
2619
+
2620
+ // LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
2621
+ static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
2622
+ static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
2623
+
2624
+ // Loop through all tool calls
2625
+ while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
2626
+ builder.move_to(res->groups[0].end);
2627
+
2628
+ // Parse JSON array format: [{"name": "...", "arguments": {...}}]
2629
+ auto tool_calls_data = builder.consume_json();
2630
+
2631
+ // Consume end marker
2632
+ builder.consume_spaces();
2633
+ if (!builder.try_consume_regex(tool_call_end_regex)) {
2634
+ throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
2635
+ }
2636
+
2637
+ // Process each tool call in the array
2638
+ if (tool_calls_data.json.is_array()) {
2639
+ for (const auto & tool_call : tool_calls_data.json) {
2640
+ if (!tool_call.is_object()) {
2641
+ throw common_chat_msg_partial_exception("Tool call must be an object");
2642
+ }
2643
+
2644
+ if (!tool_call.contains("name")) {
2645
+ throw common_chat_msg_partial_exception("Tool call missing 'name' field");
2646
+ }
2647
+
2648
+ std::string function_name = tool_call.at("name");
2649
+ std::string arguments = "{}";
2650
+
2651
+ if (tool_call.contains("arguments")) {
2652
+ if (tool_call.at("arguments").is_object()) {
2653
+ arguments = tool_call.at("arguments").dump();
2654
+ } else if (tool_call.at("arguments").is_string()) {
2655
+ arguments = tool_call.at("arguments");
2656
+ }
2657
+ }
2658
+
2659
+ if (!builder.add_tool_call(function_name, "", arguments)) {
2660
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2661
+ }
2662
+ }
2663
+ } else {
2664
+ throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
2665
+ }
2666
+
2667
+ // Consume any trailing whitespace after this tool call
2668
+ builder.consume_spaces();
2669
+ }
2670
+
2671
+ // Consume any remaining content after all tool calls
2672
+ auto remaining = builder.consume_rest();
2673
+ if (!string_strip(remaining).empty()) {
2674
+ builder.add_content(remaining);
2675
+ }
2676
+ }
2677
+
2488
2678
  static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
2489
2679
  // Parse thinking tags first - this handles the main reasoning content
2490
2680
  builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2734,6 +2924,12 @@ static common_chat_params common_chat_templates_apply_jinja(
2734
2924
  return common_chat_params_init_apertus(tmpl, params);
2735
2925
  }
2736
2926
 
2927
+ // LFM2 (w/ tools)
2928
+ if (src.find("List of tools: <|tool_list_start|>[") != std::string::npos &&
2929
+ src.find("]<|tool_list_end|>") != std::string::npos) {
2930
+ return common_chat_params_init_lfm2(tmpl, params);
2931
+ }
2932
+
2737
2933
  // Use generic handler when mixing tools + JSON schema.
2738
2934
  // TODO: support that mix in handlers below.
2739
2935
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2912,6 +3108,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2912
3108
  case COMMON_CHAT_FORMAT_APERTUS:
2913
3109
  common_chat_parse_apertus(builder);
2914
3110
  break;
3111
+ case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
3112
+ common_chat_parse_lfm2(builder);
3113
+ break;
2915
3114
  default:
2916
3115
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
2917
3116
  }
@@ -127,6 +127,7 @@ enum common_chat_format {
127
127
  COMMON_CHAT_FORMAT_SEED_OSS,
128
128
  COMMON_CHAT_FORMAT_NEMOTRON_V2,
129
129
  COMMON_CHAT_FORMAT_APERTUS,
130
+ COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
130
131
 
131
132
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
132
133
  };
@@ -601,7 +601,10 @@ private:
601
601
  }
602
602
 
603
603
  std::string _resolve_ref(const std::string & ref) {
604
- std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
604
+ auto it = ref.find('#');
605
+ std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
606
+ static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
607
+ std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
605
608
  if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
606
609
  _refs_being_resolved.insert(ref);
607
610
  json resolved = _refs[ref];
@@ -774,11 +777,24 @@ public:
774
777
  std::vector<std::string> tokens = string_split(pointer, "/");
775
778
  for (size_t i = 1; i < tokens.size(); ++i) {
776
779
  std::string sel = tokens[i];
777
- if (target.is_null() || !target.contains(sel)) {
780
+ if (target.is_object() && target.contains(sel)) {
781
+ target = target[sel];
782
+ } else if (target.is_array()) {
783
+ size_t sel_index;
784
+ try {
785
+ sel_index = std::stoul(sel);
786
+ } catch (const std::invalid_argument & e) {
787
+ sel_index = target.size();
788
+ }
789
+ if (sel_index >= target.size()) {
790
+ _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
791
+ return;
792
+ }
793
+ target = target[sel_index];
794
+ } else {
778
795
  _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
779
796
  return;
780
797
  }
781
- target = target[sel];
782
798
  }
783
799
  _refs[ref] = target;
784
800
  }
@@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adr
251
251
  set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
252
252
  "gmml: OpenCL API version to target")
253
253
 
254
+ option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
255
+
254
256
  # toolchain for vulkan-shaders-gen
255
257
  set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
256
258
 
@@ -0,0 +1,19 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ // backend API
11
+ GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
12
+
13
+ GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
14
+
15
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
16
+
17
+ #ifdef __cplusplus
18
+ }
19
+ #endif
@@ -402,6 +402,7 @@ ggml_add_backend(Vulkan)
402
402
  ggml_add_backend(WebGPU)
403
403
  ggml_add_backend(zDNN)
404
404
  ggml_add_backend(OpenCL)
405
+ ggml_add_backend(Hexagon)
405
406
 
406
407
  foreach (target ggml-base ggml)
407
408
  target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
@@ -7519,8 +7519,8 @@ static void ggml_compute_forward_upscale_f32(
7519
7519
  float pixel_offset = 0.5f;
7520
7520
  if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
7521
7521
  pixel_offset = 0.0f;
7522
- sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
7523
- sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
7522
+ sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
7523
+ sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
7524
7524
  }
7525
7525
 
7526
7526
  for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -268,9 +268,7 @@ llama_context::llama_context(
268
268
  if (pipeline_parallel) {
269
269
  LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
270
270
  }
271
- }
272
271
 
273
- if (!hparams.vocab_only) {
274
272
  llama_memory_context_ptr mctx;
275
273
  if (memory) {
276
274
  LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -343,7 +341,14 @@ llama_context::llama_context(
343
341
  {
344
342
  auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
345
343
  if (!gf) {
346
- throw std::runtime_error("failed to allocate compute pp buffers");
344
+ if (pipeline_parallel) {
345
+ LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
346
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
347
+ gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
348
+ }
349
+ if (!gf) {
350
+ throw std::runtime_error("failed to allocate compute pp buffers");
351
+ }
347
352
  }
348
353
 
349
354
  n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
@@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn(
810
810
  GGML_ABORT("fatal error");
811
811
  }
812
812
 
813
+ //expand here so that we can fuse ffn gate
814
+ ggml_build_forward_expand(gf, cur);
815
+
813
816
  if (gate && type_gate == LLM_FFN_PAR) {
814
817
  cur = ggml_mul(ctx0, cur, tmp);
815
818
  cb(cur, "ffn_gate_par", il);
@@ -1006,10 +1009,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1006
1009
  ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
1007
1010
  cb(weights_sum, "ffn_moe_weights_sum", il);
1008
1011
 
1009
- if (arch == LLM_ARCH_BAILINGMOE2) {
1010
- weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
1011
- cb(weights_sum, "ffn_moe_weights_sum_biased", il);
1012
- }
1012
+ // Avoid division by zero, clamp to smallest number representable by F16
1013
+ weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
1014
+ cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
1013
1015
 
1014
1016
  weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
1015
1017
  cb(weights, "ffn_moe_weights_norm", il);
@@ -1091,6 +1093,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1091
1093
  GGML_ABORT("fatal error");
1092
1094
  }
1093
1095
 
1096
+ //expand here so that we can fuse ffn gate
1097
+ ggml_build_forward_expand(gf, cur);
1098
+
1094
1099
  experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
1095
1100
  cb(experts, "ffn_moe_down", il);
1096
1101
 
@@ -8,6 +8,7 @@
8
8
  #include <algorithm>
9
9
  #include <cassert>
10
10
  #include <cmath>
11
+ #include <cstring>
11
12
  #include <limits>
12
13
  #include <map>
13
14
  #include <stdexcept>
@@ -37,8 +38,15 @@ llama_kv_cache::llama_kv_cache(
37
38
 
38
39
  const uint32_t n_layer_kv = hparams.n_layer_kv();
39
40
 
41
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
42
+ struct ggml_backend_buft_comparator {
43
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
44
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
45
+ }
46
+ };
47
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
48
+
40
49
  // create a context for each buffer type
41
- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
42
50
  auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
43
51
  auto it = ctx_map.find(buft);
44
52
  if (it == ctx_map.end()) {
@@ -53,13 +61,12 @@ llama_kv_cache::llama_kv_cache(
53
61
  return nullptr;
54
62
  }
55
63
 
56
- ctx_map[buft] = ctx;
57
- ctxs.emplace_back(ctx);
64
+ ctx_map.emplace(buft, ctx);
58
65
 
59
66
  return ctx;
60
67
  }
61
68
 
62
- return it->second;
69
+ return it->second.get();
63
70
  };
64
71
 
65
72
  GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
@@ -167,11 +174,8 @@ llama_kv_cache::llama_kv_cache(
167
174
  }
168
175
 
169
176
  // allocate tensors and initialize the buffers to avoid NaNs in the padding
170
- for (auto it : ctx_map) {
171
- auto * buft = it.first;
172
- auto * ctx = it.second;
173
-
174
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
177
+ for (auto & [buft, ctx] : ctx_map) {
178
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
175
179
  if (!buf) {
176
180
  throw std::runtime_error("failed to allocate buffer for kv cache");
177
181
  }
@@ -179,7 +183,7 @@ llama_kv_cache::llama_kv_cache(
179
183
  LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
180
184
 
181
185
  ggml_backend_buffer_clear(buf, 0);
182
- bufs.emplace_back(buf);
186
+ ctxs_bufs.emplace_back(std::move(ctx), buf);
183
187
  }
184
188
 
185
189
  {
@@ -203,7 +207,7 @@ void llama_kv_cache::clear(bool data) {
203
207
  }
204
208
 
205
209
  if (data) {
206
- for (auto & buf : bufs) {
210
+ for (auto & [_, buf] : ctxs_bufs) {
207
211
  ggml_backend_buffer_clear(buf.get(), 0);
208
212
  }
209
213
  }
@@ -472,8 +476,8 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
472
476
 
473
477
  std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
474
478
  std::map<ggml_backend_buffer_type_t, size_t> ret;
475
- for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
476
- ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
479
+ for (const auto & [_, buf] : ctxs_bufs) {
480
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
477
481
  }
478
482
  return ret;
479
483
  }
@@ -957,10 +961,14 @@ bool llama_kv_cache::get_has_shift() const {
957
961
  uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
958
962
  uint32_t result = 0;
959
963
 
964
+ // pad the n_kv value so that the graph remains constant across batches and can be reused
965
+ // note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220)
966
+ const uint32_t n_pad_cur = std::max(n_pad, 256u);
967
+
960
968
  for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
961
969
  const auto & cells = v_cells[sinfo.strm[s]];
962
970
 
963
- result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
971
+ result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result);
964
972
  }
965
973
 
966
974
  return result;
@@ -1298,7 +1306,7 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
1298
1306
  size_t llama_kv_cache::total_size() const {
1299
1307
  size_t size = 0;
1300
1308
 
1301
- for (const auto & buf : bufs) {
1309
+ for (const auto & [_, buf] : ctxs_bufs) {
1302
1310
  size += ggml_backend_buffer_get_size(buf.get());
1303
1311
  }
1304
1312
 
@@ -2010,8 +2018,3 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
2010
2018
  void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2011
2019
  kv->set_input_pos_bucket(dst, ubatch);
2012
2020
  }
2013
-
2014
- uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
2015
- // the FA kernels require padding to avoid extra runtime boundary checks
2016
- return cparams.flash_attn ? 256u : 32u;
2017
- }
@@ -19,8 +19,6 @@ struct llama_context;
19
19
 
20
20
  class llama_kv_cache : public llama_memory_i {
21
21
  public:
22
- static uint32_t get_padding(const llama_cparams & cparams);
23
-
24
22
  struct stream_copy_info {
25
23
  bool empty() const {
26
24
  assert(ssrc.size() == sdst.size());
@@ -217,8 +215,8 @@ private:
217
215
  // this is the SWA type of the cache - not to be confused with the model SWA type
218
216
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
219
217
 
220
- std::vector<ggml_context_ptr> ctxs;
221
- std::vector<ggml_backend_buffer_ptr> bufs;
218
+ // ggml contexts for the KV cache along with the allocated backend buffers:
219
+ std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
222
220
 
223
221
  // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
224
222
  // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
@@ -7,6 +7,7 @@
7
7
 
8
8
  #include <algorithm>
9
9
  #include <cassert>
10
+ #include <cstring>
10
11
  #include <limits>
11
12
  #include <map>
12
13
  #include <stdexcept>
@@ -32,8 +33,15 @@ llama_memory_recurrent::llama_memory_recurrent(
32
33
  cells.clear();
33
34
  cells.resize(mem_size);
34
35
 
36
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
37
+ struct ggml_backend_buft_comparator {
38
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
39
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
40
+ }
41
+ };
42
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
43
+
35
44
  // create a context for each buffer type
36
- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
37
45
  auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
38
46
  auto it = ctx_map.find(buft);
39
47
  if (it == ctx_map.end()) {
@@ -48,13 +56,12 @@ llama_memory_recurrent::llama_memory_recurrent(
48
56
  return nullptr;
49
57
  }
50
58
 
51
- ctx_map[buft] = ctx;
52
- ctxs.emplace_back(ctx);
59
+ ctx_map.emplace(buft, ctx);
53
60
 
54
61
  return ctx;
55
62
  }
56
63
 
57
- return it->second;
64
+ return it->second.get();
58
65
  };
59
66
 
60
67
  r_l.resize(n_layer);
@@ -93,17 +100,14 @@ llama_memory_recurrent::llama_memory_recurrent(
93
100
  }
94
101
 
95
102
  // allocate tensors and initialize the buffers to avoid NaNs in the padding
96
- for (auto it : ctx_map) {
97
- auto * buft = it.first;
98
- auto * ctx = it.second;
99
-
100
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
103
+ for (auto & [buft, ctx] : ctx_map) {
104
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
101
105
  if (!buf) {
102
106
  throw std::runtime_error("failed to allocate buffer for rs cache");
103
107
  }
104
108
  ggml_backend_buffer_clear(buf, 0);
105
109
  LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
106
- bufs.emplace_back(buf);
110
+ ctxs_bufs.emplace_back(std::move(ctx), buf);
107
111
  }
108
112
 
109
113
  {
@@ -129,7 +133,7 @@ void llama_memory_recurrent::clear(bool data) {
129
133
  used = 0;
130
134
 
131
135
  if (data) {
132
- for (auto & buf : bufs) {
136
+ for (auto & [_, buf] : ctxs_bufs) {
133
137
  ggml_backend_buffer_clear(buf.get(), 0);
134
138
  }
135
139
  }
@@ -364,8 +368,8 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
364
368
 
365
369
  std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
366
370
  std::map<ggml_backend_buffer_type_t, size_t> ret;
367
- for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
368
- ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
371
+ for (const auto & [_, buf] : ctxs_bufs) {
372
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
369
373
  }
370
374
  return ret;
371
375
  }
@@ -662,7 +666,7 @@ bool llama_memory_recurrent::get_can_shift() const {
662
666
 
663
667
  size_t llama_memory_recurrent::total_size() const {
664
668
  size_t size = 0;
665
- for (const auto & buf : bufs) {
669
+ for (const auto & [_, buf] : ctxs_bufs) {
666
670
  size += ggml_backend_buffer_get_size(buf.get());
667
671
  }
668
672
 
@@ -109,8 +109,8 @@ private:
109
109
 
110
110
  const uint32_t n_seq_max = 1;
111
111
 
112
- std::vector<ggml_context_ptr> ctxs;
113
- std::vector<ggml_backend_buffer_ptr> bufs;
112
+ // ggml contexts for the KV cache along with the allocated backend buffers:
113
+ std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
114
114
 
115
115
  size_t total_size() const;
116
116
 
@@ -15,7 +15,6 @@
15
15
 
16
16
  #include <algorithm>
17
17
  #include <cassert>
18
- #include <cmath>
19
18
  #include <cfloat>
20
19
  #include <cstring>
21
20
  #include <cmath>
@@ -404,6 +403,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
404
403
  // add the device default buffer type
405
404
  buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
406
405
 
406
+ // add the device extra buffer type (if any)
407
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
408
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
409
+ ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
410
+
411
+ if (ggml_backend_dev_get_extra_bufts_fn) {
412
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
413
+ while (extra_bufts && *extra_bufts) {
414
+ buft_list.emplace_back(dev, *extra_bufts);
415
+ ++extra_bufts;
416
+ }
417
+ }
418
+
407
419
  return buft_list;
408
420
  }
409
421
 
@@ -425,7 +437,7 @@ struct llama_model::impl {
425
437
  llama_mlocks mlock_mmaps;
426
438
 
427
439
  // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
428
- std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
440
+ std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
429
441
 
430
442
  buft_list_t cpu_buft_list;
431
443
  std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -2219,7 +2231,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2219
2231
  // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2220
2232
  struct ggml_backend_buft_comparator {
2221
2233
  bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2222
- return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
2234
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
2223
2235
  }
2224
2236
  };
2225
2237
  std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
@@ -6173,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6173
6185
  bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
6174
6186
  bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
6175
6187
 
6176
- ggml_backend_buffer_t buf = nullptr;
6188
+ std::vector<ggml_backend_buffer_ptr> bufs;
6177
6189
  if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6178
6190
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6179
6191
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6186,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6186
6198
  continue;
6187
6199
  }
6188
6200
  const size_t max_size = ggml_get_max_tensor_size(ctx);
6189
- buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6201
+ ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6190
6202
  if (buf == nullptr) {
6191
6203
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6192
6204
  }
6205
+ bufs.emplace_back(buf);
6193
6206
  buf_map.emplace(idx, buf);
6194
6207
  }
6195
6208
  }
6196
6209
  else {
6197
- buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6210
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6198
6211
  if (buf == nullptr) {
6199
6212
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6200
6213
  }
@@ -6204,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6204
6217
  mlock_buf->init (ggml_backend_buffer_get_base(buf));
6205
6218
  mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
6206
6219
  }
6220
+ bufs.emplace_back(buf);
6207
6221
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6208
6222
  buf_map.emplace(idx, buf);
6209
6223
  }
6210
6224
  }
6211
- pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
6225
+ pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
6212
6226
 
6213
6227
  for (auto & buf : buf_map) {
6214
6228
  // indicate that this buffer contains weights
@@ -6234,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6234
6248
  }
6235
6249
 
6236
6250
  // print memory requirements per buffer type
6237
- for (auto & [_, buf] : pimpl->ctxs_bufs) {
6238
- LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6251
+ for (auto & [_, bufs] : pimpl->ctxs_bufs) {
6252
+ for (auto & buf: bufs) {
6253
+ LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
6254
+ __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6255
+ }
6239
6256
  }
6240
6257
 
6241
6258
  // populate tensors_by_name
@@ -6287,8 +6304,10 @@ size_t llama_model::n_devices() const {
6287
6304
 
6288
6305
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6289
6306
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6290
- for (const auto & [_, buf] : pimpl->ctxs_bufs) {
6291
- ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6307
+ for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6308
+ for (const auto & buf : bufs) {
6309
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6310
+ }
6292
6311
  }
6293
6312
  return ret;
6294
6313
  }
@@ -6356,6 +6375,8 @@ void llama_model::print_info() const {
6356
6375
  LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
6357
6376
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
6358
6377
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
6378
+ LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
6379
+ LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
6359
6380
  LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
6360
6381
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
6361
6382
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
@@ -6456,8 +6477,6 @@ void llama_model::print_info() const {
6456
6477
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6457
6478
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6458
6479
  LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6459
- LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
6460
- LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
6461
6480
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6462
6481
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6463
6482
  LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
@@ -17952,6 +17971,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
17952
17971
  cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
17953
17972
  cb(cur, "result_norm", -1);
17954
17973
 
17974
+ res->t_embd = cur;
17975
+
17955
17976
  // lm_head
17956
17977
  cur = build_lora_mm(model.output, cur);
17957
17978
  cb(cur, "result_output", -1);
@@ -19324,6 +19345,7 @@ struct llm_build_smallthinker : public llm_graph_context{
19324
19345
 
19325
19346
  cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
19326
19347
  cb(cur, "result_norm", -1);
19348
+ res->t_embd = cur;
19327
19349
 
19328
19350
  // lm_head
19329
19351
  cur = build_lora_mm(model.output, cur);
@@ -19619,7 +19641,7 @@ struct llm_build_apertus : public llm_graph_context {
19619
19641
  }
19620
19642
  };
19621
19643
 
19622
- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
19644
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
19623
19645
  llama_memory_i * res;
19624
19646
 
19625
19647
  switch (arch) {
@@ -19670,17 +19692,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19670
19692
  };
19671
19693
  }
19672
19694
 
19673
- const auto padding = llama_kv_cache::get_padding(cparams);
19674
-
19675
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
19676
-
19677
19695
  res = new llama_memory_hybrid(
19678
19696
  /* model */ *this,
19679
19697
  /* attn_type_k */ params.type_k,
19680
19698
  /* attn_type_v */ params.type_v,
19681
19699
  /* attn_v_trans */ !cparams.flash_attn,
19682
19700
  /* attn_kv_size */ cparams.n_ctx,
19683
- /* attn_n_pad */ padding,
19701
+ /* attn_n_pad */ 1,
19684
19702
  /* attn_n_swa */ hparams.n_swa,
19685
19703
  /* attn_swa_type */ hparams.swa_type,
19686
19704
  /* recurrent_type_k */ GGML_TYPE_F32,
@@ -19692,23 +19710,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19692
19710
  /* filter_attn */ std::move(filter_attn),
19693
19711
  /* filter_recr */ std::move(filter_recr));
19694
19712
  } else {
19695
- const auto padding = llama_kv_cache::get_padding(cparams);
19696
-
19697
19713
  uint32_t n_ctx_per_stream = cparams.n_ctx;
19698
19714
 
19699
19715
  if (!cparams.kv_unified) {
19700
19716
  n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
19701
- n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19702
-
19703
- cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
19704
- } else {
19705
- n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19706
-
19707
- cparams.n_ctx = n_ctx_per_stream;
19708
19717
  }
19709
19718
 
19710
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
19711
-
19712
19719
  llama_memory_i::layer_reuse_cb reuse = nullptr;
19713
19720
 
19714
19721
  if (arch == LLM_ARCH_GEMMA3N) {
@@ -19735,7 +19742,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19735
19742
  n_ctx_per_stream,
19736
19743
  cparams.n_seq_max,
19737
19744
  cparams.n_ubatch,
19738
- padding,
19745
+ 1,
19739
19746
  nullptr,
19740
19747
  reuse);
19741
19748
  } else {
@@ -19750,7 +19757,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19750
19757
  cparams.kv_unified,
19751
19758
  n_ctx_per_stream,
19752
19759
  cparams.n_seq_max,
19753
- padding,
19760
+ 1,
19754
19761
  hparams.n_swa,
19755
19762
  hparams.swa_type,
19756
19763
  nullptr,
@@ -500,9 +500,8 @@ struct llama_model {
500
500
 
501
501
  ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
502
502
 
503
- // note: can mutate `cparams`
504
503
  // TODO: move this to new llm_arch_model_i interface
505
- llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
504
+ llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
506
505
 
507
506
  // TODO: move this to new llm_arch_model_i interface
508
507
  ggml_cgraph * build_graph(const llm_graph_params & params) const;