@fugood/llama.node 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +12 -12
  3. package/src/llama.cpp/CMakeLists.txt +0 -1
  4. package/src/llama.cpp/common/arg.cpp +17 -0
  5. package/src/llama.cpp/common/chat.cpp +37 -20
  6. package/src/llama.cpp/common/chat.h +2 -0
  7. package/src/llama.cpp/common/common.h +4 -0
  8. package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
  9. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  10. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  11. package/src/llama.cpp/ggml/include/ggml.h +181 -10
  12. package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  20. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
  21. package/src/llama.cpp/include/llama.h +1 -0
  22. package/src/llama.cpp/src/llama-arch.cpp +108 -2
  23. package/src/llama.cpp/src/llama-arch.h +7 -0
  24. package/src/llama.cpp/src/llama-batch.cpp +27 -1
  25. package/src/llama.cpp/src/llama-batch.h +8 -1
  26. package/src/llama.cpp/src/llama-chat.cpp +15 -0
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-graph.cpp +95 -81
  29. package/src/llama.cpp/src/llama-graph.h +43 -16
  30. package/src/llama.cpp/src/llama-hparams.cpp +2 -1
  31. package/src/llama.cpp/src/llama-hparams.h +1 -0
  32. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  34. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  35. package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  36. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  37. package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  38. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  39. package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
  40. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  41. package/src/llama.cpp/src/llama-memory.h +3 -0
  42. package/src/llama.cpp/src/llama-model.cpp +1374 -210
  43. package/src/llama.cpp/src/llama-model.h +3 -0
  44. package/src/llama.cpp/src/llama-vocab.cpp +8 -1
  45. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.1",
4
+ "version": "1.0.3",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -70,19 +70,19 @@
70
70
  "CMakeLists.txt"
71
71
  ],
72
72
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.1",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.1",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.1",
76
- "@fugood/node-llama-linux-arm64": "1.0.1",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.1",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.1",
79
- "@fugood/node-llama-win32-x64": "1.0.1",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.1",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.1",
82
- "@fugood/node-llama-win32-arm64": "1.0.1",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.1",
84
- "@fugood/node-llama-darwin-x64": "1.0.1",
85
- "@fugood/node-llama-darwin-arm64": "1.0.1"
73
+ "@fugood/node-llama-linux-x64": "1.0.3",
74
+ "@fugood/node-llama-linux-x64-vulkan": "1.0.3",
75
+ "@fugood/node-llama-linux-x64-cuda": "1.0.3",
76
+ "@fugood/node-llama-linux-arm64": "1.0.3",
77
+ "@fugood/node-llama-linux-arm64-vulkan": "1.0.3",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.0.3",
79
+ "@fugood/node-llama-win32-x64": "1.0.3",
80
+ "@fugood/node-llama-win32-x64-vulkan": "1.0.3",
81
+ "@fugood/node-llama-win32-x64-cuda": "1.0.3",
82
+ "@fugood/node-llama-win32-arm64": "1.0.3",
83
+ "@fugood/node-llama-win32-arm64-vulkan": "1.0.3",
84
+ "@fugood/node-llama-darwin-x64": "1.0.3",
85
+ "@fugood/node-llama-darwin-arm64": "1.0.3"
86
86
  },
87
87
  "devDependencies": {
88
88
  "@babel/preset-env": "^7.24.4",
@@ -1,5 +1,5 @@
1
1
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
2
- index 7d9aaeb1..a7b68d4a 100644
2
+ index 114dbfcc..6771bd43 100644
3
3
  --- a/src/llama.cpp/common/chat.cpp
4
4
  +++ b/src/llama.cpp/common/chat.cpp
5
5
  @@ -6,9 +6,6 @@
@@ -12,7 +12,7 @@ index 7d9aaeb1..a7b68d4a 100644
12
12
  #include <cstdio>
13
13
  #include <exception>
14
14
  #include <iostream>
15
- @@ -121,14 +118,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
15
+ @@ -123,14 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
16
16
  return diffs;
17
17
  }
18
18
 
@@ -27,13 +27,13 @@ index 7d9aaeb1..a7b68d4a 100644
27
27
  struct templates_params {
28
28
  json messages;
29
29
  json tools;
30
- diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
31
- index 9f59e6b0..9b7fe724 100644
30
+ diff --git a/common/chat.h b/common/chat.h
31
+ index ca807c14..56649863 100644
32
32
  --- a/src/llama.cpp/common/chat.h
33
33
  +++ b/src/llama.cpp/common/chat.h
34
- @@ -8,7 +8,16 @@
35
- #include <string>
34
+ @@ -9,7 +9,16 @@
36
35
  #include <vector>
36
+ #include <map>
37
37
 
38
38
  -struct common_chat_templates;
39
39
  +#include <minja/chat-template.hpp>
@@ -62,10 +62,10 @@ index e4e71ad1..091ddda4 100644
62
62
  mparams.split_mode = params.split_mode;
63
63
  mparams.tensor_split = params.tensor_split;
64
64
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
65
- index e08a59ea..d120b67d 100644
65
+ index 8922090e..3c2d1a6a 100644
66
66
  --- a/src/llama.cpp/common/common.h
67
67
  +++ b/src/llama.cpp/common/common.h
68
- @@ -223,6 +223,7 @@ enum common_reasoning_format {
68
+ @@ -224,6 +224,7 @@ enum common_reasoning_format {
69
69
  };
70
70
 
71
71
  struct common_params {
@@ -74,7 +74,7 @@ index e08a59ea..d120b67d 100644
74
74
  int32_t n_ctx = 4096; // context size
75
75
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
76
76
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
77
- index 71b1d67b..093cd6f9 100644
77
+ index 671fad4d..93fc3cd7 100644
78
78
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
79
79
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
80
80
  @@ -104,7 +104,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -87,10 +87,10 @@ index 71b1d67b..093cd6f9 100644
87
87
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
88
88
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
89
89
  diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
90
- index 39f022f3..7ae9047e 100644
90
+ index b97e7bf9..c3eb9519 100644
91
91
  --- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
92
92
  +++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
93
- @@ -110,7 +110,7 @@ if (Vulkan_FOUND)
93
+ @@ -111,7 +111,7 @@ if (Vulkan_FOUND)
94
94
  endif()
95
95
 
96
96
  # Set up toolchain for host compilation whether cross-compiling or not
@@ -99,7 +99,7 @@ index 39f022f3..7ae9047e 100644
99
99
  if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
100
100
  set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
101
101
  else()
102
- @@ -130,7 +130,7 @@ if (Vulkan_FOUND)
102
+ @@ -131,7 +131,7 @@ if (Vulkan_FOUND)
103
103
 
104
104
  include(ExternalProject)
105
105
 
@@ -120,7 +120,6 @@ endfunction()
120
120
 
121
121
  llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
122
122
  llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
123
- llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
124
123
  llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
125
124
  llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
126
125
  llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2734
2734
  params.public_path = value;
2735
2735
  }
2736
2736
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2737
+ add_opt(common_arg(
2738
+ {"--api-prefix"}, "PREFIX",
2739
+ string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2740
+ [](common_params & params, const std::string & value) {
2741
+ params.api_prefix = value;
2742
+ }
2743
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2737
2744
  add_opt(common_arg(
2738
2745
  {"--no-webui"},
2739
2746
  string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -2794,6 +2801,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2794
2801
  params.ssl_file_cert = value;
2795
2802
  }
2796
2803
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2804
+ add_opt(common_arg(
2805
+ {"--chat-template-kwargs"}, "STRING",
2806
+ string_format("sets additional params for the json template parser"),
2807
+ [](common_params & params, const std::string & value) {
2808
+ auto parsed = json::parse(value);
2809
+ for (const auto & item : parsed.items()) {
2810
+ params.default_template_kwargs[item.key()] = item.value().dump();
2811
+ }
2812
+ }
2813
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2797
2814
  add_opt(common_arg(
2798
2815
  {"-to", "--timeout"}, "N",
2799
2816
  string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -14,6 +14,8 @@
14
14
  #include <string>
15
15
  #include <vector>
16
16
 
17
+ using json = nlohmann::ordered_json;
18
+
17
19
  static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
18
20
  auto time = std::chrono::system_clock::to_time_t(now);
19
21
  auto local_time = *std::localtime(&time);
@@ -129,6 +131,7 @@ struct templates_params {
129
131
  bool add_generation_prompt = true;
130
132
  bool enable_thinking = true;
131
133
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
134
+ json extra_context;
132
135
  };
133
136
 
134
137
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -709,16 +712,23 @@ static void foreach_function(const json & tools, const std::function<void(const
709
712
 
710
713
  static std::string apply(
711
714
  const common_chat_template & tmpl,
712
- const nlohmann::ordered_json & messages,
713
- const nlohmann::ordered_json & tools,
714
- bool add_generation_prompt,
715
- const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
715
+ const struct templates_params & inputs,
716
+ const std::optional<json> & messages_override = std::nullopt,
717
+ const std::optional<json> & tools_override = std::nullopt,
718
+ const std::optional<json> & additional_context = std::nullopt)
716
719
  {
717
720
  minja::chat_template_inputs tmpl_inputs;
718
- tmpl_inputs.messages = messages;
719
- tmpl_inputs.tools = tools;
720
- tmpl_inputs.add_generation_prompt = add_generation_prompt;
721
- tmpl_inputs.extra_context = extra_context;
721
+ tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
722
+ if (tools_override) {
723
+ tmpl_inputs.tools = *tools_override;
724
+ } else {
725
+ tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
726
+ }
727
+ tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
728
+ tmpl_inputs.extra_context = inputs.extra_context;
729
+ if (additional_context) {
730
+ tmpl_inputs.extra_context.merge_patch(*additional_context);
731
+ }
722
732
  // TODO: add flag to control date/time, if only for testing purposes.
723
733
  // tmpl_inputs.now = std::chrono::system_clock::now();
724
734
 
@@ -817,7 +827,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
817
827
  inputs.messages,
818
828
  "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
819
829
 
820
- data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
830
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
821
831
  data.format = COMMON_CHAT_FORMAT_GENERIC;
822
832
  return data;
823
833
  }
@@ -893,7 +903,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
893
903
  data.preserved_tokens = {
894
904
  "[TOOL_CALLS]",
895
905
  };
896
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
906
+ data.prompt = apply(tmpl, inputs);
897
907
  data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
898
908
  return data;
899
909
  }
@@ -923,7 +933,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
923
933
  adjusted_messages.push_back(msg);
924
934
  }
925
935
  }
926
- data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
936
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
927
937
  data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
928
938
  if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
929
939
  if (!inputs.enable_thinking) {
@@ -1111,7 +1121,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
1111
1121
  } else {
1112
1122
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1113
1123
  }
1114
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
1124
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
1115
1125
  {"date_string", format_time(inputs.now, "%d %b %Y")},
1116
1126
  {"tools_in_user_message", false},
1117
1127
  {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@@ -1176,7 +1186,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
1176
1186
 
1177
1187
  static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1178
1188
  common_chat_params data;
1179
- auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1189
+ auto prompt = apply(tmpl, inputs);
1180
1190
 
1181
1191
  // Hacks to fix the official (broken) prompt.
1182
1192
  // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@@ -1271,7 +1281,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1271
1281
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1272
1282
  LOG_DBG("%s\n", __func__);
1273
1283
  common_chat_params data;
1274
- data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
1284
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
1275
1285
  {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
1276
1286
  {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
1277
1287
  });
@@ -1327,7 +1337,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
1327
1337
  // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
1328
1338
  // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
1329
1339
  common_chat_params data;
1330
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1340
+ data.prompt = apply(tmpl, inputs);
1331
1341
  data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
1332
1342
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1333
1343
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1454,7 +1464,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
1454
1464
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1455
1465
  }
1456
1466
 
1457
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1467
+ data.prompt = apply(tmpl, inputs);
1458
1468
  // TODO: if (has_raw_python)
1459
1469
  return data;
1460
1470
  }
@@ -1487,14 +1497,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
1487
1497
  static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
1488
1498
  common_chat_params data;
1489
1499
 
1490
- json additional_context = {
1500
+ json extra_context = json {
1491
1501
  {"enable_thinking", inputs.enable_thinking},
1492
1502
  };
1503
+ extra_context.update(inputs.extra_context);
1493
1504
 
1494
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
1505
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
1495
1506
  data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
1496
1507
  if (string_ends_with(data.prompt, "<think>\n")) {
1497
- if (!inputs.enable_thinking) {
1508
+ if (!extra_context["enable_thinking"]) {
1498
1509
  data.prompt += "</think>";
1499
1510
  } else {
1500
1511
  data.thinking_forced_open = true;
@@ -1680,7 +1691,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1680
1691
 
1681
1692
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
1682
1693
  common_chat_params data;
1683
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1694
+ data.prompt = apply(tmpl, inputs);
1684
1695
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1685
1696
  data.grammar_lazy = false;
1686
1697
  if (!inputs.json_schema.is_null()) {
@@ -1711,6 +1722,12 @@ static common_chat_params common_chat_templates_apply_jinja(
1711
1722
  params.enable_thinking = inputs.enable_thinking;
1712
1723
  params.grammar = inputs.grammar;
1713
1724
  params.now = inputs.now;
1725
+
1726
+ params.extra_context = json::object();
1727
+ for (auto el : inputs.chat_template_kwargs) {
1728
+ params.extra_context[el.first] = json::parse(el.second);
1729
+ }
1730
+
1714
1731
  if (!inputs.json_schema.empty()) {
1715
1732
  params.json_schema = json::parse(inputs.json_schema);
1716
1733
  }
@@ -7,6 +7,7 @@
7
7
  #include <chrono>
8
8
  #include <string>
9
9
  #include <vector>
10
+ #include <map>
10
11
 
11
12
  #include <minja/chat-template.hpp>
12
13
  #include <minja/minja.hpp>
@@ -134,6 +135,7 @@ struct common_chat_templates_inputs {
134
135
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
135
136
  bool enable_thinking = true;
136
137
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
138
+ std::map<std::string, std::string> chat_template_kwargs;
137
139
  };
138
140
 
139
141
  struct common_chat_params {
@@ -8,6 +8,7 @@
8
8
  #include <string>
9
9
  #include <string_view>
10
10
  #include <vector>
11
+ #include <map>
11
12
  #include <sstream>
12
13
 
13
14
  #ifdef _WIN32
@@ -370,6 +371,7 @@ struct common_params {
370
371
 
371
372
  std::string hostname = "127.0.0.1";
372
373
  std::string public_path = ""; // NOLINT
374
+ std::string api_prefix = ""; // NOLINT
373
375
  std::string chat_template = ""; // NOLINT
374
376
  bool use_jinja = false; // NOLINT
375
377
  bool enable_chat_template = true;
@@ -382,6 +384,8 @@ struct common_params {
382
384
  std::string ssl_file_key = ""; // NOLINT
383
385
  std::string ssl_file_cert = ""; // NOLINT
384
386
 
387
+ std::map<std::string, std::string> default_template_kwargs;
388
+
385
389
  // "advanced" endpoints are disabled by default for better security
386
390
  bool webui = true;
387
391
  bool endpoint_slots = false;
@@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
181
181
  option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
182
182
  option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
183
183
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
184
- option(GGML_KOMPUTE "ggml: use Kompute" OFF)
185
184
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
186
185
  option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
187
186
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
266
265
  include/ggml-cann.h
267
266
  include/ggml-cpp.h
268
267
  include/ggml-cuda.h
269
- include/ggml-kompute.h
270
268
  include/ggml-opt.h
271
269
  include/ggml-metal.h
272
270
  include/ggml-rpc.h
@@ -360,6 +358,13 @@ write_basic_package_version_file(
360
358
  VERSION ${GGML_INSTALL_VERSION}
361
359
  COMPATIBILITY SameMajorVersion)
362
360
 
361
+ target_compile_definitions(ggml-base PRIVATE
362
+ GGML_VERSION="${GGML_INSTALL_VERSION}"
363
+ GGML_COMMIT="${GGML_BUILD_COMMIT}"
364
+ )
365
+ message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
366
+ message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
367
+
363
368
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
364
369
  ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
365
370
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
@@ -339,7 +339,7 @@ extern "C" {
339
339
  typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
340
340
 
341
341
  // Compare the output of two backends
342
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
342
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
343
343
 
344
344
  // Tensor initialization
345
345
  GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
@@ -134,6 +134,7 @@ extern "C" {
134
134
 
135
135
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
136
136
 
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
138
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
139
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);