@fugood/llama.node 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +17 -13
  3. package/src/LlamaCompletionWorker.cpp +2 -0
  4. package/src/llama.cpp/common/arg.cpp +28 -11
  5. package/src/llama.cpp/common/chat.cpp +46 -2
  6. package/src/llama.cpp/common/chat.h +7 -2
  7. package/src/llama.cpp/common/common.h +3 -2
  8. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  9. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  13. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  17. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  20. package/src/llama.cpp/include/llama.h +1 -0
  21. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  22. package/src/llama.cpp/src/llama-arch.h +10 -0
  23. package/src/llama.cpp/src/llama-chat.cpp +13 -0
  24. package/src/llama.cpp/src/llama-chat.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +8 -8
  26. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  27. package/src/llama.cpp/src/llama-graph.h +38 -0
  28. package/src/llama.cpp/src/llama-hparams.h +5 -3
  29. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
  30. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  31. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  32. package/src/llama.cpp/src/llama-model.cpp +499 -4
  33. package/src/llama.cpp/src/llama-model.h +24 -4
  34. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  35. package/src/llama.cpp/src/llama-vocab.cpp +42 -0
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.5",
4
+ "version": "1.1.6",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.5",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.5",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.5",
77
- "@fugood/node-llama-linux-arm64": "1.1.5",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.5",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.5",
80
- "@fugood/node-llama-win32-x64": "1.1.5",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.5",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.5",
83
- "@fugood/node-llama-win32-arm64": "1.1.5",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.5",
85
- "@fugood/node-llama-darwin-x64": "1.1.5",
86
- "@fugood/node-llama-darwin-arm64": "1.1.5"
74
+ "@fugood/node-llama-linux-x64": "1.1.6",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.6",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.6",
77
+ "@fugood/node-llama-linux-arm64": "1.1.6",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.6",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.6",
80
+ "@fugood/node-llama-win32-x64": "1.1.6",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.6",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.6",
83
+ "@fugood/node-llama-win32-arm64": "1.1.6",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.6",
85
+ "@fugood/node-llama-darwin-x64": "1.1.6",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.6"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -1,5 +1,5 @@
1
1
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
2
- index 114dbfcc..6771bd43 100644
2
+ index 60805ab3..71b4236a 100644
3
3
  --- a/src/llama.cpp/common/chat.cpp
4
4
  +++ b/src/llama.cpp/common/chat.cpp
5
5
  @@ -6,9 +6,6 @@
@@ -12,13 +12,15 @@ index 114dbfcc..6771bd43 100644
12
12
  #include <cstdio>
13
13
  #include <exception>
14
14
  #include <iostream>
15
- @@ -123,14 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
15
+ @@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
16
16
  return diffs;
17
17
  }
18
18
 
19
19
  -typedef minja::chat_template common_chat_template;
20
20
  -
21
21
  -struct common_chat_templates {
22
+ - bool add_bos;
23
+ - bool add_eos;
22
24
  - bool has_explicit_template; // Model had builtin template or template overridde was specified.
23
25
  - std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
24
26
  - std::unique_ptr<common_chat_template> template_tool_use;
@@ -27,21 +29,23 @@ index 114dbfcc..6771bd43 100644
27
29
  struct templates_params {
28
30
  json messages;
29
31
  json tools;
30
- diff --git a/common/chat.h b/common/chat.h
31
- index ca807c14..56649863 100644
32
+ diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
33
+ index b014f9f0..3a868797 100644
32
34
  --- a/src/llama.cpp/common/chat.h
33
35
  +++ b/src/llama.cpp/common/chat.h
34
- @@ -9,7 +9,16 @@
36
+ @@ -9,7 +9,18 @@
35
37
  #include <vector>
36
38
  #include <map>
37
39
 
38
40
  -struct common_chat_templates;
39
- +#include <minja/chat-template.hpp>
40
- +#include <minja/minja.hpp>
41
+ +#include "minja/chat-template.hpp"
42
+ +#include "minja/minja.hpp"
41
43
  +
42
44
  +typedef minja::chat_template common_chat_template;
43
45
  +
44
46
  +struct common_chat_templates {
47
+ + bool add_bos;
48
+ + bool add_eos;
45
49
  + bool has_explicit_template; // Model had builtin template or template overridde was specified.
46
50
  + std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
47
51
  + std::unique_ptr<common_chat_template> template_tool_use;
@@ -50,10 +54,10 @@ index ca807c14..56649863 100644
50
54
  struct common_chat_tool_call {
51
55
  std::string name;
52
56
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
53
- index e4e71ad1..091ddda4 100644
57
+ index c6962d1d..ba5a4786 100644
54
58
  --- a/src/llama.cpp/common/common.cpp
55
59
  +++ b/src/llama.cpp/common/common.cpp
56
- @@ -1101,6 +1101,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
60
+ @@ -1116,6 +1116,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
57
61
  mparams.n_gpu_layers = params.n_gpu_layers;
58
62
  }
59
63
 
@@ -62,10 +66,10 @@ index e4e71ad1..091ddda4 100644
62
66
  mparams.split_mode = params.split_mode;
63
67
  mparams.tensor_split = params.tensor_split;
64
68
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
65
- index 8922090e..3c2d1a6a 100644
69
+ index 6c1c7ee2..c3eb0552 100644
66
70
  --- a/src/llama.cpp/common/common.h
67
71
  +++ b/src/llama.cpp/common/common.h
68
- @@ -224,6 +224,7 @@ enum common_reasoning_format {
72
+ @@ -242,6 +242,7 @@ enum common_reasoning_format {
69
73
  };
70
74
 
71
75
  struct common_params {
@@ -74,10 +78,10 @@ index 8922090e..3c2d1a6a 100644
74
78
  int32_t n_ctx = 4096; // context size
75
79
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
76
80
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
77
- index 671fad4d..93fc3cd7 100644
81
+ index f188d163..0c33acad 100644
78
82
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
79
83
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
80
- @@ -104,7 +104,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
84
+ @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
81
85
  )
82
86
 
83
87
  if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
@@ -320,6 +320,8 @@ void LlamaCompletionWorker::OnOK() {
320
320
  chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
321
321
  } else if (_reasoning_format == "deepseek-legacy") {
322
322
  chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
323
+ } else if (_reasoning_format == "auto") {
324
+ chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
323
325
  } else {
324
326
  chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE;
325
327
  }
@@ -24,6 +24,7 @@
24
24
  #include <cstdarg>
25
25
  #include <filesystem>
26
26
  #include <fstream>
27
+ #include <list>
27
28
  #include <regex>
28
29
  #include <set>
29
30
  #include <string>
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2375
2376
  }
2376
2377
  throw std::invalid_argument("unknown buffer type");
2377
2378
  }
2378
- // FIXME: this leaks memory
2379
- params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2379
+ // keep strings alive and avoid leaking memory by storing them in a static vector
2380
+ static std::list<std::string> buft_overrides;
2381
+ buft_overrides.push_back(tensor_name);
2382
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
2380
2383
  }
2381
2384
  }
2382
2385
  ));
2383
2386
  add_opt(common_arg(
2384
- {"--cpu-moe"},
2385
- "use CPU for Mixture of Experts (MoE) weights",
2387
+ {"--cpu-moe", "-cmoe"},
2388
+ "keep all Mixture of Experts (MoE) weights in the CPU",
2386
2389
  [](common_params & params) {
2387
- params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2388
- params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2389
- params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2390
+ params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2390
2391
  }
2391
2392
  ).set_env("LLAMA_ARG_CPU_MOE"));
2393
+ add_opt(common_arg(
2394
+ {"--n-cpu-moe", "-ncmoe"}, "N",
2395
+ "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2396
+ [](common_params & params, int value) {
2397
+ if (value < 0) {
2398
+ throw std::invalid_argument("invalid value");
2399
+ }
2400
+ for (int i = 0; i < value; ++i) {
2401
+ // keep strings alive and avoid leaking memory by storing them in a static vector
2402
+ static std::list<std::string> buft_overrides;
2403
+ buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2404
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2405
+ }
2406
+ }
2407
+ ).set_env("LLAMA_ARG_N_CPU_MOE"));
2392
2408
  add_opt(common_arg(
2393
2409
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2394
2410
  "number of layers to store in VRAM",
@@ -2649,10 +2665,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2649
2665
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2650
2666
  add_opt(common_arg(
2651
2667
  {"--output-format"}, "{gguf,dat}",
2652
- string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
2668
+ string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
2653
2669
  [](common_params & params, const std::string & value) {
2654
- /**/ if (value == "gguf") { params.imat_dat = false; }
2655
- else if (value == "dat") { params.imat_dat = true; }
2670
+ /**/ if (value == "gguf") { params.imat_dat = -1; }
2671
+ else if (value == "dat") { params.imat_dat = 1; }
2656
2672
  else { throw std::invalid_argument("invalid output format"); }
2657
2673
  }
2658
2674
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
@@ -2931,11 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2931
2947
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2932
2948
  "- none: leaves thoughts unparsed in `message.content`\n"
2933
2949
  "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2934
- "(default: deepseek)",
2950
+ "(default: auto)",
2935
2951
  [](common_params & params, const std::string & value) {
2936
2952
  /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2937
2953
  else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2938
2954
  else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2955
+ else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
2939
2956
  else { throw std::invalid_argument("invalid value"); }
2940
2957
  }
2941
2958
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
@@ -132,6 +132,8 @@ struct templates_params {
132
132
  bool enable_thinking = true;
133
133
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
134
134
  json extra_context;
135
+ bool add_bos;
136
+ bool add_eos;
135
137
  };
136
138
 
137
139
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -434,6 +436,8 @@ std::string common_chat_format_single(
434
436
 
435
437
  common_chat_templates_inputs inputs;
436
438
  inputs.use_jinja = use_jinja;
439
+ inputs.add_bos = tmpls->add_bos;
440
+ inputs.add_eos = tmpls->add_eos;
437
441
 
438
442
  std::string fmt_past_msg;
439
443
  if (!past_msg.empty()) {
@@ -458,6 +462,8 @@ std::string common_chat_format_single(
458
462
  std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
459
463
  common_chat_templates_inputs inputs;
460
464
  inputs.use_jinja = use_jinja;
465
+ inputs.add_bos = tmpls->add_bos;
466
+ inputs.add_eos = tmpls->add_eos;
461
467
  auto add_simple_msg = [&](auto role, auto content) {
462
468
  common_chat_msg msg;
463
469
  msg.role = role;
@@ -535,6 +541,8 @@ common_chat_templates_ptr common_chat_templates_init(
535
541
  }
536
542
  std::string token_bos = bos_token_override;
537
543
  std::string token_eos = eos_token_override;
544
+ bool add_bos = false;
545
+ bool add_eos = false;
538
546
  if (model) {
539
547
  const auto * vocab = llama_model_get_vocab(model);
540
548
  const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -549,9 +557,13 @@ common_chat_templates_ptr common_chat_templates_init(
549
557
  };
550
558
  token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
551
559
  token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
560
+ add_bos = llama_vocab_get_add_bos(vocab);
561
+ add_eos = llama_vocab_get_add_eos(vocab);
552
562
  }
553
563
  common_chat_templates_ptr tmpls(new common_chat_templates());
554
564
  tmpls->has_explicit_template = has_explicit_template;
565
+ tmpls->add_bos = add_bos;
566
+ tmpls->add_eos = add_eos;
555
567
  try {
556
568
  tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
557
569
  } catch (const std::exception & e) {
@@ -581,6 +593,7 @@ const char * common_chat_format_name(common_chat_format format) {
581
593
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
582
594
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
583
595
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
596
+ case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
584
597
  default:
585
598
  throw std::runtime_error("Unknown chat format");
586
599
  }
@@ -589,6 +602,7 @@ const char * common_chat_format_name(common_chat_format format) {
589
602
  const char * common_reasoning_format_name(common_reasoning_format format) {
590
603
  switch (format) {
591
604
  case COMMON_REASONING_FORMAT_NONE: return "none";
605
+ case COMMON_REASONING_FORMAT_AUTO: return "auto";
592
606
  case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
593
607
  case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
594
608
  default:
@@ -737,10 +751,10 @@ static std::string apply(
737
751
  // instead of using `chat_template_options.use_bos_token = false`, since these tokens
738
752
  // may be needed inside the template / between messages too.
739
753
  auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
740
- if (string_starts_with(result, tmpl.bos_token())) {
754
+ if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
741
755
  result = result.substr(tmpl.bos_token().size());
742
756
  }
743
- if (string_ends_with(result, tmpl.eos_token())) {
757
+ if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
744
758
  result = result.substr(0, result.size() - tmpl.eos_token().size());
745
759
  }
746
760
  return result;
@@ -1278,6 +1292,26 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1278
1292
  tool_calls_end);
1279
1293
  }
1280
1294
 
1295
+ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1296
+ common_chat_params data;
1297
+ auto prompt = apply(tmpl, inputs);
1298
+
1299
+ data.prompt = prompt;
1300
+ data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1301
+
1302
+ // TODO: support tool calls in GPT-OSS?
1303
+
1304
+ return data;
1305
+ }
1306
+ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1307
+ // TODO @ngxson : this won't work with --special enabled, we should fix that
1308
+ builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
1309
+ if (!builder.syntax().parse_tool_calls) {
1310
+ builder.add_content(builder.consume_rest());
1311
+ return;
1312
+ }
1313
+ }
1314
+
1281
1315
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1282
1316
  LOG_DBG("%s\n", __func__);
1283
1317
  common_chat_params data;
@@ -1720,6 +1754,8 @@ static common_chat_params common_chat_templates_apply_jinja(
1720
1754
  params.enable_thinking = inputs.enable_thinking;
1721
1755
  params.grammar = inputs.grammar;
1722
1756
  params.now = inputs.now;
1757
+ params.add_bos = inputs.add_bos;
1758
+ params.add_eos = inputs.add_eos;
1723
1759
 
1724
1760
  params.extra_context = json::object();
1725
1761
  for (auto el : inputs.chat_template_kwargs) {
@@ -1761,6 +1797,11 @@ static common_chat_params common_chat_templates_apply_jinja(
1761
1797
  return common_chat_params_init_hermes_2_pro(tmpl, params);
1762
1798
  }
1763
1799
 
1800
+ // GPT-OSS
1801
+ if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
1802
+ return common_chat_params_init_gpt_oss(tmpl, params);
1803
+ }
1804
+
1764
1805
  // Use generic handler when mixing tools + JSON schema.
1765
1806
  // TODO: support that mix in handlers below.
1766
1807
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1912,6 +1953,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1912
1953
  case COMMON_CHAT_FORMAT_COMMAND_R7B:
1913
1954
  common_chat_parse_command_r7b(builder);
1914
1955
  break;
1956
+ case COMMON_CHAT_FORMAT_GPT_OSS:
1957
+ common_chat_parse_gpt_oss(builder);
1958
+ break;
1915
1959
  default:
1916
1960
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1917
1961
  }
@@ -9,12 +9,14 @@
9
9
  #include <vector>
10
10
  #include <map>
11
11
 
12
- #include <minja/chat-template.hpp>
13
- #include <minja/minja.hpp>
12
+ #include "minja/chat-template.hpp"
13
+ #include "minja/minja.hpp"
14
14
 
15
15
  typedef minja::chat_template common_chat_template;
16
16
 
17
17
  struct common_chat_templates {
18
+ bool add_bos;
19
+ bool add_eos;
18
20
  bool has_explicit_template; // Model had builtin template or template overridde was specified.
19
21
  std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
20
22
  std::unique_ptr<common_chat_template> template_tool_use;
@@ -118,6 +120,7 @@ enum common_chat_format {
118
120
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
119
121
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
120
122
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
+ COMMON_CHAT_FORMAT_GPT_OSS,
121
124
 
122
125
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
123
126
  };
@@ -136,6 +139,8 @@ struct common_chat_templates_inputs {
136
139
  bool enable_thinking = true;
137
140
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
138
141
  std::map<std::string, std::string> chat_template_kwargs;
142
+ bool add_bos = false;
143
+ bool add_eos = false;
139
144
  };
140
145
 
141
146
  struct common_chat_params {
@@ -236,6 +236,7 @@ struct common_params_diffusion {
236
236
 
237
237
  enum common_reasoning_format {
238
238
  COMMON_REASONING_FORMAT_NONE,
239
+ COMMON_REASONING_FORMAT_AUTO,
239
240
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
240
241
  COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
241
242
  };
@@ -395,7 +396,7 @@ struct common_params {
395
396
  std::string chat_template = ""; // NOLINT
396
397
  bool use_jinja = false; // NOLINT
397
398
  bool enable_chat_template = true;
398
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
399
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
399
400
  int reasoning_budget = -1;
400
401
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
401
402
 
@@ -440,7 +441,7 @@ struct common_params {
440
441
  int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
441
442
  int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
442
443
  int32_t i_chunk = 0; // start processing from this chunk
443
- bool imat_dat = false; // whether the legacy imatrix.dat format should be output
444
+ int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
444
445
 
445
446
  bool process_output = false; // collect data for the output tensor
446
447
  bool compute_ppl = true; // whether to compute perplexity
@@ -39,8 +39,9 @@ if (WIN32)
39
39
  set(CMAKE_SHARED_MODULE_PREFIX "")
40
40
  endif()
41
41
 
42
- option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
43
- option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
42
+ option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
43
+ option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
44
+ set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
44
45
 
45
46
  #
46
47
  # option list
@@ -304,6 +304,16 @@
304
304
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
305
305
  GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
306
306
 
307
+ #define GGML_TENSOR_TERNARY_OP_LOCALS \
308
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
309
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
310
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
311
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
312
+ GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
313
+ GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
314
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
315
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
316
+
307
317
  #define GGML_TENSOR_BINARY_OP_LOCALS01 \
308
318
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
309
319
  GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
@@ -395,7 +405,8 @@ extern "C" {
395
405
  // GGML_TYPE_IQ4_NL_4_4 = 36,
396
406
  // GGML_TYPE_IQ4_NL_4_8 = 37,
397
407
  // GGML_TYPE_IQ4_NL_8_8 = 38,
398
- GGML_TYPE_COUNT = 39,
408
+ GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
409
+ GGML_TYPE_COUNT = 40,
399
410
  };
400
411
 
401
412
  // precision
@@ -430,6 +441,7 @@ extern "C" {
430
441
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
431
442
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
432
443
  GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
444
+ GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
433
445
  };
434
446
 
435
447
  // available tensor operations:
@@ -438,6 +450,7 @@ extern "C" {
438
450
 
439
451
  GGML_OP_DUP,
440
452
  GGML_OP_ADD,
453
+ GGML_OP_ADD_ID,
441
454
  GGML_OP_ADD1,
442
455
  GGML_OP_ACC,
443
456
  GGML_OP_SUB,
@@ -557,6 +570,7 @@ extern "C" {
557
570
  GGML_GLU_OP_REGLU,
558
571
  GGML_GLU_OP_GEGLU,
559
572
  GGML_GLU_OP_SWIGLU,
573
+ GGML_GLU_OP_SWIGLU_OAI,
560
574
  GGML_GLU_OP_GEGLU_ERF,
561
575
  GGML_GLU_OP_GEGLU_QUICK,
562
576
 
@@ -831,6 +845,13 @@ extern "C" {
831
845
  struct ggml_tensor * b,
832
846
  enum ggml_type type);
833
847
 
848
+ // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
849
+ GGML_API struct ggml_tensor * ggml_add_id(
850
+ struct ggml_context * ctx,
851
+ struct ggml_tensor * a,
852
+ struct ggml_tensor * b,
853
+ struct ggml_tensor * ids);
854
+
834
855
  GGML_API struct ggml_tensor * ggml_add1(
835
856
  struct ggml_context * ctx,
836
857
  struct ggml_tensor * a,
@@ -1198,6 +1219,13 @@ extern "C" {
1198
1219
  struct ggml_tensor * a,
1199
1220
  struct ggml_tensor * b);
1200
1221
 
1222
+ GGML_API struct ggml_tensor * ggml_swiglu_oai(
1223
+ struct ggml_context * ctx,
1224
+ struct ggml_tensor * a,
1225
+ struct ggml_tensor * b,
1226
+ float alpha,
1227
+ float limit);
1228
+
1201
1229
  // normalize along rows
1202
1230
  GGML_API struct ggml_tensor * ggml_norm(
1203
1231
  struct ggml_context * ctx,
@@ -1570,6 +1598,10 @@ extern "C" {
1570
1598
  float scale,
1571
1599
  float max_bias);
1572
1600
 
1601
+ GGML_API void ggml_soft_max_add_sinks(
1602
+ struct ggml_tensor * a,
1603
+ struct ggml_tensor * sinks);
1604
+
1573
1605
  GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
1574
1606
  struct ggml_context * ctx,
1575
1607
  struct ggml_tensor * a,
@@ -2052,6 +2084,10 @@ extern "C" {
2052
2084
  GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
2053
2085
  const struct ggml_tensor * a);
2054
2086
 
2087
+ GGML_API void ggml_flash_attn_ext_add_sinks(
2088
+ struct ggml_tensor * a,
2089
+ struct ggml_tensor * sinks);
2090
+
2055
2091
  // TODO: needs to be adapted to ggml_flash_attn_ext
2056
2092
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
2057
2093
  struct ggml_context * ctx,
@@ -214,6 +214,13 @@ add_library(ggml
214
214
  ggml-backend-reg.cpp)
215
215
  add_library(ggml::ggml ALIAS ggml)
216
216
 
217
+ if (GGML_BACKEND_DIR)
218
+ if (NOT GGML_BACKEND_DL)
219
+ message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
220
+ endif()
221
+ target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
222
+ endif()
223
+
217
224
  target_link_libraries(ggml PUBLIC ggml-base)
218
225
 
219
226
  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -227,7 +234,11 @@ function(ggml_add_backend_library backend)
227
234
  set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
228
235
  target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
229
236
  add_dependencies(ggml ${backend})
230
- install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
237
+ if (GGML_BACKEND_DIR)
238
+ install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
239
+ else()
240
+ install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
241
+ endif()
231
242
  else()
232
243
  add_library(${backend} ${ARGN})
233
244
  target_link_libraries(ggml PUBLIC ${backend})
@@ -589,6 +589,67 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
589
589
  *s = sumf;
590
590
  }
591
591
 
592
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
593
+ assert(nrc == 1);
594
+ UNUSED(nrc);
595
+ UNUSED(bx);
596
+ UNUSED(by);
597
+ UNUSED(bs);
598
+ assert(n % QK_MXFP4 == 0);
599
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
600
+
601
+ const block_mxfp4 * GGML_RESTRICT x = vx;
602
+ const block_q8_0 * GGML_RESTRICT y = vy;
603
+
604
+ const int nb = n / QK_MXFP4;
605
+
606
+ int ib = 0;
607
+ float sumf = 0;
608
+
609
+ #if defined __ARM_NEON
610
+ const int8x16_t values = vld1q_s8(kvalues_mxfp4);
611
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
612
+ uint8x16x2_t q4bits;
613
+ int8x16x4_t q4b;
614
+ int8x16x4_t q8b;
615
+ int32x4_t prod_1;
616
+ int32x4_t prod_2;
617
+
618
+ for (; ib + 1 < nb; ib += 2) {
619
+ q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
620
+ q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
621
+ q8b.val[0] = vld1q_s8(y[ib + 0].qs);
622
+ q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
623
+ q8b.val[2] = vld1q_s8(y[ib + 1].qs);
624
+ q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
625
+
626
+ q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
627
+ q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
628
+ q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
629
+ q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
630
+
631
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
632
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
633
+
634
+ sumf +=
635
+ GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
636
+ GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
637
+ }
638
+
639
+ #endif
640
+ for (; ib < nb; ++ib) {
641
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
642
+ int sumi1 = 0;
643
+ int sumi2 = 0;
644
+ for (int j = 0; j < QK_MXFP4/2; ++j) {
645
+ sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
646
+ sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
647
+ }
648
+ sumf += d * (sumi1 + sumi2);
649
+ }
650
+ *s = sumf;
651
+ }
652
+
592
653
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
593
654
  const int qk = QK8_0;
594
655
  const int nb = n / qk;