@fugood/llama.node 0.3.11 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -0
  18. package/lib/index.js +26 -20
  19. package/lib/index.ts +32 -28
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +14 -0
  22. package/src/LlamaContext.cpp +13 -4
  23. package/src/llama.cpp/.github/workflows/build.yml +35 -3
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/common/CMakeLists.txt +20 -3
  27. package/src/llama.cpp/common/arg.cpp +180 -3
  28. package/src/llama.cpp/common/chat-template.hpp +21 -7
  29. package/src/llama.cpp/common/chat.cpp +220 -101
  30. package/src/llama.cpp/common/chat.hpp +3 -0
  31. package/src/llama.cpp/common/common.h +15 -7
  32. package/src/llama.cpp/common/llguidance.cpp +3 -3
  33. package/src/llama.cpp/common/log.cpp +1 -0
  34. package/src/llama.cpp/common/log.h +2 -1
  35. package/src/llama.cpp/common/minja.hpp +24 -9
  36. package/src/llama.cpp/common/sampling.cpp +52 -46
  37. package/src/llama.cpp/common/speculative.h +1 -1
  38. package/src/llama.cpp/docs/build.md +2 -2
  39. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
  40. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  41. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  43. package/src/llama.cpp/examples/run/run.cpp +5 -12
  44. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/server/httplib.h +381 -292
  46. package/src/llama.cpp/examples/server/server.cpp +58 -47
  47. package/src/llama.cpp/examples/server/utils.hpp +7 -5
  48. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  49. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  50. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  51. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  52. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
  55. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
  57. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  58. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
  59. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
  60. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
  61. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
  62. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
  63. package/src/llama.cpp/ggml/src/ggml.c +1 -1
  64. package/src/llama.cpp/include/llama.h +14 -10
  65. package/src/llama.cpp/src/llama-grammar.cpp +1 -1
  66. package/src/llama.cpp/src/llama-grammar.h +1 -1
  67. package/src/llama.cpp/src/llama-impl.h +6 -6
  68. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  69. package/src/llama.cpp/src/llama-mmap.h +1 -0
  70. package/src/llama.cpp/src/llama-model.cpp +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +131 -57
  72. package/src/llama.cpp/src/llama.cpp +7 -5
  73. package/src/llama.cpp/src/unicode.cpp +9 -2
  74. package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
  75. package/src/llama.cpp/tests/test-chat.cpp +237 -69
  76. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  77. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
@@ -55,6 +55,8 @@ const std::vector<std::string> type_names = {
55
55
  "q4_k",
56
56
  "q5_k",
57
57
  "q6_k",
58
+ "iq1_s",
59
+ "iq1_m",
58
60
  "iq2_xxs",
59
61
  "iq2_xs",
60
62
  "iq2_s",
@@ -182,6 +184,13 @@ std::string to_uppercase(const std::string& input) {
182
184
  return result;
183
185
  }
184
186
 
187
+ bool string_starts_with(const std::string& str, const std::string& prefix) {
188
+ if (prefix.size() > str.size()) {
189
+ return false;
190
+ }
191
+ return std::equal(prefix.begin(), prefix.end(), str.begin());
192
+ }
193
+
185
194
  bool string_ends_with(const std::string& str, const std::string& suffix) {
186
195
  if (suffix.size() > str.size()) {
187
196
  return false;
@@ -387,7 +396,7 @@ void process_shaders() {
387
396
  for (const auto& tname : type_names) {
388
397
  // mul mat vec
389
398
  std::string data_a_key = "DATA_A_" + to_uppercase(tname);
390
- std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
399
+ std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
391
400
 
392
401
  string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
393
402
  string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
@@ -434,6 +443,8 @@ void process_shaders() {
434
443
  string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
435
444
  string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
436
445
 
446
+ string_to_spv("sub_f32", "sub.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
447
+
437
448
  string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
438
449
 
439
450
  string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
@@ -443,6 +454,7 @@ void process_shaders() {
443
454
  string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
444
455
 
445
456
  string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
457
+ string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
446
458
 
447
459
  string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
448
460
 
@@ -482,9 +494,19 @@ void process_shaders() {
482
494
  string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
483
495
  string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
484
496
 
497
+ string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
498
+ string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
499
+ string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
500
+
501
+ string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
502
+ string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
503
+ string_to_spv("rope_vision_f16_rte", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
504
+
485
505
  string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
486
506
 
507
+ string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}}));
487
508
  string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
509
+ string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
488
510
 
489
511
  string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
490
512
  string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
@@ -496,6 +518,8 @@ void process_shaders() {
496
518
 
497
519
  string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
498
520
 
521
+ string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
522
+
499
523
  for (auto &c : compiles) {
500
524
  c.wait();
501
525
  }
@@ -1379,7 +1379,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
1379
1379
  (t0->nb[3] == t1->nb[3]);
1380
1380
  }
1381
1381
 
1382
- // check if t1 can be represented as a repeatition of t0
1382
+ // check if t1 can be represented as a repetition of t0
1383
1383
  bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1384
1384
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1385
1385
 
@@ -213,7 +213,7 @@ extern "C" {
213
213
  LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
214
214
  };
215
215
 
216
- // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
216
+ // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
217
217
  typedef struct llama_token_data {
218
218
  llama_token id; // token id
219
219
  float logit; // log-odds of the token
@@ -307,7 +307,7 @@ extern "C" {
307
307
  };
308
308
 
309
309
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
310
- // https://github.com/ggerganov/llama.cpp/pull/7544
310
+ // https://github.com/ggml-org/llama.cpp/pull/7544
311
311
  struct llama_context_params {
312
312
  uint32_t n_ctx; // text context, 0 = from model
313
313
  uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
@@ -320,7 +320,7 @@ extern "C" {
320
320
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
321
321
  enum llama_attention_type attention_type; // attention type to use for embeddings
322
322
 
323
- // ref: https://github.com/ggerganov/llama.cpp/pull/2054
323
+ // ref: https://github.com/ggml-org/llama.cpp/pull/2054
324
324
  float rope_freq_base; // RoPE base frequency, 0 = from model
325
325
  float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
326
326
  float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
@@ -385,7 +385,7 @@ extern "C" {
385
385
  struct llama_adapter_lora;
386
386
 
387
387
  // Helpers for getting default parameters
388
- // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
388
+ // TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
389
389
  LLAMA_API struct llama_model_params llama_model_default_params(void);
390
390
  LLAMA_API struct llama_context_params llama_context_default_params(void);
391
391
  LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
@@ -1040,7 +1040,7 @@ extern "C" {
1040
1040
 
1041
1041
  /// Apply chat template. Inspired by hf apply_chat_template() on python.
1042
1042
  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
1043
- /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
1043
+ /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
1044
1044
  /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
1045
1045
  /// @param chat Pointer to a list of multiple llama_chat_message
1046
1046
  /// @param n_msg Number of llama_chat_message in this chat
@@ -1114,11 +1114,12 @@ extern "C" {
1114
1114
  };
1115
1115
 
1116
1116
  struct llama_sampler {
1117
- struct llama_sampler_i * iface;
1118
- llama_sampler_context_t ctx;
1117
+ const struct llama_sampler_i * iface;
1118
+ llama_sampler_context_t ctx;
1119
1119
  };
1120
1120
 
1121
1121
  // mirror of llama_sampler_i:
1122
+ LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1122
1123
  LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
1123
1124
  LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
1124
1125
  LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1148,7 +1149,7 @@ extern "C" {
1148
1149
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1149
1150
  /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1150
1151
  DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1151
- "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
1152
+ "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1152
1153
 
1153
1154
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1154
1155
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1156,7 +1157,7 @@ extern "C" {
1156
1157
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1157
1158
  LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
1158
1159
 
1159
- /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1160
+ /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
1160
1161
  LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1161
1162
 
1162
1163
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -1171,6 +1172,9 @@ extern "C" {
1171
1172
  /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1172
1173
  LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
1173
1174
 
1175
+ /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
1176
+ LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
1177
+
1174
1178
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1175
1179
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1176
1180
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1199,7 +1203,7 @@ extern "C" {
1199
1203
  const char * grammar_str,
1200
1204
  const char * grammar_root);
1201
1205
 
1202
- /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639
1206
+ /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
1203
1207
  /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
1204
1208
  /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
1205
1209
  LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
@@ -1186,7 +1186,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1186
1186
  return;
1187
1187
  }
1188
1188
  }
1189
- LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`) (buffer: `%s`)\n", token, piece.c_str(), grammar.trigger_buffer.c_str());
1189
+ LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str());
1190
1190
  return;
1191
1191
  }
1192
1192
  }
@@ -116,7 +116,7 @@ struct llama_grammar {
116
116
  llama_partial_utf8 partial_utf8;
117
117
 
118
118
  // lazy grammars wait for trigger words or tokens before constraining the sampling.
119
- // we still ahve trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
119
+ // we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
120
120
  // (useful e.g. for tool_choice=required)
121
121
  bool lazy = false;
122
122
  bool awaiting_trigger = false; // Initialized to true for lazy grammars only
@@ -6,13 +6,13 @@
6
6
  #include <vector>
7
7
 
8
8
  #ifdef __GNUC__
9
- #ifdef __MINGW32__
10
- #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
9
+ # if defined(__MINGW32__) && !defined(__clang__)
10
+ # define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
11
+ # else
12
+ # define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
13
+ # endif
11
14
  #else
12
- #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
13
- #endif
14
- #else
15
- #define LLAMA_ATTRIBUTE_FORMAT(...)
15
+ # define LLAMA_ATTRIBUTE_FORMAT(...)
16
16
  #endif
17
17
 
18
18
  //
@@ -37,7 +37,7 @@ struct llama_kv_cache {
37
37
  bool can_shift = false;
38
38
 
39
39
  // Note: The value of head isn't only used to optimize searching
40
- // for a free KV slot. llama_decode_internal also uses it, so it
40
+ // for a free KV slot. llama_decode_impl also uses it, so it
41
41
  // cannot be freely changed after a slot has been allocated.
42
42
  uint32_t head = 0;
43
43
  uint32_t size = 0;
@@ -1,5 +1,6 @@
1
1
  #pragma once
2
2
 
3
+ #include <cstdint>
3
4
  #include <memory>
4
5
  #include <vector>
5
6
 
@@ -1275,7 +1275,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1275
1275
 
1276
1276
  const bool use_mmap_buffer = true;
1277
1277
 
1278
- LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, use_mmap_buffer ? "true" : "false");
1278
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1279
1279
 
1280
1280
  // build a list of buffer types for the CPU and GPU devices
1281
1281
  pimpl->cpu_buft_list = make_cpu_buft_list(devices);
@@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {
316
316
 
317
317
  // llama_sampler API
318
318
 
319
+ struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
320
+ return new llama_sampler {
321
+ /* .iface = */ iface,
322
+ /* .ctx = */ ctx,
323
+ };
324
+ }
325
+
319
326
  const char * llama_sampler_name(const struct llama_sampler * smpl) {
320
327
  if (!smpl->iface) {
321
328
  return "(null)";
@@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
347
354
  }
348
355
 
349
356
  if (smpl->ctx == nullptr) {
350
- return new llama_sampler {
357
+ return llama_sampler_init(
351
358
  /* .iface = */ smpl->iface,
352
- /* .ctx = */ nullptr,
353
- };
359
+ /* .ctx = */ nullptr
360
+ );
354
361
  }
355
362
 
356
363
  GGML_ABORT("the sampler does not support cloning");
@@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
472
479
  };
473
480
 
474
481
  struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
475
- return new llama_sampler {
482
+ return llama_sampler_init(
476
483
  /* .iface = */ &llama_sampler_chain_i,
477
484
  /* .ctx = */ new llama_sampler_chain {
478
485
  /* .params = */ params,
479
486
  /* .samplers = */ {},
480
487
  /* .t_sample_us = */ 0,
481
488
  /* .n_sample = */ 0,
482
- },
483
- };
489
+ }
490
+ );
484
491
  }
485
492
 
486
493
  void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
@@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
546
553
  };
547
554
 
548
555
  struct llama_sampler * llama_sampler_init_greedy() {
549
- return new llama_sampler {
556
+ return llama_sampler_init(
550
557
  /* .iface = */ &llama_sampler_greedy_i,
551
- /* .ctx = */ nullptr,
552
- };
558
+ /* .ctx = */ nullptr
559
+ );
553
560
  }
554
561
 
555
562
  // dist
@@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {
608
615
 
609
616
  struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
610
617
  auto seed_cur = get_rng_seed(seed);
611
- return new llama_sampler {
618
+ return llama_sampler_init(
612
619
  /* .iface = */ &llama_sampler_dist_i,
613
620
  /* .ctx = */ new llama_sampler_dist {
614
621
  /* .seed = */ seed,
615
622
  /* .seed_cur = */ seed_cur,
616
623
  /* .rng = */ std::mt19937(seed_cur),
617
- },
618
- };
624
+ }
625
+ );
619
626
  }
620
627
 
621
628
  // softmax
@@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
638
645
  };
639
646
 
640
647
  struct llama_sampler * llama_sampler_init_softmax() {
641
- return new llama_sampler {
648
+ return llama_sampler_init(
642
649
  /* .iface = */ &llama_sampler_softmax_i,
643
- /* .ctx = */ nullptr,
644
- };
650
+ /* .ctx = */ nullptr
651
+ );
645
652
  }
646
653
 
647
654
  // top-k
@@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
678
685
  };
679
686
 
680
687
  struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
681
- return new llama_sampler {
688
+ return llama_sampler_init(
682
689
  /* .iface = */ &llama_sampler_top_k_i,
683
690
  /* .ctx = */ new llama_sampler_top_k {
684
691
  /* .k = */ k,
685
- },
686
- };
692
+ }
693
+ );
687
694
  }
688
695
 
689
696
  // top-p
@@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
744
751
  };
745
752
 
746
753
  struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
747
- return new llama_sampler {
754
+ return llama_sampler_init(
748
755
  /* .iface = */ &llama_sampler_top_p_i,
749
756
  /* .ctx = */ new llama_sampler_top_p {
750
757
  /* .p = */ p,
751
758
  /* .min_keep = */ min_keep,
752
- },
753
- };
759
+ }
760
+ );
754
761
  }
755
762
 
756
763
  // min-p
@@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
840
847
  };
841
848
 
842
849
  struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
843
- return new llama_sampler {
850
+ return llama_sampler_init(
844
851
  /* .iface = */ &llama_sampler_min_p_i,
845
852
  /* .ctx = */ new llama_sampler_min_p {
846
853
  /* .p = */ p,
847
854
  /* .min_keep = */ min_keep,
848
- },
849
- };
855
+ }
856
+ );
850
857
  }
851
858
 
852
859
  // typical
@@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
939
946
  };
940
947
 
941
948
  struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
942
- return new llama_sampler {
949
+ return llama_sampler_init(
943
950
  /* .iface = */ &llama_sampler_typical_i,
944
951
  /* .ctx = */ new llama_sampler_typical {
945
952
  /* .p = */ p,
946
953
  /* .min_keep = */ min_keep,
947
- },
948
- };
954
+ }
955
+ );
949
956
  }
950
957
 
951
958
  // temp
@@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
983
990
  };
984
991
 
985
992
  struct llama_sampler * llama_sampler_init_temp(float temp) {
986
- return new llama_sampler {
993
+ return llama_sampler_init(
987
994
  /* .iface = */ &llama_sampler_temp_i,
988
995
  /* .ctx = */ new llama_sampler_temp {
989
996
  /*.temp = */ temp,
990
- },
991
- };
997
+ }
998
+ );
992
999
  }
993
1000
 
994
1001
  // temp-ext
@@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
1093
1100
  };
1094
1101
 
1095
1102
  struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
1096
- return new llama_sampler {
1103
+ return llama_sampler_init(
1097
1104
  /* .iface = */ &llama_sampler_temp_ext_i,
1098
1105
  /* .ctx = */ new llama_sampler_temp_ext {
1099
1106
  /* .temp = */ temp,
1100
1107
  /* .delta = */ delta,
1101
1108
  /* .exponent = */ exponent,
1102
- },
1103
- };
1109
+ }
1110
+ );
1104
1111
  }
1105
1112
 
1106
1113
  // xtc
@@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
1185
1192
 
1186
1193
  struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
1187
1194
  auto seed_cur = get_rng_seed(seed);
1188
- return new llama_sampler {
1195
+ return llama_sampler_init(
1189
1196
  /* .iface = */ &llama_sampler_xtc_i,
1190
1197
  /* .ctx = */ new llama_sampler_xtc {
1191
1198
  /* .probability = */ p,
@@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
1194
1201
  /* .seed = */ seed,
1195
1202
  /* .seed_cur = */ seed_cur,
1196
1203
  /* .rng = */ std::mt19937(seed_cur),
1197
- },
1198
- };
1204
+ }
1205
+ );
1199
1206
  }
1200
1207
 
1201
1208
  // mirostat
@@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
1292
1299
 
1293
1300
  struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
1294
1301
  auto seed_cur = get_rng_seed(seed);
1295
- return new llama_sampler {
1302
+ return llama_sampler_init(
1296
1303
  /* .iface = */ &llama_sampler_mirostat_i,
1297
1304
  /* .ctx = */ new llama_sampler_mirostat {
1298
1305
  /* .n_vocab = */ n_vocab,
@@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
1303
1310
  /* .m = */ m,
1304
1311
  /* .mu = */ 2.0f*tau,
1305
1312
  /* .rng = */ std::mt19937(seed_cur),
1306
- },
1307
- };
1313
+ }
1314
+ );
1308
1315
  }
1309
1316
 
1310
1317
  // mirostat v2
@@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
1391
1398
 
1392
1399
  struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
1393
1400
  auto seed_cur = get_rng_seed(seed);
1394
- return new llama_sampler {
1401
+ return llama_sampler_init(
1395
1402
  /* .iface = */ &llama_sampler_mirostat_v2_i,
1396
1403
  /* .ctx = */ new llama_sampler_mirostat_v2 {
1397
1404
  /* .seed = */ seed,
@@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
1400
1407
  /* .eta = */ eta,
1401
1408
  /* .mu = */ 2.0f*tau,
1402
1409
  /* .rng = */ std::mt19937(seed_cur),
1403
- },
1404
- };
1410
+ }
1411
+ );
1405
1412
  }
1406
1413
 
1407
1414
  // grammar
@@ -1528,10 +1535,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1528
1535
  };
1529
1536
  }
1530
1537
 
1531
- return new llama_sampler {
1538
+ return llama_sampler_init(
1532
1539
  /* .iface = */ &llama_sampler_grammar_i,
1533
- /* .ctx = */ ctx,
1534
- };
1540
+ /* .ctx = */ ctx
1541
+ );
1535
1542
  }
1536
1543
 
1537
1544
  struct llama_sampler * llama_sampler_init_grammar(
@@ -1678,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties(
1678
1685
  float penalty_present) {
1679
1686
  penalty_last_n = std::max(penalty_last_n, 0);
1680
1687
 
1681
- return new llama_sampler {
1688
+ return llama_sampler_init(
1682
1689
  /* .iface = */ &llama_sampler_penalties_i,
1683
1690
  /* .ctx = */ new llama_sampler_penalties {
1684
1691
  /* .penalty_last_n = */ penalty_last_n,
@@ -1687,8 +1694,75 @@ struct llama_sampler * llama_sampler_init_penalties(
1687
1694
  /* .penalty_present = */ penalty_present,
1688
1695
  /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
1689
1696
  /* .token_count = */ {},
1690
- },
1691
- };
1697
+ }
1698
+ );
1699
+ }
1700
+
1701
+ // top-n-sigma
1702
+
1703
+ struct llama_sampler_top_n_sigma {
1704
+ const float n;
1705
+ };
1706
+
1707
+ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) {
1708
+ return "top-n-sigma";
1709
+ }
1710
+
1711
+ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1712
+ const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
1713
+
1714
+ // find max logit and calculate mean
1715
+ float max = cur_p->data[0].logit;
1716
+ float logits_sum = 0;
1717
+ for (size_t i = 0; i < cur_p->size; ++i) {
1718
+ if (cur_p->data[i].logit > max) {
1719
+ max = cur_p->data[i].logit;
1720
+ }
1721
+ logits_sum += cur_p->data[i].logit;
1722
+ }
1723
+ float mean = logits_sum/cur_p->size;
1724
+
1725
+ // calculate standard deviation
1726
+ float acc = 0;
1727
+ for (size_t i = 0; i < cur_p->size; ++i) {
1728
+ acc += pow(cur_p->data[i].logit - mean, 2);
1729
+ }
1730
+ float std = sqrt(acc/cur_p->size);
1731
+
1732
+ //apply mask
1733
+ for (size_t i = 0; i < cur_p->size; ++i) {
1734
+ if (cur_p->data[i].logit < max - (ctx->n * std)) {
1735
+ cur_p->data[i].logit = -INFINITY;
1736
+ }
1737
+ }
1738
+ llama_sampler_softmax_impl(cur_p);
1739
+ }
1740
+
1741
+ static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
1742
+ const auto * ctx = (const llama_sampler_top_n_sigma *) smpl->ctx;
1743
+ return llama_sampler_init_top_n_sigma(ctx->n);
1744
+ }
1745
+
1746
+ static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) {
1747
+ delete (llama_sampler_top_n_sigma *) smpl->ctx;
1748
+ }
1749
+
1750
+ static struct llama_sampler_i llama_sampler_top_n_sigma_i = {
1751
+ /* .name = */ llama_sampler_top_n_sigma_name,
1752
+ /* .accept = */ nullptr,
1753
+ /* .apply = */ llama_sampler_top_n_sigma_apply,
1754
+ /* .reset = */ nullptr,
1755
+ /* .clone = */ llama_sampler_top_n_sigma_clone,
1756
+ /* .free = */ llama_sampler_top_n_sigma_free,
1757
+ };
1758
+
1759
+ struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
1760
+ return llama_sampler_init(
1761
+ /* .iface = */ &llama_sampler_top_n_sigma_i,
1762
+ /* .ctx = */ new llama_sampler_top_n_sigma {
1763
+ /* .n = */ n,
1764
+ }
1765
+ );
1692
1766
  }
1693
1767
 
1694
1768
  // DRY
@@ -2041,7 +2115,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
2041
2115
  }
2042
2116
  }
2043
2117
 
2044
- return new llama_sampler {
2118
+ return llama_sampler_init(
2045
2119
  /* .iface = */ &llama_sampler_dry_i,
2046
2120
  /* .ctx = */ new llama_sampler_dry {
2047
2121
  /* .total_context_size = */ context_size,
@@ -2053,8 +2127,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
2053
2127
  /* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
2054
2128
  /* .dry_max_token_repeat = */ {},
2055
2129
  /* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
2056
- },
2057
- };
2130
+ }
2131
+ );
2058
2132
  }
2059
2133
 
2060
2134
  // wrapper for test-sampling.cpp
@@ -2155,14 +2229,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
2155
2229
  int32_t n_vocab,
2156
2230
  int32_t n_logit_bias,
2157
2231
  const llama_logit_bias * logit_bias) {
2158
- return new llama_sampler {
2232
+ return llama_sampler_init(
2159
2233
  /* .iface = */ &llama_sampler_logit_bias_i,
2160
2234
  /* .ctx = */ new llama_sampler_logit_bias {
2161
2235
  /* .n_vocab = */ n_vocab,
2162
2236
  /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
2163
2237
  /* .to_search = */ {},
2164
- },
2165
- };
2238
+ }
2239
+ );
2166
2240
  }
2167
2241
 
2168
2242
  // infill
@@ -2377,14 +2451,14 @@ static struct llama_sampler_i llama_sampler_infill_i = {
2377
2451
  };
2378
2452
 
2379
2453
  struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
2380
- return new llama_sampler {
2454
+ return llama_sampler_init(
2381
2455
  /* .iface = */ &llama_sampler_infill_i,
2382
2456
  /* .ctx = */ new llama_sampler_infill {
2383
2457
  /* .vocab = */ vocab,
2384
2458
  /* .buf0 = */ std::vector<char>(512),
2385
2459
  /* .buf1 = */ std::vector<char>(512),
2386
- },
2387
- };
2460
+ }
2461
+ );
2388
2462
  }
2389
2463
 
2390
2464
  // utils