@fugood/llama.node 0.3.12 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -0
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +13 -4
  21. package/src/llama.cpp/.github/workflows/build.yml +35 -3
  22. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  23. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  24. package/src/llama.cpp/common/CMakeLists.txt +20 -3
  25. package/src/llama.cpp/common/arg.cpp +180 -3
  26. package/src/llama.cpp/common/chat-template.hpp +21 -7
  27. package/src/llama.cpp/common/chat.cpp +220 -101
  28. package/src/llama.cpp/common/chat.hpp +3 -0
  29. package/src/llama.cpp/common/common.h +15 -7
  30. package/src/llama.cpp/common/llguidance.cpp +3 -3
  31. package/src/llama.cpp/common/log.cpp +1 -0
  32. package/src/llama.cpp/common/log.h +2 -1
  33. package/src/llama.cpp/common/minja.hpp +24 -9
  34. package/src/llama.cpp/common/sampling.cpp +52 -46
  35. package/src/llama.cpp/common/speculative.h +1 -1
  36. package/src/llama.cpp/docs/build.md +2 -2
  37. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  39. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  40. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  41. package/src/llama.cpp/examples/run/run.cpp +5 -12
  42. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  43. package/src/llama.cpp/examples/server/httplib.h +381 -292
  44. package/src/llama.cpp/examples/server/server.cpp +58 -47
  45. package/src/llama.cpp/examples/server/utils.hpp +7 -5
  46. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  47. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  48. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  49. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  50. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  51. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
  52. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  56. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
  57. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
  58. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
  59. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
  60. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
  61. package/src/llama.cpp/ggml/src/ggml.c +1 -1
  62. package/src/llama.cpp/include/llama.h +14 -10
  63. package/src/llama.cpp/src/llama-grammar.cpp +1 -1
  64. package/src/llama.cpp/src/llama-grammar.h +1 -1
  65. package/src/llama.cpp/src/llama-impl.h +6 -6
  66. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  67. package/src/llama.cpp/src/llama-mmap.h +1 -0
  68. package/src/llama.cpp/src/llama-model.cpp +1 -1
  69. package/src/llama.cpp/src/llama-sampling.cpp +131 -57
  70. package/src/llama.cpp/src/llama.cpp +7 -5
  71. package/src/llama.cpp/src/unicode.cpp +9 -2
  72. package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
  73. package/src/llama.cpp/tests/test-chat.cpp +237 -69
  74. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  75. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
@@ -140,6 +140,7 @@ struct common_params_sampling {
140
140
  int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
141
141
  int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
142
142
  int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
143
+ float top_n_sigma = -1.00f;// -1.0 = disabled
143
144
  float mirostat_tau = 5.00f; // target entropy
144
145
  float mirostat_eta = 0.10f; // learning rate
145
146
  bool ignore_eos = false;
@@ -202,6 +203,11 @@ struct common_params_vocoder {
202
203
  bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
203
204
  };
204
205
 
206
+ enum common_reasoning_format {
207
+ COMMON_REASONING_FORMAT_NONE,
208
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
209
+ };
210
+
205
211
  struct common_params {
206
212
  int32_t n_predict = -1; // new tokens to predict
207
213
  int32_t n_ctx = 4096; // context size
@@ -292,6 +298,7 @@ struct common_params {
292
298
  bool kl_divergence = false; // compute KL divergence
293
299
 
294
300
  bool usage = false; // print usage
301
+ bool completion = false; // print source-able completion script
295
302
  bool use_color = false; // use color to distinguish generations and inputs
296
303
  bool special = false; // enable special token output
297
304
  bool interactive = false; // interactive mode
@@ -346,6 +353,7 @@ struct common_params {
346
353
  std::string chat_template = ""; // NOLINT
347
354
  bool use_jinja = false; // NOLINT
348
355
  bool enable_chat_template = true;
356
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
349
357
 
350
358
  std::vector<std::string> api_keys;
351
359
 
@@ -424,13 +432,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
424
432
  //
425
433
 
426
434
  #ifdef __GNUC__
427
- #ifdef __MINGW32__
428
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
429
- #else
430
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
431
- #endif
435
+ # if defined(__MINGW32__) && !defined(__clang__)
436
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
437
+ # else
438
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
439
+ # endif
432
440
  #else
433
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
441
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
434
442
  #endif
435
443
 
436
444
  LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@@ -623,7 +631,7 @@ struct common_chat_msg {
623
631
  std::string role;
624
632
  std::string content;
625
633
  std::vector<common_tool_call> tool_calls;
626
- std::string tool_plan = "";
634
+ std::string reasoning_content = "";
627
635
  };
628
636
 
629
637
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
@@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
254
254
  };
255
255
  }
256
256
 
257
- return new llama_sampler{
257
+ return llama_sampler_init(
258
258
  /* .iface = */ &llama_sampler_llg_i,
259
- /* .ctx = */ ctx,
260
- };
259
+ /* .ctx = */ ctx
260
+ );
261
261
  }
262
262
 
263
263
  #else
@@ -1,5 +1,6 @@
1
1
  #include "log.h"
2
2
 
3
+ #include <chrono>
3
4
  #include <condition_variable>
4
5
  #include <cstdarg>
5
6
  #include <cstdio>
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "ggml.h" // for ggml_log_level
4
4
 
5
+ #define LOG_CLR_TO_EOL "\033[K\r"
5
6
  #define LOG_COL_DEFAULT "\033[0m"
6
7
  #define LOG_COL_BOLD "\033[1m"
7
8
  #define LOG_COL_RED "\033[31m"
@@ -14,7 +15,7 @@
14
15
 
15
16
  #ifndef __GNUC__
16
17
  # define LOG_ATTRIBUTE_FORMAT(...)
17
- #elif defined(__MINGW32__)
18
+ #elif defined(__MINGW32__) && !defined(__clang__)
18
19
  # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
19
20
  #else
20
21
  # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
@@ -1385,6 +1385,13 @@ static std::string strip(const std::string & s) {
1385
1385
  return s.substr(start, end - start + 1);
1386
1386
  }
1387
1387
 
1388
+ static std::string capitalize(const std::string & s) {
1389
+ if (s.empty()) return s;
1390
+ auto result = s;
1391
+ result[0] = std::toupper(result[0]);
1392
+ return result;
1393
+ }
1394
+
1388
1395
  static std::string html_escape(const std::string & s) {
1389
1396
  std::string result;
1390
1397
  result.reserve(s.size());
@@ -1462,6 +1469,9 @@ public:
1462
1469
  if (method->get_name() == "strip") {
1463
1470
  vargs.expectArgs("strip method", {0, 0}, {0, 0});
1464
1471
  return Value(strip(str));
1472
+ } else if (method->get_name() == "capitalize") {
1473
+ vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
1474
+ return Value(capitalize(str));
1465
1475
  } else if (method->get_name() == "endswith") {
1466
1476
  vargs.expectArgs("endswith method", {1, 1}, {0, 0});
1467
1477
  auto suffix = vargs.args[0].get<std::string>();
@@ -1792,7 +1802,7 @@ private:
1792
1802
  auto left = parseStringConcat();
1793
1803
  if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");
1794
1804
 
1795
- static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
1805
+ static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
1796
1806
  static std::regex not_tok(R"(not\b)");
1797
1807
  std::string op_str;
1798
1808
  while (!(op_str = consumeToken(compare_tok)).empty()) {
@@ -2171,7 +2181,7 @@ private:
2171
2181
  using TemplateTokenIterator = TemplateTokenVector::const_iterator;
2172
2182
 
2173
2183
  std::vector<std::string> parseVarNames() {
2174
- static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");
2184
+ static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
2175
2185
 
2176
2186
  std::vector<std::string> group;
2177
2187
  if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@@ -2194,13 +2204,13 @@ private:
2194
2204
  }
2195
2205
 
2196
2206
  TemplateTokenVector tokenize() {
2197
- static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
2207
+ static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
2198
2208
  static std::regex expr_open_regex(R"(\{\{([-~])?)");
2199
- static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
2209
+ static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
2200
2210
  static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
2201
2211
  static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
2202
- static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
2203
- static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
2212
+ static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
2213
+ static std::regex block_close_regex(R"(\s*([-~])?%\})");
2204
2214
 
2205
2215
  TemplateTokenVector tokens;
2206
2216
  std::vector<std::string> group;
@@ -2284,7 +2294,7 @@ private:
2284
2294
  auto post_space = parseBlockClose();
2285
2295
  tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
2286
2296
  } else if (keyword == "set") {
2287
- static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");
2297
+ static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
2288
2298
 
2289
2299
  std::string ns;
2290
2300
  std::vector<std::string> var_names;
@@ -2336,6 +2346,11 @@ private:
2336
2346
  throw std::runtime_error("Unexpected block: " + keyword);
2337
2347
  }
2338
2348
  } else if (std::regex_search(it, end, match, non_text_open_regex)) {
2349
+ if (!match.position()) {
2350
+ if (match[0] != "{#")
2351
+ throw std::runtime_error("Internal error: Expected a comment");
2352
+ throw std::runtime_error("Missing end of comment tag");
2353
+ }
2339
2354
  auto text_end = it + match.position();
2340
2355
  text = std::string(it, text_end);
2341
2356
  it = text_end;
@@ -2400,7 +2415,7 @@ private:
2400
2415
 
2401
2416
  auto text = text_token->text;
2402
2417
  if (post_space == SpaceHandling::Strip) {
2403
- static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
2418
+ static std::regex trailing_space_regex(R"(\s+$)");
2404
2419
  text = std::regex_replace(text, trailing_space_regex, "");
2405
2420
  } else if (options.lstrip_blocks && it != end) {
2406
2421
  auto i = text.size();
@@ -2410,7 +2425,7 @@ private:
2410
2425
  }
2411
2426
  }
2412
2427
  if (pre_space == SpaceHandling::Strip) {
2413
- static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
2428
+ static std::regex leading_space_regex(R"(^\s+)");
2414
2429
  text = std::regex_replace(text, leading_space_regex, "");
2415
2430
  } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
2416
2431
  if (text.length() > 0 && text[0] == '\n') {
@@ -134,11 +134,11 @@ std::string common_params_sampling::print() const {
134
134
  snprintf(result, sizeof(result),
135
135
  "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
136
136
  "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
137
- "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
137
+ "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
138
138
  "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
139
139
  penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
140
140
  dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
141
- top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
141
+ top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
142
142
  mirostat, mirostat_eta, mirostat_tau);
143
143
 
144
144
  return std::string(result);
@@ -151,12 +151,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
151
151
 
152
152
  lparams.no_perf = params.no_perf;
153
153
 
154
- std::vector<const char *> trigger_words;
155
- trigger_words.reserve(params.grammar_trigger_words.size());
156
- for (const auto & str : params.grammar_trigger_words) {
157
- trigger_words.push_back(str.word.c_str());
158
- }
159
-
160
154
  struct llama_sampler * grmr;
161
155
  if (params.grammar.compare(0, 11, "%llguidance") == 0) {
162
156
  #ifdef LLAMA_USE_LLGUIDANCE
@@ -165,6 +159,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
165
159
  GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
166
160
  #endif // LLAMA_USE_LLGUIDANCE
167
161
  } else {
162
+ std::vector<const char *> trigger_words;
163
+ trigger_words.reserve(params.grammar_trigger_words.size());
164
+ for (const auto & str : params.grammar_trigger_words) {
165
+ trigger_words.push_back(str.word.c_str());
166
+ }
167
+
168
168
  grmr = params.grammar_lazy
169
169
  ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170
170
  trigger_words.data(), trigger_words.size(),
@@ -188,45 +188,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
188
188
  params.logit_bias.data()));
189
189
 
190
190
  if (params.mirostat == 0) {
191
- for (const auto & cnstr : params.samplers) {
192
- switch (cnstr) {
193
- case COMMON_SAMPLER_TYPE_DRY:
194
- {
195
- std::vector<const char *> c_breakers;
196
- c_breakers.reserve(params.dry_sequence_breakers.size());
197
- for (const auto & str : params.dry_sequence_breakers) {
198
- c_breakers.push_back(str.c_str());
191
+ if (params.top_n_sigma >= 0) {
192
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
193
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
194
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
195
+ } else {
196
+ for (const auto & cnstr : params.samplers) {
197
+ switch (cnstr) {
198
+ case COMMON_SAMPLER_TYPE_DRY:
199
+ {
200
+ std::vector<const char *> c_breakers;
201
+ c_breakers.reserve(params.dry_sequence_breakers.size());
202
+ for (const auto & str : params.dry_sequence_breakers) {
203
+ c_breakers.push_back(str.c_str());
204
+ }
205
+
206
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
199
207
  }
200
-
201
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
202
- }
203
- break;
204
- case COMMON_SAMPLER_TYPE_TOP_K:
205
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
206
- break;
207
- case COMMON_SAMPLER_TYPE_TOP_P:
208
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
209
- break;
210
- case COMMON_SAMPLER_TYPE_MIN_P:
211
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
212
- break;
213
- case COMMON_SAMPLER_TYPE_XTC:
214
- llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
215
- break;
216
- case COMMON_SAMPLER_TYPE_TYPICAL_P:
217
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
218
- break;
219
- case COMMON_SAMPLER_TYPE_TEMPERATURE:
220
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
221
- break;
222
- case COMMON_SAMPLER_TYPE_INFILL:
223
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
224
- break;
225
- case COMMON_SAMPLER_TYPE_PENALTIES:
226
- llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
227
- break;
228
- default:
229
- GGML_ASSERT(false && "unknown sampler type");
208
+ break;
209
+ case COMMON_SAMPLER_TYPE_TOP_K:
210
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
211
+ break;
212
+ case COMMON_SAMPLER_TYPE_TOP_P:
213
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
214
+ break;
215
+ case COMMON_SAMPLER_TYPE_MIN_P:
216
+ llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
217
+ break;
218
+ case COMMON_SAMPLER_TYPE_XTC:
219
+ llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
220
+ break;
221
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
222
+ llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
223
+ break;
224
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
225
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
226
+ break;
227
+ case COMMON_SAMPLER_TYPE_INFILL:
228
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
229
+ break;
230
+ case COMMON_SAMPLER_TYPE_PENALTIES:
231
+ llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
232
+ break;
233
+ default:
234
+ GGML_ASSERT(false && "unknown sampler type");
235
+ }
230
236
  }
231
237
  }
232
238
  llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -9,7 +9,7 @@ struct common_speculative_params {
9
9
  int n_draft = 16; // max drafted tokens
10
10
  int n_reuse = 256;
11
11
 
12
- float p_min = 0.9f; // min probabiliy required to accept a token in the draft
12
+ float p_min = 0.9f; // min probability required to accept a token in the draft
13
13
  };
14
14
 
15
15
  struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
@@ -3,7 +3,7 @@
3
3
  **To get the Code:**
4
4
 
5
5
  ```bash
6
- git clone https://github.com/ggerganov/llama.cpp
6
+ git clone https://github.com/ggml-org/llama.cpp
7
7
  cd llama.cpp
8
8
  ```
9
9
 
@@ -46,7 +46,7 @@ cmake --build build --config Release
46
46
  ```
47
47
 
48
48
  - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
49
- - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
49
+ - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
50
50
  - Tab Workload: Desktop-development with C++
51
51
  - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
52
52
  - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
@@ -3,6 +3,7 @@
3
3
  #include "log.h"
4
4
  #include "llama.h"
5
5
 
6
+ #include <chrono>
6
7
  #include <cmath>
7
8
  #include <cstdio>
8
9
  #include <cstring>
@@ -99,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
99
100
  const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
100
101
 
101
102
  // this has been adapted to the new format of storing merged experts in a single 3d tensor
102
- // ref: https://github.com/ggerganov/llama.cpp/pull/6387
103
+ // ref: https://github.com/ggml-org/llama.cpp/pull/6387
103
104
  if (t->op == GGML_OP_MUL_MAT_ID) {
104
105
  // ids -> [n_experts_used, n_tokens]
105
106
  // src1 -> [cols, n_expert_used, n_tokens]
@@ -876,8 +876,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
876
876
  struct test {
877
877
  static const std::string build_commit;
878
878
  static const int build_number;
879
- static const std::string cpu_info;
880
- static const std::string gpu_info;
879
+ const std::string cpu_info;
880
+ const std::string gpu_info;
881
881
  std::string model_filename;
882
882
  std::string model_type;
883
883
  uint64_t model_size;
@@ -903,7 +903,10 @@ struct test {
903
903
  std::string test_time;
904
904
  std::vector<uint64_t> samples_ns;
905
905
 
906
- test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
906
+ test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
907
+ cpu_info(get_cpu_info()),
908
+ gpu_info(get_gpu_info()) {
909
+
907
910
  model_filename = inst.model;
908
911
  char buf[128];
909
912
  llama_model_desc(lmodel, buf, sizeof(buf));
@@ -1058,8 +1061,6 @@ struct test {
1058
1061
 
1059
1062
  const std::string test::build_commit = LLAMA_COMMIT;
1060
1063
  const int test::build_number = LLAMA_BUILD_NUMBER;
1061
- const std::string test::cpu_info = get_cpu_info();
1062
- const std::string test::gpu_info = get_gpu_info();
1063
1064
 
1064
1065
  struct printer {
1065
1066
  virtual ~printer() {}
@@ -14,7 +14,7 @@ project("llama-android")
14
14
  #include(FetchContent)
15
15
  #FetchContent_Declare(
16
16
  # llama
17
- # GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
17
+ # GIT_REPOSITORY https://github.com/ggml-org/llama.cpp
18
18
  # GIT_TAG master
19
19
  #)
20
20
 
@@ -3,6 +3,7 @@
3
3
  #include "log.h"
4
4
  #include "llama.h"
5
5
 
6
+ #include <chrono>
6
7
  #include <algorithm>
7
8
  #include <array>
8
9
  #include <atomic>
@@ -346,7 +346,7 @@ class HttpClient {
346
346
  if (!output_file.empty()) {
347
347
  output_file_partial = output_file + ".partial";
348
348
  if (!out.open(output_file_partial, "ab")) {
349
- printe("Failed to open file\n");
349
+ printe("Failed to open file for writing\n");
350
350
 
351
351
  return 1;
352
352
  }
@@ -535,8 +535,7 @@ class HttpClient {
535
535
 
536
536
  static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
537
537
  const std::string & progress_suffix) {
538
- printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(),
539
- progress_suffix.c_str());
538
+ printe("\r" LOG_CLR_TO_EOL "%s%s| %s", progress_prefix.c_str(), progress_bar.c_str(), progress_suffix.c_str());
540
539
  }
541
540
  // Function to write data to a file
542
541
  static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
@@ -797,16 +796,13 @@ class LlamaData {
797
796
  llama_model_ptr initialize_model(Opt & opt) {
798
797
  ggml_backend_load_all();
799
798
  resolve_model(opt.model_);
800
- printe(
801
- "\r%*s"
802
- "\rLoading model",
803
- get_terminal_width(), " ");
799
+ printe("\r" LOG_CLR_TO_EOL "Loading model");
804
800
  llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
805
801
  if (!model) {
806
802
  printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
807
803
  }
808
804
 
809
- printe("\r%*s\r", static_cast<int>(sizeof("Loading model")), " ");
805
+ printe("\r" LOG_CLR_TO_EOL);
810
806
  return model;
811
807
  }
812
808
 
@@ -969,10 +965,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
969
965
  static int read_user_input(std::string & user_input) {
970
966
  static const char * prompt_prefix = "> ";
971
967
  #ifdef WIN32
972
- printf(
973
- "\r%*s"
974
- "\r" LOG_COL_DEFAULT "%s",
975
- get_terminal_width(), " ", prompt_prefix);
968
+ printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);
976
969
 
977
970
  std::getline(std::cin, user_input);
978
971
  if (std::cin.eof()) {
@@ -5,7 +5,7 @@ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
5
5
  include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
6
6
 
7
7
  if (MINGW)
8
- # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
8
+ # fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
9
9
  add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
10
10
  endif()
11
11