@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -249,16 +249,30 @@ class chat_template {
249
249
  inputs.add_generation_prompt = false;
250
250
  full = apply(inputs);
251
251
  }
252
-
253
- if (full.find(prefix) != 0) {
254
- if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
255
- prefix = prefix.substr(0, prefix.size() - eos_token_.size());
252
+ auto eos_pos_last = full.rfind(eos_token_);
253
+ if (eos_pos_last == prefix.size() - eos_token_.size() ||
254
+ (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
255
+ full = full.substr(0, eos_pos_last);
256
+ }
257
+ size_t common_prefix_length = 0;
258
+ for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
259
+ if (prefix[i] != full[i]) {
260
+ break;
256
261
  }
262
+ if (prefix[i] == '<') {
263
+ // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
264
+ // but it removes thinking tags for past messages.
265
+ // The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
266
+ continue;
267
+ }
268
+ common_prefix_length = i + 1;
257
269
  }
258
- if (full.find(prefix) != 0) {
270
+ auto example = full.substr(common_prefix_length);
271
+ if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
259
272
  fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
273
+ } else {
274
+ tool_call_example_ = example;
260
275
  }
261
- tool_call_example_ = full.substr(prefix.size());
262
276
  }
263
277
  } catch (const std::exception & e) {
264
278
  fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@@ -363,7 +377,7 @@ class chat_template {
363
377
  if (polyfill_tools) {
364
378
  adjusted_messages = add_system(inputs.messages,
365
379
  "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
366
- (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
380
+ (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
367
381
  } else {
368
382
  adjusted_messages = inputs.messages;
369
383
  }
@@ -1378,13 +1378,34 @@ struct ArgumentsExpression {
1378
1378
  }
1379
1379
  };
1380
1380
 
1381
- static std::string strip(const std::string & s) {
1382
- auto start = s.find_first_not_of(" \t\n\r");
1381
+ static std::string strip(const std::string & s, const std::string & chars = "", bool left = true, bool right = true) {
1382
+ auto charset = chars.empty() ? " \t\n\r" : chars;
1383
+ auto start = left ? s.find_first_not_of(charset) : 0;
1383
1384
  if (start == std::string::npos) return "";
1384
- auto end = s.find_last_not_of(" \t\n\r");
1385
+ auto end = right ? s.find_last_not_of(charset) : s.size() - 1;
1385
1386
  return s.substr(start, end - start + 1);
1386
1387
  }
1387
1388
 
1389
+ static std::vector<std::string> split(const std::string & s, const std::string & sep) {
1390
+ std::vector<std::string> result;
1391
+ size_t start = 0;
1392
+ size_t end = s.find(sep);
1393
+ while (end != std::string::npos) {
1394
+ result.push_back(s.substr(start, end - start));
1395
+ start = end + sep.length();
1396
+ end = s.find(sep, start);
1397
+ }
1398
+ result.push_back(s.substr(start));
1399
+ return result;
1400
+ }
1401
+
1402
+ static std::string capitalize(const std::string & s) {
1403
+ if (s.empty()) return s;
1404
+ auto result = s;
1405
+ result[0] = std::toupper(result[0]);
1406
+ return result;
1407
+ }
1408
+
1388
1409
  static std::string html_escape(const std::string & s) {
1389
1410
  std::string result;
1390
1411
  result.reserve(s.size());
@@ -1460,8 +1481,29 @@ public:
1460
1481
  } else if (obj.is_string()) {
1461
1482
  auto str = obj.get<std::string>();
1462
1483
  if (method->get_name() == "strip") {
1463
- vargs.expectArgs("strip method", {0, 0}, {0, 0});
1464
- return Value(strip(str));
1484
+ vargs.expectArgs("strip method", {0, 1}, {0, 0});
1485
+ auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1486
+ return Value(strip(str, chars));
1487
+ } else if (method->get_name() == "lstrip") {
1488
+ vargs.expectArgs("lstrip method", {0, 1}, {0, 0});
1489
+ auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1490
+ return Value(strip(str, chars, /* left= */ true, /* right= */ false));
1491
+ } else if (method->get_name() == "rstrip") {
1492
+ vargs.expectArgs("rstrip method", {0, 1}, {0, 0});
1493
+ auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
1494
+ return Value(strip(str, chars, /* left= */ false, /* right= */ true));
1495
+ } else if (method->get_name() == "split") {
1496
+ vargs.expectArgs("split method", {1, 1}, {0, 0});
1497
+ auto sep = vargs.args[0].get<std::string>();
1498
+ auto parts = split(str, sep);
1499
+ Value result = Value::array();
1500
+ for (const auto& part : parts) {
1501
+ result.push_back(Value(part));
1502
+ }
1503
+ return result;
1504
+ } else if (method->get_name() == "capitalize") {
1505
+ vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
1506
+ return Value(capitalize(str));
1465
1507
  } else if (method->get_name() == "endswith") {
1466
1508
  vargs.expectArgs("endswith method", {1, 1}, {0, 0});
1467
1509
  auto suffix = vargs.args[0].get<std::string>();
@@ -1792,7 +1834,7 @@ private:
1792
1834
  auto left = parseStringConcat();
1793
1835
  if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");
1794
1836
 
1795
- static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
1837
+ static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
1796
1838
  static std::regex not_tok(R"(not\b)");
1797
1839
  std::string op_str;
1798
1840
  while (!(op_str = consumeToken(compare_tok)).empty()) {
@@ -2171,7 +2213,7 @@ private:
2171
2213
  using TemplateTokenIterator = TemplateTokenVector::const_iterator;
2172
2214
 
2173
2215
  std::vector<std::string> parseVarNames() {
2174
- static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");
2216
+ static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
2175
2217
 
2176
2218
  std::vector<std::string> group;
2177
2219
  if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@@ -2194,13 +2236,13 @@ private:
2194
2236
  }
2195
2237
 
2196
2238
  TemplateTokenVector tokenize() {
2197
- static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
2239
+ static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
2198
2240
  static std::regex expr_open_regex(R"(\{\{([-~])?)");
2199
- static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
2241
+ static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
2200
2242
  static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
2201
2243
  static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
2202
- static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
2203
- static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
2244
+ static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
2245
+ static std::regex block_close_regex(R"(\s*([-~])?%\})");
2204
2246
 
2205
2247
  TemplateTokenVector tokens;
2206
2248
  std::vector<std::string> group;
@@ -2284,7 +2326,7 @@ private:
2284
2326
  auto post_space = parseBlockClose();
2285
2327
  tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
2286
2328
  } else if (keyword == "set") {
2287
- static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");
2329
+ static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
2288
2330
 
2289
2331
  std::string ns;
2290
2332
  std::vector<std::string> var_names;
@@ -2336,6 +2378,11 @@ private:
2336
2378
  throw std::runtime_error("Unexpected block: " + keyword);
2337
2379
  }
2338
2380
  } else if (std::regex_search(it, end, match, non_text_open_regex)) {
2381
+ if (!match.position()) {
2382
+ if (match[0] != "{#")
2383
+ throw std::runtime_error("Internal error: Expected a comment");
2384
+ throw std::runtime_error("Missing end of comment tag");
2385
+ }
2339
2386
  auto text_end = it + match.position();
2340
2387
  text = std::string(it, text_end);
2341
2388
  it = text_end;
@@ -2400,7 +2447,7 @@ private:
2400
2447
 
2401
2448
  auto text = text_token->text;
2402
2449
  if (post_space == SpaceHandling::Strip) {
2403
- static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
2450
+ static std::regex trailing_space_regex(R"(\s+$)");
2404
2451
  text = std::regex_replace(text, trailing_space_regex, "");
2405
2452
  } else if (options.lstrip_blocks && it != end) {
2406
2453
  auto i = text.size();
@@ -2410,7 +2457,7 @@ private:
2410
2457
  }
2411
2458
  }
2412
2459
  if (pre_space == SpaceHandling::Strip) {
2413
- static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
2460
+ static std::regex leading_space_regex(R"(^\s+)");
2414
2461
  text = std::regex_replace(text, leading_space_regex, "");
2415
2462
  } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
2416
2463
  if (text.length() > 0 && text[0] == '\n') {
@@ -7,6 +7,7 @@
7
7
  #include <cstdio>
8
8
  #include <fstream>
9
9
  #include <thread>
10
+ #include <algorithm>
10
11
 
11
12
  void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
12
13
  std::vector<llama_token> & inp, int nnew, bool print_progress) {
@@ -4,6 +4,7 @@
4
4
 
5
5
  #include <cmath>
6
6
  #include <unordered_map>
7
+ #include <algorithm>
7
8
 
8
9
  // the ring buffer works similarly to std::deque, but with a fixed capacity
9
10
  // TODO: deduplicate with llama-impl.h
@@ -134,11 +135,11 @@ std::string common_params_sampling::print() const {
134
135
  snprintf(result, sizeof(result),
135
136
  "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
136
137
  "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
137
- "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
138
+ "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
138
139
  "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
139
140
  penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
140
141
  dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
141
- top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
142
+ top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
142
143
  mirostat, mirostat_eta, mirostat_tau);
143
144
 
144
145
  return std::string(result);
@@ -151,12 +152,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
151
152
 
152
153
  lparams.no_perf = params.no_perf;
153
154
 
154
- std::vector<const char *> trigger_words;
155
- trigger_words.reserve(params.grammar_trigger_words.size());
156
- for (const auto & str : params.grammar_trigger_words) {
157
- trigger_words.push_back(str.word.c_str());
158
- }
159
-
160
155
  struct llama_sampler * grmr;
161
156
  if (params.grammar.compare(0, 11, "%llguidance") == 0) {
162
157
  #ifdef LLAMA_USE_LLGUIDANCE
@@ -165,10 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
165
160
  GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
166
161
  #endif // LLAMA_USE_LLGUIDANCE
167
162
  } else {
163
+ std::vector<std::string> patterns_at_start;
164
+ std::vector<std::string> patterns_anywhere;
165
+ std::vector<llama_token> trigger_tokens;
166
+ for (const auto & trigger : params.grammar_triggers) {
167
+ switch (trigger.type) {
168
+ case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
169
+ {
170
+ const auto & word = trigger.value;
171
+ patterns_anywhere.push_back(regex_escape(word));
172
+ break;
173
+ }
174
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
175
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
176
+ {
177
+ const auto & pattern = trigger.value;
178
+ (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
179
+ break;
180
+ }
181
+ case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
182
+ {
183
+ const auto token = trigger.token;
184
+ trigger_tokens.push_back(token);
185
+ break;
186
+ }
187
+ default:
188
+ GGML_ASSERT(false && "unknown trigger type");
189
+ }
190
+ }
191
+
192
+ std::vector<std::string> trigger_patterns;
193
+ if (!patterns_at_start.empty()) {
194
+ trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
195
+ }
196
+ if (!patterns_anywhere.empty()) {
197
+ trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
198
+ }
199
+
200
+ std::vector<const char *> trigger_patterns_c;
201
+ trigger_patterns_c.reserve(trigger_patterns.size());
202
+ for (const auto & regex : trigger_patterns) {
203
+ trigger_patterns_c.push_back(regex.c_str());
204
+ }
205
+
168
206
  grmr = params.grammar_lazy
169
- ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170
- trigger_words.data(), trigger_words.size(),
171
- params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
207
+ ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
208
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
209
+ trigger_tokens.data(), trigger_tokens.size())
172
210
  : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
173
211
  }
174
212
 
@@ -188,45 +226,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
188
226
  params.logit_bias.data()));
189
227
 
190
228
  if (params.mirostat == 0) {
191
- for (const auto & cnstr : params.samplers) {
192
- switch (cnstr) {
193
- case COMMON_SAMPLER_TYPE_DRY:
194
- {
195
- std::vector<const char *> c_breakers;
196
- c_breakers.reserve(params.dry_sequence_breakers.size());
197
- for (const auto & str : params.dry_sequence_breakers) {
198
- c_breakers.push_back(str.c_str());
229
+ if (params.top_n_sigma >= 0) {
230
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
231
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
232
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
233
+ } else {
234
+ for (const auto & cnstr : params.samplers) {
235
+ switch (cnstr) {
236
+ case COMMON_SAMPLER_TYPE_DRY:
237
+ {
238
+ std::vector<const char *> c_breakers;
239
+ c_breakers.reserve(params.dry_sequence_breakers.size());
240
+ for (const auto & str : params.dry_sequence_breakers) {
241
+ c_breakers.push_back(str.c_str());
242
+ }
243
+
244
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
199
245
  }
200
-
201
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
202
- }
203
- break;
204
- case COMMON_SAMPLER_TYPE_TOP_K:
205
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
206
- break;
207
- case COMMON_SAMPLER_TYPE_TOP_P:
208
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
209
- break;
210
- case COMMON_SAMPLER_TYPE_MIN_P:
211
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
212
- break;
213
- case COMMON_SAMPLER_TYPE_XTC:
214
- llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
215
- break;
216
- case COMMON_SAMPLER_TYPE_TYPICAL_P:
217
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
218
- break;
219
- case COMMON_SAMPLER_TYPE_TEMPERATURE:
220
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
221
- break;
222
- case COMMON_SAMPLER_TYPE_INFILL:
223
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
224
- break;
225
- case COMMON_SAMPLER_TYPE_PENALTIES:
226
- llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
227
- break;
228
- default:
229
- GGML_ASSERT(false && "unknown sampler type");
246
+ break;
247
+ case COMMON_SAMPLER_TYPE_TOP_K:
248
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
249
+ break;
250
+ case COMMON_SAMPLER_TYPE_TOP_P:
251
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
252
+ break;
253
+ case COMMON_SAMPLER_TYPE_MIN_P:
254
+ llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
255
+ break;
256
+ case COMMON_SAMPLER_TYPE_XTC:
257
+ llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
258
+ break;
259
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
260
+ llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
261
+ break;
262
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
263
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
264
+ break;
265
+ case COMMON_SAMPLER_TYPE_INFILL:
266
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
267
+ break;
268
+ case COMMON_SAMPLER_TYPE_PENALTIES:
269
+ llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
270
+ break;
271
+ default:
272
+ GGML_ASSERT(false && "unknown sampler type");
273
+ }
230
274
  }
231
275
  }
232
276
  llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -5,6 +5,7 @@
5
5
  #include "sampling.h"
6
6
 
7
7
  #include <cstring>
8
+ #include <algorithm>
8
9
 
9
10
  #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
10
11
  #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -252,11 +253,6 @@ llama_tokens common_speculative_gen_draft(
252
253
  // add drafted token for each sequence
253
254
  const llama_token id = cur_p->data[0].id;
254
255
 
255
- // only collect very high-confidence draft tokens
256
- if (cur_p->data[0].p < params.p_min) {
257
- break;
258
- }
259
-
260
256
  common_sampler_accept(smpl, id, true);
261
257
 
262
258
  result.push_back(id);
@@ -265,6 +261,11 @@ llama_tokens common_speculative_gen_draft(
265
261
  break;
266
262
  }
267
263
 
264
+ // only collect very high-confidence draft tokens
265
+ if (cur_p->data[0].p < params.p_min) {
266
+ break;
267
+ }
268
+
268
269
  common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
269
270
 
270
271
  // evaluate the drafted tokens on the draft model
@@ -9,7 +9,7 @@ struct common_speculative_params {
9
9
  int n_draft = 16; // max drafted tokens
10
10
  int n_reuse = 256;
11
11
 
12
- float p_min = 0.9f; // min probabiliy required to accept a token in the draft
12
+ float p_min = 0.75f; // min probability required to accept a token in the draft
13
13
  };
14
14
 
15
15
  struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
@@ -3,7 +3,7 @@
3
3
  **To get the Code:**
4
4
 
5
5
  ```bash
6
- git clone https://github.com/ggerganov/llama.cpp
6
+ git clone https://github.com/ggml-org/llama.cpp
7
7
  cd llama.cpp
8
8
  ```
9
9
 
@@ -46,7 +46,7 @@ cmake --build build --config Release
46
46
  ```
47
47
 
48
48
  - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
49
- - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
49
+ - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
50
50
  - Tab Workload: Desktop-development with C++
51
51
  - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
52
52
  - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
@@ -197,20 +197,52 @@ The following compilation options are also available to tweak performance:
197
197
 
198
198
  ## MUSA
199
199
 
200
- This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
200
+ This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
201
201
 
202
- - Using `CMake`:
202
+ #### Download directly from Moore Threads
203
203
 
204
- ```bash
205
- cmake -B build -DGGML_MUSA=ON
204
+ You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
205
+
206
+ ### Compilation
207
+
208
+ ```bash
209
+ cmake -B build -DGGML_MUSA=ON
210
+ cmake --build build --config Release
211
+ ```
212
+
213
+ #### Override Compute Capability Specifications
214
+
215
+ By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
216
+
217
+ ```bash
218
+ cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
219
+ ```
220
+
221
+ This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
222
+
223
+ #### Compilation options
224
+
225
+ Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
226
+
227
+ - For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
228
+ ```
229
+ cmake -B build -DGGML_MUSA=ON \
230
+ -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
206
231
  cmake --build build --config Release
207
232
  ```
208
233
 
209
- The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
234
+ ### Runtime MUSA environmental variables
210
235
 
211
- The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
236
+ You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
212
237
 
213
- Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
238
+ ```bash
239
+ # Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
240
+ MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
241
+ ```
242
+
243
+ ### Unified Memory
244
+
245
+ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
214
246
 
215
247
  ## HIP
216
248
 
@@ -227,6 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
227
259
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
228
260
  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
229
261
 
262
+ To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
263
+
264
+ The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
265
+
266
+ As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
267
+
230
268
  Note that if you get the following error:
231
269
  ```
232
270
  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
394
394
  int main(int argc, char ** argv) {
395
395
  common_params params;
396
396
 
397
+ params.out_file = "control_vector.gguf";
398
+
397
399
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
398
400
  return 1;
399
401
  }
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
498
500
  }
499
501
 
500
502
  // write output vectors to gguf
501
- export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
503
+ export_gguf(ctx_train.v_final, params.out_file, model_hint);
502
504
 
503
505
  llama_backend_free();
504
506
 
@@ -4,6 +4,7 @@
4
4
  #include "llama.h"
5
5
 
6
6
  #include <ctime>
7
+ #include <algorithm>
7
8
 
8
9
  #if defined(_MSC_VER)
9
10
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
413
413
  int main(int argc, char ** argv) {
414
414
  common_params params;
415
415
 
416
+ params.out_file = "ggml-lora-merged-f16.gguf";
417
+
416
418
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
417
419
  return 1;
418
420
  }
419
421
 
420
422
  g_verbose = (params.verbosity > 1);
421
423
  try {
422
- lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
424
+ lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
423
425
  ctx.run_merge();
424
426
  } catch (const std::exception & err) {
425
427
  fprintf(stderr, "%s\n", err.what());
426
428
  exit(EXIT_FAILURE);
427
429
  }
428
430
 
429
- printf("done, output file is %s\n", params.lora_outfile.c_str());
431
+ printf("done, output file is %s\n", params.out_file.c_str());
430
432
 
431
433
  return 0;
432
434
  }
@@ -3,6 +3,7 @@
3
3
  #include "log.h"
4
4
  #include "llama.h"
5
5
 
6
+ #include <chrono>
6
7
  #include <cmath>
7
8
  #include <cstdio>
8
9
  #include <cstring>
@@ -99,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
99
100
  const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
100
101
 
101
102
  // this has been adapted to the new format of storing merged experts in a single 3d tensor
102
- // ref: https://github.com/ggerganov/llama.cpp/pull/6387
103
+ // ref: https://github.com/ggml-org/llama.cpp/pull/6387
103
104
  if (t->op == GGML_OP_MUL_MAT_ID) {
104
105
  // ids -> [n_experts_used, n_tokens]
105
106
  // src1 -> [cols, n_expert_used, n_tokens]
@@ -205,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
205
206
 
206
207
  void IMatrixCollector::save_imatrix(int ncall) const {
207
208
  auto fname = m_params.out_file;
208
- if (fname.empty()) {
209
- fname = "imatrix.dat";
210
- }
211
209
 
212
210
  if (ncall > 0) {
213
211
  fname += ".at_";
@@ -582,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
582
580
  int main(int argc, char ** argv) {
583
581
  common_params params;
584
582
 
583
+ params.out_file = "imatrix.dat" ;
584
+
585
585
  params.n_ctx = 512;
586
586
  params.logits_all = true;
587
587
  params.escape = false;
@@ -876,8 +876,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
876
876
  struct test {
877
877
  static const std::string build_commit;
878
878
  static const int build_number;
879
- static const std::string cpu_info;
880
- static const std::string gpu_info;
879
+ const std::string cpu_info;
880
+ const std::string gpu_info;
881
881
  std::string model_filename;
882
882
  std::string model_type;
883
883
  uint64_t model_size;
@@ -903,7 +903,10 @@ struct test {
903
903
  std::string test_time;
904
904
  std::vector<uint64_t> samples_ns;
905
905
 
906
- test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
906
+ test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
907
+ cpu_info(get_cpu_info()),
908
+ gpu_info(get_gpu_info()) {
909
+
907
910
  model_filename = inst.model;
908
911
  char buf[128];
909
912
  llama_model_desc(lmodel, buf, sizeof(buf));
@@ -1058,8 +1061,6 @@ struct test {
1058
1061
 
1059
1062
  const std::string test::build_commit = LLAMA_COMMIT;
1060
1063
  const int test::build_number = LLAMA_BUILD_NUMBER;
1061
- const std::string test::cpu_info = get_cpu_info();
1062
- const std::string test::gpu_info = get_gpu_info();
1063
1064
 
1064
1065
  struct printer {
1065
1066
  virtual ~printer() {}
@@ -14,7 +14,7 @@ project("llama-android")
14
14
  #include(FetchContent)
15
15
  #FetchContent_Declare(
16
16
  # llama
17
- # GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
17
+ # GIT_REPOSITORY https://github.com/ggml-org/llama.cpp
18
18
  # GIT_TAG master
19
19
  #)
20
20