cui-llama.rn 1.2.6 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +26 -6
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +228 -40
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/amx/amx.cpp +196 -0
  9. package/cpp/amx/amx.h +20 -0
  10. package/cpp/amx/common.h +101 -0
  11. package/cpp/amx/mmq.cpp +2524 -0
  12. package/cpp/amx/mmq.h +16 -0
  13. package/cpp/common.cpp +118 -251
  14. package/cpp/common.h +53 -30
  15. package/cpp/ggml-aarch64.c +46 -3395
  16. package/cpp/ggml-aarch64.h +0 -20
  17. package/cpp/ggml-alloc.c +6 -8
  18. package/cpp/ggml-backend-impl.h +33 -11
  19. package/cpp/ggml-backend-reg.cpp +423 -0
  20. package/cpp/ggml-backend.cpp +14 -676
  21. package/cpp/ggml-backend.h +46 -9
  22. package/cpp/ggml-common.h +6 -0
  23. package/cpp/ggml-cpu-aarch64.c +3823 -0
  24. package/cpp/ggml-cpu-aarch64.h +32 -0
  25. package/cpp/ggml-cpu-impl.h +14 -242
  26. package/cpp/ggml-cpu-quants.c +10835 -0
  27. package/cpp/ggml-cpu-quants.h +63 -0
  28. package/cpp/ggml-cpu.c +13971 -13720
  29. package/cpp/ggml-cpu.cpp +715 -0
  30. package/cpp/ggml-cpu.h +65 -63
  31. package/cpp/ggml-impl.h +285 -25
  32. package/cpp/ggml-metal.h +8 -8
  33. package/cpp/ggml-metal.m +1221 -728
  34. package/cpp/ggml-quants.c +189 -10681
  35. package/cpp/ggml-quants.h +78 -125
  36. package/cpp/ggml-threading.cpp +12 -0
  37. package/cpp/ggml-threading.h +12 -0
  38. package/cpp/ggml.c +688 -1460
  39. package/cpp/ggml.h +58 -244
  40. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  41. package/cpp/json.hpp +24766 -24766
  42. package/cpp/llama-sampling.cpp +5 -2
  43. package/cpp/llama.cpp +409 -123
  44. package/cpp/llama.h +8 -4
  45. package/cpp/rn-llama.hpp +89 -25
  46. package/cpp/sampling.cpp +42 -3
  47. package/cpp/sampling.h +22 -1
  48. package/cpp/sgemm.cpp +608 -0
  49. package/cpp/speculative.cpp +270 -0
  50. package/cpp/speculative.h +28 -0
  51. package/cpp/unicode.cpp +11 -0
  52. package/ios/RNLlama.mm +43 -20
  53. package/ios/RNLlamaContext.h +9 -3
  54. package/ios/RNLlamaContext.mm +146 -33
  55. package/jest/mock.js +0 -1
  56. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  57. package/lib/commonjs/grammar.js +4 -2
  58. package/lib/commonjs/grammar.js.map +1 -1
  59. package/lib/commonjs/index.js +52 -15
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/module/NativeRNLlama.js.map +1 -1
  62. package/lib/module/grammar.js +2 -1
  63. package/lib/module/grammar.js.map +1 -1
  64. package/lib/module/index.js +51 -15
  65. package/lib/module/index.js.map +1 -1
  66. package/lib/typescript/NativeRNLlama.d.ts +122 -8
  67. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  68. package/lib/typescript/grammar.d.ts +5 -6
  69. package/lib/typescript/grammar.d.ts.map +1 -1
  70. package/lib/typescript/index.d.ts +15 -6
  71. package/lib/typescript/index.d.ts.map +1 -1
  72. package/package.json +2 -1
  73. package/src/NativeRNLlama.ts +135 -13
  74. package/src/grammar.ts +10 -8
  75. package/src/index.ts +104 -28
package/cpp/amx/mmq.h ADDED
@@ -0,0 +1,16 @@
1
+ #pragma once
2
+ #include "common.h"
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ size_t lm_ggml_backend_amx_get_alloc_size(const struct lm_ggml_tensor * tensor);
9
+
10
+ void lm_ggml_backend_amx_convert_weight(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
11
+
12
+ void lm_ggml_backend_amx_mul_mat(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
13
+
14
+ #ifdef __cplusplus
15
+ }
16
+ #endif
package/cpp/common.cpp CHANGED
@@ -542,12 +542,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
542
542
  [](const unsigned char c) { return !std::isprint(c); }),
543
543
  detokenized.end());
544
544
 
545
- buf << "\n" << std::to_string(i)
546
- << ":token '" << detokenized << "'"
547
- << ":pos " << std::to_string(batch.pos[i])
548
- << ":n_seq_id " << std::to_string(batch.n_seq_id[i])
549
- << ":seq_id " << std::to_string(batch.seq_id[i][0])
550
- << ":logits " << std::to_string(batch.logits[i]);
545
+ buf << "\n" << std::to_string(i)
546
+ << ", token '" << detokenized << "'"
547
+ << ", pos " << std::to_string(batch.pos[i])
548
+ << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
549
+ << ", seq_id " << std::to_string(batch.seq_id[i][0])
550
+ << ", logits " << std::to_string(batch.logits[i]);
551
551
  }
552
552
 
553
553
  buf << " ]";
@@ -658,7 +658,17 @@ bool fs_validate_filename(const std::string & filename) {
658
658
 
659
659
  std::u32string filename_utf32;
660
660
  try {
661
+ #if defined(__clang__)
662
+ // disable C++17 deprecation warning for std::codecvt_utf8
663
+ # pragma clang diagnostic push
664
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
665
+ #endif
661
666
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
667
+
668
+ #if defined(__clang__)
669
+ # pragma clang diagnostic pop
670
+ #endif
671
+
662
672
  filename_utf32 = converter.from_bytes(filename);
663
673
 
664
674
  // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
@@ -835,9 +845,9 @@ struct common_init_result common_init_from_params(common_params & params) {
835
845
  llama_model * model = nullptr;
836
846
 
837
847
  if (!params.hf_repo.empty() && !params.hf_file.empty()) {
838
- model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
848
+ model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
839
849
  } else if (!params.model_url.empty()) {
840
- model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
850
+ model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
841
851
  } else {
842
852
  model = llama_load_model_from_file(params.model.c_str(), mparams);
843
853
  }
@@ -881,6 +891,12 @@ struct common_init_result common_init_from_params(common_params & params) {
881
891
  return iparams;
882
892
  }
883
893
 
894
+ if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
895
+ LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
896
+ llama_free_model(model);
897
+ return iparams;
898
+ }
899
+
884
900
  if (!params.control_vectors.empty()) {
885
901
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
886
902
  if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
@@ -925,9 +941,9 @@ struct common_init_result common_init_from_params(common_params & params) {
925
941
  common_lora_adapters_apply(lctx, iparams.lora_adapters);
926
942
  }
927
943
 
928
- if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
944
+ if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
929
945
  LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
930
- params.sparams.ignore_eos = false;
946
+ params.sampling.ignore_eos = false;
931
947
  }
932
948
 
933
949
  if (params.warmup) {
@@ -979,9 +995,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
979
995
  }
980
996
  }
981
997
 
982
- struct llama_model_params common_model_params_to_llama(const common_params & params) {
998
+ struct llama_model_params common_model_params_to_llama(common_params & params) {
983
999
  auto mparams = llama_model_default_params();
984
1000
 
1001
+ if (!params.devices.empty()) {
1002
+ mparams.devices = params.devices.data();
1003
+ }
985
1004
  if (params.n_gpu_layers != -1) {
986
1005
  mparams.n_gpu_layers = params.n_gpu_layers;
987
1006
  }
@@ -1013,6 +1032,9 @@ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
1013
1032
  if (s == "f16") {
1014
1033
  return LM_GGML_TYPE_F16;
1015
1034
  }
1035
+ if (s == "bf16") {
1036
+ return LM_GGML_TYPE_BF16;
1037
+ }
1016
1038
  if (s == "q8_0") {
1017
1039
  return LM_GGML_TYPE_Q8_0;
1018
1040
  }
@@ -1340,17 +1362,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
1340
1362
  }
1341
1363
 
1342
1364
  struct llama_model * common_load_model_from_url(
1343
- const char * model_url,
1344
- const char * path_model,
1345
- const char * hf_token,
1365
+ const std::string & model_url,
1366
+ const std::string & local_path,
1367
+ const std::string & hf_token,
1346
1368
  const struct llama_model_params & params) {
1347
1369
  // Basic validation of the model_url
1348
- if (!model_url || strlen(model_url) == 0) {
1370
+ if (model_url.empty()) {
1349
1371
  LOG_ERR("%s: invalid model_url\n", __func__);
1350
1372
  return NULL;
1351
1373
  }
1352
1374
 
1353
- if (!common_download_file(model_url, path_model, hf_token)) {
1375
+ if (!common_download_file(model_url, local_path, hf_token)) {
1354
1376
  return NULL;
1355
1377
  }
1356
1378
 
@@ -1361,9 +1383,9 @@ struct llama_model * common_load_model_from_url(
1361
1383
  /*.no_alloc = */ true,
1362
1384
  /*.ctx = */ NULL,
1363
1385
  };
1364
- auto * ctx_gguf = lm_gguf_init_from_file(path_model, lm_gguf_params);
1386
+ auto * ctx_gguf = lm_gguf_init_from_file(local_path.c_str(), lm_gguf_params);
1365
1387
  if (!ctx_gguf) {
1366
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
1388
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
1367
1389
  return NULL;
1368
1390
  }
1369
1391
 
@@ -1382,13 +1404,13 @@ struct llama_model * common_load_model_from_url(
1382
1404
  // Verify the first split file format
1383
1405
  // and extract split URL and PATH prefixes
1384
1406
  {
1385
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
1386
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
1407
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
1408
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
1387
1409
  return NULL;
1388
1410
  }
1389
1411
 
1390
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
1391
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
1412
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
1413
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
1392
1414
  return NULL;
1393
1415
  }
1394
1416
  }
@@ -1415,14 +1437,14 @@ struct llama_model * common_load_model_from_url(
1415
1437
  }
1416
1438
  }
1417
1439
 
1418
- return llama_load_model_from_file(path_model, params);
1440
+ return llama_load_model_from_file(local_path.c_str(), params);
1419
1441
  }
1420
1442
 
1421
1443
  struct llama_model * common_load_model_from_hf(
1422
- const char * repo,
1423
- const char * model,
1424
- const char * path_model,
1425
- const char * hf_token,
1444
+ const std::string & repo,
1445
+ const std::string & remote_path,
1446
+ const std::string & local_path,
1447
+ const std::string & hf_token,
1426
1448
  const struct llama_model_params & params) {
1427
1449
  // construct hugging face model url:
1428
1450
  //
@@ -1436,27 +1458,27 @@ struct llama_model * common_load_model_from_hf(
1436
1458
  std::string model_url = "https://huggingface.co/";
1437
1459
  model_url += repo;
1438
1460
  model_url += "/resolve/main/";
1439
- model_url += model;
1461
+ model_url += remote_path;
1440
1462
 
1441
- return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1463
+ return common_load_model_from_url(model_url, local_path, hf_token, params);
1442
1464
  }
1443
1465
 
1444
1466
  #else
1445
1467
 
1446
1468
  struct llama_model * common_load_model_from_url(
1447
- const char * /*model_url*/,
1448
- const char * /*path_model*/,
1449
- const char * /*hf_token*/,
1469
+ const std::string & /*model_url*/,
1470
+ const std::string & /*local_path*/,
1471
+ const std::string & /*hf_token*/,
1450
1472
  const struct llama_model_params & /*params*/) {
1451
1473
  LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
1452
1474
  return nullptr;
1453
1475
  }
1454
1476
 
1455
1477
  struct llama_model * common_load_model_from_hf(
1456
- const char * /*repo*/,
1457
- const char * /*model*/,
1458
- const char * /*path_model*/,
1459
- const char * /*hf_token*/,
1478
+ const std::string & /*repo*/,
1479
+ const std::string & /*remote_path*/,
1480
+ const std::string & /*local_path*/,
1481
+ const std::string & /*hf_token*/,
1460
1482
  const struct llama_model_params & /*params*/) {
1461
1483
  LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1462
1484
  return nullptr;
@@ -1491,6 +1513,66 @@ void common_batch_add(
1491
1513
  batch.n_tokens++;
1492
1514
  }
1493
1515
 
1516
+ //
1517
+ // Token utils
1518
+ //
1519
+
1520
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
1521
+ size_t i;
1522
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
1523
+
1524
+ return i;
1525
+ }
1526
+
1527
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
1528
+ // check for empty sequences
1529
+ if (a.empty() || b.empty()) {
1530
+ return 0;
1531
+ }
1532
+
1533
+ // get the lengths of the input sequences
1534
+ size_t a_len = a.size();
1535
+ size_t b_len = b.size();
1536
+
1537
+ // initialize the maximum length of the longest common subsequence (LCS)
1538
+ size_t max_length = 0;
1539
+
1540
+ // use two rows instead of a 2D matrix to optimize space
1541
+ std::vector<size_t> prev_row(b_len + 1, 0);
1542
+ std::vector<size_t> curr_row(b_len + 1, 0);
1543
+
1544
+ // iterate through the elements of a
1545
+ for (size_t i = 1; i <= a_len; i++) {
1546
+ // iterate through the elements of b
1547
+ for (size_t j = 1; j <= b_len; j++) {
1548
+ // if elements at the current positions match
1549
+ if (a[i - 1] == b[j - 1]) {
1550
+ // if it's the first element of either sequences, set LCS length to 1
1551
+ if (i == 1 || j == 1) {
1552
+ curr_row[j] = 1;
1553
+ } else {
1554
+ // increment LCS length by 1 compared to the previous element
1555
+ curr_row[j] = prev_row[j - 1] + 1;
1556
+ }
1557
+
1558
+ // update max_length if necessary
1559
+ if (curr_row[j] > max_length) {
1560
+ max_length = curr_row[j];
1561
+ }
1562
+ } else {
1563
+ // reset LCS length if elements don't match
1564
+ curr_row[j] = 0;
1565
+ }
1566
+ }
1567
+
1568
+ // update the previous row for the next iteration
1569
+ prev_row = curr_row;
1570
+ }
1571
+
1572
+ // return the maximum length of the LCS
1573
+ return max_length;
1574
+ }
1575
+
1494
1576
  //
1495
1577
  // Vocab utils
1496
1578
  //
@@ -1897,218 +1979,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
1897
1979
  return result;
1898
1980
  }
1899
1981
 
1900
- //
1901
- // YAML utils
1902
- //
1903
-
1904
- void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
1905
- if (data.empty()) {
1906
- fprintf(stream, "%s:\n", prop_name);
1907
- return;
1908
- }
1909
-
1910
- fprintf(stream, "%s: [", prop_name);
1911
- for (size_t i = 0; i < data.size() - 1; ++i) {
1912
- fprintf(stream, "%e, ", data[i]);
1913
- }
1914
- fprintf(stream, "%e]\n", data.back());
1915
- }
1916
-
1917
- void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
1918
- if (data.empty()) {
1919
- fprintf(stream, "%s:\n", prop_name);
1920
- return;
1921
- }
1922
-
1923
- fprintf(stream, "%s: [", prop_name);
1924
- for (size_t i = 0; i < data.size() - 1; ++i) {
1925
- fprintf(stream, "%d, ", data[i]);
1926
- }
1927
- fprintf(stream, "%d]\n", data.back());
1928
- }
1929
-
1930
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
1931
- std::string data_str(data == NULL ? "" : data);
1932
-
1933
- if (data_str.empty()) {
1934
- fprintf(stream, "%s:\n", prop_name);
1935
- return;
1936
- }
1937
-
1938
- size_t pos_start = 0;
1939
- size_t pos_found = 0;
1940
-
1941
- if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
1942
- data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
1943
- data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
1944
- data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
1945
- data_str = "\"" + data_str + "\"";
1946
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1947
- return;
1948
- }
1949
-
1950
- if (data_str.find('\n') == std::string::npos) {
1951
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1952
- return;
1953
- }
1954
-
1955
- fprintf(stream, "%s: |\n", prop_name);
1956
- while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
1957
- fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
1958
- pos_start = pos_found + 1;
1959
- }
1960
- }
1961
-
1962
- void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
1963
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1964
- lm_ggml_cpu_init(); // some ARM features are detected at runtime
1965
-
1966
- const auto & sparams = params.sparams;
1967
-
1968
- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
1969
- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
1970
- fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false");
1971
- fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false");
1972
- fprintf(stream, "cpu_has_avx_vnni: %s\n", lm_ggml_cpu_has_avx_vnni() ? "true" : "false");
1973
- fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false");
1974
- fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false");
1975
- fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false");
1976
- fprintf(stream, "cpu_has_avx512_vnni: %s\n", lm_ggml_cpu_has_avx512_vnni() ? "true" : "false");
1977
- fprintf(stream, "cpu_has_cuda: %s\n", lm_ggml_cpu_has_cuda() ? "true" : "false");
1978
- fprintf(stream, "cpu_has_vulkan: %s\n", lm_ggml_cpu_has_vulkan() ? "true" : "false");
1979
- fprintf(stream, "cpu_has_kompute: %s\n", lm_ggml_cpu_has_kompute() ? "true" : "false");
1980
- fprintf(stream, "cpu_has_fma: %s\n", lm_ggml_cpu_has_fma() ? "true" : "false");
1981
- fprintf(stream, "cpu_has_gpublas: %s\n", lm_ggml_cpu_has_gpublas() ? "true" : "false");
1982
- fprintf(stream, "cpu_has_neon: %s\n", lm_ggml_cpu_has_neon() ? "true" : "false");
1983
- fprintf(stream, "cpu_has_sve: %s\n", lm_ggml_cpu_has_sve() ? "true" : "false");
1984
- fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false");
1985
- fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false");
1986
- fprintf(stream, "cpu_has_riscv_v: %s\n", lm_ggml_cpu_has_riscv_v() ? "true" : "false");
1987
- fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false");
1988
- fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false");
1989
- fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false");
1990
- fprintf(stream, "cpu_has_vsx: %s\n", lm_ggml_cpu_has_vsx() ? "true" : "false");
1991
- fprintf(stream, "cpu_has_matmul_int8: %s\n", lm_ggml_cpu_has_matmul_int8() ? "true" : "false");
1992
-
1993
- #ifdef NDEBUG
1994
- fprintf(stream, "debug: false\n");
1995
- #else
1996
- fprintf(stream, "debug: true\n");
1997
- #endif // NDEBUG
1998
-
1999
- fprintf(stream, "model_desc: %s\n", model_desc);
2000
- fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
2001
-
2002
- #ifdef __OPTIMIZE__
2003
- fprintf(stream, "optimize: true\n");
2004
- #else
2005
- fprintf(stream, "optimize: false\n");
2006
- #endif // __OPTIMIZE__
2007
-
2008
- fprintf(stream, "time: %s\n", timestamp.c_str());
2009
-
2010
- fprintf(stream, "\n");
2011
- fprintf(stream, "###############\n");
2012
- fprintf(stream, "# User Inputs #\n");
2013
- fprintf(stream, "###############\n");
2014
- fprintf(stream, "\n");
2015
-
2016
- fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2017
- fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2018
- fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2019
- fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2020
- fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2021
- fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
2022
- fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
2023
- fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
2024
- fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
2025
- fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2026
- fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2027
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2028
- yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
2029
- fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2030
- fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2031
- fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2032
- fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
2033
-
2034
- yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2035
- fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2036
- yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2037
- fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2038
- fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2039
- fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2040
- fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2041
-
2042
- fprintf(stream, "logit_bias:\n");
2043
- for (const auto & logit_bias : sparams.logit_bias) {
2044
- fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
2045
- }
2046
-
2047
- fprintf(stream, "lora:\n");
2048
- for (auto & la : params.lora_adapters) {
2049
- if (la.scale == 1.0f) {
2050
- fprintf(stream, " - %s\n", la.path.c_str());
2051
- }
2052
- }
2053
- fprintf(stream, "lora_scaled:\n");
2054
- for (auto & la : params.lora_adapters) {
2055
- if (la.scale != 1.0f) {
2056
- fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
2057
- }
2058
- }
2059
- fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
2060
- fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2061
- fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2062
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2063
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2064
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2065
- fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2066
- fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2067
- fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2068
- fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2069
- fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2070
- fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2071
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2072
- fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2073
- fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2074
- fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2075
- fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2076
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2077
- yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
2078
- fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2079
- fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2080
- fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2081
- yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2082
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2083
-
2084
- fprintf(stream, "reverse_prompt:\n");
2085
- for (std::string ap : params.antiprompt) {
2086
- size_t pos = 0;
2087
- while ((pos = ap.find('\n', pos)) != std::string::npos) {
2088
- ap.replace(pos, 1, "\\n");
2089
- pos += 1;
2090
- }
2091
-
2092
- fprintf(stream, " - %s\n", ap.c_str());
2093
- }
2094
-
2095
- fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2096
- fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2097
- fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2098
- fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2099
- fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2100
- fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2101
-
2102
- const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2103
- yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2104
-
2105
- fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
2106
- fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2107
- fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2108
- fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2109
- fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2110
- fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
2111
- fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
2112
- fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2113
- fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2114
- }