cui-llama.rn 1.2.6 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/android/src/main/CMakeLists.txt +26 -6
- package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
- package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
- package/android/src/main/jni.cpp +228 -40
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/cpp/amx/amx.cpp +196 -0
- package/cpp/amx/amx.h +20 -0
- package/cpp/amx/common.h +101 -0
- package/cpp/amx/mmq.cpp +2524 -0
- package/cpp/amx/mmq.h +16 -0
- package/cpp/common.cpp +118 -251
- package/cpp/common.h +53 -30
- package/cpp/ggml-aarch64.c +46 -3395
- package/cpp/ggml-aarch64.h +0 -20
- package/cpp/ggml-alloc.c +6 -8
- package/cpp/ggml-backend-impl.h +33 -11
- package/cpp/ggml-backend-reg.cpp +423 -0
- package/cpp/ggml-backend.cpp +14 -676
- package/cpp/ggml-backend.h +46 -9
- package/cpp/ggml-common.h +6 -0
- package/cpp/ggml-cpu-aarch64.c +3823 -0
- package/cpp/ggml-cpu-aarch64.h +32 -0
- package/cpp/ggml-cpu-impl.h +14 -242
- package/cpp/ggml-cpu-quants.c +10835 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu.c +13971 -13720
- package/cpp/ggml-cpu.cpp +715 -0
- package/cpp/ggml-cpu.h +65 -63
- package/cpp/ggml-impl.h +285 -25
- package/cpp/ggml-metal.h +8 -8
- package/cpp/ggml-metal.m +1221 -728
- package/cpp/ggml-quants.c +189 -10681
- package/cpp/ggml-quants.h +78 -125
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +12 -0
- package/cpp/ggml.c +688 -1460
- package/cpp/ggml.h +58 -244
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-sampling.cpp +5 -2
- package/cpp/llama.cpp +409 -123
- package/cpp/llama.h +8 -4
- package/cpp/rn-llama.hpp +89 -25
- package/cpp/sampling.cpp +42 -3
- package/cpp/sampling.h +22 -1
- package/cpp/sgemm.cpp +608 -0
- package/cpp/speculative.cpp +270 -0
- package/cpp/speculative.h +28 -0
- package/cpp/unicode.cpp +11 -0
- package/ios/RNLlama.mm +43 -20
- package/ios/RNLlamaContext.h +9 -3
- package/ios/RNLlamaContext.mm +146 -33
- package/jest/mock.js +0 -1
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +4 -2
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +52 -15
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +2 -1
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +51 -15
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +122 -8
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts +5 -6
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +15 -6
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/NativeRNLlama.ts +135 -13
- package/src/grammar.ts +10 -8
- package/src/index.ts +104 -28
package/cpp/amx/mmq.h
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include "common.h"
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
size_t lm_ggml_backend_amx_get_alloc_size(const struct lm_ggml_tensor * tensor);
|
9
|
+
|
10
|
+
void lm_ggml_backend_amx_convert_weight(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
11
|
+
|
12
|
+
void lm_ggml_backend_amx_mul_mat(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
|
13
|
+
|
14
|
+
#ifdef __cplusplus
|
15
|
+
}
|
16
|
+
#endif
|
package/cpp/common.cpp
CHANGED
@@ -542,12 +542,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
542
542
|
[](const unsigned char c) { return !std::isprint(c); }),
|
543
543
|
detokenized.end());
|
544
544
|
|
545
|
-
buf << "\n"
|
546
|
-
<< "
|
547
|
-
<< "
|
548
|
-
<< "
|
549
|
-
<< "
|
550
|
-
<< "
|
545
|
+
buf << "\n" << std::to_string(i)
|
546
|
+
<< ", token '" << detokenized << "'"
|
547
|
+
<< ", pos " << std::to_string(batch.pos[i])
|
548
|
+
<< ", n_seq_id " << std::to_string(batch.n_seq_id[i])
|
549
|
+
<< ", seq_id " << std::to_string(batch.seq_id[i][0])
|
550
|
+
<< ", logits " << std::to_string(batch.logits[i]);
|
551
551
|
}
|
552
552
|
|
553
553
|
buf << " ]";
|
@@ -658,7 +658,17 @@ bool fs_validate_filename(const std::string & filename) {
|
|
658
658
|
|
659
659
|
std::u32string filename_utf32;
|
660
660
|
try {
|
661
|
+
#if defined(__clang__)
|
662
|
+
// disable C++17 deprecation warning for std::codecvt_utf8
|
663
|
+
# pragma clang diagnostic push
|
664
|
+
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
665
|
+
#endif
|
661
666
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
667
|
+
|
668
|
+
#if defined(__clang__)
|
669
|
+
# pragma clang diagnostic pop
|
670
|
+
#endif
|
671
|
+
|
662
672
|
filename_utf32 = converter.from_bytes(filename);
|
663
673
|
|
664
674
|
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
@@ -835,9 +845,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
835
845
|
llama_model * model = nullptr;
|
836
846
|
|
837
847
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
838
|
-
model = common_load_model_from_hf(params.hf_repo
|
848
|
+
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
839
849
|
} else if (!params.model_url.empty()) {
|
840
|
-
model = common_load_model_from_url(params.model_url
|
850
|
+
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
841
851
|
} else {
|
842
852
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
843
853
|
}
|
@@ -881,6 +891,12 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
881
891
|
return iparams;
|
882
892
|
}
|
883
893
|
|
894
|
+
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
895
|
+
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
|
896
|
+
llama_free_model(model);
|
897
|
+
return iparams;
|
898
|
+
}
|
899
|
+
|
884
900
|
if (!params.control_vectors.empty()) {
|
885
901
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
886
902
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
@@ -925,9 +941,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
925
941
|
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
926
942
|
}
|
927
943
|
|
928
|
-
if (params.
|
944
|
+
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
929
945
|
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
930
|
-
params.
|
946
|
+
params.sampling.ignore_eos = false;
|
931
947
|
}
|
932
948
|
|
933
949
|
if (params.warmup) {
|
@@ -979,9 +995,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
|
|
979
995
|
}
|
980
996
|
}
|
981
997
|
|
982
|
-
struct llama_model_params common_model_params_to_llama(
|
998
|
+
struct llama_model_params common_model_params_to_llama(common_params & params) {
|
983
999
|
auto mparams = llama_model_default_params();
|
984
1000
|
|
1001
|
+
if (!params.devices.empty()) {
|
1002
|
+
mparams.devices = params.devices.data();
|
1003
|
+
}
|
985
1004
|
if (params.n_gpu_layers != -1) {
|
986
1005
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
987
1006
|
}
|
@@ -1013,6 +1032,9 @@ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
1013
1032
|
if (s == "f16") {
|
1014
1033
|
return LM_GGML_TYPE_F16;
|
1015
1034
|
}
|
1035
|
+
if (s == "bf16") {
|
1036
|
+
return LM_GGML_TYPE_BF16;
|
1037
|
+
}
|
1016
1038
|
if (s == "q8_0") {
|
1017
1039
|
return LM_GGML_TYPE_Q8_0;
|
1018
1040
|
}
|
@@ -1340,17 +1362,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
1340
1362
|
}
|
1341
1363
|
|
1342
1364
|
struct llama_model * common_load_model_from_url(
|
1343
|
-
const
|
1344
|
-
const
|
1345
|
-
const
|
1365
|
+
const std::string & model_url,
|
1366
|
+
const std::string & local_path,
|
1367
|
+
const std::string & hf_token,
|
1346
1368
|
const struct llama_model_params & params) {
|
1347
1369
|
// Basic validation of the model_url
|
1348
|
-
if (
|
1370
|
+
if (model_url.empty()) {
|
1349
1371
|
LOG_ERR("%s: invalid model_url\n", __func__);
|
1350
1372
|
return NULL;
|
1351
1373
|
}
|
1352
1374
|
|
1353
|
-
if (!common_download_file(model_url,
|
1375
|
+
if (!common_download_file(model_url, local_path, hf_token)) {
|
1354
1376
|
return NULL;
|
1355
1377
|
}
|
1356
1378
|
|
@@ -1361,9 +1383,9 @@ struct llama_model * common_load_model_from_url(
|
|
1361
1383
|
/*.no_alloc = */ true,
|
1362
1384
|
/*.ctx = */ NULL,
|
1363
1385
|
};
|
1364
|
-
auto * ctx_gguf = lm_gguf_init_from_file(
|
1386
|
+
auto * ctx_gguf = lm_gguf_init_from_file(local_path.c_str(), lm_gguf_params);
|
1365
1387
|
if (!ctx_gguf) {
|
1366
|
-
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__,
|
1388
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
1367
1389
|
return NULL;
|
1368
1390
|
}
|
1369
1391
|
|
@@ -1382,13 +1404,13 @@ struct llama_model * common_load_model_from_url(
|
|
1382
1404
|
// Verify the first split file format
|
1383
1405
|
// and extract split URL and PATH prefixes
|
1384
1406
|
{
|
1385
|
-
if (!llama_split_prefix(split_prefix, sizeof(split_prefix),
|
1386
|
-
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__,
|
1407
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
1408
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
1387
1409
|
return NULL;
|
1388
1410
|
}
|
1389
1411
|
|
1390
|
-
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
1391
|
-
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
1412
|
+
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
1413
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
1392
1414
|
return NULL;
|
1393
1415
|
}
|
1394
1416
|
}
|
@@ -1415,14 +1437,14 @@ struct llama_model * common_load_model_from_url(
|
|
1415
1437
|
}
|
1416
1438
|
}
|
1417
1439
|
|
1418
|
-
return llama_load_model_from_file(
|
1440
|
+
return llama_load_model_from_file(local_path.c_str(), params);
|
1419
1441
|
}
|
1420
1442
|
|
1421
1443
|
struct llama_model * common_load_model_from_hf(
|
1422
|
-
const
|
1423
|
-
const
|
1424
|
-
const
|
1425
|
-
const
|
1444
|
+
const std::string & repo,
|
1445
|
+
const std::string & remote_path,
|
1446
|
+
const std::string & local_path,
|
1447
|
+
const std::string & hf_token,
|
1426
1448
|
const struct llama_model_params & params) {
|
1427
1449
|
// construct hugging face model url:
|
1428
1450
|
//
|
@@ -1436,27 +1458,27 @@ struct llama_model * common_load_model_from_hf(
|
|
1436
1458
|
std::string model_url = "https://huggingface.co/";
|
1437
1459
|
model_url += repo;
|
1438
1460
|
model_url += "/resolve/main/";
|
1439
|
-
model_url +=
|
1461
|
+
model_url += remote_path;
|
1440
1462
|
|
1441
|
-
return common_load_model_from_url(model_url
|
1463
|
+
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
1442
1464
|
}
|
1443
1465
|
|
1444
1466
|
#else
|
1445
1467
|
|
1446
1468
|
struct llama_model * common_load_model_from_url(
|
1447
|
-
const
|
1448
|
-
const
|
1449
|
-
const
|
1469
|
+
const std::string & /*model_url*/,
|
1470
|
+
const std::string & /*local_path*/,
|
1471
|
+
const std::string & /*hf_token*/,
|
1450
1472
|
const struct llama_model_params & /*params*/) {
|
1451
1473
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
1452
1474
|
return nullptr;
|
1453
1475
|
}
|
1454
1476
|
|
1455
1477
|
struct llama_model * common_load_model_from_hf(
|
1456
|
-
const
|
1457
|
-
const
|
1458
|
-
const
|
1459
|
-
const
|
1478
|
+
const std::string & /*repo*/,
|
1479
|
+
const std::string & /*remote_path*/,
|
1480
|
+
const std::string & /*local_path*/,
|
1481
|
+
const std::string & /*hf_token*/,
|
1460
1482
|
const struct llama_model_params & /*params*/) {
|
1461
1483
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
1462
1484
|
return nullptr;
|
@@ -1491,6 +1513,66 @@ void common_batch_add(
|
|
1491
1513
|
batch.n_tokens++;
|
1492
1514
|
}
|
1493
1515
|
|
1516
|
+
//
|
1517
|
+
// Token utils
|
1518
|
+
//
|
1519
|
+
|
1520
|
+
size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
|
1521
|
+
size_t i;
|
1522
|
+
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
1523
|
+
|
1524
|
+
return i;
|
1525
|
+
}
|
1526
|
+
|
1527
|
+
size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
|
1528
|
+
// check for empty sequences
|
1529
|
+
if (a.empty() || b.empty()) {
|
1530
|
+
return 0;
|
1531
|
+
}
|
1532
|
+
|
1533
|
+
// get the lengths of the input sequences
|
1534
|
+
size_t a_len = a.size();
|
1535
|
+
size_t b_len = b.size();
|
1536
|
+
|
1537
|
+
// initialize the maximum length of the longest common subsequence (LCS)
|
1538
|
+
size_t max_length = 0;
|
1539
|
+
|
1540
|
+
// use two rows instead of a 2D matrix to optimize space
|
1541
|
+
std::vector<size_t> prev_row(b_len + 1, 0);
|
1542
|
+
std::vector<size_t> curr_row(b_len + 1, 0);
|
1543
|
+
|
1544
|
+
// iterate through the elements of a
|
1545
|
+
for (size_t i = 1; i <= a_len; i++) {
|
1546
|
+
// iterate through the elements of b
|
1547
|
+
for (size_t j = 1; j <= b_len; j++) {
|
1548
|
+
// if elements at the current positions match
|
1549
|
+
if (a[i - 1] == b[j - 1]) {
|
1550
|
+
// if it's the first element of either sequences, set LCS length to 1
|
1551
|
+
if (i == 1 || j == 1) {
|
1552
|
+
curr_row[j] = 1;
|
1553
|
+
} else {
|
1554
|
+
// increment LCS length by 1 compared to the previous element
|
1555
|
+
curr_row[j] = prev_row[j - 1] + 1;
|
1556
|
+
}
|
1557
|
+
|
1558
|
+
// update max_length if necessary
|
1559
|
+
if (curr_row[j] > max_length) {
|
1560
|
+
max_length = curr_row[j];
|
1561
|
+
}
|
1562
|
+
} else {
|
1563
|
+
// reset LCS length if elements don't match
|
1564
|
+
curr_row[j] = 0;
|
1565
|
+
}
|
1566
|
+
}
|
1567
|
+
|
1568
|
+
// update the previous row for the next iteration
|
1569
|
+
prev_row = curr_row;
|
1570
|
+
}
|
1571
|
+
|
1572
|
+
// return the maximum length of the LCS
|
1573
|
+
return max_length;
|
1574
|
+
}
|
1575
|
+
|
1494
1576
|
//
|
1495
1577
|
// Vocab utils
|
1496
1578
|
//
|
@@ -1897,218 +1979,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|
1897
1979
|
return result;
|
1898
1980
|
}
|
1899
1981
|
|
1900
|
-
//
|
1901
|
-
// YAML utils
|
1902
|
-
//
|
1903
|
-
|
1904
|
-
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
1905
|
-
if (data.empty()) {
|
1906
|
-
fprintf(stream, "%s:\n", prop_name);
|
1907
|
-
return;
|
1908
|
-
}
|
1909
|
-
|
1910
|
-
fprintf(stream, "%s: [", prop_name);
|
1911
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
1912
|
-
fprintf(stream, "%e, ", data[i]);
|
1913
|
-
}
|
1914
|
-
fprintf(stream, "%e]\n", data.back());
|
1915
|
-
}
|
1916
|
-
|
1917
|
-
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
1918
|
-
if (data.empty()) {
|
1919
|
-
fprintf(stream, "%s:\n", prop_name);
|
1920
|
-
return;
|
1921
|
-
}
|
1922
|
-
|
1923
|
-
fprintf(stream, "%s: [", prop_name);
|
1924
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
1925
|
-
fprintf(stream, "%d, ", data[i]);
|
1926
|
-
}
|
1927
|
-
fprintf(stream, "%d]\n", data.back());
|
1928
|
-
}
|
1929
|
-
|
1930
|
-
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
|
1931
|
-
std::string data_str(data == NULL ? "" : data);
|
1932
|
-
|
1933
|
-
if (data_str.empty()) {
|
1934
|
-
fprintf(stream, "%s:\n", prop_name);
|
1935
|
-
return;
|
1936
|
-
}
|
1937
|
-
|
1938
|
-
size_t pos_start = 0;
|
1939
|
-
size_t pos_found = 0;
|
1940
|
-
|
1941
|
-
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
1942
|
-
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
1943
|
-
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
1944
|
-
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
1945
|
-
data_str = "\"" + data_str + "\"";
|
1946
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
1947
|
-
return;
|
1948
|
-
}
|
1949
|
-
|
1950
|
-
if (data_str.find('\n') == std::string::npos) {
|
1951
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
1952
|
-
return;
|
1953
|
-
}
|
1954
|
-
|
1955
|
-
fprintf(stream, "%s: |\n", prop_name);
|
1956
|
-
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
1957
|
-
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
1958
|
-
pos_start = pos_found + 1;
|
1959
|
-
}
|
1960
|
-
}
|
1961
|
-
|
1962
|
-
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
1963
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
1964
|
-
lm_ggml_cpu_init(); // some ARM features are detected at runtime
|
1965
|
-
|
1966
|
-
const auto & sparams = params.sparams;
|
1967
|
-
|
1968
|
-
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
1969
|
-
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
1970
|
-
fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false");
|
1971
|
-
fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false");
|
1972
|
-
fprintf(stream, "cpu_has_avx_vnni: %s\n", lm_ggml_cpu_has_avx_vnni() ? "true" : "false");
|
1973
|
-
fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false");
|
1974
|
-
fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false");
|
1975
|
-
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
1976
|
-
fprintf(stream, "cpu_has_avx512_vnni: %s\n", lm_ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
1977
|
-
fprintf(stream, "cpu_has_cuda: %s\n", lm_ggml_cpu_has_cuda() ? "true" : "false");
|
1978
|
-
fprintf(stream, "cpu_has_vulkan: %s\n", lm_ggml_cpu_has_vulkan() ? "true" : "false");
|
1979
|
-
fprintf(stream, "cpu_has_kompute: %s\n", lm_ggml_cpu_has_kompute() ? "true" : "false");
|
1980
|
-
fprintf(stream, "cpu_has_fma: %s\n", lm_ggml_cpu_has_fma() ? "true" : "false");
|
1981
|
-
fprintf(stream, "cpu_has_gpublas: %s\n", lm_ggml_cpu_has_gpublas() ? "true" : "false");
|
1982
|
-
fprintf(stream, "cpu_has_neon: %s\n", lm_ggml_cpu_has_neon() ? "true" : "false");
|
1983
|
-
fprintf(stream, "cpu_has_sve: %s\n", lm_ggml_cpu_has_sve() ? "true" : "false");
|
1984
|
-
fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false");
|
1985
|
-
fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false");
|
1986
|
-
fprintf(stream, "cpu_has_riscv_v: %s\n", lm_ggml_cpu_has_riscv_v() ? "true" : "false");
|
1987
|
-
fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false");
|
1988
|
-
fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false");
|
1989
|
-
fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false");
|
1990
|
-
fprintf(stream, "cpu_has_vsx: %s\n", lm_ggml_cpu_has_vsx() ? "true" : "false");
|
1991
|
-
fprintf(stream, "cpu_has_matmul_int8: %s\n", lm_ggml_cpu_has_matmul_int8() ? "true" : "false");
|
1992
|
-
|
1993
|
-
#ifdef NDEBUG
|
1994
|
-
fprintf(stream, "debug: false\n");
|
1995
|
-
#else
|
1996
|
-
fprintf(stream, "debug: true\n");
|
1997
|
-
#endif // NDEBUG
|
1998
|
-
|
1999
|
-
fprintf(stream, "model_desc: %s\n", model_desc);
|
2000
|
-
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
2001
|
-
|
2002
|
-
#ifdef __OPTIMIZE__
|
2003
|
-
fprintf(stream, "optimize: true\n");
|
2004
|
-
#else
|
2005
|
-
fprintf(stream, "optimize: false\n");
|
2006
|
-
#endif // __OPTIMIZE__
|
2007
|
-
|
2008
|
-
fprintf(stream, "time: %s\n", timestamp.c_str());
|
2009
|
-
|
2010
|
-
fprintf(stream, "\n");
|
2011
|
-
fprintf(stream, "###############\n");
|
2012
|
-
fprintf(stream, "# User Inputs #\n");
|
2013
|
-
fprintf(stream, "###############\n");
|
2014
|
-
fprintf(stream, "\n");
|
2015
|
-
|
2016
|
-
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
2017
|
-
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
2018
|
-
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
2019
|
-
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
2020
|
-
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
2021
|
-
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
|
2022
|
-
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
|
2023
|
-
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
|
2024
|
-
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
|
2025
|
-
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
2026
|
-
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
2027
|
-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
2028
|
-
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
|
2029
|
-
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
2030
|
-
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
2031
|
-
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
2032
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
|
2033
|
-
|
2034
|
-
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
2035
|
-
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
2036
|
-
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
2037
|
-
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
2038
|
-
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
2039
|
-
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
2040
|
-
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
2041
|
-
|
2042
|
-
fprintf(stream, "logit_bias:\n");
|
2043
|
-
for (const auto & logit_bias : sparams.logit_bias) {
|
2044
|
-
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
|
2045
|
-
}
|
2046
|
-
|
2047
|
-
fprintf(stream, "lora:\n");
|
2048
|
-
for (auto & la : params.lora_adapters) {
|
2049
|
-
if (la.scale == 1.0f) {
|
2050
|
-
fprintf(stream, " - %s\n", la.path.c_str());
|
2051
|
-
}
|
2052
|
-
}
|
2053
|
-
fprintf(stream, "lora_scaled:\n");
|
2054
|
-
for (auto & la : params.lora_adapters) {
|
2055
|
-
if (la.scale != 1.0f) {
|
2056
|
-
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
|
2057
|
-
}
|
2058
|
-
}
|
2059
|
-
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
2060
|
-
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
2061
|
-
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
2062
|
-
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
2063
|
-
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
2064
|
-
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
2065
|
-
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
2066
|
-
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
2067
|
-
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
2068
|
-
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
2069
|
-
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
2070
|
-
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
2071
|
-
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
2072
|
-
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
2073
|
-
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
2074
|
-
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
2075
|
-
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
2076
|
-
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
2077
|
-
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
|
2078
|
-
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
2079
|
-
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
2080
|
-
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
2081
|
-
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
|
2082
|
-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
2083
|
-
|
2084
|
-
fprintf(stream, "reverse_prompt:\n");
|
2085
|
-
for (std::string ap : params.antiprompt) {
|
2086
|
-
size_t pos = 0;
|
2087
|
-
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
2088
|
-
ap.replace(pos, 1, "\\n");
|
2089
|
-
pos += 1;
|
2090
|
-
}
|
2091
|
-
|
2092
|
-
fprintf(stream, " - %s\n", ap.c_str());
|
2093
|
-
}
|
2094
|
-
|
2095
|
-
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
2096
|
-
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
2097
|
-
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
2098
|
-
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
2099
|
-
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
2100
|
-
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
2101
|
-
|
2102
|
-
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
2103
|
-
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
2104
|
-
|
2105
|
-
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
2106
|
-
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
2107
|
-
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
2108
|
-
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
2109
|
-
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
|
2110
|
-
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
|
2111
|
-
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
2112
|
-
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
2113
|
-
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
2114
|
-
}
|