@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -12,6 +12,7 @@
12
12
 
13
13
  #include <algorithm>
14
14
  #include <cinttypes>
15
+ #include <climits>
15
16
  #include <cmath>
16
17
  #include <codecvt>
17
18
  #include <cstdarg>
@@ -23,10 +24,10 @@
23
24
  #include <regex>
24
25
  #include <sstream>
25
26
  #include <string>
27
+ #include <thread>
26
28
  #include <unordered_map>
27
29
  #include <unordered_set>
28
30
  #include <vector>
29
- #include <thread>
30
31
 
31
32
  #if defined(__APPLE__) && defined(__MACH__)
32
33
  #include <sys/types.h>
@@ -362,10 +363,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
362
363
  return true;
363
364
  }
364
365
 
365
- void gpt_init() {
366
+ void common_init() {
366
367
  llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
367
- if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
368
- gpt_log_add(gpt_log_main(), level, "%s", text);
368
+ if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
369
+ common_log_add(common_log_main(), level, "%s", text);
369
370
  }
370
371
  }, NULL);
371
372
 
@@ -378,7 +379,7 @@ void gpt_init() {
378
379
  LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
379
380
  }
380
381
 
381
- std::string gpt_params_get_system_info(const gpt_params & params) {
382
+ std::string common_params_get_system_info(const common_params & params) {
382
383
  std::ostringstream os;
383
384
 
384
385
  os << "system_info: n_threads = " << params.cpuparams.n_threads;
@@ -400,17 +401,19 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
400
401
  // String utils
401
402
  //
402
403
 
403
- std::vector<std::string> string_split(std::string input, char separator) {
404
- std::vector<std::string> parts;
405
- size_t separator_pos = input.find(separator);
406
- while (separator_pos != std::string::npos) {
407
- std::string part = input.substr(0, separator_pos);
408
- parts.emplace_back(part);
409
- input = input.substr(separator_pos + 1);
410
- separator_pos = input.find(separator);
411
- }
412
- parts.emplace_back(input);
413
- return parts;
404
+ std::string string_format(const char * fmt, ...) {
405
+ va_list ap;
406
+ va_list ap2;
407
+ va_start(ap, fmt);
408
+ va_copy(ap2, ap);
409
+ int size = vsnprintf(NULL, 0, fmt, ap);
410
+ GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
411
+ std::vector<char> buf(size + 1);
412
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
413
+ GGML_ASSERT(size2 == size);
414
+ va_end(ap2);
415
+ va_end(ap);
416
+ return std::string(buf.data(), size);
414
417
  }
415
418
 
416
419
  std::string string_strip(const std::string & str) {
@@ -493,7 +496,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
493
496
  first = false;
494
497
  }
495
498
 
496
- auto detokenized = llama_token_to_piece(ctx, token);
499
+ auto detokenized = common_token_to_piece(ctx, token);
497
500
 
498
501
  detokenized.erase(
499
502
  std::remove_if(
@@ -524,7 +527,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
524
527
  first = false;
525
528
  }
526
529
 
527
- auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
530
+ auto detokenized = common_token_to_piece(ctx, batch.token[i]);
528
531
 
529
532
  detokenized.erase(
530
533
  std::remove_if(
@@ -819,16 +822,16 @@ std::string fs_get_cache_file(const std::string & filename) {
819
822
  //
820
823
  // Model utils
821
824
  //
822
- struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
823
- llama_init_result iparams;
824
- auto mparams = llama_model_params_from_gpt_params(params);
825
+ struct common_init_result common_init_from_params(common_params & params) {
826
+ common_init_result iparams;
827
+ auto mparams = common_model_params_to_llama(params);
825
828
 
826
829
  llama_model * model = nullptr;
827
830
 
828
831
  if (!params.hf_repo.empty() && !params.hf_file.empty()) {
829
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
832
+ model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
830
833
  } else if (!params.model_url.empty()) {
831
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
834
+ model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
832
835
  } else {
833
836
  model = llama_load_model_from_file(params.model.c_str(), mparams);
834
837
  }
@@ -863,7 +866,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
863
866
  }
864
867
  }
865
868
 
866
- auto cparams = llama_context_params_from_gpt_params(params);
869
+ auto cparams = common_context_params_to_llama(params);
867
870
 
868
871
  llama_context * lctx = llama_new_context_with_model(model, cparams);
869
872
  if (lctx == NULL) {
@@ -876,7 +879,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
876
879
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
877
880
  if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
878
881
 
879
- const auto cvec = llama_control_vector_load(params.control_vectors);
882
+ const auto cvec = common_control_vector_load(params.control_vectors);
880
883
  if (cvec.n_embd == -1) {
881
884
  llama_free(lctx);
882
885
  llama_free_model(model);
@@ -900,7 +903,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
900
903
 
901
904
  // load and optionally apply lora adapters
902
905
  for (auto & la : params.lora_adapters) {
903
- llama_lora_adapter_container loaded_la;
906
+ common_lora_adapter_container loaded_la;
904
907
  loaded_la.path = la.path;
905
908
  loaded_la.scale = la.scale;
906
909
  loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
@@ -913,7 +916,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
913
916
  iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
914
917
  }
915
918
  if (!params.lora_init_without_apply) {
916
- llama_lora_adapters_apply(lctx, iparams.lora_adapters);
919
+ common_lora_adapters_apply(lctx, iparams.lora_adapters);
917
920
  }
918
921
 
919
922
  if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -939,7 +942,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
939
942
  }
940
943
 
941
944
  if (llama_model_has_encoder(model)) {
942
- llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
945
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
943
946
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
944
947
  if (decoder_start_token_id == -1) {
945
948
  decoder_start_token_id = bos;
@@ -948,7 +951,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
948
951
  tmp.push_back(decoder_start_token_id);
949
952
  }
950
953
  if (llama_model_has_decoder(model)) {
951
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
954
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
952
955
  }
953
956
  llama_kv_cache_clear(lctx);
954
957
  llama_synchronize(lctx);
@@ -961,7 +964,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
961
964
  return iparams;
962
965
  }
963
966
 
964
- void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
967
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
965
968
  llama_lora_adapter_clear(ctx);
966
969
  for (auto & la : lora_adapters) {
967
970
  if (la.scale != 0.0f) {
@@ -970,7 +973,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
970
973
  }
971
974
  }
972
975
 
973
- struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
976
+ struct llama_model_params common_model_params_to_llama(const common_params & params) {
974
977
  auto mparams = llama_model_default_params();
975
978
 
976
979
  if (params.n_gpu_layers != -1) {
@@ -1000,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
1000
1003
  if (s == "f16") {
1001
1004
  return GGML_TYPE_F16;
1002
1005
  }
1006
+ if (s == "bf16") {
1007
+ return GGML_TYPE_BF16;
1008
+ }
1003
1009
  if (s == "q8_0") {
1004
1010
  return GGML_TYPE_Q8_0;
1005
1011
  }
@@ -1019,10 +1025,10 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
1019
1025
  return GGML_TYPE_Q5_1;
1020
1026
  }
1021
1027
 
1022
- throw std::runtime_error("Invalid cache type: " + s);
1028
+ throw std::runtime_error("Unsupported cache type: " + s);
1023
1029
  }
1024
1030
 
1025
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
1031
+ struct llama_context_params common_context_params_to_llama(const common_params & params) {
1026
1032
  auto cparams = llama_context_default_params();
1027
1033
 
1028
1034
  cparams.n_ctx = params.n_ctx;
@@ -1031,7 +1037,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1031
1037
  cparams.n_ubatch = params.n_ubatch;
1032
1038
  cparams.n_threads = params.cpuparams.n_threads;
1033
1039
  cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1034
- params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1040
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1035
1041
  cparams.logits_all = params.logits_all;
1036
1042
  cparams.embeddings = params.embedding;
1037
1043
  cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1112,7 +1118,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
1112
1118
  return false;
1113
1119
  }
1114
1120
 
1115
- static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1121
+ static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1116
1122
 
1117
1123
  // Initialize libcurl
1118
1124
  std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@@ -1182,15 +1188,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
1182
1188
  }
1183
1189
 
1184
1190
  // Send a HEAD request to retrieve the etag and last-modified headers
1185
- struct llama_load_model_from_url_headers {
1191
+ struct common_load_model_from_url_headers {
1186
1192
  std::string etag;
1187
1193
  std::string last_modified;
1188
1194
  };
1189
- llama_load_model_from_url_headers headers;
1195
+ common_load_model_from_url_headers headers;
1190
1196
  {
1191
1197
  typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1192
1198
  auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1193
- llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
1199
+ common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1194
1200
 
1195
1201
  static std::regex header_regex("([^:]+): (.*)\r\n");
1196
1202
  static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1326,7 +1332,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
1326
1332
  return true;
1327
1333
  }
1328
1334
 
1329
- struct llama_model * llama_load_model_from_url(
1335
+ struct llama_model * common_load_model_from_url(
1330
1336
  const char * model_url,
1331
1337
  const char * path_model,
1332
1338
  const char * hf_token,
@@ -1337,7 +1343,7 @@ struct llama_model * llama_load_model_from_url(
1337
1343
  return NULL;
1338
1344
  }
1339
1345
 
1340
- if (!llama_download_file(model_url, path_model, hf_token)) {
1346
+ if (!common_download_file(model_url, path_model, hf_token)) {
1341
1347
  return NULL;
1342
1348
  }
1343
1349
 
@@ -1390,7 +1396,7 @@ struct llama_model * llama_load_model_from_url(
1390
1396
  char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1391
1397
  llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1392
1398
 
1393
- return llama_download_file(split_url, split_path, hf_token);
1399
+ return common_download_file(split_url, split_path, hf_token);
1394
1400
  }, idx));
1395
1401
  }
1396
1402
 
@@ -1405,7 +1411,7 @@ struct llama_model * llama_load_model_from_url(
1405
1411
  return llama_load_model_from_file(path_model, params);
1406
1412
  }
1407
1413
 
1408
- struct llama_model * llama_load_model_from_hf(
1414
+ struct llama_model * common_load_model_from_hf(
1409
1415
  const char * repo,
1410
1416
  const char * model,
1411
1417
  const char * path_model,
@@ -1425,12 +1431,12 @@ struct llama_model * llama_load_model_from_hf(
1425
1431
  model_url += "/resolve/main/";
1426
1432
  model_url += model;
1427
1433
 
1428
- return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1434
+ return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1429
1435
  }
1430
1436
 
1431
1437
  #else
1432
1438
 
1433
- struct llama_model * llama_load_model_from_url(
1439
+ struct llama_model * common_load_model_from_url(
1434
1440
  const char * /*model_url*/,
1435
1441
  const char * /*path_model*/,
1436
1442
  const char * /*hf_token*/,
@@ -1439,7 +1445,7 @@ struct llama_model * llama_load_model_from_url(
1439
1445
  return nullptr;
1440
1446
  }
1441
1447
 
1442
- struct llama_model * llama_load_model_from_hf(
1448
+ struct llama_model * common_load_model_from_hf(
1443
1449
  const char * /*repo*/,
1444
1450
  const char * /*model*/,
1445
1451
  const char * /*path_model*/,
@@ -1455,11 +1461,11 @@ struct llama_model * llama_load_model_from_hf(
1455
1461
  // Batch utils
1456
1462
  //
1457
1463
 
1458
- void llama_batch_clear(struct llama_batch & batch) {
1464
+ void common_batch_clear(struct llama_batch & batch) {
1459
1465
  batch.n_tokens = 0;
1460
1466
  }
1461
1467
 
1462
- void llama_batch_add(
1468
+ void common_batch_add(
1463
1469
  struct llama_batch & batch,
1464
1470
  llama_token id,
1465
1471
  llama_pos pos,
@@ -1482,15 +1488,15 @@ void llama_batch_add(
1482
1488
  // Vocab utils
1483
1489
  //
1484
1490
 
1485
- std::vector<llama_token> llama_tokenize(
1491
+ std::vector<llama_token> common_tokenize(
1486
1492
  const struct llama_context * ctx,
1487
1493
  const std::string & text,
1488
1494
  bool add_special,
1489
1495
  bool parse_special) {
1490
- return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1496
+ return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1491
1497
  }
1492
1498
 
1493
- std::vector<llama_token> llama_tokenize(
1499
+ std::vector<llama_token> common_tokenize(
1494
1500
  const struct llama_model * model,
1495
1501
  const std::string & text,
1496
1502
  bool add_special,
@@ -1509,7 +1515,7 @@ std::vector<llama_token> llama_tokenize(
1509
1515
  return result;
1510
1516
  }
1511
1517
 
1512
- std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1518
+ std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1513
1519
  std::string piece;
1514
1520
  piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1515
1521
  const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
@@ -1525,7 +1531,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
1525
1531
  return piece;
1526
1532
  }
1527
1533
 
1528
- std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1534
+ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1529
1535
  std::string text;
1530
1536
  text.resize(std::max(text.capacity(), tokens.size()));
1531
1537
  int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -1545,15 +1551,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
1545
1551
  // Chat template utils
1546
1552
  //
1547
1553
 
1548
- bool llama_chat_verify_template(const std::string & tmpl) {
1554
+ bool common_chat_verify_template(const std::string & tmpl) {
1549
1555
  llama_chat_message chat[] = {{"user", "test"}};
1550
1556
  int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1551
1557
  return res >= 0;
1552
1558
  }
1553
1559
 
1554
- std::string llama_chat_apply_template(const struct llama_model * model,
1560
+ std::string common_chat_apply_template(const struct llama_model * model,
1555
1561
  const std::string & tmpl,
1556
- const std::vector<llama_chat_msg> & msgs,
1562
+ const std::vector<common_chat_msg> & msgs,
1557
1563
  bool add_ass) {
1558
1564
  int alloc_size = 0;
1559
1565
  bool fallback = false; // indicate if we must fallback to default chatml
@@ -1595,42 +1601,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
1595
1601
  return formatted_chat;
1596
1602
  }
1597
1603
 
1598
- std::string llama_chat_format_single(const struct llama_model * model,
1604
+ std::string common_chat_format_single(const struct llama_model * model,
1599
1605
  const std::string & tmpl,
1600
- const std::vector<llama_chat_msg> & past_msg,
1601
- const llama_chat_msg & new_msg,
1606
+ const std::vector<common_chat_msg> & past_msg,
1607
+ const common_chat_msg & new_msg,
1602
1608
  bool add_ass) {
1603
1609
  std::ostringstream ss;
1604
- auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
1605
- std::vector<llama_chat_msg> chat_new(past_msg);
1610
+ auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
1611
+ std::vector<common_chat_msg> chat_new(past_msg);
1606
1612
  // if the past_msg ends with a newline, we must preserve it in the formatted version
1607
1613
  if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
1608
1614
  ss << "\n";
1609
1615
  };
1610
1616
  // format chat with new_msg
1611
1617
  chat_new.push_back(new_msg);
1612
- auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
1618
+ auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
1613
1619
  // get the diff part
1614
1620
  ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1615
1621
  return ss.str();
1616
1622
  }
1617
1623
 
1618
- std::string llama_chat_format_example(const struct llama_model * model,
1624
+ std::string common_chat_format_example(const struct llama_model * model,
1619
1625
  const std::string & tmpl) {
1620
- std::vector<llama_chat_msg> msgs = {
1626
+ std::vector<common_chat_msg> msgs = {
1621
1627
  {"system", "You are a helpful assistant"},
1622
1628
  {"user", "Hello"},
1623
1629
  {"assistant", "Hi there"},
1624
1630
  {"user", "How are you?"},
1625
1631
  };
1626
- return llama_chat_apply_template(model, tmpl, msgs, true);
1632
+ return common_chat_apply_template(model, tmpl, msgs, true);
1627
1633
  }
1628
1634
 
1629
1635
  //
1630
1636
  // KV cache utils
1631
1637
  //
1632
1638
 
1633
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1639
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1634
1640
  static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1635
1641
 
1636
1642
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -1653,7 +1659,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1653
1659
  printf("\n=== Done dumping\n");
1654
1660
  }
1655
1661
 
1656
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1662
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1657
1663
  static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1658
1664
 
1659
1665
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -1705,7 +1711,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
1705
1711
  // Embedding utils
1706
1712
  //
1707
1713
 
1708
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1714
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1709
1715
  double sum = 0.0;
1710
1716
 
1711
1717
  switch (embd_norm) {
@@ -1739,7 +1745,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
1739
1745
  }
1740
1746
  }
1741
1747
 
1742
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1748
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1743
1749
  double sum = 0.0;
1744
1750
  double sum1 = 0.0;
1745
1751
  double sum2 = 0.0;
@@ -1765,8 +1771,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
1765
1771
  // Control vector utils
1766
1772
  //
1767
1773
 
1768
- static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
1769
- llama_control_vector_data result = { -1, {} };
1774
+ static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1775
+ common_control_vector_data result = { -1, {} };
1770
1776
 
1771
1777
  ggml_context * ctx = nullptr;
1772
1778
  struct gguf_init_params meta_gguf_params = {
@@ -1850,11 +1856,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
1850
1856
  return result;
1851
1857
  }
1852
1858
 
1853
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
1854
- llama_control_vector_data result = { -1, {} };
1859
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1860
+ common_control_vector_data result = { -1, {} };
1855
1861
 
1856
1862
  for (const auto & info : load_infos) {
1857
- auto cur = llama_control_vector_load_one(info);
1863
+ auto cur = common_control_vector_load_one(info);
1858
1864
 
1859
1865
  if (cur.n_embd == -1) {
1860
1866
  result.n_embd = -1;
@@ -1884,211 +1890,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
1884
1890
  return result;
1885
1891
  }
1886
1892
 
1887
- //
1888
- // YAML utils
1889
- //
1890
-
1891
- void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
1892
- if (data.empty()) {
1893
- fprintf(stream, "%s:\n", prop_name);
1894
- return;
1895
- }
1896
-
1897
- fprintf(stream, "%s: [", prop_name);
1898
- for (size_t i = 0; i < data.size() - 1; ++i) {
1899
- fprintf(stream, "%e, ", data[i]);
1900
- }
1901
- fprintf(stream, "%e]\n", data.back());
1902
- }
1903
-
1904
- void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
1905
- if (data.empty()) {
1906
- fprintf(stream, "%s:\n", prop_name);
1907
- return;
1908
- }
1909
-
1910
- fprintf(stream, "%s: [", prop_name);
1911
- for (size_t i = 0; i < data.size() - 1; ++i) {
1912
- fprintf(stream, "%d, ", data[i]);
1913
- }
1914
- fprintf(stream, "%d]\n", data.back());
1915
- }
1916
-
1917
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
1918
- std::string data_str(data == NULL ? "" : data);
1919
-
1920
- if (data_str.empty()) {
1921
- fprintf(stream, "%s:\n", prop_name);
1922
- return;
1923
- }
1924
-
1925
- size_t pos_start = 0;
1926
- size_t pos_found = 0;
1927
-
1928
- if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
1929
- data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
1930
- data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
1931
- data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
1932
- data_str = "\"" + data_str + "\"";
1933
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1934
- return;
1935
- }
1936
-
1937
- if (data_str.find('\n') == std::string::npos) {
1938
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1939
- return;
1940
- }
1941
-
1942
- fprintf(stream, "%s: |\n", prop_name);
1943
- while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
1944
- fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
1945
- pos_start = pos_found + 1;
1946
- }
1947
- }
1948
-
1949
- void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
1950
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1951
- const auto & sparams = params.sparams;
1952
-
1953
- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
1954
- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
1955
- fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
1956
- fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
1957
- fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
1958
- fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
1959
- fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
1960
- fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
1961
- fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
1962
- fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
1963
- fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
1964
- fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
1965
- fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
1966
- fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
1967
- fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
1968
- fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
1969
- fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
1970
- fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
1971
- fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
1972
- fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
1973
- fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
1974
- fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
1975
- fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
1976
- fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
1977
-
1978
- #ifdef NDEBUG
1979
- fprintf(stream, "debug: false\n");
1980
- #else
1981
- fprintf(stream, "debug: true\n");
1982
- #endif // NDEBUG
1983
-
1984
- fprintf(stream, "model_desc: %s\n", model_desc);
1985
- fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
1986
-
1987
- #ifdef __OPTIMIZE__
1988
- fprintf(stream, "optimize: true\n");
1989
- #else
1990
- fprintf(stream, "optimize: false\n");
1991
- #endif // __OPTIMIZE__
1992
-
1993
- fprintf(stream, "time: %s\n", timestamp.c_str());
1994
-
1995
- fprintf(stream, "\n");
1996
- fprintf(stream, "###############\n");
1997
- fprintf(stream, "# User Inputs #\n");
1998
- fprintf(stream, "###############\n");
1999
- fprintf(stream, "\n");
2000
-
2001
- fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2002
- fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2003
- fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2004
- fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2005
- fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2006
- fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2007
- fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2008
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2009
- yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
2010
- fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2011
- fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2012
- fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2013
- fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
2014
-
2015
- yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2016
- fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2017
- yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2018
- fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2019
- fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2020
- fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2021
- fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2022
-
2023
- fprintf(stream, "logit_bias:\n");
2024
- for (const auto & logit_bias : sparams.logit_bias) {
2025
- fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
2026
- }
2027
-
2028
- fprintf(stream, "lora:\n");
2029
- for (auto & la : params.lora_adapters) {
2030
- if (la.scale == 1.0f) {
2031
- fprintf(stream, " - %s\n", la.path.c_str());
2032
- }
2033
- }
2034
- fprintf(stream, "lora_scaled:\n");
2035
- for (auto & la : params.lora_adapters) {
2036
- if (la.scale != 1.0f) {
2037
- fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
2038
- }
2039
- }
2040
- fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
2041
- fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2042
- fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2043
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2044
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2045
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2046
- fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2047
- fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2048
- fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2049
- fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2050
- fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2051
- fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2052
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2053
- fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2054
- fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2055
- fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2056
- fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2057
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2058
- yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
2059
- fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2060
- fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2061
- fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2062
- yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2063
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2064
-
2065
- fprintf(stream, "reverse_prompt:\n");
2066
- for (std::string ap : params.antiprompt) {
2067
- size_t pos = 0;
2068
- while ((pos = ap.find('\n', pos)) != std::string::npos) {
2069
- ap.replace(pos, 1, "\\n");
2070
- pos += 1;
2071
- }
2072
-
2073
- fprintf(stream, " - %s\n", ap.c_str());
2074
- }
2075
-
2076
- fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2077
- fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2078
- fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2079
- fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2080
- fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2081
- fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2082
-
2083
- const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2084
- yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2085
-
2086
- fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
2087
- fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
2088
- fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2089
- fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2090
- fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2091
- fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
2092
- fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2093
- fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2094
- }