@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,324 @@
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama.h"
6
+
7
+ #include "sampling.h"
8
+
9
+ #define LOG_NO_FILE_LINE_FUNCTION
10
+ #include "log.h"
11
+
12
+ #include <cmath>
13
+ #include <string>
14
+ #include <vector>
15
+ #include <random>
16
+ #include <thread>
17
+ #include <unordered_map>
18
+ #include <tuple>
19
+
20
+ #ifdef _WIN32
21
+ #define DIRECTORY_SEPARATOR '\\'
22
+ #else
23
+ #define DIRECTORY_SEPARATOR '/'
24
+ #endif // _WIN32
25
+
26
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
27
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
28
+
29
+ #define print_build_info() do { \
30
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
31
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
32
+ } while(0)
33
+
34
+ // build info
35
+ extern int LLAMA_BUILD_NUMBER;
36
+ extern char const *LLAMA_COMMIT;
37
+ extern char const *LLAMA_COMPILER;
38
+ extern char const *LLAMA_BUILD_TARGET;
39
+
40
+ struct llama_control_vector_load_info;
41
+
42
+ int get_math_cpu_count();
43
+ int32_t get_num_physical_cores();
44
+
45
+ //
46
+ // CLI argument parsing
47
+ //
48
+
49
+ struct gpt_params {
50
+ uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
51
+
52
+ int32_t n_threads = get_math_cpu_count();
53
+ int32_t n_threads_draft = -1;
54
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
55
+ int32_t n_threads_batch_draft = -1;
56
+ int32_t n_predict = -1; // new tokens to predict
57
+ int32_t n_ctx = 512; // context size
58
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
59
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
60
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
61
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
62
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
63
+ int32_t n_parallel = 1; // number of parallel sequences to decode
64
+ int32_t n_sequences = 1; // number of sequences to decode
65
+ float p_split = 0.1f; // speculative decoding split probability
66
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
67
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
68
+ llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
69
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
70
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
71
+ int32_t n_beams = 0; // if non-zero then use beam search of given width.
72
+ int32_t grp_attn_n = 1; // group-attention factor
73
+ int32_t grp_attn_w = 512; // group-attention width
74
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
75
+ float rope_freq_base = 0.0f; // RoPE base frequency
76
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
77
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
78
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
79
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
80
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
81
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
82
+ float defrag_thold = -1.0f; // KV cache defragmentation threshold
83
+
84
+ ggml_backend_sched_eval_callback cb_eval = nullptr;
85
+ void * cb_eval_user_data = nullptr;
86
+
87
+ ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
88
+
89
+ llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
90
+ llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
91
+
92
+ // // sampling parameters
93
+ struct llama_sampling_params sparams;
94
+
95
+ std::string model = "models/7B/ggml-model-f16.gguf"; // model path
96
+ std::string model_draft = ""; // draft model for speculative decoding
97
+ std::string model_alias = "unknown"; // model alias
98
+ std::string model_url = ""; // model url to download
99
+ std::string hf_repo = ""; // HF repo
100
+ std::string hf_file = ""; // HF file
101
+ std::string prompt = "";
102
+ std::string prompt_file = ""; // store the external prompt file name
103
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
104
+ std::string input_prefix = ""; // string to prefix user inputs with
105
+ std::string input_suffix = ""; // string to suffix user inputs with
106
+ std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
107
+ std::string logdir = ""; // directory in which to save YAML log files
108
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
109
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
110
+ std::string logits_file = ""; // file for saving *all* logits
111
+
112
+ std::vector<llama_model_kv_override> kv_overrides;
113
+
114
+ // TODO: avoid tuple, use struct
115
+ std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
116
+ std::string lora_base = ""; // base model path for the lora adapter
117
+
118
+ std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
119
+
120
+ int32_t control_vector_layer_start = -1; // layer range for control vector
121
+ int32_t control_vector_layer_end = -1; // layer range for control vector
122
+
123
+ int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
124
+ int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
125
+ // (which is more convenient to use for plotting)
126
+ //
127
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
128
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
129
+
130
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
131
+ size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
132
+
133
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
134
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
135
+
136
+ bool kl_divergence = false; // compute KL-divergence
137
+
138
+ bool random_prompt = false; // do not randomize prompt if none provided
139
+ bool use_color = false; // use color to distinguish generations and inputs
140
+ bool interactive = false; // interactive mode
141
+ bool chatml = false; // chatml mode (used for models trained on chatml syntax)
142
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
143
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
144
+
145
+ bool embedding = false; // get only sentence embedding
146
+ bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
147
+ bool interactive_first = false; // wait for user input immediately
148
+ bool multiline_input = false; // reverse the usage of `\`
149
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
150
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
151
+
152
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
153
+ bool ignore_eos = false; // ignore generated EOS tokens
154
+ bool instruct = false; // instruction mode (used for Alpaca models)
155
+ bool logits_all = false; // return logits for all tokens in the batch
156
+ bool use_mmap = true; // use mmap for faster loads
157
+ bool use_mlock = false; // use mlock to keep model in memory
158
+ bool verbose_prompt = false; // print prompt tokens before generation
159
+ bool display_prompt = true; // print prompt before generation
160
+ bool infill = false; // use infill mode
161
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
162
+ bool no_kv_offload = false; // disable KV offloading
163
+ bool warmup = true; // warmup run
164
+
165
+ std::string cache_type_k = "f16"; // KV cache data type for the K
166
+ std::string cache_type_v = "f16"; // KV cache data type for the V
167
+
168
+ // multimodal models (see examples/llava)
169
+ std::string mmproj = ""; // path to multimodal projector
170
+ std::string image = ""; // path to an image file
171
+ };
172
+
173
+ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
174
+
175
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
176
+
177
+ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
178
+
179
+ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
180
+
181
+ std::string get_system_info(const gpt_params & params);
182
+
183
+ std::string gpt_random_prompt(std::mt19937 & rng);
184
+
185
+ void process_escapes(std::string& input);
186
+
187
+ bool validate_file_name(const std::string & filename);
188
+
189
+ //
190
+ // String utils
191
+ //
192
+
193
+ std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
194
+ std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
195
+ std::vector<std::string> string_split(std::string input, char separator);
196
+ std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
197
+
198
+ //
199
+ // Model utils
200
+ //
201
+
202
+ // TODO: avoid tuplue, use struct
203
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
204
+
205
+ struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
206
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
207
+
208
+ struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
209
+ struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
210
+
211
+ // Batch utils
212
+
213
+ void llama_batch_clear(struct llama_batch & batch);
214
+
215
+ void llama_batch_add(
216
+ struct llama_batch & batch,
217
+ llama_token id,
218
+ llama_pos pos,
219
+ const std::vector<llama_seq_id> & seq_ids,
220
+ bool logits);
221
+
222
+ //
223
+ // Vocab utils
224
+ //
225
+
226
+ // tokenizes a string into a vector of tokens
227
+ // should work similar to Python's `tokenizer.encode`
228
+ std::vector<llama_token> llama_tokenize(
229
+ const struct llama_context * ctx,
230
+ const std::string & text,
231
+ bool add_special,
232
+ bool parse_special = false);
233
+
234
+ std::vector<llama_token> llama_tokenize(
235
+ const struct llama_model * model,
236
+ const std::string & text,
237
+ bool add_special,
238
+ bool parse_special = false);
239
+
240
+ // tokenizes a token into a piece
241
+ // should work similar to Python's `tokenizer.id_to_piece`
242
+ std::string llama_token_to_piece(
243
+ const struct llama_context * ctx,
244
+ llama_token token);
245
+
246
+ // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
247
+ // that takes into account the tokenizer type and decides how to handle the leading space
248
+ //
249
+ // detokenizes a vector of tokens into a string
250
+ // should work similar to Python's `tokenizer.decode`
251
+ // removes the leading space from the first non-BOS token
252
+ std::string llama_detokenize_spm(
253
+ llama_context * ctx,
254
+ const std::vector<llama_token> & tokens);
255
+
256
+ // detokenizes a vector of tokens into a string
257
+ // should work similar to Python's `tokenizer.decode`
258
+ std::string llama_detokenize_bpe(
259
+ llama_context * ctx,
260
+ const std::vector<llama_token> & tokens);
261
+
262
+ // Uses the value from the model metadata if possible, otherwise
263
+ // defaults to true when model type is SPM, otherwise false.
264
+ bool llama_should_add_bos_token(const llama_model * model);
265
+
266
+ //
267
+ // YAML utils
268
+ //
269
+
270
+ bool create_directory_with_parents(const std::string & path);
271
+ void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
272
+ void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
273
+ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
274
+ std::string get_sortable_timestamp();
275
+
276
+ void dump_non_result_info_yaml(
277
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
278
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
279
+
280
+ //
281
+ // KV cache utils
282
+ //
283
+
284
+ // Dump the KV cache view with the number of sequences per cell.
285
+ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
286
+
287
+ // Dump the KV cache view showing individual sequences in each cell (long output).
288
+ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
289
+
290
+ //
291
+ // Embedding utils
292
+ //
293
+
294
+ void llama_embd_normalize(const float * inp, float * out, int n);
295
+
296
+ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
297
+
298
+ //
299
+ // Control vector utils
300
+ //
301
+
302
+ struct llama_control_vector_data {
303
+ int n_embd;
304
+
305
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
306
+ std::vector<float> data;
307
+ };
308
+
309
+ struct llama_control_vector_load_info {
310
+ float strength;
311
+
312
+ std::string fname;
313
+ };
314
+
315
+ // Load control vectors, scale each by strength, and add them together.
316
+ // On error, returns {-1, empty}
317
+ llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
318
+
319
+ //
320
+ // Split utils
321
+ //
322
+ static const char * const LLM_KV_SPLIT_NO = "split.no";
323
+ static const char * const LLM_KV_SPLIT_COUNT = "split.count";
324
+ static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";