@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -4,18 +4,9 @@
4
4
 
5
5
  #include "llama.h"
6
6
 
7
- #include "sampling.h"
8
-
9
- #define LOG_NO_FILE_LINE_FUNCTION
10
- #include "log.h"
11
-
12
- #include <cmath>
13
7
  #include <string>
14
8
  #include <vector>
15
- #include <random>
16
- #include <thread>
17
- #include <unordered_map>
18
- #include <tuple>
9
+ #include <sstream>
19
10
 
20
11
  #ifdef _WIN32
21
12
  #define DIRECTORY_SEPARATOR '\\'
@@ -33,6 +24,15 @@
33
24
 
34
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
35
26
 
27
+ struct llama_lora_adapter_info {
28
+ std::string path;
29
+ float scale;
30
+ };
31
+
32
+ struct llama_lora_adapter_container : llama_lora_adapter_info {
33
+ struct llama_lora_adapter * adapter;
34
+ };
35
+
36
36
  // build info
37
37
  extern int LLAMA_BUILD_NUMBER;
38
38
  extern char const * LLAMA_COMMIT;
@@ -45,26 +45,103 @@ struct llama_control_vector_load_info;
45
45
  // CPU utils
46
46
  //
47
47
 
48
+ struct cpu_params {
49
+ int n_threads = -1;
50
+ bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
51
+ bool mask_valid = false; // Default: any CPU
52
+ enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
53
+ bool strict_cpu = false; // Use strict CPU placement
54
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
55
+ };
56
+
48
57
  int32_t cpu_get_num_physical_cores();
49
58
  int32_t cpu_get_num_math();
50
59
 
51
60
  //
52
- // CLI argument parsing
61
+ // Common params
53
62
  //
54
63
 
64
+ enum llama_example {
65
+ LLAMA_EXAMPLE_COMMON,
66
+ LLAMA_EXAMPLE_SPECULATIVE,
67
+ LLAMA_EXAMPLE_MAIN,
68
+ LLAMA_EXAMPLE_INFILL,
69
+ LLAMA_EXAMPLE_EMBEDDING,
70
+ LLAMA_EXAMPLE_PERPLEXITY,
71
+ LLAMA_EXAMPLE_RETRIEVAL,
72
+ LLAMA_EXAMPLE_PASSKEY,
73
+ LLAMA_EXAMPLE_IMATRIX,
74
+ LLAMA_EXAMPLE_BENCH,
75
+ LLAMA_EXAMPLE_SERVER,
76
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
77
+ LLAMA_EXAMPLE_EXPORT_LORA,
78
+ LLAMA_EXAMPLE_LLAVA,
79
+ LLAMA_EXAMPLE_LOOKUP,
80
+ LLAMA_EXAMPLE_PARALLEL,
81
+
82
+ LLAMA_EXAMPLE_COUNT,
83
+ };
84
+
85
+ enum gpt_sampler_type {
86
+ GPT_SAMPLER_TYPE_NONE = 0,
87
+ GPT_SAMPLER_TYPE_TOP_K = 1,
88
+ GPT_SAMPLER_TYPE_TOP_P = 2,
89
+ GPT_SAMPLER_TYPE_MIN_P = 3,
90
+ GPT_SAMPLER_TYPE_TFS_Z = 4,
91
+ GPT_SAMPLER_TYPE_TYPICAL_P = 5,
92
+ GPT_SAMPLER_TYPE_TEMPERATURE = 6,
93
+ };
94
+
55
95
  // dimensionality reduction methods, used by cvector-generator
56
96
  enum dimre_method {
57
97
  DIMRE_METHOD_PCA,
58
98
  DIMRE_METHOD_MEAN,
59
99
  };
60
100
 
61
- struct gpt_params {
62
- uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
101
+ // sampler parameters
102
+ struct gpt_sampler_params {
103
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
104
+
105
+ int32_t n_prev = 64; // number of previous tokens to remember
106
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
107
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
108
+ int32_t top_k = 40; // <= 0 to use vocab size
109
+ float top_p = 0.95f; // 1.0 = disabled
110
+ float min_p = 0.05f; // 0.0 = disabled
111
+ float tfs_z = 1.00f; // 1.0 = disabled
112
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
113
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
114
+ float dynatemp_range = 0.00f; // 0.0 = disabled
115
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
116
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
117
+ float penalty_repeat = 1.00f; // 1.0 = disabled
118
+ float penalty_freq = 0.00f; // 0.0 = disabled
119
+ float penalty_present = 0.00f; // 0.0 = disabled
120
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
121
+ float mirostat_tau = 5.00f; // target entropy
122
+ float mirostat_eta = 0.10f; // learning rate
123
+ bool penalize_nl = false; // consider newlines as a repeatable token
124
+ bool ignore_eos = false;
125
+ bool no_perf = false; // disable performance metrics
126
+
127
+ std::vector<enum gpt_sampler_type> samplers = {
128
+ GPT_SAMPLER_TYPE_TOP_K,
129
+ GPT_SAMPLER_TYPE_TFS_Z,
130
+ GPT_SAMPLER_TYPE_TYPICAL_P,
131
+ GPT_SAMPLER_TYPE_TOP_P,
132
+ GPT_SAMPLER_TYPE_MIN_P,
133
+ GPT_SAMPLER_TYPE_TEMPERATURE
134
+ };
135
+
136
+ std::string grammar; // optional BNF-like grammar to constrain sampling
137
+
138
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
139
+
140
+ // print the parameters into a string
141
+ std::string print() const;
142
+ };
63
143
 
64
- int32_t n_threads = cpu_get_num_math();
65
- int32_t n_threads_draft = -1;
66
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
67
- int32_t n_threads_batch_draft = -1;
144
+ struct gpt_params {
68
145
  int32_t n_predict = -1; // new tokens to predict
69
146
  int32_t n_ctx = 0; // context size
70
147
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -91,6 +168,11 @@ struct gpt_params {
91
168
  int32_t yarn_orig_ctx = 0; // YaRN original context length
92
169
  float defrag_thold = -1.0f; // KV cache defragmentation threshold
93
170
 
171
+ struct cpu_params cpuparams;
172
+ struct cpu_params cpuparams_batch;
173
+ struct cpu_params draft_cpuparams;
174
+ struct cpu_params draft_cpuparams_batch;
175
+
94
176
  ggml_backend_sched_eval_callback cb_eval = nullptr;
95
177
  void * cb_eval_user_data = nullptr;
96
178
 
@@ -101,33 +183,32 @@ struct gpt_params {
101
183
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
102
184
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
103
185
 
104
- // // sampling parameters
105
- struct llama_sampling_params sparams;
106
-
107
- std::string model = ""; // model path
108
- std::string model_draft = ""; // draft model for speculative decoding
109
- std::string model_alias = "unknown"; // model alias
110
- std::string model_url = ""; // model url to download
111
- std::string hf_token = ""; // HF token
112
- std::string hf_repo = ""; // HF repo
113
- std::string hf_file = ""; // HF file
114
- std::string prompt = "";
115
- std::string prompt_file = ""; // store the external prompt file name
116
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
117
- std::string input_prefix = ""; // string to prefix user inputs with
118
- std::string input_suffix = ""; // string to suffix user inputs with
119
- std::string logdir = ""; // directory in which to save YAML log files
120
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
121
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
122
- std::string logits_file = ""; // file for saving *all* logits
123
- std::string rpc_servers = ""; // comma separated list of RPC servers
186
+ struct gpt_sampler_params sparams;
187
+
188
+ std::string model = ""; // model path // NOLINT
189
+ std::string model_draft = ""; // draft model for speculative decoding // NOLINT
190
+ std::string model_alias = "unknown"; // model alias // NOLINT
191
+ std::string model_url = ""; // model url to download // NOLINT
192
+ std::string hf_token = ""; // HF token // NOLINT
193
+ std::string hf_repo = ""; // HF repo // NOLINT
194
+ std::string hf_file = ""; // HF file // NOLINT
195
+ std::string prompt = ""; // NOLINT
196
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
197
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
198
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
199
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
200
+ std::string logdir = ""; // directory in which to save YAML log files // NOLINT
201
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
202
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
203
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
204
+ std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
124
205
 
125
206
  std::vector<std::string> in_files; // all input files
126
207
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
127
208
  std::vector<llama_model_kv_override> kv_overrides;
128
209
 
129
- // TODO: avoid tuple, use struct
130
- std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
210
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
211
+ std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
131
212
 
132
213
  std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
133
214
 
@@ -164,15 +245,15 @@ struct gpt_params {
164
245
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
165
246
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
166
247
  bool flash_attn = false; // flash attention
248
+ bool no_perf = false; // disable performance metrics
249
+ bool ctx_shift = true; // context shift on inifinite text generation
167
250
 
168
251
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
169
- bool ignore_eos = false; // ignore generated EOS tokens
170
252
  bool logits_all = false; // return logits for all tokens in the batch
171
253
  bool use_mmap = true; // use mmap for faster loads
172
254
  bool use_mlock = false; // use mlock to keep model in memory
173
255
  bool verbose_prompt = false; // print prompt tokens before generation
174
256
  bool display_prompt = true; // print prompt before generation
175
- bool infill = false; // use infill mode
176
257
  bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
177
258
  bool no_kv_offload = false; // disable KV offloading
178
259
  bool warmup = true; // warmup run
@@ -182,7 +263,7 @@ struct gpt_params {
182
263
  std::string cache_type_v = "f16"; // KV cache data type for the V
183
264
 
184
265
  // multimodal models (see examples/llava)
185
- std::string mmproj = ""; // path to multimodal projector
266
+ std::string mmproj = ""; // path to multimodal projector // NOLINT
186
267
  std::vector<std::string> image; // path to image file(s)
187
268
 
188
269
  // embedding
@@ -190,23 +271,24 @@ struct gpt_params {
190
271
  int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
191
272
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
192
273
  std::string embd_sep = "\n"; // separator of embendings
274
+ bool reranking = false; // enable reranking support on server
193
275
 
194
276
  // server params
195
277
  int32_t port = 8080; // server listens on this network port
196
278
  int32_t timeout_read = 600; // http read timeout in seconds
197
279
  int32_t timeout_write = timeout_read; // http write timeout in seconds
198
- int32_t n_threads_http = -1; // number of threads to process HTTP requests
280
+ int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
199
281
 
200
282
  std::string hostname = "127.0.0.1";
201
- std::string public_path = "";
202
- std::string chat_template = "";
203
- std::string system_prompt = "";
283
+ std::string public_path = ""; // NOLINT
284
+ std::string chat_template = ""; // NOLINT
285
+ std::string system_prompt = ""; // NOLINT
204
286
  bool enable_chat_template = true;
205
287
 
206
288
  std::vector<std::string> api_keys;
207
289
 
208
- std::string ssl_file_key = "";
209
- std::string ssl_file_cert = "";
290
+ std::string ssl_file_key = ""; // NOLINT
291
+ std::string ssl_file_cert = ""; // NOLINT
210
292
 
211
293
  bool endpoint_slots = true;
212
294
  bool endpoint_metrics = false;
@@ -256,18 +338,22 @@ struct gpt_params {
256
338
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
257
339
 
258
340
  std::string lora_outfile = "ggml-lora-merged-f16.gguf";
259
- };
260
341
 
261
- void gpt_params_handle_hf_token(gpt_params & params);
262
- void gpt_params_handle_model_default(gpt_params & params);
342
+ // batched-bench params
343
+ bool batched_bench_output_jsonl = false;
344
+ };
263
345
 
264
- bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
265
- bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
266
- bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
267
- void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
346
+ // call once at the start of a program if it uses libcommon
347
+ // initializes the logging system and prints info about the build
348
+ void gpt_init();
268
349
 
269
350
  std::string gpt_params_get_system_info(const gpt_params & params);
270
351
 
352
+ bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
353
+ bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
354
+ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
355
+ bool set_process_priority(enum ggml_sched_priority prio);
356
+
271
357
  //
272
358
  // String utils
273
359
  //
@@ -277,6 +363,8 @@ std::vector<std::string> string_split(std::string input, char separator);
277
363
  std::string string_strip(const std::string & str);
278
364
  std::string string_get_sortable_timestamp();
279
365
 
366
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
367
+
280
368
  template<class T>
281
369
  static std::vector<T> string_split(const std::string & str, char delim) {
282
370
  std::vector<T> values;
@@ -294,6 +382,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
294
382
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
295
383
  void string_process_escapes(std::string & input);
296
384
 
385
+ std::string string_from(bool value);
386
+ std::string string_from(const std::vector<int> & values);
387
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
388
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
389
+
297
390
  //
298
391
  // Filesystem utils
299
392
  //
@@ -308,15 +401,24 @@ std::string fs_get_cache_file(const std::string & filename);
308
401
  // Model utils
309
402
  //
310
403
 
311
- // TODO: avoid tuplue, use struct
312
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
404
+ struct llama_init_result {
405
+ struct llama_model * model = nullptr;
406
+ struct llama_context * context = nullptr;
407
+ std::vector<llama_lora_adapter_container> lora_adapters;
408
+ };
409
+
410
+ struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
313
411
 
314
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
315
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
412
+ struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
413
+ struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
414
+ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
316
415
 
317
416
  struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
318
417
  struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
319
418
 
419
+ // clear LoRA adapters from context, then apply new list of adapters
420
+ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
421
+
320
422
  // Batch utils
321
423
 
322
424
  void llama_batch_clear(struct llama_batch & batch);
@@ -361,10 +463,6 @@ std::string llama_detokenize(
361
463
  const std::vector<llama_token> & tokens,
362
464
  bool special = true);
363
465
 
364
- // Uses the value from the model metadata if possible, otherwise
365
- // defaults to true when model type is SPM, otherwise false.
366
- bool llama_should_add_bos_token(const llama_model * model);
367
-
368
466
  //
369
467
  // Chat template utils
370
468
  //
@@ -94,6 +94,9 @@ namespace console {
94
94
  simple_io = true;
95
95
  }
96
96
  }
97
+ if (simple_io) {
98
+ _setmode(_fileno(stdin), _O_U8TEXT);
99
+ }
97
100
  #else
98
101
  // POSIX-specific console initialization
99
102
  if (!simple_io) {