@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -24,12 +24,12 @@
|
|
|
24
24
|
|
|
25
25
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
26
26
|
|
|
27
|
-
struct
|
|
27
|
+
struct common_lora_adapter_info {
|
|
28
28
|
std::string path;
|
|
29
29
|
float scale;
|
|
30
30
|
};
|
|
31
31
|
|
|
32
|
-
struct
|
|
32
|
+
struct common_lora_adapter_container : common_lora_adapter_info {
|
|
33
33
|
struct llama_lora_adapter * adapter;
|
|
34
34
|
};
|
|
35
35
|
|
|
@@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
|
|
|
39
39
|
extern char const * LLAMA_COMPILER;
|
|
40
40
|
extern char const * LLAMA_BUILD_TARGET;
|
|
41
41
|
|
|
42
|
-
struct
|
|
42
|
+
struct common_control_vector_load_info;
|
|
43
43
|
|
|
44
44
|
//
|
|
45
45
|
// CPU utils
|
|
@@ -82,14 +82,17 @@ enum llama_example {
|
|
|
82
82
|
LLAMA_EXAMPLE_COUNT,
|
|
83
83
|
};
|
|
84
84
|
|
|
85
|
-
enum
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
85
|
+
enum common_sampler_type {
|
|
86
|
+
COMMON_SAMPLER_TYPE_NONE = 0,
|
|
87
|
+
COMMON_SAMPLER_TYPE_DRY = 1,
|
|
88
|
+
COMMON_SAMPLER_TYPE_TOP_K = 2,
|
|
89
|
+
COMMON_SAMPLER_TYPE_TOP_P = 3,
|
|
90
|
+
COMMON_SAMPLER_TYPE_MIN_P = 4,
|
|
91
|
+
//COMMON_SAMPLER_TYPE_TFS_Z = 5,
|
|
92
|
+
COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
|
|
93
|
+
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
|
|
94
|
+
COMMON_SAMPLER_TYPE_XTC = 8,
|
|
95
|
+
COMMON_SAMPLER_TYPE_INFILL = 9,
|
|
93
96
|
};
|
|
94
97
|
|
|
95
98
|
// dimensionality reduction methods, used by cvector-generator
|
|
@@ -99,38 +102,47 @@ enum dimre_method {
|
|
|
99
102
|
};
|
|
100
103
|
|
|
101
104
|
// sampler parameters
|
|
102
|
-
struct
|
|
105
|
+
struct common_sampler_params {
|
|
103
106
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
104
107
|
|
|
105
|
-
int32_t n_prev
|
|
106
|
-
int32_t n_probs
|
|
107
|
-
int32_t min_keep
|
|
108
|
-
int32_t top_k
|
|
109
|
-
float top_p
|
|
110
|
-
float min_p
|
|
111
|
-
float
|
|
112
|
-
float
|
|
113
|
-
float
|
|
114
|
-
float
|
|
115
|
-
float
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
float
|
|
119
|
-
float
|
|
120
|
-
|
|
121
|
-
float
|
|
122
|
-
float
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
108
|
+
int32_t n_prev = 64; // number of previous tokens to remember
|
|
109
|
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
110
|
+
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
111
|
+
int32_t top_k = 40; // <= 0 to use vocab size
|
|
112
|
+
float top_p = 0.95f; // 1.0 = disabled
|
|
113
|
+
float min_p = 0.05f; // 0.0 = disabled
|
|
114
|
+
float xtc_probability = 0.00f; // 0.0 = disabled
|
|
115
|
+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
|
116
|
+
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
117
|
+
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
118
|
+
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
119
|
+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
120
|
+
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
121
|
+
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
122
|
+
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
123
|
+
float penalty_present = 0.00f; // 0.0 = disabled
|
|
124
|
+
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
|
125
|
+
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
|
126
|
+
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
|
127
|
+
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
|
128
|
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
129
|
+
float mirostat_tau = 5.00f; // target entropy
|
|
130
|
+
float mirostat_eta = 0.10f; // learning rate
|
|
131
|
+
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
132
|
+
bool ignore_eos = false;
|
|
133
|
+
bool no_perf = false; // disable performance metrics
|
|
134
|
+
|
|
135
|
+
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
std::vector<enum common_sampler_type> samplers = {
|
|
139
|
+
COMMON_SAMPLER_TYPE_DRY,
|
|
140
|
+
COMMON_SAMPLER_TYPE_TOP_K,
|
|
141
|
+
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
|
142
|
+
COMMON_SAMPLER_TYPE_TOP_P,
|
|
143
|
+
COMMON_SAMPLER_TYPE_MIN_P,
|
|
144
|
+
COMMON_SAMPLER_TYPE_XTC,
|
|
145
|
+
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
|
134
146
|
};
|
|
135
147
|
|
|
136
148
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
@@ -141,9 +153,9 @@ struct gpt_sampler_params {
|
|
|
141
153
|
std::string print() const;
|
|
142
154
|
};
|
|
143
155
|
|
|
144
|
-
struct
|
|
156
|
+
struct common_params {
|
|
145
157
|
int32_t n_predict = -1; // new tokens to predict
|
|
146
|
-
int32_t n_ctx =
|
|
158
|
+
int32_t n_ctx = 4096; // context size
|
|
147
159
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
148
160
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
|
149
161
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
@@ -166,7 +178,7 @@ struct gpt_params {
|
|
|
166
178
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
|
167
179
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
168
180
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
169
|
-
float defrag_thold =
|
|
181
|
+
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
|
170
182
|
|
|
171
183
|
struct cpu_params cpuparams;
|
|
172
184
|
struct cpu_params cpuparams_batch;
|
|
@@ -183,7 +195,7 @@ struct gpt_params {
|
|
|
183
195
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
184
196
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
185
197
|
|
|
186
|
-
struct
|
|
198
|
+
struct common_sampler_params sparams;
|
|
187
199
|
|
|
188
200
|
std::string model = ""; // model path // NOLINT
|
|
189
201
|
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
|
@@ -197,7 +209,6 @@ struct gpt_params {
|
|
|
197
209
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
|
198
210
|
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
|
199
211
|
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
|
200
|
-
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
|
201
212
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
|
202
213
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
|
203
214
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
|
@@ -208,9 +219,9 @@ struct gpt_params {
|
|
|
208
219
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
209
220
|
|
|
210
221
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
|
211
|
-
std::vector<
|
|
222
|
+
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
|
212
223
|
|
|
213
|
-
std::vector<
|
|
224
|
+
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
|
214
225
|
|
|
215
226
|
int32_t verbosity = 0;
|
|
216
227
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
|
@@ -268,21 +279,21 @@ struct gpt_params {
|
|
|
268
279
|
|
|
269
280
|
// embedding
|
|
270
281
|
bool embedding = false; // get only sentence embedding
|
|
271
|
-
int32_t embd_normalize = 2; // normalisation for
|
|
282
|
+
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
272
283
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
273
|
-
std::string embd_sep = "\n"; // separator of
|
|
284
|
+
std::string embd_sep = "\n"; // separator of embeddings
|
|
274
285
|
bool reranking = false; // enable reranking support on server
|
|
275
286
|
|
|
276
287
|
// server params
|
|
277
288
|
int32_t port = 8080; // server listens on this network port
|
|
278
289
|
int32_t timeout_read = 600; // http read timeout in seconds
|
|
279
290
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
280
|
-
|
|
291
|
+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
292
|
+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
281
293
|
|
|
282
294
|
std::string hostname = "127.0.0.1";
|
|
283
295
|
std::string public_path = ""; // NOLINT
|
|
284
296
|
std::string chat_template = ""; // NOLINT
|
|
285
|
-
std::string system_prompt = ""; // NOLINT
|
|
286
297
|
bool enable_chat_template = true;
|
|
287
298
|
|
|
288
299
|
std::vector<std::string> api_keys;
|
|
@@ -290,7 +301,10 @@ struct gpt_params {
|
|
|
290
301
|
std::string ssl_file_key = ""; // NOLINT
|
|
291
302
|
std::string ssl_file_cert = ""; // NOLINT
|
|
292
303
|
|
|
293
|
-
|
|
304
|
+
// "advanced" endpoints are disabled by default for better security
|
|
305
|
+
bool webui = true;
|
|
306
|
+
bool endpoint_slots = false;
|
|
307
|
+
bool endpoint_props = false; // only control POST requests, not GET
|
|
294
308
|
bool endpoint_metrics = false;
|
|
295
309
|
|
|
296
310
|
bool log_json = false;
|
|
@@ -345,20 +359,31 @@ struct gpt_params {
|
|
|
345
359
|
|
|
346
360
|
// call once at the start of a program if it uses libcommon
|
|
347
361
|
// initializes the logging system and prints info about the build
|
|
348
|
-
void
|
|
362
|
+
void common_init();
|
|
349
363
|
|
|
350
|
-
std::string
|
|
364
|
+
std::string common_params_get_system_info(const common_params & params);
|
|
351
365
|
|
|
352
|
-
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
353
|
-
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
354
|
-
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
|
366
|
+
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
367
|
+
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
368
|
+
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
|
|
355
369
|
bool set_process_priority(enum ggml_sched_priority prio);
|
|
356
370
|
|
|
357
371
|
//
|
|
358
372
|
// String utils
|
|
359
373
|
//
|
|
360
374
|
|
|
361
|
-
|
|
375
|
+
#ifdef __GNUC__
|
|
376
|
+
#ifdef __MINGW32__
|
|
377
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
378
|
+
#else
|
|
379
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
380
|
+
#endif
|
|
381
|
+
#else
|
|
382
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
383
|
+
#endif
|
|
384
|
+
|
|
385
|
+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
386
|
+
std::string string_format(const char * fmt, ...);
|
|
362
387
|
|
|
363
388
|
std::string string_strip(const std::string & str);
|
|
364
389
|
std::string string_get_sortable_timestamp();
|
|
@@ -367,6 +392,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
|
367
392
|
|
|
368
393
|
template<class T>
|
|
369
394
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
|
395
|
+
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
|
|
370
396
|
std::vector<T> values;
|
|
371
397
|
std::istringstream str_stream(str);
|
|
372
398
|
std::string token;
|
|
@@ -379,6 +405,22 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
|
|
379
405
|
return values;
|
|
380
406
|
}
|
|
381
407
|
|
|
408
|
+
template<>
|
|
409
|
+
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
|
|
410
|
+
{
|
|
411
|
+
std::vector<std::string> parts;
|
|
412
|
+
size_t begin_pos = 0;
|
|
413
|
+
size_t separator_pos = input.find(separator);
|
|
414
|
+
while (separator_pos != std::string::npos) {
|
|
415
|
+
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
|
|
416
|
+
parts.emplace_back(part);
|
|
417
|
+
begin_pos = separator_pos + 1;
|
|
418
|
+
separator_pos = input.find(separator, begin_pos);
|
|
419
|
+
}
|
|
420
|
+
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
|
|
421
|
+
return parts;
|
|
422
|
+
}
|
|
423
|
+
|
|
382
424
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
383
425
|
void string_process_escapes(std::string & input);
|
|
384
426
|
|
|
@@ -401,29 +443,29 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
|
401
443
|
// Model utils
|
|
402
444
|
//
|
|
403
445
|
|
|
404
|
-
struct
|
|
446
|
+
struct common_init_result {
|
|
405
447
|
struct llama_model * model = nullptr;
|
|
406
448
|
struct llama_context * context = nullptr;
|
|
407
|
-
std::vector<
|
|
449
|
+
std::vector<common_lora_adapter_container> lora_adapters;
|
|
408
450
|
};
|
|
409
451
|
|
|
410
|
-
struct
|
|
452
|
+
struct common_init_result common_init_from_params(common_params & params);
|
|
411
453
|
|
|
412
|
-
struct llama_model_params
|
|
413
|
-
struct llama_context_params
|
|
454
|
+
struct llama_model_params common_model_params_to_llama (const common_params & params);
|
|
455
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
|
414
456
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
|
415
457
|
|
|
416
|
-
struct llama_model *
|
|
417
|
-
struct llama_model *
|
|
458
|
+
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
459
|
+
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
418
460
|
|
|
419
461
|
// clear LoRA adapters from context, then apply new list of adapters
|
|
420
|
-
void
|
|
462
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
|
421
463
|
|
|
422
464
|
// Batch utils
|
|
423
465
|
|
|
424
|
-
void
|
|
466
|
+
void common_batch_clear(struct llama_batch & batch);
|
|
425
467
|
|
|
426
|
-
void
|
|
468
|
+
void common_batch_add(
|
|
427
469
|
struct llama_batch & batch,
|
|
428
470
|
llama_token id,
|
|
429
471
|
llama_pos pos,
|
|
@@ -436,13 +478,13 @@ void llama_batch_add(
|
|
|
436
478
|
|
|
437
479
|
// tokenizes a string into a vector of tokens
|
|
438
480
|
// should work similar to Python's `tokenizer.encode`
|
|
439
|
-
std::vector<llama_token>
|
|
481
|
+
std::vector<llama_token> common_tokenize(
|
|
440
482
|
const struct llama_context * ctx,
|
|
441
483
|
const std::string & text,
|
|
442
484
|
bool add_special,
|
|
443
485
|
bool parse_special = false);
|
|
444
486
|
|
|
445
|
-
std::vector<llama_token>
|
|
487
|
+
std::vector<llama_token> common_tokenize(
|
|
446
488
|
const struct llama_model * model,
|
|
447
489
|
const std::string & text,
|
|
448
490
|
bool add_special,
|
|
@@ -450,7 +492,7 @@ std::vector<llama_token> llama_tokenize(
|
|
|
450
492
|
|
|
451
493
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
|
452
494
|
// should work similar to Python's `tokenizer.id_to_piece`
|
|
453
|
-
std::string
|
|
495
|
+
std::string common_token_to_piece(
|
|
454
496
|
const struct llama_context * ctx,
|
|
455
497
|
llama_token token,
|
|
456
498
|
bool special = true);
|
|
@@ -458,7 +500,7 @@ std::string llama_token_to_piece(
|
|
|
458
500
|
// detokenizes a vector of tokens into a string
|
|
459
501
|
// should work similar to Python's `tokenizer.decode`
|
|
460
502
|
// optionally renders special/control tokens
|
|
461
|
-
std::string
|
|
503
|
+
std::string common_detokenize(
|
|
462
504
|
llama_context * ctx,
|
|
463
505
|
const std::vector<llama_token> & tokens,
|
|
464
506
|
bool special = true);
|
|
@@ -468,31 +510,31 @@ std::string llama_detokenize(
|
|
|
468
510
|
//
|
|
469
511
|
|
|
470
512
|
// same with llama_chat_message, but uses std::string
|
|
471
|
-
struct
|
|
513
|
+
struct common_chat_msg {
|
|
472
514
|
std::string role;
|
|
473
515
|
std::string content;
|
|
474
516
|
};
|
|
475
517
|
|
|
476
518
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
477
|
-
bool
|
|
519
|
+
bool common_chat_verify_template(const std::string & tmpl);
|
|
478
520
|
|
|
479
521
|
// CPP wrapper for llama_chat_apply_template
|
|
480
522
|
// If the built-in template is not supported, we default to chatml
|
|
481
523
|
// If the custom "tmpl" is not supported, we throw an error
|
|
482
|
-
std::string
|
|
524
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
|
483
525
|
const std::string & tmpl,
|
|
484
|
-
const std::vector<
|
|
526
|
+
const std::vector<common_chat_msg> & chat,
|
|
485
527
|
bool add_ass);
|
|
486
528
|
|
|
487
529
|
// Format single message, while taking into account the position of that message in chat history
|
|
488
|
-
std::string
|
|
530
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
|
489
531
|
const std::string & tmpl,
|
|
490
|
-
const std::vector<
|
|
491
|
-
const
|
|
532
|
+
const std::vector<common_chat_msg> & past_msg,
|
|
533
|
+
const common_chat_msg & new_msg,
|
|
492
534
|
bool add_ass);
|
|
493
535
|
|
|
494
536
|
// Returns an example of formatted chat
|
|
495
|
-
std::string
|
|
537
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
|
496
538
|
const std::string & tmpl);
|
|
497
539
|
|
|
498
540
|
//
|
|
@@ -500,31 +542,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
|
|
|
500
542
|
//
|
|
501
543
|
|
|
502
544
|
// Dump the KV cache view with the number of sequences per cell.
|
|
503
|
-
void
|
|
545
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
504
546
|
|
|
505
547
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
506
|
-
void
|
|
548
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
507
549
|
|
|
508
550
|
//
|
|
509
551
|
// Embedding utils
|
|
510
552
|
//
|
|
511
553
|
|
|
512
|
-
void
|
|
554
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
|
513
555
|
|
|
514
|
-
float
|
|
556
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
|
515
557
|
|
|
516
558
|
//
|
|
517
559
|
// Control vector utils
|
|
518
560
|
//
|
|
519
561
|
|
|
520
|
-
struct
|
|
562
|
+
struct common_control_vector_data {
|
|
521
563
|
int n_embd;
|
|
522
564
|
|
|
523
565
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
|
524
566
|
std::vector<float> data;
|
|
525
567
|
};
|
|
526
568
|
|
|
527
|
-
struct
|
|
569
|
+
struct common_control_vector_load_info {
|
|
528
570
|
float strength;
|
|
529
571
|
|
|
530
572
|
std::string fname;
|
|
@@ -532,7 +574,7 @@ struct llama_control_vector_load_info {
|
|
|
532
574
|
|
|
533
575
|
// Load control vectors, scale each by strength, and add them together.
|
|
534
576
|
// On error, returns {-1, empty}
|
|
535
|
-
|
|
577
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
|
|
536
578
|
|
|
537
579
|
//
|
|
538
580
|
// Split utils
|
|
@@ -541,15 +583,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
541
583
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
|
542
584
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
543
585
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
544
|
-
|
|
545
|
-
//
|
|
546
|
-
// YAML utils
|
|
547
|
-
//
|
|
548
|
-
|
|
549
|
-
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
550
|
-
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
551
|
-
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
552
|
-
|
|
553
|
-
void yaml_dump_non_result_info(
|
|
554
|
-
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
555
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|