@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -98,8 +98,8 @@ struct ring_buffer {
|
|
|
98
98
|
std::vector<T> data;
|
|
99
99
|
};
|
|
100
100
|
|
|
101
|
-
struct
|
|
102
|
-
|
|
101
|
+
struct common_sampler {
|
|
102
|
+
common_sampler_params params;
|
|
103
103
|
|
|
104
104
|
struct llama_sampler * grmr;
|
|
105
105
|
struct llama_sampler * chain;
|
|
@@ -125,26 +125,28 @@ struct gpt_sampler {
|
|
|
125
125
|
}
|
|
126
126
|
};
|
|
127
127
|
|
|
128
|
-
std::string
|
|
128
|
+
std::string common_sampler_params::print() const {
|
|
129
129
|
char result[1024];
|
|
130
130
|
|
|
131
131
|
snprintf(result, sizeof(result),
|
|
132
132
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
|
133
|
-
"\
|
|
133
|
+
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
|
|
134
|
+
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
|
|
134
135
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
|
135
136
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
|
136
|
-
|
|
137
|
+
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
|
|
138
|
+
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
|
|
137
139
|
mirostat, mirostat_eta, mirostat_tau);
|
|
138
140
|
|
|
139
141
|
return std::string(result);
|
|
140
142
|
}
|
|
141
143
|
|
|
142
|
-
struct
|
|
144
|
+
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
|
|
143
145
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
|
144
146
|
|
|
145
147
|
lparams.no_perf = params.no_perf;
|
|
146
148
|
|
|
147
|
-
auto * result = new
|
|
149
|
+
auto * result = new common_sampler {
|
|
148
150
|
/* .params = */ params,
|
|
149
151
|
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
|
150
152
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
@@ -171,60 +173,60 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
|
|
171
173
|
params.penalize_nl,
|
|
172
174
|
params.ignore_eos));
|
|
173
175
|
|
|
174
|
-
if (params.
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
176
|
+
if (params.mirostat == 0) {
|
|
177
|
+
for (const auto & cnstr : params.samplers) {
|
|
178
|
+
switch (cnstr) {
|
|
179
|
+
case COMMON_SAMPLER_TYPE_DRY:
|
|
180
|
+
{
|
|
181
|
+
std::vector<const char*> c_breakers;
|
|
182
|
+
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
183
|
+
for (const auto& str : params.dry_sequence_breakers) {
|
|
184
|
+
c_breakers.push_back(str.c_str());
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
188
|
+
}
|
|
180
189
|
break;
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
190
|
+
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
191
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
192
|
+
break;
|
|
193
|
+
case COMMON_SAMPLER_TYPE_TOP_P:
|
|
194
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
|
195
|
+
break;
|
|
196
|
+
case COMMON_SAMPLER_TYPE_MIN_P:
|
|
197
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
|
198
|
+
break;
|
|
199
|
+
case COMMON_SAMPLER_TYPE_XTC:
|
|
200
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
|
201
|
+
break;
|
|
202
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
|
203
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
|
204
|
+
break;
|
|
205
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
|
206
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
207
|
+
break;
|
|
208
|
+
case COMMON_SAMPLER_TYPE_INFILL:
|
|
209
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
|
|
210
|
+
break;
|
|
211
|
+
default:
|
|
212
|
+
GGML_ASSERT(false && "unknown sampler type");
|
|
199
213
|
}
|
|
200
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
|
201
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
202
|
-
} else if (params.mirostat == 1) {
|
|
203
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
204
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
|
205
|
-
} else if (params.mirostat == 2) {
|
|
206
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
207
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
|
208
|
-
} else {
|
|
209
|
-
GGML_ASSERT(false && "unknown mirostat version");
|
|
210
214
|
}
|
|
215
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
216
|
+
} else if (params.mirostat == 1) {
|
|
217
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
218
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
|
219
|
+
} else if (params.mirostat == 2) {
|
|
220
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
221
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
|
211
222
|
} else {
|
|
212
|
-
|
|
213
|
-
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
|
|
214
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
|
|
215
|
-
//
|
|
216
|
-
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
|
|
217
|
-
// it is much faster, since we avoid sorting all tokens and should give a good approximation
|
|
218
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
|
|
219
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
|
220
|
-
}
|
|
221
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
|
|
223
|
+
GGML_ASSERT(false && "unknown mirostat version");
|
|
222
224
|
}
|
|
223
225
|
|
|
224
226
|
return result;
|
|
225
227
|
}
|
|
226
228
|
|
|
227
|
-
void
|
|
229
|
+
void common_sampler_free(struct common_sampler * gsmpl) {
|
|
228
230
|
if (gsmpl) {
|
|
229
231
|
llama_sampler_free(gsmpl->grmr);
|
|
230
232
|
|
|
@@ -234,7 +236,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
|
|
234
236
|
}
|
|
235
237
|
}
|
|
236
238
|
|
|
237
|
-
void
|
|
239
|
+
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
|
238
240
|
if (accept_grammar) {
|
|
239
241
|
llama_sampler_accept(gsmpl->grmr, token);
|
|
240
242
|
}
|
|
@@ -244,14 +246,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
|
|
|
244
246
|
gsmpl->prev.push_back(token);
|
|
245
247
|
}
|
|
246
248
|
|
|
247
|
-
void
|
|
249
|
+
void common_sampler_reset(struct common_sampler * gsmpl) {
|
|
248
250
|
llama_sampler_reset(gsmpl->grmr);
|
|
249
251
|
|
|
250
252
|
llama_sampler_reset(gsmpl->chain);
|
|
251
253
|
}
|
|
252
254
|
|
|
253
|
-
struct
|
|
254
|
-
return new
|
|
255
|
+
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
256
|
+
return new common_sampler {
|
|
255
257
|
/* .params = */ gsmpl->params,
|
|
256
258
|
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
257
259
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
|
@@ -261,7 +263,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
|
|
|
261
263
|
};
|
|
262
264
|
}
|
|
263
265
|
|
|
264
|
-
void
|
|
266
|
+
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
|
265
267
|
// TODO: measure grammar performance
|
|
266
268
|
|
|
267
269
|
if (gsmpl) {
|
|
@@ -272,7 +274,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|
|
272
274
|
}
|
|
273
275
|
}
|
|
274
276
|
|
|
275
|
-
llama_token
|
|
277
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
|
276
278
|
gsmpl->set_logits(ctx, idx);
|
|
277
279
|
|
|
278
280
|
auto & grmr = gsmpl->grmr;
|
|
@@ -318,21 +320,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
|
|
318
320
|
return cur_p.data[cur_p.selected].id;
|
|
319
321
|
}
|
|
320
322
|
|
|
321
|
-
uint32_t
|
|
323
|
+
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
322
324
|
return llama_sampler_get_seed(gsmpl->chain);
|
|
323
325
|
}
|
|
324
326
|
|
|
325
327
|
// helpers
|
|
326
328
|
|
|
327
|
-
llama_token_data_array *
|
|
329
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
|
328
330
|
return &gsmpl->cur_p;
|
|
329
331
|
}
|
|
330
332
|
|
|
331
|
-
llama_token
|
|
333
|
+
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
|
332
334
|
return gsmpl->prev.rat(0);
|
|
333
335
|
}
|
|
334
336
|
|
|
335
|
-
std::string
|
|
337
|
+
std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
|
336
338
|
std::string result = "logits ";
|
|
337
339
|
|
|
338
340
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
|
@@ -343,7 +345,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
|
|
343
345
|
return result;
|
|
344
346
|
}
|
|
345
347
|
|
|
346
|
-
std::string
|
|
348
|
+
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
|
|
347
349
|
n = std::min(n, (int) gsmpl->prev.size());
|
|
348
350
|
|
|
349
351
|
if (n <= 0) {
|
|
@@ -358,63 +360,67 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
|
|
|
358
360
|
|
|
359
361
|
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
|
360
362
|
|
|
361
|
-
result +=
|
|
363
|
+
result += common_token_to_piece(ctx_main, id);
|
|
362
364
|
}
|
|
363
365
|
|
|
364
366
|
return result;
|
|
365
367
|
}
|
|
366
368
|
|
|
367
|
-
char
|
|
369
|
+
char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
|
368
370
|
switch (cnstr) {
|
|
369
|
-
case
|
|
370
|
-
case
|
|
371
|
-
case
|
|
372
|
-
case
|
|
373
|
-
case
|
|
374
|
-
case
|
|
371
|
+
case COMMON_SAMPLER_TYPE_DRY: return 'd';
|
|
372
|
+
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
|
373
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
|
374
|
+
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
|
375
|
+
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
|
376
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
|
377
|
+
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
|
378
|
+
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
|
|
375
379
|
default : return '?';
|
|
376
380
|
}
|
|
377
381
|
}
|
|
378
382
|
|
|
379
|
-
std::string
|
|
383
|
+
std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
|
380
384
|
switch (cnstr) {
|
|
381
|
-
case
|
|
382
|
-
case
|
|
383
|
-
case
|
|
384
|
-
case
|
|
385
|
-
case
|
|
386
|
-
case
|
|
385
|
+
case COMMON_SAMPLER_TYPE_DRY: return "dry";
|
|
386
|
+
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
|
387
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
|
388
|
+
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
|
389
|
+
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
|
390
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
|
391
|
+
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
|
392
|
+
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
|
|
387
393
|
default : return "";
|
|
388
394
|
}
|
|
389
395
|
}
|
|
390
396
|
|
|
391
|
-
std::vector<
|
|
392
|
-
std::unordered_map<std::string,
|
|
393
|
-
{ "
|
|
394
|
-
{ "
|
|
395
|
-
{ "
|
|
396
|
-
{ "
|
|
397
|
-
{ "
|
|
398
|
-
{ "temperature",
|
|
397
|
+
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
|
398
|
+
std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
|
|
399
|
+
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
|
400
|
+
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
|
401
|
+
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
|
402
|
+
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
403
|
+
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
|
404
|
+
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
405
|
+
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
|
406
|
+
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
|
399
407
|
};
|
|
400
408
|
|
|
401
409
|
// since samplers names are written multiple ways
|
|
402
410
|
// make it ready for both system names and input names
|
|
403
|
-
std::unordered_map<std::string,
|
|
404
|
-
{ "top-k",
|
|
405
|
-
{ "top-p",
|
|
406
|
-
{ "nucleus",
|
|
407
|
-
{ "typical-p",
|
|
408
|
-
{ "typical",
|
|
409
|
-
{ "typ-p",
|
|
410
|
-
{ "typ",
|
|
411
|
-
{ "min-p",
|
|
412
|
-
{ "
|
|
413
|
-
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
|
|
414
|
-
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
|
|
411
|
+
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
|
412
|
+
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
|
413
|
+
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
|
414
|
+
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
|
415
|
+
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
416
|
+
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
417
|
+
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
418
|
+
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
419
|
+
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
|
|
420
|
+
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
415
421
|
};
|
|
416
422
|
|
|
417
|
-
std::vector<
|
|
423
|
+
std::vector<common_sampler_type> samplers;
|
|
418
424
|
samplers.reserve(names.size());
|
|
419
425
|
|
|
420
426
|
for (const auto & name : names) {
|
|
@@ -434,17 +440,19 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
|
|
434
440
|
return samplers;
|
|
435
441
|
}
|
|
436
442
|
|
|
437
|
-
std::vector<
|
|
438
|
-
std::unordered_map<char,
|
|
439
|
-
{
|
|
440
|
-
{
|
|
441
|
-
{
|
|
442
|
-
{
|
|
443
|
-
{
|
|
444
|
-
{
|
|
443
|
+
std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
|
|
444
|
+
std::unordered_map<char, common_sampler_type> sampler_name_map = {
|
|
445
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
|
|
446
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
|
447
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
448
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
|
449
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
|
450
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
451
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
|
452
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
|
|
445
453
|
};
|
|
446
454
|
|
|
447
|
-
std::vector<
|
|
455
|
+
std::vector<common_sampler_type> samplers;
|
|
448
456
|
samplers.reserve(chars.size());
|
|
449
457
|
|
|
450
458
|
for (const auto & c : chars) {
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
#include <string>
|
|
8
8
|
#include <vector>
|
|
9
9
|
|
|
10
|
-
//
|
|
10
|
+
// common_sampler extends llama_sampler with additional functionality:
|
|
11
11
|
//
|
|
12
12
|
// - grammar support
|
|
13
13
|
// - custom sampler logic based on the parameters
|
|
@@ -23,30 +23,30 @@
|
|
|
23
23
|
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
|
|
24
24
|
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
|
25
25
|
//
|
|
26
|
-
// The
|
|
26
|
+
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
|
|
27
27
|
// be moved into the core llama library.
|
|
28
28
|
//
|
|
29
|
-
// For convenience, the
|
|
29
|
+
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
|
|
30
30
|
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
|
31
31
|
//
|
|
32
32
|
// TODO: measure grammar performance
|
|
33
33
|
//
|
|
34
34
|
|
|
35
|
-
struct
|
|
35
|
+
struct common_sampler;
|
|
36
36
|
|
|
37
37
|
// llama_sampler API overloads
|
|
38
38
|
|
|
39
|
-
struct
|
|
39
|
+
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
|
|
40
40
|
|
|
41
|
-
void
|
|
41
|
+
void common_sampler_free(struct common_sampler * gsmpl);
|
|
42
42
|
|
|
43
43
|
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
|
44
|
-
void
|
|
45
|
-
void
|
|
46
|
-
struct
|
|
44
|
+
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
|
|
45
|
+
void common_sampler_reset (struct common_sampler * gsmpl);
|
|
46
|
+
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|
47
47
|
|
|
48
48
|
// arguments can be nullptr to skip printing
|
|
49
|
-
void
|
|
49
|
+
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
|
50
50
|
|
|
51
51
|
// extended sampling implementation:
|
|
52
52
|
//
|
|
@@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|
|
58
58
|
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
|
59
59
|
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
60
60
|
//
|
|
61
|
-
llama_token
|
|
61
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
62
62
|
|
|
63
|
-
uint32_t
|
|
63
|
+
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
64
64
|
|
|
65
65
|
// helpers
|
|
66
66
|
|
|
67
67
|
// access the internal list of current candidate tokens
|
|
68
|
-
llama_token_data_array *
|
|
68
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
|
69
69
|
|
|
70
70
|
// get the last accepted token
|
|
71
|
-
llama_token
|
|
71
|
+
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
|
72
72
|
|
|
73
73
|
// print the sampler chain into a string
|
|
74
|
-
std::string
|
|
74
|
+
std::string common_sampler_print(const struct common_sampler * gsmpl);
|
|
75
75
|
|
|
76
76
|
// get a string representation of the last accepted tokens
|
|
77
|
-
std::string
|
|
77
|
+
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
|
|
78
78
|
|
|
79
|
-
char
|
|
80
|
-
std::string
|
|
79
|
+
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
|
|
80
|
+
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
|
81
81
|
|
|
82
|
-
std::vector<enum
|
|
83
|
-
std::vector<enum
|
|
82
|
+
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
83
|
+
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
|
|
@@ -186,18 +186,16 @@ The following compilation options are also available to tweak performance:
|
|
|
186
186
|
|
|
187
187
|
| Option | Legal values | Default | Description |
|
|
188
188
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
189
|
-
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
|
190
|
-
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
|
191
|
-
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
|
192
189
|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
|
193
190
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
|
194
191
|
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
|
195
|
-
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
|
196
192
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
|
197
193
|
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
|
198
194
|
|
|
199
195
|
### MUSA
|
|
200
196
|
|
|
197
|
+
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
|
|
198
|
+
|
|
201
199
|
- Using `make`:
|
|
202
200
|
```bash
|
|
203
201
|
make GGML_MUSA=1
|
|
@@ -209,6 +207,12 @@ The following compilation options are also available to tweak performance:
|
|
|
209
207
|
cmake --build build --config Release
|
|
210
208
|
```
|
|
211
209
|
|
|
210
|
+
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
|
|
211
|
+
|
|
212
|
+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
|
|
213
|
+
|
|
214
|
+
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
|
215
|
+
|
|
212
216
|
### hipBLAS
|
|
213
217
|
|
|
214
218
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
|
@@ -222,7 +226,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
222
226
|
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
|
223
227
|
```bash
|
|
224
228
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
|
225
|
-
cmake -S . -B build -
|
|
229
|
+
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
|
226
230
|
&& cmake --build build --config Release -- -j 16
|
|
227
231
|
```
|
|
228
232
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
|
@@ -239,7 +243,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
239
243
|
```bash
|
|
240
244
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
|
241
245
|
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
|
242
|
-
cmake -S . -B build -
|
|
246
|
+
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
|
243
247
|
&& cmake --build build -- -j 16
|
|
244
248
|
```
|
|
245
249
|
|
|
@@ -251,7 +255,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
251
255
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
|
252
256
|
```bash
|
|
253
257
|
set PATH=%HIP_PATH%\bin;%PATH%
|
|
254
|
-
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -
|
|
258
|
+
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
|
255
259
|
cmake --build build
|
|
256
260
|
```
|
|
257
261
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
|
@@ -260,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
260
264
|
|
|
261
265
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
|
262
266
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
|
263
|
-
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
|
264
|
-
|
|
265
|
-
| Option | Legal values | Default | Description |
|
|
266
|
-
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
267
|
-
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
|
268
|
-
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
|
269
|
-
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
|
270
267
|
|
|
271
268
|
### Vulkan
|
|
272
269
|
|
|
@@ -274,9 +271,9 @@ The following compilation options are also available to tweak performance (yes,
|
|
|
274
271
|
|
|
275
272
|
#### w64devkit
|
|
276
273
|
|
|
277
|
-
Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
|
274
|
+
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
|
|
278
275
|
|
|
279
|
-
Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows)
|
|
276
|
+
Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
|
|
280
277
|
|
|
281
278
|
Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
|
|
282
279
|
```sh
|
|
@@ -294,6 +291,29 @@ EOF
|
|
|
294
291
|
```
|
|
295
292
|
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
|
|
296
293
|
|
|
294
|
+
#### Git Bash MINGW64
|
|
295
|
+
|
|
296
|
+
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
|
|
297
|
+
|
|
298
|
+
Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
|
|
299
|
+
|
|
300
|
+
Download and install [`CMake`](https://cmake.org/download/) with the default settings
|
|
301
|
+
|
|
302
|
+
Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
|
|
303
|
+
|
|
304
|
+
Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
|
|
305
|
+
|
|
306
|
+
```
|
|
307
|
+
cmake -B build -DGGML_VULKAN=ON
|
|
308
|
+
cmake --build build --config Release
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
Now you can load the model in conversation mode using `Vulkan`
|
|
312
|
+
|
|
313
|
+
```
|
|
314
|
+
build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
|
|
315
|
+
```
|
|
316
|
+
|
|
297
317
|
#### MSYS2
|
|
298
318
|
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
|
299
319
|
```sh
|
|
@@ -367,7 +387,7 @@ cmake --build build --config release
|
|
|
367
387
|
|
|
368
388
|
You can test with:
|
|
369
389
|
|
|
370
|
-
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
|
390
|
+
`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
|
371
391
|
|
|
372
392
|
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
|
|
373
393
|
```bash
|
|
@@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|
|
13
13
|
if (EMSCRIPTEN)
|
|
14
14
|
else()
|
|
15
15
|
add_subdirectory(cvector-generator)
|
|
16
|
-
add_subdirectory(baby-llama)
|
|
17
16
|
add_subdirectory(batched-bench)
|
|
18
17
|
add_subdirectory(batched)
|
|
19
18
|
add_subdirectory(convert-llama2c-to-ggml)
|
|
@@ -49,6 +48,7 @@ else()
|
|
|
49
48
|
endif()
|
|
50
49
|
add_subdirectory(save-load-state)
|
|
51
50
|
add_subdirectory(simple)
|
|
51
|
+
add_subdirectory(simple-chat)
|
|
52
52
|
add_subdirectory(speculative)
|
|
53
53
|
add_subdirectory(tokenize)
|
|
54
54
|
endif()
|