@fugood/llama.node 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +3 -1
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +61 -6
- package/src/LlamaContext.h +1 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -97,14 +97,17 @@ int main(int argc, char ** argv) {
|
|
|
97
97
|
// load the model
|
|
98
98
|
common_init_result llama_init = common_init_from_params(params);
|
|
99
99
|
|
|
100
|
-
llama_model * model = llama_init.model;
|
|
101
|
-
llama_context * ctx = llama_init.context;
|
|
100
|
+
llama_model * model = llama_init.model.get();
|
|
101
|
+
llama_context * ctx = llama_init.context.get();
|
|
102
|
+
|
|
102
103
|
if (model == NULL) {
|
|
103
104
|
LOG_ERR("%s: unable to load model\n", __func__);
|
|
104
105
|
return 1;
|
|
105
106
|
}
|
|
106
107
|
|
|
107
|
-
const
|
|
108
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
109
|
+
|
|
110
|
+
const int n_ctx_train = llama_model_n_ctx_train(model);
|
|
108
111
|
const int n_ctx = llama_n_ctx(ctx);
|
|
109
112
|
|
|
110
113
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
|
@@ -147,7 +150,7 @@ int main(int argc, char ** argv) {
|
|
|
147
150
|
// check if the last token is SEP
|
|
148
151
|
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
|
149
152
|
for (auto & inp : inputs) {
|
|
150
|
-
if (inp.empty() || inp.back() !=
|
|
153
|
+
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
|
|
151
154
|
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
|
|
152
155
|
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
|
153
156
|
}
|
|
@@ -180,7 +183,7 @@ int main(int argc, char ** argv) {
|
|
|
180
183
|
}
|
|
181
184
|
|
|
182
185
|
// allocate output
|
|
183
|
-
const int n_embd =
|
|
186
|
+
const int n_embd = llama_model_n_embd(model);
|
|
184
187
|
std::vector<float> embeddings(n_embd_count * n_embd, 0);
|
|
185
188
|
float * emb = embeddings.data();
|
|
186
189
|
|
|
@@ -316,8 +319,6 @@ int main(int argc, char ** argv) {
|
|
|
316
319
|
|
|
317
320
|
// clean up
|
|
318
321
|
llama_batch_free(batch);
|
|
319
|
-
llama_free(ctx);
|
|
320
|
-
llama_free_model(model);
|
|
321
322
|
llama_backend_free();
|
|
322
323
|
|
|
323
324
|
return 0;
|
|
@@ -127,7 +127,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
|
127
127
|
}
|
|
128
128
|
|
|
129
129
|
static bool run(llama_context * ctx, const common_params & params) {
|
|
130
|
-
const
|
|
130
|
+
const llama_model * model = llama_get_model(ctx);
|
|
131
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
132
|
+
|
|
133
|
+
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
131
134
|
|
|
132
135
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
|
133
136
|
|
|
@@ -162,8 +165,9 @@ int main(int argc, char ** argv) {
|
|
|
162
165
|
// init
|
|
163
166
|
common_init_result llama_init = common_init_from_params(params);
|
|
164
167
|
|
|
165
|
-
llama_model * model = llama_init.model;
|
|
166
|
-
llama_context * ctx = llama_init.context;
|
|
168
|
+
llama_model * model = llama_init.model.get();
|
|
169
|
+
llama_context * ctx = llama_init.context.get();
|
|
170
|
+
|
|
167
171
|
if (model == nullptr || ctx == nullptr) {
|
|
168
172
|
LOG_ERR("%s : failed to init\n", __func__);
|
|
169
173
|
return 1;
|
|
@@ -184,9 +188,6 @@ int main(int argc, char ** argv) {
|
|
|
184
188
|
LOG("\n");
|
|
185
189
|
llama_perf_context_print(ctx);
|
|
186
190
|
|
|
187
|
-
llama_free(ctx);
|
|
188
|
-
llama_free_model(model);
|
|
189
|
-
|
|
190
191
|
llama_backend_free();
|
|
191
192
|
|
|
192
193
|
return 0;
|
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "common.h"
|
|
3
1
|
#include "ggml.h"
|
|
4
2
|
#include "ggml-alloc.h"
|
|
3
|
+
#include "gguf.h"
|
|
4
|
+
|
|
5
|
+
#include "arg.h"
|
|
6
|
+
#include "common.h"
|
|
5
7
|
|
|
6
8
|
#include <map>
|
|
7
9
|
#include <vector>
|
|
8
10
|
#include <string>
|
|
9
|
-
#include <thread>
|
|
10
11
|
#include <fstream>
|
|
11
12
|
|
|
12
13
|
static bool g_verbose = false;
|
|
@@ -128,7 +129,7 @@ struct lora_merge_ctx {
|
|
|
128
129
|
|
|
129
130
|
lora_merge_ctx(
|
|
130
131
|
std::string & base_fname,
|
|
131
|
-
std::vector<
|
|
132
|
+
std::vector<common_adapter_lora_info> & lora_files,
|
|
132
133
|
std::string & outfile,
|
|
133
134
|
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
|
134
135
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
|
@@ -265,8 +266,8 @@ struct lora_merge_ctx {
|
|
|
265
266
|
fout.write((const char *)data.data(), data.size());
|
|
266
267
|
}
|
|
267
268
|
|
|
268
|
-
printf("%s : merged %
|
|
269
|
-
printf("%s : wrote %
|
|
269
|
+
printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
|
|
270
|
+
printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
|
|
270
271
|
}
|
|
271
272
|
|
|
272
273
|
void copy_tensor(struct ggml_tensor * base) {
|
|
@@ -352,7 +353,7 @@ struct lora_merge_ctx {
|
|
|
352
353
|
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
|
|
353
354
|
delta = ggml_scale(ctx0, delta, scale);
|
|
354
355
|
cur = ggml_add(ctx0, delta, cur);
|
|
355
|
-
printf("%s : + merging from adapter[%
|
|
356
|
+
printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
|
|
356
357
|
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
|
|
357
358
|
}
|
|
358
359
|
cur = ggml_cast(ctx0, cur, out->type);
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
#include "ggml.h"
|
|
2
|
+
#include "gguf.h"
|
|
2
3
|
|
|
3
4
|
#include <cstdio>
|
|
4
|
-
#include <cinttypes>
|
|
5
5
|
#include <string>
|
|
6
6
|
#include <sstream>
|
|
7
|
-
#include <fstream>
|
|
8
7
|
#include <vector>
|
|
9
8
|
|
|
10
9
|
#undef MIN
|
|
@@ -135,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
|
|
135
134
|
|
|
136
135
|
for (int i = 0; i < n_tensors; ++i) {
|
|
137
136
|
const char * name = gguf_get_tensor_name (ctx, i);
|
|
137
|
+
const size_t size = gguf_get_tensor_size (ctx, i);
|
|
138
138
|
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
|
139
139
|
|
|
140
|
-
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
|
140
|
+
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
|
141
141
|
}
|
|
142
142
|
}
|
|
143
143
|
|
|
@@ -182,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
|
|
182
182
|
|
|
183
183
|
for (int i = 0; i < n_tensors; ++i) {
|
|
184
184
|
const char * name = gguf_get_tensor_name (ctx, i);
|
|
185
|
+
const size_t size = gguf_get_tensor_size (ctx, i);
|
|
185
186
|
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
|
186
187
|
|
|
187
|
-
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
|
188
|
+
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
|
188
189
|
}
|
|
189
190
|
}
|
|
190
191
|
|
|
@@ -199,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
|
|
199
200
|
|
|
200
201
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
|
201
202
|
|
|
202
|
-
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n",
|
|
203
|
+
printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
|
|
204
|
+
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
|
|
203
205
|
|
|
204
206
|
// print first 10 elements
|
|
205
207
|
const float * data = (const float *) cur->data;
|
|
@@ -215,7 +217,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
|
|
215
217
|
const float * data = (const float *) cur->data;
|
|
216
218
|
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
|
217
219
|
if (data[j] != 100 + i) {
|
|
218
|
-
fprintf(stderr, "%s: tensor[%d]
|
|
220
|
+
fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
|
|
219
221
|
gguf_free(ctx);
|
|
220
222
|
return false;
|
|
221
223
|
}
|
|
@@ -245,6 +247,8 @@ int main(int argc, char ** argv) {
|
|
|
245
247
|
check_data = false;
|
|
246
248
|
}
|
|
247
249
|
|
|
250
|
+
srand(123456);
|
|
251
|
+
|
|
248
252
|
const std::string fname(argv[1]);
|
|
249
253
|
const std::string mode (argv[2]);
|
|
250
254
|
|
|
@@ -1,18 +1,19 @@
|
|
|
1
|
+
#include "ggml.h"
|
|
2
|
+
#include "gguf.h"
|
|
1
3
|
#include "llama.h"
|
|
2
4
|
#include "common.h"
|
|
3
5
|
|
|
4
6
|
#include <algorithm>
|
|
5
|
-
#include <
|
|
7
|
+
#include <cinttypes>
|
|
8
|
+
#include <climits>
|
|
9
|
+
#include <cstdio>
|
|
6
10
|
#include <cstdlib>
|
|
11
|
+
#include <stdexcept>
|
|
12
|
+
#include <cstring>
|
|
7
13
|
#include <fstream>
|
|
8
14
|
#include <string>
|
|
9
15
|
#include <vector>
|
|
10
16
|
|
|
11
|
-
#include <stdio.h>
|
|
12
|
-
#include <string.h>
|
|
13
|
-
#include <climits>
|
|
14
|
-
#include <stdexcept>
|
|
15
|
-
|
|
16
17
|
#if defined(_WIN32)
|
|
17
18
|
#include <windows.h>
|
|
18
19
|
#ifndef PATH_MAX
|
|
@@ -297,7 +298,7 @@ struct split_strategy {
|
|
|
297
298
|
total_size += ggml_nbytes(t);
|
|
298
299
|
}
|
|
299
300
|
total_size = total_size / 1000 / 1000; // convert to megabytes
|
|
300
|
-
printf("split %05d: n_tensors = %
|
|
301
|
+
printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
|
301
302
|
i_split++;
|
|
302
303
|
}
|
|
303
304
|
}
|
|
@@ -11,6 +11,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
11
11
|
std::vector<std::vector<float>> result;
|
|
12
12
|
|
|
13
13
|
const llama_model * model = llama_get_model(ctx);
|
|
14
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
14
15
|
|
|
15
16
|
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
|
16
17
|
|
|
@@ -19,16 +20,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
19
20
|
|
|
20
21
|
const std::string input_string = instruction + sentences[i];
|
|
21
22
|
|
|
22
|
-
std::vector<llama_token> inputs = common_tokenize(
|
|
23
|
+
std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
|
|
23
24
|
|
|
24
25
|
const int32_t n_toks = inputs.size();
|
|
25
26
|
|
|
26
27
|
// GritLM seems to have EOS = ""
|
|
27
28
|
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
|
|
28
|
-
// inputs.push_back(
|
|
29
|
+
// inputs.push_back(llama_vocab_eos(vocab));
|
|
29
30
|
|
|
30
31
|
// we want to ignore instruction tokens for mean pooling
|
|
31
|
-
const int32_t n_inst = common_tokenize(
|
|
32
|
+
const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
|
|
32
33
|
|
|
33
34
|
#ifdef GRIT_DEBUG
|
|
34
35
|
// debug tokens - should be matching as referenced in the GritLM sample
|
|
@@ -52,7 +53,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
52
53
|
llama_decode(ctx, batch);
|
|
53
54
|
|
|
54
55
|
// get embedding dimensions
|
|
55
|
-
uint64_t n_embd =
|
|
56
|
+
uint64_t n_embd = llama_model_n_embd(model);
|
|
56
57
|
|
|
57
58
|
// allocate embedding output
|
|
58
59
|
std::vector<float> emb_unorm(n_embd, 0.0f);
|
|
@@ -97,7 +98,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
|
|
97
98
|
std::string result;
|
|
98
99
|
|
|
99
100
|
const llama_model * model = llama_get_model(ctx);
|
|
100
|
-
|
|
101
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
102
|
+
|
|
103
|
+
llama_token eos_token = llama_vocab_eos(vocab);
|
|
101
104
|
|
|
102
105
|
llama_kv_cache_clear(ctx);
|
|
103
106
|
llama_set_embeddings(ctx, false);
|
|
@@ -105,7 +108,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
|
|
105
108
|
|
|
106
109
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
|
107
110
|
|
|
108
|
-
std::vector<llama_token> inputs = common_tokenize(
|
|
111
|
+
std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
|
|
109
112
|
int32_t i_current_token = 0;
|
|
110
113
|
|
|
111
114
|
while (true) {
|
|
@@ -165,10 +168,10 @@ int main(int argc, char * argv[]) {
|
|
|
165
168
|
|
|
166
169
|
llama_backend_init();
|
|
167
170
|
|
|
168
|
-
llama_model * model =
|
|
171
|
+
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
|
|
169
172
|
|
|
170
173
|
// create generation context
|
|
171
|
-
llama_context * ctx =
|
|
174
|
+
llama_context * ctx = llama_init_from_model(model, cparams);
|
|
172
175
|
|
|
173
176
|
auto sparams = llama_sampler_chain_default_params();
|
|
174
177
|
|
|
@@ -197,7 +200,7 @@ int main(int argc, char * argv[]) {
|
|
|
197
200
|
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
|
|
198
201
|
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
|
|
199
202
|
|
|
200
|
-
const int n_embd =
|
|
203
|
+
const int n_embd = llama_model_n_embd(model);
|
|
201
204
|
|
|
202
205
|
const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
|
203
206
|
const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
|
@@ -219,7 +222,7 @@ int main(int argc, char * argv[]) {
|
|
|
219
222
|
|
|
220
223
|
llama_sampler_free(smpl);
|
|
221
224
|
llama_free(ctx);
|
|
222
|
-
|
|
225
|
+
llama_model_free(model);
|
|
223
226
|
llama_backend_free();
|
|
224
227
|
|
|
225
228
|
return 0;
|
|
@@ -7,7 +7,6 @@
|
|
|
7
7
|
#include <cstdio>
|
|
8
8
|
#include <cstring>
|
|
9
9
|
#include <ctime>
|
|
10
|
-
#include <sstream>
|
|
11
10
|
#include <thread>
|
|
12
11
|
#include <mutex>
|
|
13
12
|
#include <vector>
|
|
@@ -40,7 +39,7 @@ public:
|
|
|
40
39
|
void set_params(common_params params) { m_params = std::move(params); }
|
|
41
40
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
|
42
41
|
void save_imatrix(int ncall = -1) const;
|
|
43
|
-
bool load_imatrix(const char *
|
|
42
|
+
bool load_imatrix(const char * fname);
|
|
44
43
|
private:
|
|
45
44
|
std::unordered_map<std::string, Stats> m_stats;
|
|
46
45
|
common_params m_params;
|
|
@@ -429,10 +428,14 @@ static void process_logits(
|
|
|
429
428
|
}
|
|
430
429
|
|
|
431
430
|
static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
|
432
|
-
const
|
|
433
|
-
|
|
431
|
+
const llama_model * model = llama_get_model(ctx);
|
|
432
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
433
|
+
|
|
434
|
+
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
434
435
|
const int n_ctx = llama_n_ctx(ctx);
|
|
435
436
|
|
|
437
|
+
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
|
438
|
+
|
|
436
439
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
|
437
440
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
|
438
441
|
|
|
@@ -467,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
|
|
467
470
|
const int n_chunk_max = tokens.size() / n_ctx;
|
|
468
471
|
|
|
469
472
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
|
470
|
-
const int n_vocab =
|
|
473
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
471
474
|
const int n_batch = params.n_batch;
|
|
472
475
|
|
|
473
476
|
int count = 0;
|
|
@@ -507,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
|
|
507
510
|
|
|
508
511
|
// add BOS token for the first batch of each chunk
|
|
509
512
|
if (add_bos && j == 0) {
|
|
510
|
-
tokens[batch_start] =
|
|
513
|
+
tokens[batch_start] = llama_vocab_bos(vocab);
|
|
511
514
|
}
|
|
512
515
|
|
|
513
516
|
common_batch_clear(batch);
|
|
@@ -618,14 +621,15 @@ int main(int argc, char ** argv) {
|
|
|
618
621
|
// init
|
|
619
622
|
common_init_result llama_init = common_init_from_params(params);
|
|
620
623
|
|
|
621
|
-
llama_model * model = llama_init.model;
|
|
622
|
-
llama_context * ctx = llama_init.context;
|
|
624
|
+
llama_model * model = llama_init.model.get();
|
|
625
|
+
llama_context * ctx = llama_init.context.get();
|
|
626
|
+
|
|
623
627
|
if (model == nullptr || ctx == nullptr) {
|
|
624
628
|
LOG_ERR("%s : failed to init\n", __func__);
|
|
625
629
|
return 1;
|
|
626
630
|
}
|
|
627
631
|
|
|
628
|
-
const int n_ctx_train =
|
|
632
|
+
const int n_ctx_train = llama_model_n_ctx_train(model);
|
|
629
633
|
if (params.n_ctx > n_ctx_train) {
|
|
630
634
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
|
631
635
|
__func__, n_ctx_train, params.n_ctx);
|
|
@@ -655,9 +659,6 @@ int main(int argc, char ** argv) {
|
|
|
655
659
|
LOG("\n");
|
|
656
660
|
llama_perf_context_print(ctx);
|
|
657
661
|
|
|
658
|
-
llama_free(ctx);
|
|
659
|
-
llama_free_model(model);
|
|
660
|
-
|
|
661
662
|
llama_backend_free();
|
|
662
663
|
|
|
663
664
|
return 0;
|
|
@@ -131,15 +131,17 @@ int main(int argc, char ** argv) {
|
|
|
131
131
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
132
132
|
common_init_result llama_init = common_init_from_params(params);
|
|
133
133
|
|
|
134
|
-
model = llama_init.model;
|
|
135
|
-
ctx = llama_init.context;
|
|
134
|
+
model = llama_init.model.get();
|
|
135
|
+
ctx = llama_init.context.get();
|
|
136
136
|
|
|
137
137
|
if (model == NULL) {
|
|
138
138
|
LOG_ERR("%s: unable to load model\n", __func__);
|
|
139
139
|
return 1;
|
|
140
140
|
}
|
|
141
141
|
|
|
142
|
-
const
|
|
142
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
143
|
+
|
|
144
|
+
const int n_ctx_train = llama_model_n_ctx_train(model);
|
|
143
145
|
const int n_ctx = llama_n_ctx(ctx);
|
|
144
146
|
LOG_DBG("n_ctx: %d\n", n_ctx);
|
|
145
147
|
|
|
@@ -152,28 +154,28 @@ int main(int argc, char ** argv) {
|
|
|
152
154
|
LOG_INF("\n");
|
|
153
155
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
154
156
|
}
|
|
155
|
-
const bool add_bos =
|
|
156
|
-
GGML_ASSERT(!
|
|
157
|
+
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
158
|
+
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
|
157
159
|
|
|
158
160
|
std::vector<llama_token> embd_inp;
|
|
159
161
|
std::vector<llama_token> embd_end;
|
|
160
162
|
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
161
163
|
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
162
164
|
|
|
163
|
-
GGML_ASSERT(
|
|
164
|
-
GGML_ASSERT(
|
|
165
|
+
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
|
|
166
|
+
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
|
|
165
167
|
|
|
166
|
-
inp_pfx.insert(inp_pfx.begin(),
|
|
167
|
-
inp_sfx.insert(inp_sfx.begin(),
|
|
168
|
+
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
|
169
|
+
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
|
168
170
|
|
|
169
171
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
170
172
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
171
173
|
if (add_bos) {
|
|
172
|
-
embd_inp.insert(embd_inp.begin(),
|
|
174
|
+
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
|
173
175
|
}
|
|
174
176
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
175
177
|
|
|
176
|
-
const llama_token middle_token =
|
|
178
|
+
const llama_token middle_token = llama_vocab_fim_mid(vocab);
|
|
177
179
|
if (middle_token >= 0) {
|
|
178
180
|
embd_inp.push_back(middle_token);
|
|
179
181
|
}
|
|
@@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
|
|
|
185
187
|
|
|
186
188
|
// Should not run without any tokens
|
|
187
189
|
if (embd_inp.empty()) {
|
|
188
|
-
embd_inp.push_back(
|
|
190
|
+
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
189
191
|
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
190
192
|
}
|
|
191
193
|
|
|
@@ -420,10 +422,10 @@ int main(int argc, char ** argv) {
|
|
|
420
422
|
// if not currently processing queued inputs;
|
|
421
423
|
if ((int) embd_inp.size() <= n_consumed) {
|
|
422
424
|
// deal with eot token in infill mode
|
|
423
|
-
if ((common_sampler_last(smpl) ==
|
|
425
|
+
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
|
|
424
426
|
if (is_interacting && !params.interactive_first) {
|
|
425
427
|
// print an eot token
|
|
426
|
-
LOG("%s", common_token_to_piece(ctx,
|
|
428
|
+
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
|
427
429
|
}
|
|
428
430
|
LOG("\n");
|
|
429
431
|
console::set_display(console::user_input);
|
|
@@ -463,13 +465,13 @@ int main(int argc, char ** argv) {
|
|
|
463
465
|
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
464
466
|
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
465
467
|
|
|
466
|
-
inp_pfx.insert(inp_pfx.begin(),
|
|
467
|
-
inp_sfx.insert(inp_sfx.begin(),
|
|
468
|
+
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
|
469
|
+
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
|
468
470
|
|
|
469
471
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
470
472
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
471
473
|
if (add_bos) {
|
|
472
|
-
embd_inp.insert(embd_inp.begin(),
|
|
474
|
+
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
|
473
475
|
}
|
|
474
476
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
475
477
|
|
|
@@ -484,7 +486,7 @@ int main(int argc, char ** argv) {
|
|
|
484
486
|
is_interacting = false;
|
|
485
487
|
}
|
|
486
488
|
// deal with end of generation tokens in interactive mode
|
|
487
|
-
else if (
|
|
489
|
+
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
|
488
490
|
LOG_DBG("found EOS token\n");
|
|
489
491
|
|
|
490
492
|
if (params.interactive) {
|
|
@@ -500,7 +502,7 @@ int main(int argc, char ** argv) {
|
|
|
500
502
|
|
|
501
503
|
if (params.input_prefix_bos) {
|
|
502
504
|
LOG_DBG("adding input prefix BOS token\n");
|
|
503
|
-
embd_inp.push_back(
|
|
505
|
+
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
504
506
|
}
|
|
505
507
|
|
|
506
508
|
std::string buffer;
|
|
@@ -563,7 +565,7 @@ int main(int argc, char ** argv) {
|
|
|
563
565
|
}
|
|
564
566
|
|
|
565
567
|
// end of generation
|
|
566
|
-
if (!embd.empty() &&
|
|
568
|
+
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
|
|
567
569
|
break;
|
|
568
570
|
}
|
|
569
571
|
|
|
@@ -575,15 +577,12 @@ int main(int argc, char ** argv) {
|
|
|
575
577
|
}
|
|
576
578
|
}
|
|
577
579
|
if (!params.interactive && n_remain <= 0) {
|
|
578
|
-
LOG("%s", common_token_to_piece(ctx,
|
|
580
|
+
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
|
579
581
|
}
|
|
580
582
|
|
|
581
583
|
LOG("\n");
|
|
582
584
|
common_perf_print(ctx, smpl);
|
|
583
585
|
|
|
584
|
-
llama_free(ctx);
|
|
585
|
-
llama_free_model(model);
|
|
586
|
-
|
|
587
586
|
common_sampler_free(smpl);
|
|
588
587
|
llama_backend_free();
|
|
589
588
|
|
|
@@ -683,7 +683,7 @@ struct cmd_params_instance {
|
|
|
683
683
|
bool cpu_strict;
|
|
684
684
|
int poll;
|
|
685
685
|
int n_gpu_layers;
|
|
686
|
-
std::string
|
|
686
|
+
std::string rpc_servers_str;
|
|
687
687
|
llama_split_mode split_mode;
|
|
688
688
|
int main_gpu;
|
|
689
689
|
bool no_kv_offload;
|
|
@@ -696,8 +696,37 @@ struct cmd_params_instance {
|
|
|
696
696
|
llama_model_params mparams = llama_model_default_params();
|
|
697
697
|
|
|
698
698
|
mparams.n_gpu_layers = n_gpu_layers;
|
|
699
|
-
if (!
|
|
700
|
-
|
|
699
|
+
if (!rpc_servers_str.empty()) {
|
|
700
|
+
auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
|
|
701
|
+
|
|
702
|
+
// add RPC devices
|
|
703
|
+
if (!rpc_servers.empty()) {
|
|
704
|
+
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
|
705
|
+
if (!rpc_reg) {
|
|
706
|
+
fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
|
|
707
|
+
exit(1);
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
|
|
711
|
+
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
|
712
|
+
if (!ggml_backend_rpc_add_device_fn) {
|
|
713
|
+
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
|
|
714
|
+
exit(1);
|
|
715
|
+
}
|
|
716
|
+
static std::vector<ggml_backend_dev_t> devices;
|
|
717
|
+
devices.clear();
|
|
718
|
+
for (const std::string & server : rpc_servers) {
|
|
719
|
+
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
|
720
|
+
if (dev) {
|
|
721
|
+
devices.push_back(dev);
|
|
722
|
+
} else {
|
|
723
|
+
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
|
724
|
+
exit(1);
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
devices.push_back(nullptr);
|
|
728
|
+
mparams.devices = devices.data();
|
|
729
|
+
}
|
|
701
730
|
}
|
|
702
731
|
mparams.split_mode = split_mode;
|
|
703
732
|
mparams.main_gpu = main_gpu;
|
|
@@ -708,7 +737,7 @@ struct cmd_params_instance {
|
|
|
708
737
|
}
|
|
709
738
|
|
|
710
739
|
bool equal_mparams(const cmd_params_instance & other) const {
|
|
711
|
-
return model == other.model && n_gpu_layers == other.n_gpu_layers &&
|
|
740
|
+
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
|
|
712
741
|
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
|
|
713
742
|
tensor_split == other.tensor_split;
|
|
714
743
|
}
|
|
@@ -1401,7 +1430,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
|
|
|
1401
1430
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1402
1431
|
|
|
1403
1432
|
const llama_model * model = llama_get_model(ctx);
|
|
1404
|
-
const
|
|
1433
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1434
|
+
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
1405
1435
|
|
|
1406
1436
|
std::vector<llama_token> tokens(n_batch);
|
|
1407
1437
|
|
|
@@ -1409,7 +1439,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
|
|
|
1409
1439
|
|
|
1410
1440
|
while (n_processed < n_prompt) {
|
|
1411
1441
|
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
|
1412
|
-
tokens[0] = n_processed == 0 &&
|
|
1442
|
+
tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
|
|
1413
1443
|
for (int i = 1; i < n_tokens; i++) {
|
|
1414
1444
|
tokens[i] = std::rand() % n_vocab;
|
|
1415
1445
|
}
|
|
@@ -1424,9 +1454,10 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
|
|
|
1424
1454
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1425
1455
|
|
|
1426
1456
|
const llama_model * model = llama_get_model(ctx);
|
|
1427
|
-
const
|
|
1457
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1458
|
+
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
1428
1459
|
|
|
1429
|
-
llama_token token =
|
|
1460
|
+
llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
|
|
1430
1461
|
|
|
1431
1462
|
for (int i = 0; i < n_gen; i++) {
|
|
1432
1463
|
llama_decode(ctx, llama_batch_get_one(&token, 1));
|
|
@@ -1526,10 +1557,10 @@ int main(int argc, char ** argv) {
|
|
|
1526
1557
|
// keep the same model between tests when possible
|
|
1527
1558
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
|
1528
1559
|
if (lmodel) {
|
|
1529
|
-
|
|
1560
|
+
llama_model_free(lmodel);
|
|
1530
1561
|
}
|
|
1531
1562
|
|
|
1532
|
-
lmodel =
|
|
1563
|
+
lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
|
1533
1564
|
if (lmodel == NULL) {
|
|
1534
1565
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
|
1535
1566
|
return 1;
|
|
@@ -1537,10 +1568,10 @@ int main(int argc, char ** argv) {
|
|
|
1537
1568
|
prev_inst = &inst;
|
|
1538
1569
|
}
|
|
1539
1570
|
|
|
1540
|
-
llama_context * ctx =
|
|
1571
|
+
llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
|
|
1541
1572
|
if (ctx == NULL) {
|
|
1542
1573
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
|
1543
|
-
|
|
1574
|
+
llama_model_free(lmodel);
|
|
1544
1575
|
return 1;
|
|
1545
1576
|
}
|
|
1546
1577
|
|
|
@@ -1626,7 +1657,7 @@ int main(int argc, char ** argv) {
|
|
|
1626
1657
|
ggml_threadpool_free_fn(threadpool);
|
|
1627
1658
|
}
|
|
1628
1659
|
|
|
1629
|
-
|
|
1660
|
+
llama_model_free(lmodel);
|
|
1630
1661
|
|
|
1631
1662
|
if (p) {
|
|
1632
1663
|
p->print_footer();
|