@fugood/llama.node 0.3.7 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -0
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +156 -6
- package/src/LlamaContext.h +5 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#pragma once
|
|
4
4
|
|
|
5
|
-
#include "llama.h"
|
|
5
|
+
#include "llama-cpp.h"
|
|
6
6
|
|
|
7
7
|
#include <string>
|
|
8
8
|
#include <vector>
|
|
@@ -24,13 +24,11 @@
|
|
|
24
24
|
|
|
25
25
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
26
26
|
|
|
27
|
-
struct
|
|
27
|
+
struct common_adapter_lora_info {
|
|
28
28
|
std::string path;
|
|
29
29
|
float scale;
|
|
30
|
-
};
|
|
31
30
|
|
|
32
|
-
struct
|
|
33
|
-
struct llama_lora_adapter * adapter;
|
|
31
|
+
struct llama_adapter_lora * ptr;
|
|
34
32
|
};
|
|
35
33
|
|
|
36
34
|
using llama_tokens = std::vector<llama_token>;
|
|
@@ -105,6 +103,12 @@ enum dimre_method {
|
|
|
105
103
|
DIMRE_METHOD_MEAN,
|
|
106
104
|
};
|
|
107
105
|
|
|
106
|
+
enum common_conversation_mode {
|
|
107
|
+
COMMON_CONVERSATION_MODE_DISABLED = 0,
|
|
108
|
+
COMMON_CONVERSATION_MODE_ENABLED = 1,
|
|
109
|
+
COMMON_CONVERSATION_MODE_AUTO = 2,
|
|
110
|
+
};
|
|
111
|
+
|
|
108
112
|
// sampling parameters
|
|
109
113
|
struct common_params_sampling {
|
|
110
114
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
@@ -180,6 +184,8 @@ struct common_params_vocoder {
|
|
|
180
184
|
|
|
181
185
|
std::string model = ""; // model path // NOLINT
|
|
182
186
|
std::string model_url = ""; // model url to download // NOLINT
|
|
187
|
+
|
|
188
|
+
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
|
183
189
|
};
|
|
184
190
|
|
|
185
191
|
struct common_params {
|
|
@@ -242,14 +248,13 @@ struct common_params {
|
|
|
242
248
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
|
243
249
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
|
244
250
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
|
245
|
-
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
|
246
251
|
|
|
247
252
|
std::vector<std::string> in_files; // all input files
|
|
248
253
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
249
254
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
250
255
|
|
|
251
|
-
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using
|
|
252
|
-
std::vector<
|
|
256
|
+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
|
257
|
+
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
|
253
258
|
|
|
254
259
|
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
|
255
260
|
|
|
@@ -277,7 +282,6 @@ struct common_params {
|
|
|
277
282
|
bool special = false; // enable special token output
|
|
278
283
|
bool interactive = false; // interactive mode
|
|
279
284
|
bool interactive_first = false; // wait for user input immediately
|
|
280
|
-
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
|
281
285
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
|
282
286
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
|
283
287
|
|
|
@@ -303,6 +307,8 @@ struct common_params {
|
|
|
303
307
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
304
308
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
305
309
|
|
|
310
|
+
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
|
311
|
+
|
|
306
312
|
// multimodal models (see examples/llava)
|
|
307
313
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
|
308
314
|
std::vector<std::string> image; // path to image file(s)
|
|
@@ -456,6 +462,11 @@ static bool string_starts_with(const std::string & str,
|
|
|
456
462
|
return str.rfind(prefix, 0) == 0;
|
|
457
463
|
}
|
|
458
464
|
|
|
465
|
+
static bool string_ends_with(const std::string & str,
|
|
466
|
+
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
|
|
467
|
+
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
|
468
|
+
}
|
|
469
|
+
|
|
459
470
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
460
471
|
void string_process_escapes(std::string & input);
|
|
461
472
|
|
|
@@ -478,10 +489,12 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
|
478
489
|
// Model utils
|
|
479
490
|
//
|
|
480
491
|
|
|
492
|
+
// note: defines object's lifetime
|
|
481
493
|
struct common_init_result {
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
494
|
+
llama_model_ptr model;
|
|
495
|
+
llama_context_ptr context;
|
|
496
|
+
|
|
497
|
+
std::vector<llama_adapter_lora_ptr> lora;
|
|
485
498
|
};
|
|
486
499
|
|
|
487
500
|
struct common_init_result common_init_from_params(common_params & params);
|
|
@@ -501,9 +514,12 @@ struct llama_model * common_load_model_from_hf(
|
|
|
501
514
|
const std::string & local_path,
|
|
502
515
|
const std::string & hf_token,
|
|
503
516
|
const struct llama_model_params & params);
|
|
517
|
+
std::pair<std::string, std::string> common_get_hf_file(
|
|
518
|
+
const std::string & hf_repo_with_tag,
|
|
519
|
+
const std::string & hf_token);
|
|
504
520
|
|
|
505
521
|
// clear LoRA adapters from context, then apply new list of adapters
|
|
506
|
-
void
|
|
522
|
+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
|
507
523
|
|
|
508
524
|
//
|
|
509
525
|
// Batch utils
|
|
@@ -541,7 +557,7 @@ std::vector<llama_token> common_tokenize(
|
|
|
541
557
|
bool parse_special = false);
|
|
542
558
|
|
|
543
559
|
std::vector<llama_token> common_tokenize(
|
|
544
|
-
const struct
|
|
560
|
+
const struct llama_vocab * vocab,
|
|
545
561
|
const std::string & text,
|
|
546
562
|
bool add_special,
|
|
547
563
|
bool parse_special = false);
|
|
@@ -553,11 +569,21 @@ std::string common_token_to_piece(
|
|
|
553
569
|
llama_token token,
|
|
554
570
|
bool special = true);
|
|
555
571
|
|
|
572
|
+
std::string common_token_to_piece(
|
|
573
|
+
const struct llama_vocab * vocab,
|
|
574
|
+
llama_token token,
|
|
575
|
+
bool special = true);
|
|
576
|
+
|
|
556
577
|
// detokenizes a vector of tokens into a string
|
|
557
578
|
// should work similar to Python's `tokenizer.decode`
|
|
558
579
|
// optionally renders special/control tokens
|
|
559
580
|
std::string common_detokenize(
|
|
560
|
-
|
|
581
|
+
const struct llama_context * ctx,
|
|
582
|
+
const std::vector<llama_token> & tokens,
|
|
583
|
+
bool special = true);
|
|
584
|
+
|
|
585
|
+
std::string common_detokenize(
|
|
586
|
+
const struct llama_vocab * vocab,
|
|
561
587
|
const std::vector<llama_token> & tokens,
|
|
562
588
|
bool special = true);
|
|
563
589
|
|
|
@@ -571,6 +597,9 @@ struct common_chat_msg {
|
|
|
571
597
|
std::string content;
|
|
572
598
|
};
|
|
573
599
|
|
|
600
|
+
// Get the built-in chat template for the model. Return empty string if not present.
|
|
601
|
+
std::string common_get_builtin_chat_template(const struct llama_model * model);
|
|
602
|
+
|
|
574
603
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
575
604
|
bool common_chat_verify_template(const std::string & tmpl);
|
|
576
605
|
|
|
@@ -637,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|
|
637
666
|
// Split utils
|
|
638
667
|
//
|
|
639
668
|
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
669
|
+
namespace {
|
|
670
|
+
|
|
671
|
+
const char * const LLM_KV_SPLIT_NO = "split.no";
|
|
672
|
+
const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
673
|
+
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
674
|
+
|
|
675
|
+
}
|
|
@@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
|
|
65
65
|
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
|
66
66
|
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
|
67
67
|
if (part_static_it == nc_static.end()) {
|
|
68
|
-
return
|
|
68
|
+
return LLAMA_TOKEN_NULL;
|
|
69
69
|
}
|
|
70
70
|
const common_ngram_cache_part part_static = part_static_it->second;
|
|
71
71
|
|
|
72
72
|
int max_count_static = 0;
|
|
73
73
|
int sum_count_static = 0;
|
|
74
|
-
llama_token max_token =
|
|
74
|
+
llama_token max_token = LLAMA_TOKEN_NULL;
|
|
75
75
|
|
|
76
76
|
for (std::pair<llama_token, int> token_count_static : part_static) {
|
|
77
77
|
const llama_token token = token_count_static.first;
|
|
@@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
|
|
85
85
|
}
|
|
86
86
|
|
|
87
87
|
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
|
88
|
-
return
|
|
88
|
+
return LLAMA_TOKEN_NULL;
|
|
89
89
|
}
|
|
90
90
|
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
|
91
|
-
return
|
|
91
|
+
return LLAMA_TOKEN_NULL;
|
|
92
92
|
}
|
|
93
93
|
return max_token;
|
|
94
94
|
}
|
|
@@ -98,9 +98,9 @@ static llama_token try_draft(
|
|
|
98
98
|
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
|
99
99
|
const int * min_sample_size, const int * min_percent) {
|
|
100
100
|
|
|
101
|
-
llama_token drafted_token =
|
|
101
|
+
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
|
102
102
|
|
|
103
|
-
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token ==
|
|
103
|
+
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
|
|
104
104
|
const common_ngram ngram_primary = ngrams_primary[i];
|
|
105
105
|
|
|
106
106
|
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
|
@@ -112,7 +112,7 @@ static llama_token try_draft(
|
|
|
112
112
|
int max_count_primary = 0;
|
|
113
113
|
int max_count_static = 0;
|
|
114
114
|
int sum_count_primary = 0;
|
|
115
|
-
llama_token max_token =
|
|
115
|
+
llama_token max_token = LLAMA_TOKEN_NULL;
|
|
116
116
|
|
|
117
117
|
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
|
118
118
|
const llama_token token = token_count_primary.first;
|
|
@@ -154,7 +154,7 @@ void common_ngram_cache_draft(
|
|
|
154
154
|
}
|
|
155
155
|
|
|
156
156
|
while ((int) draft.size()-1 < n_draft) {
|
|
157
|
-
llama_token drafted_token =
|
|
157
|
+
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
|
158
158
|
|
|
159
159
|
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
|
160
160
|
common_ngram ngram_static;
|
|
@@ -177,17 +177,17 @@ void common_ngram_cache_draft(
|
|
|
177
177
|
}
|
|
178
178
|
ngrams_cd.push_back(ngram_cd);
|
|
179
179
|
}
|
|
180
|
-
if (drafted_token ==
|
|
180
|
+
if (drafted_token == LLAMA_TOKEN_NULL) {
|
|
181
181
|
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
|
182
182
|
}
|
|
183
|
-
if (drafted_token ==
|
|
183
|
+
if (drafted_token == LLAMA_TOKEN_NULL) {
|
|
184
184
|
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
|
185
185
|
}
|
|
186
|
-
if (drafted_token ==
|
|
186
|
+
if (drafted_token == LLAMA_TOKEN_NULL) {
|
|
187
187
|
drafted_token = try_draft(nc_static, ngram_static);
|
|
188
188
|
}
|
|
189
189
|
|
|
190
|
-
if (drafted_token ==
|
|
190
|
+
if (drafted_token == LLAMA_TOKEN_NULL) {
|
|
191
191
|
break;
|
|
192
192
|
}
|
|
193
193
|
|
|
@@ -17,13 +17,13 @@ struct common_ngram {
|
|
|
17
17
|
|
|
18
18
|
common_ngram() {
|
|
19
19
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
20
|
-
tokens[i] =
|
|
20
|
+
tokens[i] = LLAMA_TOKEN_NULL;
|
|
21
21
|
}
|
|
22
22
|
}
|
|
23
23
|
|
|
24
24
|
common_ngram(const llama_token * input, const int ngram_size) {
|
|
25
25
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
26
|
-
tokens[i] = i < ngram_size ? input[i] :
|
|
26
|
+
tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
|
|
27
27
|
}
|
|
28
28
|
}
|
|
29
29
|
|
|
@@ -113,7 +113,10 @@ struct common_sampler {
|
|
|
113
113
|
void set_logits(struct llama_context * ctx, int idx) {
|
|
114
114
|
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
115
115
|
|
|
116
|
-
const
|
|
116
|
+
const llama_model * model = llama_get_model(ctx);
|
|
117
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
118
|
+
|
|
119
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
117
120
|
|
|
118
121
|
cur.resize(n_vocab);
|
|
119
122
|
|
|
@@ -142,13 +145,15 @@ std::string common_params_sampling::print() const {
|
|
|
142
145
|
}
|
|
143
146
|
|
|
144
147
|
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
|
148
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
149
|
+
|
|
145
150
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
|
146
151
|
|
|
147
152
|
lparams.no_perf = params.no_perf;
|
|
148
153
|
|
|
149
154
|
auto * result = new common_sampler {
|
|
150
155
|
/* .params = */ params,
|
|
151
|
-
/* .grmr = */ llama_sampler_init_grammar(
|
|
156
|
+
/* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
|
|
152
157
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
153
158
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
154
159
|
/* .cur = */ {},
|
|
@@ -157,7 +162,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
157
162
|
|
|
158
163
|
llama_sampler_chain_add(result->chain,
|
|
159
164
|
llama_sampler_init_logit_bias(
|
|
160
|
-
|
|
165
|
+
llama_vocab_n_tokens(vocab),
|
|
161
166
|
params.logit_bias.size(),
|
|
162
167
|
params.logit_bias.data()));
|
|
163
168
|
|
|
@@ -172,7 +177,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
172
177
|
c_breakers.push_back(str.c_str());
|
|
173
178
|
}
|
|
174
179
|
|
|
175
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
180
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
176
181
|
}
|
|
177
182
|
break;
|
|
178
183
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
@@ -194,7 +199,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
194
199
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
195
200
|
break;
|
|
196
201
|
case COMMON_SAMPLER_TYPE_INFILL:
|
|
197
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (
|
|
202
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
|
198
203
|
break;
|
|
199
204
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
|
200
205
|
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
|
@@ -206,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
206
211
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
207
212
|
} else if (params.mirostat == 1) {
|
|
208
213
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
209
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(
|
|
214
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
|
210
215
|
} else if (params.mirostat == 2) {
|
|
211
216
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
212
217
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
|
@@ -79,10 +79,13 @@ bool common_speculative_are_compatible(
|
|
|
79
79
|
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
|
|
80
80
|
const struct llama_model * model_dft = llama_get_model(ctx_dft);
|
|
81
81
|
|
|
82
|
-
const
|
|
82
|
+
const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
|
|
83
|
+
const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
|
|
84
|
+
|
|
85
|
+
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
|
|
83
86
|
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
|
|
84
87
|
|
|
85
|
-
const bool vocab_type_dft = llama_vocab_type(
|
|
88
|
+
const bool vocab_type_dft = llama_vocab_type(vocab_dft);
|
|
86
89
|
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
|
87
90
|
|
|
88
91
|
if (vocab_type_tgt != vocab_type_dft) {
|
|
@@ -91,34 +94,34 @@ bool common_speculative_are_compatible(
|
|
|
91
94
|
return false;
|
|
92
95
|
}
|
|
93
96
|
|
|
94
|
-
if (
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
LOG_ERR("%s: draft
|
|
99
|
-
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__,
|
|
100
|
-
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__,
|
|
97
|
+
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
|
98
|
+
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
|
99
|
+
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
|
100
|
+
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
|
|
101
|
+
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
|
|
102
|
+
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
|
|
103
|
+
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
|
|
101
104
|
return false;
|
|
102
105
|
}
|
|
103
106
|
|
|
104
107
|
{
|
|
105
|
-
const int n_vocab_tgt =
|
|
106
|
-
const int n_vocab_dft =
|
|
108
|
+
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
|
|
109
|
+
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
|
|
107
110
|
|
|
108
111
|
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
|
|
109
112
|
|
|
110
113
|
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
|
111
114
|
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
|
|
112
115
|
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
|
113
|
-
__func__, n_vocab_tgt,
|
|
116
|
+
__func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
|
114
117
|
return false;
|
|
115
118
|
}
|
|
116
119
|
|
|
117
120
|
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
|
118
|
-
const char * token_text_tgt =
|
|
119
|
-
const char * token_text_dft =
|
|
121
|
+
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
|
|
122
|
+
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
|
|
120
123
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
|
121
|
-
LOG_ERR("%s: draft
|
|
124
|
+
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
|
|
122
125
|
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
|
|
123
126
|
common_token_to_piece(ctx_tgt, i).c_str(),
|
|
124
127
|
common_token_to_piece(ctx_dft, i).c_str());
|
|
@@ -127,6 +127,8 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
|
|
127
127
|
|
|
128
128
|
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
|
|
129
129
|
|
|
130
|
+
If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect.
|
|
131
|
+
|
|
130
132
|
- Using `CMake`:
|
|
131
133
|
|
|
132
134
|
```bash
|
|
@@ -41,17 +41,19 @@ int main(int argc, char ** argv) {
|
|
|
41
41
|
|
|
42
42
|
llama_model_params model_params = common_model_params_to_llama(params);
|
|
43
43
|
|
|
44
|
-
llama_model * model =
|
|
44
|
+
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
|
45
45
|
|
|
46
46
|
if (model == NULL) {
|
|
47
47
|
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
|
48
48
|
return 1;
|
|
49
49
|
}
|
|
50
50
|
|
|
51
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
52
|
+
|
|
51
53
|
// tokenize the prompt
|
|
52
54
|
|
|
53
55
|
std::vector<llama_token> tokens_list;
|
|
54
|
-
tokens_list = common_tokenize(
|
|
56
|
+
tokens_list = common_tokenize(vocab, params.prompt, true);
|
|
55
57
|
|
|
56
58
|
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
|
57
59
|
|
|
@@ -62,7 +64,7 @@ int main(int argc, char ** argv) {
|
|
|
62
64
|
ctx_params.n_ctx = n_kv_req;
|
|
63
65
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
|
64
66
|
|
|
65
|
-
llama_context * ctx =
|
|
67
|
+
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
|
66
68
|
|
|
67
69
|
auto sparams = llama_sampler_chain_default_params();
|
|
68
70
|
sparams.no_perf = false;
|
|
@@ -120,8 +122,8 @@ int main(int argc, char ** argv) {
|
|
|
120
122
|
}
|
|
121
123
|
|
|
122
124
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
123
|
-
if (decoder_start_token_id ==
|
|
124
|
-
decoder_start_token_id =
|
|
125
|
+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
|
126
|
+
decoder_start_token_id = llama_vocab_bos(vocab);
|
|
125
127
|
}
|
|
126
128
|
|
|
127
129
|
common_batch_clear(batch);
|
|
@@ -174,7 +176,7 @@ int main(int argc, char ** argv) {
|
|
|
174
176
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
|
175
177
|
|
|
176
178
|
// is it an end of generation? -> mark the stream as finished
|
|
177
|
-
if (
|
|
179
|
+
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
|
|
178
180
|
i_batch[i] = -1;
|
|
179
181
|
LOG("\n");
|
|
180
182
|
if (n_parallel > 1) {
|
|
@@ -236,7 +238,7 @@ int main(int argc, char ** argv) {
|
|
|
236
238
|
|
|
237
239
|
llama_sampler_free(smpl);
|
|
238
240
|
llama_free(ctx);
|
|
239
|
-
|
|
241
|
+
llama_model_free(model);
|
|
240
242
|
|
|
241
243
|
llama_backend_free();
|
|
242
244
|
|
|
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
|
|
|
38
38
|
|
|
39
39
|
llama_model_params model_params = common_model_params_to_llama(params);
|
|
40
40
|
|
|
41
|
-
llama_model * model =
|
|
41
|
+
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
|
42
42
|
|
|
43
43
|
if (model == NULL) {
|
|
44
44
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
@@ -50,7 +50,7 @@ int main(int argc, char ** argv) {
|
|
|
50
50
|
// ensure enough sequences are available
|
|
51
51
|
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
|
52
52
|
|
|
53
|
-
llama_context * ctx =
|
|
53
|
+
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
|
54
54
|
|
|
55
55
|
if (ctx == NULL) {
|
|
56
56
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
|
@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
|
|
|
194
194
|
llama_batch_free(batch);
|
|
195
195
|
|
|
196
196
|
llama_free(ctx);
|
|
197
|
-
|
|
197
|
+
llama_model_free(model);
|
|
198
198
|
|
|
199
199
|
llama_backend_free();
|
|
200
200
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
#include "ggml.h"
|
|
2
|
+
#include "gguf.h"
|
|
3
|
+
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
#include "common.h"
|
|
4
6
|
#include "log.h"
|
|
@@ -434,12 +436,12 @@ static void print_matrix(struct ggml_tensor * probs) {
|
|
|
434
436
|
}
|
|
435
437
|
}
|
|
436
438
|
|
|
437
|
-
struct
|
|
439
|
+
struct my_llama_file {
|
|
438
440
|
// use FILE * so we don't have to re-open the file to mmap
|
|
439
441
|
FILE * fp;
|
|
440
442
|
size_t size;
|
|
441
443
|
|
|
442
|
-
|
|
444
|
+
my_llama_file(const char * fname, const char * mode) {
|
|
443
445
|
fp = std::fopen(fname, mode);
|
|
444
446
|
if (fp == NULL) {
|
|
445
447
|
size = 0;
|
|
@@ -500,7 +502,7 @@ struct llama_file {
|
|
|
500
502
|
return std::string(chars.data(), len);
|
|
501
503
|
}
|
|
502
504
|
|
|
503
|
-
~
|
|
505
|
+
~my_llama_file() {
|
|
504
506
|
if (fp) {
|
|
505
507
|
std::fclose(fp);
|
|
506
508
|
}
|
|
@@ -508,7 +510,7 @@ struct llama_file {
|
|
|
508
510
|
};
|
|
509
511
|
|
|
510
512
|
static bool is_ggml_file(const char * filename) {
|
|
511
|
-
|
|
513
|
+
my_llama_file file(filename, "rb");
|
|
512
514
|
if (file.size < 4) {
|
|
513
515
|
return false;
|
|
514
516
|
}
|
|
@@ -576,7 +578,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
|
|
|
576
578
|
} else {
|
|
577
579
|
// assume llama2.c vocabulary
|
|
578
580
|
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
|
579
|
-
|
|
581
|
+
my_llama_file file(filename, "rb");
|
|
580
582
|
if (!file.fp) {
|
|
581
583
|
die_fmt("%s: %s", strerror(errno), filename);
|
|
582
584
|
}
|
|
@@ -689,8 +691,8 @@ static void save_as_llama_model(
|
|
|
689
691
|
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
|
|
690
692
|
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
|
|
691
693
|
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
|
|
692
|
-
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID,
|
|
693
|
-
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID,
|
|
694
|
+
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
|
|
695
|
+
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
|
|
694
696
|
|
|
695
697
|
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
|
696
698
|
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
|
@@ -909,7 +911,7 @@ int main(int argc, char ** argv) {
|
|
|
909
911
|
load_vocab(params.fn_vocab_model, &config, &vocab);
|
|
910
912
|
|
|
911
913
|
struct my_llama_model model;
|
|
912
|
-
model.hparams.n_vocab = config.vocab_size; //
|
|
914
|
+
model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx);
|
|
913
915
|
model.hparams.n_ctx = params.n_ctx;
|
|
914
916
|
model.hparams.n_embd = config.dim; //params.n_embd;
|
|
915
917
|
model.hparams.n_ff = config.hidden_dim;
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
#include "ggml.h"
|
|
2
|
+
#include "gguf.h"
|
|
3
|
+
|
|
1
4
|
#include "arg.h"
|
|
2
5
|
#include "common.h"
|
|
3
6
|
#include "llama.h"
|
|
4
|
-
#include "ggml.h"
|
|
5
7
|
#include "pca.hpp"
|
|
6
8
|
#include "mean.hpp"
|
|
7
9
|
|
|
@@ -271,7 +273,9 @@ struct tokenized_prompt {
|
|
|
271
273
|
size_t max_seq_len;
|
|
272
274
|
|
|
273
275
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
|
274
|
-
const
|
|
276
|
+
const llama_model * model = llama_get_model(ctx);
|
|
277
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
278
|
+
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
275
279
|
tokens_pos = common_tokenize(ctx, pos, add_bos, true);
|
|
276
280
|
tokens_neg = common_tokenize(ctx, neg, add_bos, true);
|
|
277
281
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
|
@@ -415,12 +419,13 @@ int main(int argc, char ** argv) {
|
|
|
415
419
|
// load the model to get hparams
|
|
416
420
|
common_init_result llama_init = common_init_from_params(params);
|
|
417
421
|
|
|
418
|
-
llama_model * model = llama_init.model;
|
|
419
|
-
llama_context * ctx = llama_init.context;
|
|
422
|
+
llama_model * model = llama_init.model.get();
|
|
423
|
+
llama_context * ctx = llama_init.context.get();
|
|
420
424
|
|
|
421
425
|
// int n_ctx = llama_n_ctx(ctx);
|
|
422
|
-
int n_layers =
|
|
423
|
-
int n_embd =
|
|
426
|
+
int n_layers = llama_model_n_layer(model);
|
|
427
|
+
int n_embd = llama_model_n_embd(model);
|
|
428
|
+
|
|
424
429
|
// get model hint param (a.k.a model arch name)
|
|
425
430
|
char model_hint[128];
|
|
426
431
|
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
|
|
@@ -474,8 +479,6 @@ int main(int argc, char ** argv) {
|
|
|
474
479
|
|
|
475
480
|
// done with the model, we can now free it to make gain some memory
|
|
476
481
|
printf("Done evaluate prompts, unload model...\n");
|
|
477
|
-
llama_free(ctx);
|
|
478
|
-
llama_free_model(model);
|
|
479
482
|
|
|
480
483
|
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
|
|
481
484
|
|
|
@@ -15,7 +15,7 @@ static void run(
|
|
|
15
15
|
for (size_t il = 0; il < v_input.size(); ++il) {
|
|
16
16
|
// prepare output vector
|
|
17
17
|
struct ggml_tensor * ctrl_out = v_output[il];
|
|
18
|
-
ggml_format_name(ctrl_out, "direction.%
|
|
18
|
+
ggml_format_name(ctrl_out, "direction.%zu", il+1);
|
|
19
19
|
|
|
20
20
|
// calculate mean vector
|
|
21
21
|
struct ggml_tensor * t_layer = v_input[il];
|
|
@@ -302,7 +302,7 @@ static void run_pca(
|
|
|
302
302
|
|
|
303
303
|
// prepare output vector
|
|
304
304
|
struct ggml_tensor * ctrl_out = v_output[il];
|
|
305
|
-
ggml_format_name(ctrl_out, "direction.%
|
|
305
|
+
ggml_format_name(ctrl_out, "direction.%zu", il+1);
|
|
306
306
|
|
|
307
307
|
// run power_iteration
|
|
308
308
|
params.i_layer = il;
|