@fugood/llama.node 0.0.1-alpha.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +36 -7
- package/README.md +9 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/lib/binding.js +1 -1
- package/lib/binding.ts +5 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +18 -1
- package/src/common.hpp +11 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
|
@@ -31,6 +31,8 @@
|
|
|
31
31
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
|
32
32
|
} while(0)
|
|
33
33
|
|
|
34
|
+
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
35
|
+
|
|
34
36
|
// build info
|
|
35
37
|
extern int LLAMA_BUILD_NUMBER;
|
|
36
38
|
extern char const *LLAMA_COMMIT;
|
|
@@ -86,13 +88,13 @@ struct gpt_params {
|
|
|
86
88
|
|
|
87
89
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
|
88
90
|
|
|
89
|
-
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
90
|
-
llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
91
|
+
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
92
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
91
93
|
|
|
92
94
|
// // sampling parameters
|
|
93
95
|
struct llama_sampling_params sparams;
|
|
94
96
|
|
|
95
|
-
std::string model = "
|
|
97
|
+
std::string model = ""; // model path
|
|
96
98
|
std::string model_draft = ""; // draft model for speculative decoding
|
|
97
99
|
std::string model_alias = "unknown"; // model alias
|
|
98
100
|
std::string model_url = ""; // model url to download
|
|
@@ -133,11 +135,12 @@ struct gpt_params {
|
|
|
133
135
|
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
|
|
134
136
|
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
|
|
135
137
|
|
|
136
|
-
bool kl_divergence = false; // compute KL
|
|
138
|
+
bool kl_divergence = false; // compute KL divergence
|
|
137
139
|
|
|
138
140
|
bool random_prompt = false; // do not randomize prompt if none provided
|
|
139
141
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
140
142
|
bool interactive = false; // interactive mode
|
|
143
|
+
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
|
141
144
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
|
142
145
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
|
143
146
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
|
@@ -148,6 +151,7 @@ struct gpt_params {
|
|
|
148
151
|
bool multiline_input = false; // reverse the usage of `\`
|
|
149
152
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
150
153
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
154
|
+
bool flash_attn = false; // flash attention
|
|
151
155
|
|
|
152
156
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
153
157
|
bool ignore_eos = false; // ignore generated EOS tokens
|
|
@@ -161,15 +165,20 @@ struct gpt_params {
|
|
|
161
165
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
|
162
166
|
bool no_kv_offload = false; // disable KV offloading
|
|
163
167
|
bool warmup = true; // warmup run
|
|
168
|
+
bool check_tensors = false; // validate tensor data
|
|
164
169
|
|
|
165
170
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
|
166
171
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
|
167
172
|
|
|
168
173
|
// multimodal models (see examples/llava)
|
|
169
|
-
std::string mmproj = "";
|
|
170
|
-
std::string image
|
|
174
|
+
std::string mmproj = ""; // path to multimodal projector
|
|
175
|
+
std::vector<std::string> image; // path to image file(s)
|
|
171
176
|
};
|
|
172
177
|
|
|
178
|
+
void gpt_params_handle_model_default(gpt_params & params);
|
|
179
|
+
|
|
180
|
+
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
181
|
+
|
|
173
182
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
|
174
183
|
|
|
175
184
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
@@ -193,6 +202,7 @@ bool validate_file_name(const std::string & filename);
|
|
|
193
202
|
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
194
203
|
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
|
|
195
204
|
std::vector<std::string> string_split(std::string input, char separator);
|
|
205
|
+
std::string string_strip(const std::string & str);
|
|
196
206
|
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
|
197
207
|
|
|
198
208
|
//
|
|
@@ -237,11 +247,12 @@ std::vector<llama_token> llama_tokenize(
|
|
|
237
247
|
bool add_special,
|
|
238
248
|
bool parse_special = false);
|
|
239
249
|
|
|
240
|
-
// tokenizes a token into a piece
|
|
250
|
+
// tokenizes a token into a piece, optionally renders special/control tokens
|
|
241
251
|
// should work similar to Python's `tokenizer.id_to_piece`
|
|
242
252
|
std::string llama_token_to_piece(
|
|
243
253
|
const struct llama_context * ctx,
|
|
244
|
-
llama_token token
|
|
254
|
+
llama_token token,
|
|
255
|
+
bool special = true);
|
|
245
256
|
|
|
246
257
|
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
|
247
258
|
// that takes into account the tokenizer type and decides how to handle the leading space
|
|
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
234
234
|
// INTERNAL, DO NOT USE
|
|
235
235
|
// USE LOG() INSTEAD
|
|
236
236
|
//
|
|
237
|
-
#if !defined(_MSC_VER)
|
|
237
|
+
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
|
238
238
|
#define LOG_IMPL(str, ...) \
|
|
239
239
|
do { \
|
|
240
240
|
if (LOG_TARGET != nullptr) \
|
|
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
257
257
|
// INTERNAL, DO NOT USE
|
|
258
258
|
// USE LOG_TEE() INSTEAD
|
|
259
259
|
//
|
|
260
|
-
#if !defined(_MSC_VER)
|
|
260
|
+
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
|
261
261
|
#define LOG_TEE_IMPL(str, ...) \
|
|
262
262
|
do { \
|
|
263
263
|
if (LOG_TARGET != nullptr) \
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
#define LLAMA_API_INTERNAL
|
|
1
2
|
#include "sampling.h"
|
|
3
|
+
#include <random>
|
|
2
4
|
|
|
3
5
|
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
|
4
6
|
struct llama_sampling_context * result = new llama_sampling_context();
|
|
@@ -33,6 +35,10 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
|
|
33
35
|
|
|
34
36
|
result->prev.resize(params.n_prev);
|
|
35
37
|
|
|
38
|
+
result->n_considered = 0;
|
|
39
|
+
|
|
40
|
+
llama_sampling_set_rng_seed(result, params.seed);
|
|
41
|
+
|
|
36
42
|
return result;
|
|
37
43
|
}
|
|
38
44
|
|
|
@@ -60,6 +66,14 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
|
|
60
66
|
|
|
61
67
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
|
62
68
|
ctx->cur.clear();
|
|
69
|
+
ctx->n_considered = 0;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
|
73
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
|
74
|
+
seed = std::random_device{}();
|
|
75
|
+
}
|
|
76
|
+
ctx->rng.seed(seed);
|
|
63
77
|
}
|
|
64
78
|
|
|
65
79
|
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
|
|
@@ -203,7 +217,7 @@ static llama_token llama_sampling_sample_impl(
|
|
|
203
217
|
|
|
204
218
|
sampler_queue(ctx_main, params, cur_p, min_keep);
|
|
205
219
|
|
|
206
|
-
id =
|
|
220
|
+
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
|
|
207
221
|
|
|
208
222
|
//{
|
|
209
223
|
// const int n_top = 10;
|
|
@@ -242,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
|
|
|
242
256
|
}
|
|
243
257
|
}
|
|
244
258
|
|
|
259
|
+
ctx_sampling->n_considered = cur_p.size;
|
|
260
|
+
|
|
245
261
|
return id;
|
|
246
262
|
}
|
|
247
263
|
|
|
@@ -4,9 +4,10 @@
|
|
|
4
4
|
|
|
5
5
|
#include "grammar-parser.h"
|
|
6
6
|
|
|
7
|
+
#include <random>
|
|
7
8
|
#include <string>
|
|
8
|
-
#include <vector>
|
|
9
9
|
#include <unordered_map>
|
|
10
|
+
#include <vector>
|
|
10
11
|
|
|
11
12
|
// sampler types
|
|
12
13
|
enum class llama_sampler_type : char {
|
|
@@ -20,25 +21,26 @@ enum class llama_sampler_type : char {
|
|
|
20
21
|
|
|
21
22
|
// sampling parameters
|
|
22
23
|
typedef struct llama_sampling_params {
|
|
23
|
-
int32_t n_prev = 64;
|
|
24
|
-
int32_t n_probs = 0;
|
|
25
|
-
int32_t min_keep = 0;
|
|
26
|
-
int32_t top_k = 40;
|
|
27
|
-
float top_p = 0.95f;
|
|
28
|
-
float min_p = 0.05f;
|
|
29
|
-
float tfs_z = 1.00f;
|
|
30
|
-
float typical_p = 1.00f;
|
|
31
|
-
float temp = 0.80f;
|
|
32
|
-
float dynatemp_range = 0.00f;
|
|
33
|
-
float dynatemp_exponent = 1.00f;
|
|
34
|
-
int32_t penalty_last_n = 64;
|
|
35
|
-
float penalty_repeat = 1.00f;
|
|
36
|
-
float penalty_freq = 0.00f;
|
|
37
|
-
float penalty_present = 0.00f;
|
|
38
|
-
int32_t mirostat = 0;
|
|
39
|
-
float mirostat_tau = 5.00f;
|
|
40
|
-
float mirostat_eta = 0.10f;
|
|
41
|
-
bool penalize_nl = false;
|
|
24
|
+
int32_t n_prev = 64; // number of previous tokens to remember
|
|
25
|
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
26
|
+
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
27
|
+
int32_t top_k = 40; // <= 0 to use vocab size
|
|
28
|
+
float top_p = 0.95f; // 1.0 = disabled
|
|
29
|
+
float min_p = 0.05f; // 0.0 = disabled
|
|
30
|
+
float tfs_z = 1.00f; // 1.0 = disabled
|
|
31
|
+
float typical_p = 1.00f; // 1.0 = disabled
|
|
32
|
+
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
33
|
+
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
34
|
+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
35
|
+
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
36
|
+
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
37
|
+
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
38
|
+
float penalty_present = 0.00f; // 0.0 = disabled
|
|
39
|
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
40
|
+
float mirostat_tau = 5.00f; // target entropy
|
|
41
|
+
float mirostat_eta = 0.10f; // learning rate
|
|
42
|
+
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
43
|
+
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
|
|
42
44
|
|
|
43
45
|
std::vector<llama_sampler_type> samplers_sequence = {
|
|
44
46
|
llama_sampler_type::TOP_K,
|
|
@@ -79,6 +81,9 @@ struct llama_sampling_context {
|
|
|
79
81
|
// TODO: replace with ring-buffer
|
|
80
82
|
std::vector<llama_token> prev;
|
|
81
83
|
std::vector<llama_token_data> cur;
|
|
84
|
+
size_t n_considered;
|
|
85
|
+
|
|
86
|
+
std::mt19937 rng;
|
|
82
87
|
};
|
|
83
88
|
|
|
84
89
|
#include "common.h"
|
|
@@ -93,6 +98,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
|
|
|
93
98
|
// - reset grammar
|
|
94
99
|
void llama_sampling_reset(llama_sampling_context * ctx);
|
|
95
100
|
|
|
101
|
+
// Set the sampler seed
|
|
102
|
+
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
|
|
103
|
+
|
|
96
104
|
// Copy the sampler context
|
|
97
105
|
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
|
98
106
|
|
|
@@ -32,7 +32,7 @@ int main(int argc, char ** argv) {
|
|
|
32
32
|
gpt_params params;
|
|
33
33
|
|
|
34
34
|
if (argc == 1 || argv[1][0] == '-') {
|
|
35
|
-
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
|
35
|
+
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
|
36
36
|
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
|
37
37
|
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
|
38
38
|
return 1 ;
|
|
@@ -41,6 +41,7 @@ int main(int argc, char ** argv) {
|
|
|
41
41
|
int n_kv_max = 2048;
|
|
42
42
|
int n_batch = 2048;
|
|
43
43
|
int n_ubatch = 512;
|
|
44
|
+
bool flash_attn = false;
|
|
44
45
|
int is_pp_shared = 0;
|
|
45
46
|
int n_gpu_layers = 0;
|
|
46
47
|
|
|
@@ -66,23 +67,27 @@ int main(int argc, char ** argv) {
|
|
|
66
67
|
}
|
|
67
68
|
|
|
68
69
|
if (argc >= 6) {
|
|
69
|
-
|
|
70
|
+
flash_attn = std::atoi(argv[5]);
|
|
70
71
|
}
|
|
71
72
|
|
|
72
73
|
if (argc >= 7) {
|
|
73
|
-
|
|
74
|
+
is_pp_shared = std::atoi(argv[6]);
|
|
74
75
|
}
|
|
75
76
|
|
|
76
77
|
if (argc >= 8) {
|
|
77
|
-
|
|
78
|
+
n_gpu_layers = std::atoi(argv[7]);
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
if (argc >= 9) {
|
|
81
|
-
|
|
82
|
+
n_pp = parse_list(argv[8]);
|
|
82
83
|
}
|
|
83
84
|
|
|
84
85
|
if (argc >= 10) {
|
|
85
|
-
|
|
86
|
+
n_tg = parse_list(argv[9]);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (argc >= 11) {
|
|
90
|
+
n_pl = parse_list(argv[10]);
|
|
86
91
|
}
|
|
87
92
|
|
|
88
93
|
// init LLM
|
|
@@ -108,10 +113,11 @@ int main(int argc, char ** argv) {
|
|
|
108
113
|
|
|
109
114
|
llama_context_params ctx_params = llama_context_default_params();
|
|
110
115
|
|
|
111
|
-
ctx_params.seed
|
|
112
|
-
ctx_params.n_ctx
|
|
113
|
-
ctx_params.n_batch
|
|
114
|
-
ctx_params.n_ubatch
|
|
116
|
+
ctx_params.seed = 1234;
|
|
117
|
+
ctx_params.n_ctx = n_kv_max;
|
|
118
|
+
ctx_params.n_batch = n_batch;
|
|
119
|
+
ctx_params.n_ubatch = n_ubatch;
|
|
120
|
+
ctx_params.flash_attn = flash_attn;
|
|
115
121
|
|
|
116
122
|
ctx_params.n_threads = params.n_threads;
|
|
117
123
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
@@ -169,7 +175,7 @@ int main(int argc, char ** argv) {
|
|
|
169
175
|
}
|
|
170
176
|
|
|
171
177
|
LOG_TEE("\n");
|
|
172
|
-
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
|
178
|
+
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
|
173
179
|
LOG_TEE("\n");
|
|
174
180
|
|
|
175
181
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
|
@@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|
|
52
52
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
|
53
53
|
float v;
|
|
54
54
|
if (type == GGML_TYPE_F16) {
|
|
55
|
-
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data
|
|
55
|
+
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
|
56
56
|
} else if (type == GGML_TYPE_F32) {
|
|
57
|
-
v = *(float *) data
|
|
57
|
+
v = *(float *) &data[i];
|
|
58
58
|
} else if (type == GGML_TYPE_I32) {
|
|
59
|
-
v = (float) *(int32_t *) data
|
|
59
|
+
v = (float) *(int32_t *) &data[i];
|
|
60
60
|
} else if (type == GGML_TYPE_I16) {
|
|
61
|
-
v = (float) *(int16_t *) data
|
|
61
|
+
v = (float) *(int16_t *) &data[i];
|
|
62
62
|
} else if (type == GGML_TYPE_I8) {
|
|
63
|
-
v = (float) *(int8_t *) data
|
|
63
|
+
v = (float) *(int8_t *) &data[i];
|
|
64
64
|
} else {
|
|
65
65
|
GGML_ASSERT(false);
|
|
66
66
|
}
|
|
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|
|
575
575
|
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
|
576
576
|
|
|
577
577
|
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
|
578
|
-
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
|
|
578
|
+
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
|
|
579
579
|
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
|
580
580
|
} else if (a->type == GGML_TYPE_F32) {
|
|
581
581
|
return ggml_add(ctx, a, b);
|
|
@@ -32,6 +32,7 @@ struct split_params {
|
|
|
32
32
|
int n_split_tensors = 128;
|
|
33
33
|
std::string input;
|
|
34
34
|
std::string output;
|
|
35
|
+
bool no_tensor_first_split = false;
|
|
35
36
|
bool dry_run = false;
|
|
36
37
|
};
|
|
37
38
|
|
|
@@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
|
|
|
49
50
|
printf(" --merge merge multiple GGUF to a single GGUF\n");
|
|
50
51
|
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
|
|
51
52
|
printf(" --split-max-size N(M|G) max size per split\n");
|
|
53
|
+
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
|
|
52
54
|
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
|
|
53
55
|
printf("\n");
|
|
54
56
|
}
|
|
@@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
|
|
|
100
102
|
arg_found = true;
|
|
101
103
|
params.dry_run = true;
|
|
102
104
|
}
|
|
105
|
+
if (arg == "--no-tensor-first-split") {
|
|
106
|
+
arg_found = true;
|
|
107
|
+
params.no_tensor_first_split = true;
|
|
108
|
+
}
|
|
103
109
|
|
|
104
110
|
if (is_op_set) {
|
|
105
111
|
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
|
|
@@ -200,10 +206,10 @@ struct split_strategy {
|
|
|
200
206
|
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
|
|
201
207
|
int i_split = -1;
|
|
202
208
|
struct gguf_context * ctx_out = NULL;
|
|
203
|
-
auto new_ctx_out = [&]() {
|
|
209
|
+
auto new_ctx_out = [&](bool allow_no_tensors) {
|
|
204
210
|
i_split++;
|
|
205
211
|
if (ctx_out != NULL) {
|
|
206
|
-
if (gguf_get_n_tensors(ctx_out) == 0) {
|
|
212
|
+
if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
|
|
207
213
|
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
|
|
208
214
|
exit(EXIT_FAILURE);
|
|
209
215
|
}
|
|
@@ -220,7 +226,12 @@ struct split_strategy {
|
|
|
220
226
|
};
|
|
221
227
|
|
|
222
228
|
// initialize ctx_out for the first split
|
|
223
|
-
new_ctx_out();
|
|
229
|
+
new_ctx_out(false);
|
|
230
|
+
|
|
231
|
+
// skip first split if no_tensor_first_split is set
|
|
232
|
+
if (params.no_tensor_first_split) {
|
|
233
|
+
new_ctx_out(true);
|
|
234
|
+
}
|
|
224
235
|
|
|
225
236
|
// process tensors one by one
|
|
226
237
|
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
|
|
@@ -230,7 +241,7 @@ struct split_strategy {
|
|
|
230
241
|
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
|
|
231
242
|
size_t next_tensors_size = curr_tensors_size + n_bytes;
|
|
232
243
|
if (should_split(i, next_tensors_size)) {
|
|
233
|
-
new_ctx_out();
|
|
244
|
+
new_ctx_out(false);
|
|
234
245
|
curr_tensors_size = n_bytes;
|
|
235
246
|
} else {
|
|
236
247
|
curr_tensors_size = next_tensors_size;
|
|
@@ -19,10 +19,12 @@
|
|
|
19
19
|
|
|
20
20
|
struct Stats {
|
|
21
21
|
std::vector<float> values;
|
|
22
|
+
std::vector<int> counts;
|
|
22
23
|
int ncall = 0;
|
|
23
24
|
};
|
|
24
25
|
|
|
25
26
|
struct StatParams {
|
|
27
|
+
std::string dataset;
|
|
26
28
|
std::string ofile = "imatrix.dat";
|
|
27
29
|
int n_output_frequency = 10;
|
|
28
30
|
int verbosity = 1;
|
|
@@ -46,7 +48,7 @@ private:
|
|
|
46
48
|
std::vector<float> m_src1_data;
|
|
47
49
|
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
|
48
50
|
//
|
|
49
|
-
void save_imatrix(const char * file_name) const;
|
|
51
|
+
void save_imatrix(const char * file_name, const char * dataset) const;
|
|
50
52
|
void keep_imatrix(int ncall) const;
|
|
51
53
|
};
|
|
52
54
|
|
|
@@ -120,12 +122,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
120
122
|
auto & e = m_stats[wname];
|
|
121
123
|
|
|
122
124
|
++e.ncall;
|
|
123
|
-
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
|
124
|
-
// using the following line, we can correct for that if needed by replacing the line above with:
|
|
125
|
-
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
|
126
125
|
|
|
127
126
|
if (e.values.empty()) {
|
|
128
127
|
e.values.resize(src1->ne[0]*n_as, 0);
|
|
128
|
+
e.counts.resize(src1->ne[0]*n_as, 0);
|
|
129
129
|
}
|
|
130
130
|
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
|
131
131
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
|
@@ -152,6 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
152
152
|
|
|
153
153
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
|
154
154
|
e.values[e_start + j] += x[j]*x[j];
|
|
155
|
+
e.counts[e_start + j]++;
|
|
155
156
|
}
|
|
156
157
|
}
|
|
157
158
|
}
|
|
@@ -169,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
169
170
|
auto& e = m_stats[wname];
|
|
170
171
|
if (e.values.empty()) {
|
|
171
172
|
e.values.resize(src1->ne[0], 0);
|
|
173
|
+
e.counts.resize(src1->ne[0], 0);
|
|
172
174
|
}
|
|
173
175
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
|
174
176
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
|
@@ -182,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
182
184
|
const float * x = data + row * src1->ne[0];
|
|
183
185
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
|
184
186
|
e.values[j] += x[j]*x[j];
|
|
187
|
+
e.counts[j]++;
|
|
185
188
|
}
|
|
186
189
|
}
|
|
187
190
|
if (e.ncall > m_last_call) {
|
|
@@ -199,7 +202,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
199
202
|
}
|
|
200
203
|
|
|
201
204
|
void IMatrixCollector::save_imatrix() const {
|
|
202
|
-
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
|
|
205
|
+
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
|
|
203
206
|
}
|
|
204
207
|
|
|
205
208
|
void IMatrixCollector::keep_imatrix(int ncall) const {
|
|
@@ -207,24 +210,39 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
|
|
|
207
210
|
if (file_name.empty()) file_name = "imatrix.dat";
|
|
208
211
|
file_name += ".at_";
|
|
209
212
|
file_name += std::to_string(ncall);
|
|
210
|
-
save_imatrix(file_name.c_str());
|
|
213
|
+
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
|
|
211
214
|
}
|
|
212
215
|
|
|
213
|
-
void IMatrixCollector::save_imatrix(const char * fname) const {
|
|
216
|
+
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
|
|
214
217
|
std::ofstream out(fname, std::ios::binary);
|
|
215
218
|
int n_entries = m_stats.size();
|
|
216
|
-
out.write((const char*)&n_entries, sizeof(n_entries));
|
|
217
|
-
for (auto& p : m_stats) {
|
|
219
|
+
out.write((const char *) &n_entries, sizeof(n_entries));
|
|
220
|
+
for (const auto & p : m_stats) {
|
|
218
221
|
int len = p.first.size();
|
|
219
|
-
out.write((const char*)&len, sizeof(len));
|
|
222
|
+
out.write((const char *) &len, sizeof(len));
|
|
220
223
|
out.write(p.first.c_str(), len);
|
|
221
|
-
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
|
|
224
|
+
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
|
222
225
|
int nval = p.second.values.size();
|
|
223
|
-
out.write((const char*)&nval, sizeof(nval));
|
|
224
|
-
if (nval > 0)
|
|
226
|
+
out.write((const char *) &nval, sizeof(nval));
|
|
227
|
+
if (nval > 0) {
|
|
228
|
+
std::vector<float> tmp(nval);
|
|
229
|
+
for (int i = 0; i < nval; i++) {
|
|
230
|
+
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
|
|
231
|
+
}
|
|
232
|
+
out.write((const char*)tmp.data(), nval*sizeof(float));
|
|
233
|
+
}
|
|
225
234
|
}
|
|
235
|
+
|
|
236
|
+
// Write the number of call the matrix was computed with
|
|
237
|
+
out.write((const char *) &m_last_call, sizeof(m_last_call));
|
|
238
|
+
|
|
239
|
+
// Write the dataset name at the end of the file to later on specify it in quantize
|
|
240
|
+
int n_dataset = strlen(dataset);
|
|
241
|
+
out.write((const char *) &n_dataset, sizeof(n_dataset));
|
|
242
|
+
out.write(dataset, n_dataset);
|
|
243
|
+
|
|
226
244
|
if (m_params.verbosity > 0) {
|
|
227
|
-
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
|
|
245
|
+
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
|
|
228
246
|
}
|
|
229
247
|
}
|
|
230
248
|
|
|
@@ -260,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
|
|
|
260
278
|
imatrix_data = {};
|
|
261
279
|
return false;
|
|
262
280
|
}
|
|
263
|
-
|
|
264
|
-
|
|
281
|
+
|
|
282
|
+
// When re-called from load_imatrix() with add set, this will already be created.
|
|
283
|
+
if (e.values.empty()) {
|
|
284
|
+
e.values.resize(nval, 0);
|
|
285
|
+
e.counts.resize(nval, 0);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
std::vector<float> tmp(nval);
|
|
289
|
+
in.read((char*)tmp.data(), nval*sizeof(float));
|
|
265
290
|
if (in.fail()) {
|
|
266
291
|
printf("%s: failed reading data for entry %d\n",__func__,i);
|
|
267
292
|
imatrix_data = {};
|
|
268
293
|
return false;
|
|
269
294
|
}
|
|
270
|
-
|
|
295
|
+
|
|
296
|
+
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
|
|
297
|
+
for (int i = 0; i < nval; i++) {
|
|
298
|
+
e.values[i] += tmp[i];
|
|
299
|
+
e.counts[i] += ncall;
|
|
300
|
+
}
|
|
301
|
+
e.ncall += ncall;
|
|
302
|
+
|
|
271
303
|
}
|
|
272
304
|
return true;
|
|
273
305
|
}
|
|
@@ -547,6 +579,29 @@ int main(int argc, char ** argv) {
|
|
|
547
579
|
}
|
|
548
580
|
}
|
|
549
581
|
|
|
582
|
+
gpt_params params;
|
|
583
|
+
params.n_batch = 512;
|
|
584
|
+
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
|
585
|
+
return 1;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
params.logits_all = true;
|
|
589
|
+
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
|
590
|
+
|
|
591
|
+
print_build_info();
|
|
592
|
+
|
|
593
|
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
594
|
+
params.seed = time(NULL);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
598
|
+
|
|
599
|
+
std::mt19937 rng(params.seed);
|
|
600
|
+
if (params.random_prompt) {
|
|
601
|
+
params.prompt = gpt_random_prompt(rng);
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
sparams.dataset = params.prompt_file;
|
|
550
605
|
g_collector.set_parameters(std::move(sparams));
|
|
551
606
|
|
|
552
607
|
if (!combine_files.empty()) {
|
|
@@ -585,28 +640,6 @@ int main(int argc, char ** argv) {
|
|
|
585
640
|
}
|
|
586
641
|
}
|
|
587
642
|
|
|
588
|
-
gpt_params params;
|
|
589
|
-
params.n_batch = 512;
|
|
590
|
-
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
|
591
|
-
return 1;
|
|
592
|
-
}
|
|
593
|
-
|
|
594
|
-
params.logits_all = true;
|
|
595
|
-
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
|
596
|
-
|
|
597
|
-
print_build_info();
|
|
598
|
-
|
|
599
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
600
|
-
params.seed = time(NULL);
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
604
|
-
|
|
605
|
-
std::mt19937 rng(params.seed);
|
|
606
|
-
if (params.random_prompt) {
|
|
607
|
-
params.prompt = gpt_random_prompt(rng);
|
|
608
|
-
}
|
|
609
|
-
|
|
610
643
|
llama_backend_init();
|
|
611
644
|
llama_numa_init(params.numa);
|
|
612
645
|
|