@fugood/llama.node 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/LlamaContext.cpp +2 -2
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +82 -54
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +748 -754
- package/src/llama.cpp/common/common.h +49 -41
- package/src/llama.cpp/common/grammar-parser.cpp +10 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +92 -10
- package/src/llama.cpp/common/sampling.h +6 -1
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +29 -17
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
- package/src/llama.cpp/examples/server/server.cpp +33 -25
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +2 -3
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +13 -3
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3715 -2050
- package/src/llama.cpp/ggml-rpc.cpp +1155 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +119 -673
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +203 -224
- package/src/llama.cpp/ggml.c +1208 -1483
- package/src/llama.cpp/ggml.h +71 -46
- package/src/llama.cpp/llama.cpp +1374 -938
- package/src/llama.cpp/llama.h +22 -6
- package/src/llama.cpp/requirements.txt +0 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
|
28
28
|
|
|
29
29
|
#define print_build_info() do { \
|
|
30
|
-
fprintf(stderr, "%s: build = %d (%s)\n",
|
|
30
|
+
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
|
31
31
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
|
32
32
|
} while(0)
|
|
33
33
|
|
|
@@ -35,14 +35,18 @@
|
|
|
35
35
|
|
|
36
36
|
// build info
|
|
37
37
|
extern int LLAMA_BUILD_NUMBER;
|
|
38
|
-
extern char const *LLAMA_COMMIT;
|
|
39
|
-
extern char const *LLAMA_COMPILER;
|
|
40
|
-
extern char const *LLAMA_BUILD_TARGET;
|
|
38
|
+
extern char const * LLAMA_COMMIT;
|
|
39
|
+
extern char const * LLAMA_COMPILER;
|
|
40
|
+
extern char const * LLAMA_BUILD_TARGET;
|
|
41
41
|
|
|
42
42
|
struct llama_control_vector_load_info;
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
//
|
|
45
|
+
// CPU utils
|
|
46
|
+
//
|
|
47
|
+
|
|
48
|
+
int32_t cpu_get_num_physical_cores();
|
|
49
|
+
int32_t cpu_get_num_math();
|
|
46
50
|
|
|
47
51
|
//
|
|
48
52
|
// CLI argument parsing
|
|
@@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
|
|
|
51
55
|
struct gpt_params {
|
|
52
56
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
|
53
57
|
|
|
54
|
-
int32_t n_threads =
|
|
58
|
+
int32_t n_threads = cpu_get_num_math();
|
|
55
59
|
int32_t n_threads_draft = -1;
|
|
56
60
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
|
57
61
|
int32_t n_threads_batch_draft = -1;
|
|
@@ -82,6 +86,7 @@ struct gpt_params {
|
|
|
82
86
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
83
87
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
84
88
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
|
89
|
+
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
85
90
|
|
|
86
91
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
|
87
92
|
void * cb_eval_user_data = nullptr;
|
|
@@ -140,6 +145,8 @@ struct gpt_params {
|
|
|
140
145
|
bool random_prompt = false; // do not randomize prompt if none provided
|
|
141
146
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
142
147
|
bool interactive = false; // interactive mode
|
|
148
|
+
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
|
149
|
+
bool special = false; // enable special token output
|
|
143
150
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
|
144
151
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
|
145
152
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
|
@@ -177,33 +184,34 @@ struct gpt_params {
|
|
|
177
184
|
|
|
178
185
|
void gpt_params_handle_model_default(gpt_params & params);
|
|
179
186
|
|
|
180
|
-
bool
|
|
181
|
-
|
|
182
|
-
bool
|
|
183
|
-
|
|
184
|
-
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
187
|
+
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
|
188
|
+
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
|
189
|
+
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
|
190
|
+
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
185
191
|
|
|
186
|
-
|
|
192
|
+
std::string gpt_params_get_system_info(const gpt_params & params);
|
|
187
193
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
194
|
+
//
|
|
195
|
+
// String utils
|
|
196
|
+
//
|
|
191
197
|
|
|
192
|
-
std::string
|
|
198
|
+
std::vector<std::string> string_split(std::string input, char separator);
|
|
193
199
|
|
|
194
|
-
|
|
200
|
+
std::string string_strip(const std::string & str);
|
|
201
|
+
std::string string_get_sortable_timestamp();
|
|
202
|
+
std::string string_random_prompt(std::mt19937 & rng);
|
|
195
203
|
|
|
196
|
-
bool
|
|
204
|
+
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
205
|
+
void string_process_escapes(std::string & input);
|
|
197
206
|
|
|
198
207
|
//
|
|
199
|
-
//
|
|
208
|
+
// Filesystem utils
|
|
200
209
|
//
|
|
201
210
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
std::string
|
|
206
|
-
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
|
211
|
+
bool fs_validate_filename(const std::string & filename);
|
|
212
|
+
bool fs_create_directory_with_parents(const std::string & path);
|
|
213
|
+
|
|
214
|
+
std::string fs_get_cache_directory();
|
|
207
215
|
|
|
208
216
|
//
|
|
209
217
|
// Model utils
|
|
@@ -274,29 +282,15 @@ std::string llama_detokenize_bpe(
|
|
|
274
282
|
// defaults to true when model type is SPM, otherwise false.
|
|
275
283
|
bool llama_should_add_bos_token(const llama_model * model);
|
|
276
284
|
|
|
277
|
-
//
|
|
278
|
-
// YAML utils
|
|
279
|
-
//
|
|
280
|
-
|
|
281
|
-
bool create_directory_with_parents(const std::string & path);
|
|
282
|
-
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
283
|
-
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
284
|
-
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
285
|
-
std::string get_sortable_timestamp();
|
|
286
|
-
|
|
287
|
-
void dump_non_result_info_yaml(
|
|
288
|
-
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
289
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
290
|
-
|
|
291
285
|
//
|
|
292
286
|
// KV cache utils
|
|
293
287
|
//
|
|
294
288
|
|
|
295
289
|
// Dump the KV cache view with the number of sequences per cell.
|
|
296
|
-
void
|
|
290
|
+
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
297
291
|
|
|
298
292
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
299
|
-
void
|
|
293
|
+
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
300
294
|
|
|
301
295
|
//
|
|
302
296
|
// Embedding utils
|
|
@@ -330,6 +324,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
330
324
|
//
|
|
331
325
|
// Split utils
|
|
332
326
|
//
|
|
327
|
+
|
|
333
328
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
|
334
329
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
335
330
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
331
|
+
|
|
332
|
+
//
|
|
333
|
+
// YAML utils
|
|
334
|
+
//
|
|
335
|
+
|
|
336
|
+
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
337
|
+
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
338
|
+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
339
|
+
|
|
340
|
+
void yaml_dump_non_result_info(
|
|
341
|
+
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
342
|
+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
343
|
+
|
|
@@ -26,7 +26,7 @@ namespace grammar_parser {
|
|
|
26
26
|
|
|
27
27
|
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
|
28
28
|
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
|
29
|
-
auto result = state.symbol_ids.
|
|
29
|
+
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
|
30
30
|
return result.first->second;
|
|
31
31
|
}
|
|
32
32
|
|
|
@@ -142,6 +142,9 @@ namespace grammar_parser {
|
|
|
142
142
|
pos++;
|
|
143
143
|
last_sym_start = out_elements.size();
|
|
144
144
|
while (*pos != '"') {
|
|
145
|
+
if (!*pos) {
|
|
146
|
+
throw std::runtime_error("unexpected end of input");
|
|
147
|
+
}
|
|
145
148
|
auto char_pair = parse_char(pos);
|
|
146
149
|
pos = char_pair.second;
|
|
147
150
|
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
|
@@ -156,6 +159,9 @@ namespace grammar_parser {
|
|
|
156
159
|
}
|
|
157
160
|
last_sym_start = out_elements.size();
|
|
158
161
|
while (*pos != ']') {
|
|
162
|
+
if (!*pos) {
|
|
163
|
+
throw std::runtime_error("unexpected end of input");
|
|
164
|
+
}
|
|
159
165
|
auto char_pair = parse_char(pos);
|
|
160
166
|
pos = char_pair.second;
|
|
161
167
|
enum llama_gretype type = last_sym_start < out_elements.size()
|
|
@@ -164,6 +170,9 @@ namespace grammar_parser {
|
|
|
164
170
|
|
|
165
171
|
out_elements.push_back({type, char_pair.first});
|
|
166
172
|
if (pos[0] == '-' && pos[1] != ']') {
|
|
173
|
+
if (!pos[1]) {
|
|
174
|
+
throw std::runtime_error("unexpected end of input");
|
|
175
|
+
}
|
|
167
176
|
auto endchar_pair = parse_char(pos + 1);
|
|
168
177
|
pos = endchar_pair.second;
|
|
169
178
|
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
|
@@ -272,7 +272,7 @@ private:
|
|
|
272
272
|
if (literal.empty()) {
|
|
273
273
|
return false;
|
|
274
274
|
}
|
|
275
|
-
ret.
|
|
275
|
+
ret.emplace_back(literal, true);
|
|
276
276
|
literal.clear();
|
|
277
277
|
return true;
|
|
278
278
|
};
|
|
@@ -298,7 +298,7 @@ private:
|
|
|
298
298
|
while (i < length) {
|
|
299
299
|
char c = sub_pattern[i];
|
|
300
300
|
if (c == '.') {
|
|
301
|
-
seq.
|
|
301
|
+
seq.emplace_back(get_dot(), false);
|
|
302
302
|
i++;
|
|
303
303
|
} else if (c == '(') {
|
|
304
304
|
i++;
|
|
@@ -307,7 +307,7 @@ private:
|
|
|
307
307
|
_warnings.push_back("Unsupported pattern syntax");
|
|
308
308
|
}
|
|
309
309
|
}
|
|
310
|
-
seq.
|
|
310
|
+
seq.emplace_back("(" + to_rule(transform()) + ")", false);
|
|
311
311
|
} else if (c == ')') {
|
|
312
312
|
i++;
|
|
313
313
|
if (start > 0 && sub_pattern[start - 1] != '(') {
|
|
@@ -331,9 +331,9 @@ private:
|
|
|
331
331
|
}
|
|
332
332
|
square_brackets += ']';
|
|
333
333
|
i++;
|
|
334
|
-
seq.
|
|
334
|
+
seq.emplace_back(square_brackets, false);
|
|
335
335
|
} else if (c == '|') {
|
|
336
|
-
seq.
|
|
336
|
+
seq.emplace_back("|", false);
|
|
337
337
|
i++;
|
|
338
338
|
} else if (c == '*' || c == '+' || c == '?') {
|
|
339
339
|
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
|
|
@@ -417,7 +417,7 @@ private:
|
|
|
417
417
|
}
|
|
418
418
|
}
|
|
419
419
|
if (!literal.empty()) {
|
|
420
|
-
seq.
|
|
420
|
+
seq.emplace_back(literal, true);
|
|
421
421
|
}
|
|
422
422
|
}
|
|
423
423
|
}
|
|
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
211
211
|
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
212
212
|
#else
|
|
213
213
|
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
|
214
|
-
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
214
|
+
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
|
215
215
|
#endif
|
|
216
216
|
#else
|
|
217
217
|
#define LOG_FLF_FMT "%s"
|
|
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
224
224
|
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
225
225
|
#else
|
|
226
226
|
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
|
227
|
-
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
227
|
+
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
|
228
228
|
#endif
|
|
229
229
|
#else
|
|
230
230
|
#define LOG_TEE_FLF_FMT "%s"
|
|
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
294
294
|
// Main LOG macro.
|
|
295
295
|
// behaves like printf, and supports arguments the exact same way.
|
|
296
296
|
//
|
|
297
|
-
#
|
|
297
|
+
#if !defined(_MSC_VER) || defined(__clang__)
|
|
298
298
|
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
|
299
299
|
#else
|
|
300
300
|
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
|
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
308
308
|
// Secondary target can be changed just like LOG_TARGET
|
|
309
309
|
// by defining LOG_TEE_TARGET
|
|
310
310
|
//
|
|
311
|
-
#
|
|
311
|
+
#if !defined(_MSC_VER) || defined(__clang__)
|
|
312
312
|
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
|
313
313
|
#else
|
|
314
314
|
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
|
315
315
|
#endif
|
|
316
316
|
|
|
317
317
|
// LOG macro variants with auto endline.
|
|
318
|
-
#
|
|
318
|
+
#if !defined(_MSC_VER) || defined(__clang__)
|
|
319
319
|
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
|
320
320
|
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
|
321
321
|
#else
|
|
@@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
|
|
35
35
|
|
|
36
36
|
result->prev.resize(params.n_prev);
|
|
37
37
|
|
|
38
|
-
result->
|
|
38
|
+
result->n_valid = 0;
|
|
39
39
|
|
|
40
40
|
llama_sampling_set_rng_seed(result, params.seed);
|
|
41
41
|
|
|
@@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
|
|
66
66
|
|
|
67
67
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
|
68
68
|
ctx->cur.clear();
|
|
69
|
-
ctx->
|
|
69
|
+
ctx->n_valid = 0;
|
|
70
70
|
}
|
|
71
71
|
|
|
72
72
|
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
|
@@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
|
|
125
125
|
std::string result = "CFG -> Penalties ";
|
|
126
126
|
if (params.mirostat == 0) {
|
|
127
127
|
for (auto sampler_type : params.samplers_sequence) {
|
|
128
|
-
const auto sampler_type_name =
|
|
128
|
+
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
|
|
129
129
|
if (!sampler_type_name.empty()) {
|
|
130
130
|
result += "-> " + sampler_type_name + " ";
|
|
131
131
|
}
|
|
@@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
|
|
137
137
|
return result;
|
|
138
138
|
}
|
|
139
139
|
|
|
140
|
+
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
|
141
|
+
switch (sampler_type) {
|
|
142
|
+
case llama_sampler_type::TOP_K: return "top_k";
|
|
143
|
+
case llama_sampler_type::TFS_Z: return "tfs_z";
|
|
144
|
+
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
|
145
|
+
case llama_sampler_type::TOP_P: return "top_p";
|
|
146
|
+
case llama_sampler_type::MIN_P: return "min_p";
|
|
147
|
+
case llama_sampler_type::TEMPERATURE: return "temperature";
|
|
148
|
+
default : return "";
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
|
153
|
+
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
|
154
|
+
{"top_k", llama_sampler_type::TOP_K},
|
|
155
|
+
{"top_p", llama_sampler_type::TOP_P},
|
|
156
|
+
{"typical_p", llama_sampler_type::TYPICAL_P},
|
|
157
|
+
{"min_p", llama_sampler_type::MIN_P},
|
|
158
|
+
{"tfs_z", llama_sampler_type::TFS_Z},
|
|
159
|
+
{"temperature", llama_sampler_type::TEMPERATURE}
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
// since samplers names are written multiple ways
|
|
163
|
+
// make it ready for both system names and input names
|
|
164
|
+
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
|
|
165
|
+
{"top-k", llama_sampler_type::TOP_K},
|
|
166
|
+
{"top-p", llama_sampler_type::TOP_P},
|
|
167
|
+
{"nucleus", llama_sampler_type::TOP_P},
|
|
168
|
+
{"typical-p", llama_sampler_type::TYPICAL_P},
|
|
169
|
+
{"typical", llama_sampler_type::TYPICAL_P},
|
|
170
|
+
{"min-p", llama_sampler_type::MIN_P},
|
|
171
|
+
{"tfs-z", llama_sampler_type::TFS_Z},
|
|
172
|
+
{"tfs", llama_sampler_type::TFS_Z},
|
|
173
|
+
{"temp", llama_sampler_type::TEMPERATURE}
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
std::vector<llama_sampler_type> sampler_types;
|
|
177
|
+
sampler_types.reserve(names.size());
|
|
178
|
+
for (const auto & name : names)
|
|
179
|
+
{
|
|
180
|
+
auto sampler_item = sampler_canonical_name_map.find(name);
|
|
181
|
+
if (sampler_item != sampler_canonical_name_map.end())
|
|
182
|
+
{
|
|
183
|
+
sampler_types.push_back(sampler_item->second);
|
|
184
|
+
}
|
|
185
|
+
else
|
|
186
|
+
{
|
|
187
|
+
if (allow_alt_names)
|
|
188
|
+
{
|
|
189
|
+
sampler_item = sampler_alt_name_map.find(name);
|
|
190
|
+
if (sampler_item != sampler_alt_name_map.end())
|
|
191
|
+
{
|
|
192
|
+
sampler_types.push_back(sampler_item->second);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return sampler_types;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
|
|
201
|
+
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
|
202
|
+
{'k', llama_sampler_type::TOP_K},
|
|
203
|
+
{'p', llama_sampler_type::TOP_P},
|
|
204
|
+
{'y', llama_sampler_type::TYPICAL_P},
|
|
205
|
+
{'m', llama_sampler_type::MIN_P},
|
|
206
|
+
{'f', llama_sampler_type::TFS_Z},
|
|
207
|
+
{'t', llama_sampler_type::TEMPERATURE}
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
std::vector<llama_sampler_type> sampler_types;
|
|
211
|
+
sampler_types.reserve(names_string.size());
|
|
212
|
+
for (const auto & c : names_string) {
|
|
213
|
+
const auto sampler_item = sampler_name_map.find(c);
|
|
214
|
+
if (sampler_item != sampler_name_map.end()) {
|
|
215
|
+
sampler_types.push_back(sampler_item->second);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return sampler_types;
|
|
219
|
+
}
|
|
220
|
+
|
|
140
221
|
// no reasons to expose this function in header
|
|
141
222
|
static void sampler_queue(
|
|
142
223
|
struct llama_context * ctx_main,
|
|
@@ -179,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
|
|
|
179
260
|
struct llama_context * ctx_main,
|
|
180
261
|
struct llama_context * ctx_cfg,
|
|
181
262
|
const int idx,
|
|
182
|
-
bool is_resampling) {
|
|
263
|
+
bool is_resampling) {
|
|
183
264
|
const llama_sampling_params & params = ctx_sampling->params;
|
|
184
265
|
|
|
185
266
|
const float temp = params.temp;
|
|
@@ -188,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
|
|
|
188
269
|
const float mirostat_eta = params.mirostat_eta;
|
|
189
270
|
|
|
190
271
|
std::vector<float> original_logits;
|
|
191
|
-
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx,
|
|
192
|
-
if (!is_resampling) {
|
|
272
|
+
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
|
|
273
|
+
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
|
193
274
|
GGML_ASSERT(!original_logits.empty());
|
|
194
275
|
}
|
|
195
276
|
llama_token id = 0;
|
|
@@ -252,11 +333,11 @@ static llama_token llama_sampling_sample_impl(
|
|
|
252
333
|
// Restore logits from the copy
|
|
253
334
|
std::copy(original_logits.begin(), original_logits.end(), logits);
|
|
254
335
|
|
|
255
|
-
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);
|
|
336
|
+
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
|
|
256
337
|
}
|
|
257
338
|
}
|
|
258
339
|
|
|
259
|
-
ctx_sampling->
|
|
340
|
+
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
|
|
260
341
|
|
|
261
342
|
return id;
|
|
262
343
|
}
|
|
@@ -285,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
|
|
285
366
|
// Get a pointer to the logits
|
|
286
367
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
|
287
368
|
|
|
288
|
-
if (
|
|
369
|
+
if (ctx_sampling->grammar != NULL && !apply_grammar) {
|
|
370
|
+
GGML_ASSERT(original_logits != NULL);
|
|
289
371
|
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
|
290
372
|
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
|
|
291
373
|
}
|
|
@@ -342,7 +424,7 @@ llama_token llama_sampling_sample(
|
|
|
342
424
|
struct llama_context * ctx_cfg,
|
|
343
425
|
const int idx) {
|
|
344
426
|
// Call the implementation function with is_resampling set to false by default
|
|
345
|
-
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
|
|
427
|
+
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
|
|
346
428
|
}
|
|
347
429
|
|
|
348
430
|
llama_token_data_array llama_sampling_prepare(
|
|
@@ -81,7 +81,7 @@ struct llama_sampling_context {
|
|
|
81
81
|
// TODO: replace with ring-buffer
|
|
82
82
|
std::vector<llama_token> prev;
|
|
83
83
|
std::vector<llama_token_data> cur;
|
|
84
|
-
size_t
|
|
84
|
+
size_t n_valid; // Number of correct top tokens with correct probabilities.
|
|
85
85
|
|
|
86
86
|
std::mt19937 rng;
|
|
87
87
|
};
|
|
@@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
|
|
|
116
116
|
// Print sampling order into a string
|
|
117
117
|
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
|
118
118
|
|
|
119
|
+
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
|
|
120
|
+
|
|
121
|
+
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
122
|
+
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
|
|
123
|
+
|
|
119
124
|
// this is a common sampling function used across the examples for convenience
|
|
120
125
|
// it can serve as a starting point for implementing your own sampling function
|
|
121
126
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
|
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
|
|
1052
1052
|
|
|
1053
1053
|
params.custom_n_ctx = false;
|
|
1054
1054
|
|
|
1055
|
-
params.use_flash =
|
|
1055
|
+
params.use_flash = false;
|
|
1056
1056
|
params.use_checkpointing = true;
|
|
1057
1057
|
|
|
1058
1058
|
params.sample_start = "";
|
|
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
|
|
|
1380
1380
|
|
|
1381
1381
|
void finish_processing_train_args(struct train_params_common * params) {
|
|
1382
1382
|
if (params->escape) {
|
|
1383
|
-
|
|
1383
|
+
string_process_escapes(params->sample_start);
|
|
1384
1384
|
}
|
|
1385
1385
|
}
|
|
1386
1386
|
|
|
@@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
49
49
|
}
|
|
50
50
|
|
|
51
51
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
|
52
|
+
//TODO: I would also add a parameter here to enable normalization or not.
|
|
53
|
+
/*fprintf(stdout, "unnormalized_embedding:");
|
|
54
|
+
for (int hh = 0; hh < n_embd; hh++) {
|
|
55
|
+
fprintf(stdout, "%9.6f ", embd[hh]);
|
|
56
|
+
}
|
|
57
|
+
fprintf(stdout, "\n");*/
|
|
52
58
|
llama_embd_normalize(embd, out, n_embd);
|
|
53
59
|
}
|
|
54
60
|
}
|
|
@@ -74,7 +80,7 @@ int main(int argc, char ** argv) {
|
|
|
74
80
|
|
|
75
81
|
std::mt19937 rng(params.seed);
|
|
76
82
|
if (params.random_prompt) {
|
|
77
|
-
params.prompt =
|
|
83
|
+
params.prompt = string_random_prompt(rng);
|
|
78
84
|
}
|
|
79
85
|
|
|
80
86
|
llama_backend_init();
|
|
@@ -101,7 +107,7 @@ int main(int argc, char ** argv) {
|
|
|
101
107
|
// print system information
|
|
102
108
|
{
|
|
103
109
|
fprintf(stderr, "\n");
|
|
104
|
-
fprintf(stderr, "%s\n",
|
|
110
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
105
111
|
}
|
|
106
112
|
|
|
107
113
|
// split the prompt into lines
|
|
@@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
|
|
|
123
129
|
inputs.push_back(inp);
|
|
124
130
|
}
|
|
125
131
|
|
|
126
|
-
//
|
|
132
|
+
// check if the last token is SEP
|
|
133
|
+
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
|
127
134
|
for (auto & inp : inputs) {
|
|
128
135
|
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
|
129
|
-
|
|
136
|
+
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
|
|
137
|
+
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
|
130
138
|
}
|
|
131
139
|
}
|
|
132
140
|
|
|
@@ -203,6 +211,7 @@ int main(int argc, char ** argv) {
|
|
|
203
211
|
|
|
204
212
|
// clean up
|
|
205
213
|
llama_print_timings(ctx);
|
|
214
|
+
llama_batch_free(batch);
|
|
206
215
|
llama_free(ctx);
|
|
207
216
|
llama_free_model(model);
|
|
208
217
|
llama_backend_free();
|
|
@@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
|
|
|
152
152
|
|
|
153
153
|
std::mt19937 rng(params.seed);
|
|
154
154
|
if (params.random_prompt) {
|
|
155
|
-
params.prompt =
|
|
155
|
+
params.prompt = string_random_prompt(rng);
|
|
156
156
|
}
|
|
157
157
|
|
|
158
158
|
llama_backend_init();
|
|
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
|
|
|
176
176
|
// print system information
|
|
177
177
|
{
|
|
178
178
|
fprintf(stderr, "\n");
|
|
179
|
-
fprintf(stderr, "%s\n",
|
|
179
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
180
180
|
}
|
|
181
181
|
|
|
182
182
|
bool OK = run(ctx, params);
|
|
@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|
|
563
563
|
// not capturing these, to silcence warnings
|
|
564
564
|
const int rope_mode = 0;
|
|
565
565
|
|
|
566
|
-
return
|
|
567
|
-
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
|
|
566
|
+
return ggml_rope_ext(ctx,
|
|
567
|
+
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
|
|
568
568
|
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
|
569
569
|
);
|
|
570
570
|
};
|
|
@@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|
|
643
643
|
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
|
|
644
644
|
struct ggml_tensor * t16;
|
|
645
645
|
if (enable_flash_attn) {
|
|
646
|
-
|
|
646
|
+
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
|
647
|
+
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
|
|
647
648
|
} else {
|
|
648
649
|
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
|
649
650
|
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
|
@@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
|
|
|
598
598
|
|
|
599
599
|
std::mt19937 rng(params.seed);
|
|
600
600
|
if (params.random_prompt) {
|
|
601
|
-
params.prompt =
|
|
601
|
+
params.prompt = string_random_prompt(rng);
|
|
602
602
|
}
|
|
603
603
|
|
|
604
604
|
sparams.dataset = params.prompt_file;
|
|
@@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
|
|
|
667
667
|
// print system information
|
|
668
668
|
{
|
|
669
669
|
fprintf(stderr, "\n");
|
|
670
|
-
fprintf(stderr, "%s\n",
|
|
670
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
671
671
|
}
|
|
672
672
|
|
|
673
673
|
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
|