@fugood/llama.node 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/CMakeLists.txt +72 -46
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +732 -752
- package/src/llama.cpp/common/common.h +47 -41
- package/src/llama.cpp/common/grammar-parser.cpp +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +89 -7
- package/src/llama.cpp/common/sampling.h +5 -0
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +24 -16
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
- package/src/llama.cpp/examples/server/server.cpp +21 -9
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +0 -1
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +4 -0
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3700 -2041
- package/src/llama.cpp/ggml-rpc.cpp +188 -56
- package/src/llama.cpp/ggml-sycl.cpp +99 -530
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +202 -225
- package/src/llama.cpp/ggml.c +1034 -1154
- package/src/llama.cpp/ggml.h +59 -31
- package/src/llama.cpp/llama.cpp +859 -609
- package/src/llama.cpp/llama.h +19 -6
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
|
28
28
|
|
|
29
29
|
#define print_build_info() do { \
|
|
30
|
-
fprintf(stderr, "%s: build = %d (%s)\n",
|
|
30
|
+
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
|
31
31
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
|
32
32
|
} while(0)
|
|
33
33
|
|
|
@@ -35,14 +35,18 @@
|
|
|
35
35
|
|
|
36
36
|
// build info
|
|
37
37
|
extern int LLAMA_BUILD_NUMBER;
|
|
38
|
-
extern char const *LLAMA_COMMIT;
|
|
39
|
-
extern char const *LLAMA_COMPILER;
|
|
40
|
-
extern char const *LLAMA_BUILD_TARGET;
|
|
38
|
+
extern char const * LLAMA_COMMIT;
|
|
39
|
+
extern char const * LLAMA_COMPILER;
|
|
40
|
+
extern char const * LLAMA_BUILD_TARGET;
|
|
41
41
|
|
|
42
42
|
struct llama_control_vector_load_info;
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
//
|
|
45
|
+
// CPU utils
|
|
46
|
+
//
|
|
47
|
+
|
|
48
|
+
int32_t cpu_get_num_physical_cores();
|
|
49
|
+
int32_t cpu_get_num_math();
|
|
46
50
|
|
|
47
51
|
//
|
|
48
52
|
// CLI argument parsing
|
|
@@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
|
|
|
51
55
|
struct gpt_params {
|
|
52
56
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
|
53
57
|
|
|
54
|
-
int32_t n_threads =
|
|
58
|
+
int32_t n_threads = cpu_get_num_math();
|
|
55
59
|
int32_t n_threads_draft = -1;
|
|
56
60
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
|
57
61
|
int32_t n_threads_batch_draft = -1;
|
|
@@ -142,6 +146,7 @@ struct gpt_params {
|
|
|
142
146
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
143
147
|
bool interactive = false; // interactive mode
|
|
144
148
|
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
|
149
|
+
bool special = false; // enable special token output
|
|
145
150
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
|
146
151
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
|
147
152
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
|
@@ -179,33 +184,34 @@ struct gpt_params {
|
|
|
179
184
|
|
|
180
185
|
void gpt_params_handle_model_default(gpt_params & params);
|
|
181
186
|
|
|
182
|
-
bool
|
|
183
|
-
|
|
184
|
-
bool
|
|
185
|
-
|
|
186
|
-
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
187
|
+
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
|
188
|
+
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
|
189
|
+
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
|
190
|
+
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
187
191
|
|
|
188
|
-
|
|
192
|
+
std::string gpt_params_get_system_info(const gpt_params & params);
|
|
189
193
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
194
|
+
//
|
|
195
|
+
// String utils
|
|
196
|
+
//
|
|
193
197
|
|
|
194
|
-
std::string
|
|
198
|
+
std::vector<std::string> string_split(std::string input, char separator);
|
|
195
199
|
|
|
196
|
-
|
|
200
|
+
std::string string_strip(const std::string & str);
|
|
201
|
+
std::string string_get_sortable_timestamp();
|
|
202
|
+
std::string string_random_prompt(std::mt19937 & rng);
|
|
197
203
|
|
|
198
|
-
bool
|
|
204
|
+
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
205
|
+
void string_process_escapes(std::string & input);
|
|
199
206
|
|
|
200
207
|
//
|
|
201
|
-
//
|
|
208
|
+
// Filesystem utils
|
|
202
209
|
//
|
|
203
210
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
std::string
|
|
208
|
-
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
|
211
|
+
bool fs_validate_filename(const std::string & filename);
|
|
212
|
+
bool fs_create_directory_with_parents(const std::string & path);
|
|
213
|
+
|
|
214
|
+
std::string fs_get_cache_directory();
|
|
209
215
|
|
|
210
216
|
//
|
|
211
217
|
// Model utils
|
|
@@ -276,29 +282,15 @@ std::string llama_detokenize_bpe(
|
|
|
276
282
|
// defaults to true when model type is SPM, otherwise false.
|
|
277
283
|
bool llama_should_add_bos_token(const llama_model * model);
|
|
278
284
|
|
|
279
|
-
//
|
|
280
|
-
// YAML utils
|
|
281
|
-
//
|
|
282
|
-
|
|
283
|
-
bool create_directory_with_parents(const std::string & path);
|
|
284
|
-
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
285
|
-
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
286
|
-
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
287
|
-
std::string get_sortable_timestamp();
|
|
288
|
-
|
|
289
|
-
void dump_non_result_info_yaml(
|
|
290
|
-
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
291
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
292
|
-
|
|
293
285
|
//
|
|
294
286
|
// KV cache utils
|
|
295
287
|
//
|
|
296
288
|
|
|
297
289
|
// Dump the KV cache view with the number of sequences per cell.
|
|
298
|
-
void
|
|
290
|
+
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
299
291
|
|
|
300
292
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
301
|
-
void
|
|
293
|
+
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
302
294
|
|
|
303
295
|
//
|
|
304
296
|
// Embedding utils
|
|
@@ -332,6 +324,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
332
324
|
//
|
|
333
325
|
// Split utils
|
|
334
326
|
//
|
|
327
|
+
|
|
335
328
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
|
336
329
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
337
330
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
331
|
+
|
|
332
|
+
//
|
|
333
|
+
// YAML utils
|
|
334
|
+
//
|
|
335
|
+
|
|
336
|
+
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
337
|
+
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
338
|
+
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
339
|
+
|
|
340
|
+
void yaml_dump_non_result_info(
|
|
341
|
+
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
342
|
+
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
343
|
+
|
|
@@ -26,7 +26,7 @@ namespace grammar_parser {
|
|
|
26
26
|
|
|
27
27
|
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
|
28
28
|
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
|
29
|
-
auto result = state.symbol_ids.
|
|
29
|
+
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
|
30
30
|
return result.first->second;
|
|
31
31
|
}
|
|
32
32
|
|
|
@@ -272,7 +272,7 @@ private:
|
|
|
272
272
|
if (literal.empty()) {
|
|
273
273
|
return false;
|
|
274
274
|
}
|
|
275
|
-
ret.
|
|
275
|
+
ret.emplace_back(literal, true);
|
|
276
276
|
literal.clear();
|
|
277
277
|
return true;
|
|
278
278
|
};
|
|
@@ -298,7 +298,7 @@ private:
|
|
|
298
298
|
while (i < length) {
|
|
299
299
|
char c = sub_pattern[i];
|
|
300
300
|
if (c == '.') {
|
|
301
|
-
seq.
|
|
301
|
+
seq.emplace_back(get_dot(), false);
|
|
302
302
|
i++;
|
|
303
303
|
} else if (c == '(') {
|
|
304
304
|
i++;
|
|
@@ -307,7 +307,7 @@ private:
|
|
|
307
307
|
_warnings.push_back("Unsupported pattern syntax");
|
|
308
308
|
}
|
|
309
309
|
}
|
|
310
|
-
seq.
|
|
310
|
+
seq.emplace_back("(" + to_rule(transform()) + ")", false);
|
|
311
311
|
} else if (c == ')') {
|
|
312
312
|
i++;
|
|
313
313
|
if (start > 0 && sub_pattern[start - 1] != '(') {
|
|
@@ -331,9 +331,9 @@ private:
|
|
|
331
331
|
}
|
|
332
332
|
square_brackets += ']';
|
|
333
333
|
i++;
|
|
334
|
-
seq.
|
|
334
|
+
seq.emplace_back(square_brackets, false);
|
|
335
335
|
} else if (c == '|') {
|
|
336
|
-
seq.
|
|
336
|
+
seq.emplace_back("|", false);
|
|
337
337
|
i++;
|
|
338
338
|
} else if (c == '*' || c == '+' || c == '?') {
|
|
339
339
|
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
|
|
@@ -417,7 +417,7 @@ private:
|
|
|
417
417
|
}
|
|
418
418
|
}
|
|
419
419
|
if (!literal.empty()) {
|
|
420
|
-
seq.
|
|
420
|
+
seq.emplace_back(literal, true);
|
|
421
421
|
}
|
|
422
422
|
}
|
|
423
423
|
}
|
|
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
211
211
|
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
212
212
|
#else
|
|
213
213
|
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
|
214
|
-
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
214
|
+
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
|
215
215
|
#endif
|
|
216
216
|
#else
|
|
217
217
|
#define LOG_FLF_FMT "%s"
|
|
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
224
224
|
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
225
225
|
#else
|
|
226
226
|
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
|
227
|
-
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
227
|
+
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
|
228
228
|
#endif
|
|
229
229
|
#else
|
|
230
230
|
#define LOG_TEE_FLF_FMT "%s"
|
|
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
294
294
|
// Main LOG macro.
|
|
295
295
|
// behaves like printf, and supports arguments the exact same way.
|
|
296
296
|
//
|
|
297
|
-
#
|
|
297
|
+
#if !defined(_MSC_VER) || defined(__clang__)
|
|
298
298
|
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
|
299
299
|
#else
|
|
300
300
|
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
|
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
|
|
308
308
|
// Secondary target can be changed just like LOG_TARGET
|
|
309
309
|
// by defining LOG_TEE_TARGET
|
|
310
310
|
//
|
|
311
|
-
#
|
|
311
|
+
#if !defined(_MSC_VER) || defined(__clang__)
|
|
312
312
|
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
|
313
313
|
#else
|
|
314
314
|
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
|
315
315
|
#endif
|
|
316
316
|
|
|
317
317
|
// LOG macro variants with auto endline.
|
|
318
|
-
#
|
|
318
|
+
#if !defined(_MSC_VER) || defined(__clang__)
|
|
319
319
|
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
|
320
320
|
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
|
321
321
|
#else
|
|
@@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
|
|
125
125
|
std::string result = "CFG -> Penalties ";
|
|
126
126
|
if (params.mirostat == 0) {
|
|
127
127
|
for (auto sampler_type : params.samplers_sequence) {
|
|
128
|
-
const auto sampler_type_name =
|
|
128
|
+
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
|
|
129
129
|
if (!sampler_type_name.empty()) {
|
|
130
130
|
result += "-> " + sampler_type_name + " ";
|
|
131
131
|
}
|
|
@@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
|
|
137
137
|
return result;
|
|
138
138
|
}
|
|
139
139
|
|
|
140
|
+
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
|
141
|
+
switch (sampler_type) {
|
|
142
|
+
case llama_sampler_type::TOP_K: return "top_k";
|
|
143
|
+
case llama_sampler_type::TFS_Z: return "tfs_z";
|
|
144
|
+
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
|
145
|
+
case llama_sampler_type::TOP_P: return "top_p";
|
|
146
|
+
case llama_sampler_type::MIN_P: return "min_p";
|
|
147
|
+
case llama_sampler_type::TEMPERATURE: return "temperature";
|
|
148
|
+
default : return "";
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
|
153
|
+
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
|
154
|
+
{"top_k", llama_sampler_type::TOP_K},
|
|
155
|
+
{"top_p", llama_sampler_type::TOP_P},
|
|
156
|
+
{"typical_p", llama_sampler_type::TYPICAL_P},
|
|
157
|
+
{"min_p", llama_sampler_type::MIN_P},
|
|
158
|
+
{"tfs_z", llama_sampler_type::TFS_Z},
|
|
159
|
+
{"temperature", llama_sampler_type::TEMPERATURE}
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
// since samplers names are written multiple ways
|
|
163
|
+
// make it ready for both system names and input names
|
|
164
|
+
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
|
|
165
|
+
{"top-k", llama_sampler_type::TOP_K},
|
|
166
|
+
{"top-p", llama_sampler_type::TOP_P},
|
|
167
|
+
{"nucleus", llama_sampler_type::TOP_P},
|
|
168
|
+
{"typical-p", llama_sampler_type::TYPICAL_P},
|
|
169
|
+
{"typical", llama_sampler_type::TYPICAL_P},
|
|
170
|
+
{"min-p", llama_sampler_type::MIN_P},
|
|
171
|
+
{"tfs-z", llama_sampler_type::TFS_Z},
|
|
172
|
+
{"tfs", llama_sampler_type::TFS_Z},
|
|
173
|
+
{"temp", llama_sampler_type::TEMPERATURE}
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
std::vector<llama_sampler_type> sampler_types;
|
|
177
|
+
sampler_types.reserve(names.size());
|
|
178
|
+
for (const auto & name : names)
|
|
179
|
+
{
|
|
180
|
+
auto sampler_item = sampler_canonical_name_map.find(name);
|
|
181
|
+
if (sampler_item != sampler_canonical_name_map.end())
|
|
182
|
+
{
|
|
183
|
+
sampler_types.push_back(sampler_item->second);
|
|
184
|
+
}
|
|
185
|
+
else
|
|
186
|
+
{
|
|
187
|
+
if (allow_alt_names)
|
|
188
|
+
{
|
|
189
|
+
sampler_item = sampler_alt_name_map.find(name);
|
|
190
|
+
if (sampler_item != sampler_alt_name_map.end())
|
|
191
|
+
{
|
|
192
|
+
sampler_types.push_back(sampler_item->second);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return sampler_types;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
|
|
201
|
+
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
|
202
|
+
{'k', llama_sampler_type::TOP_K},
|
|
203
|
+
{'p', llama_sampler_type::TOP_P},
|
|
204
|
+
{'y', llama_sampler_type::TYPICAL_P},
|
|
205
|
+
{'m', llama_sampler_type::MIN_P},
|
|
206
|
+
{'f', llama_sampler_type::TFS_Z},
|
|
207
|
+
{'t', llama_sampler_type::TEMPERATURE}
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
std::vector<llama_sampler_type> sampler_types;
|
|
211
|
+
sampler_types.reserve(names_string.size());
|
|
212
|
+
for (const auto & c : names_string) {
|
|
213
|
+
const auto sampler_item = sampler_name_map.find(c);
|
|
214
|
+
if (sampler_item != sampler_name_map.end()) {
|
|
215
|
+
sampler_types.push_back(sampler_item->second);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return sampler_types;
|
|
219
|
+
}
|
|
220
|
+
|
|
140
221
|
// no reasons to expose this function in header
|
|
141
222
|
static void sampler_queue(
|
|
142
223
|
struct llama_context * ctx_main,
|
|
@@ -179,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
|
|
|
179
260
|
struct llama_context * ctx_main,
|
|
180
261
|
struct llama_context * ctx_cfg,
|
|
181
262
|
const int idx,
|
|
182
|
-
bool is_resampling) {
|
|
263
|
+
bool is_resampling) {
|
|
183
264
|
const llama_sampling_params & params = ctx_sampling->params;
|
|
184
265
|
|
|
185
266
|
const float temp = params.temp;
|
|
@@ -188,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
|
|
|
188
269
|
const float mirostat_eta = params.mirostat_eta;
|
|
189
270
|
|
|
190
271
|
std::vector<float> original_logits;
|
|
191
|
-
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx,
|
|
192
|
-
if (!is_resampling) {
|
|
272
|
+
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
|
|
273
|
+
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
|
193
274
|
GGML_ASSERT(!original_logits.empty());
|
|
194
275
|
}
|
|
195
276
|
llama_token id = 0;
|
|
@@ -252,7 +333,7 @@ static llama_token llama_sampling_sample_impl(
|
|
|
252
333
|
// Restore logits from the copy
|
|
253
334
|
std::copy(original_logits.begin(), original_logits.end(), logits);
|
|
254
335
|
|
|
255
|
-
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);
|
|
336
|
+
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
|
|
256
337
|
}
|
|
257
338
|
}
|
|
258
339
|
|
|
@@ -285,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
|
|
285
366
|
// Get a pointer to the logits
|
|
286
367
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
|
287
368
|
|
|
288
|
-
if (
|
|
369
|
+
if (ctx_sampling->grammar != NULL && !apply_grammar) {
|
|
370
|
+
GGML_ASSERT(original_logits != NULL);
|
|
289
371
|
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
|
290
372
|
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
|
|
291
373
|
}
|
|
@@ -342,7 +424,7 @@ llama_token llama_sampling_sample(
|
|
|
342
424
|
struct llama_context * ctx_cfg,
|
|
343
425
|
const int idx) {
|
|
344
426
|
// Call the implementation function with is_resampling set to false by default
|
|
345
|
-
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
|
|
427
|
+
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
|
|
346
428
|
}
|
|
347
429
|
|
|
348
430
|
llama_token_data_array llama_sampling_prepare(
|
|
@@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
|
|
|
116
116
|
// Print sampling order into a string
|
|
117
117
|
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
|
118
118
|
|
|
119
|
+
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
|
|
120
|
+
|
|
121
|
+
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
122
|
+
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
|
|
123
|
+
|
|
119
124
|
// this is a common sampling function used across the examples for convenience
|
|
120
125
|
// it can serve as a starting point for implementing your own sampling function
|
|
121
126
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
|
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
|
|
1052
1052
|
|
|
1053
1053
|
params.custom_n_ctx = false;
|
|
1054
1054
|
|
|
1055
|
-
params.use_flash =
|
|
1055
|
+
params.use_flash = false;
|
|
1056
1056
|
params.use_checkpointing = true;
|
|
1057
1057
|
|
|
1058
1058
|
params.sample_start = "";
|
|
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
|
|
|
1380
1380
|
|
|
1381
1381
|
void finish_processing_train_args(struct train_params_common * params) {
|
|
1382
1382
|
if (params->escape) {
|
|
1383
|
-
|
|
1383
|
+
string_process_escapes(params->sample_start);
|
|
1384
1384
|
}
|
|
1385
1385
|
}
|
|
1386
1386
|
|
|
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
|
|
|
80
80
|
|
|
81
81
|
std::mt19937 rng(params.seed);
|
|
82
82
|
if (params.random_prompt) {
|
|
83
|
-
params.prompt =
|
|
83
|
+
params.prompt = string_random_prompt(rng);
|
|
84
84
|
}
|
|
85
85
|
|
|
86
86
|
llama_backend_init();
|
|
@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
|
|
|
107
107
|
// print system information
|
|
108
108
|
{
|
|
109
109
|
fprintf(stderr, "\n");
|
|
110
|
-
fprintf(stderr, "%s\n",
|
|
110
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
111
111
|
}
|
|
112
112
|
|
|
113
113
|
// split the prompt into lines
|
|
@@ -211,6 +211,7 @@ int main(int argc, char ** argv) {
|
|
|
211
211
|
|
|
212
212
|
// clean up
|
|
213
213
|
llama_print_timings(ctx);
|
|
214
|
+
llama_batch_free(batch);
|
|
214
215
|
llama_free(ctx);
|
|
215
216
|
llama_free_model(model);
|
|
216
217
|
llama_backend_free();
|
|
@@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
|
|
|
152
152
|
|
|
153
153
|
std::mt19937 rng(params.seed);
|
|
154
154
|
if (params.random_prompt) {
|
|
155
|
-
params.prompt =
|
|
155
|
+
params.prompt = string_random_prompt(rng);
|
|
156
156
|
}
|
|
157
157
|
|
|
158
158
|
llama_backend_init();
|
|
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
|
|
|
176
176
|
// print system information
|
|
177
177
|
{
|
|
178
178
|
fprintf(stderr, "\n");
|
|
179
|
-
fprintf(stderr, "%s\n",
|
|
179
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
180
180
|
}
|
|
181
181
|
|
|
182
182
|
bool OK = run(ctx, params);
|
|
@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|
|
563
563
|
// not capturing these, to silcence warnings
|
|
564
564
|
const int rope_mode = 0;
|
|
565
565
|
|
|
566
|
-
return
|
|
567
|
-
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
|
|
566
|
+
return ggml_rope_ext(ctx,
|
|
567
|
+
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
|
|
568
568
|
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
|
569
569
|
);
|
|
570
570
|
};
|
|
@@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|
|
643
643
|
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
|
|
644
644
|
struct ggml_tensor * t16;
|
|
645
645
|
if (enable_flash_attn) {
|
|
646
|
-
|
|
646
|
+
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
|
647
|
+
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
|
|
647
648
|
} else {
|
|
648
649
|
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
|
649
650
|
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
|
@@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
|
|
|
598
598
|
|
|
599
599
|
std::mt19937 rng(params.seed);
|
|
600
600
|
if (params.random_prompt) {
|
|
601
|
-
params.prompt =
|
|
601
|
+
params.prompt = string_random_prompt(rng);
|
|
602
602
|
}
|
|
603
603
|
|
|
604
604
|
sparams.dataset = params.prompt_file;
|
|
@@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
|
|
|
667
667
|
// print system information
|
|
668
668
|
{
|
|
669
669
|
fprintf(stderr, "\n");
|
|
670
|
-
fprintf(stderr, "%s\n",
|
|
670
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
671
671
|
}
|
|
672
672
|
|
|
673
673
|
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
|
|
@@ -50,9 +50,9 @@ static void write_logfile(
|
|
|
50
50
|
return;
|
|
51
51
|
}
|
|
52
52
|
|
|
53
|
-
const std::string timestamp =
|
|
53
|
+
const std::string timestamp = string_get_sortable_timestamp();
|
|
54
54
|
|
|
55
|
-
const bool success =
|
|
55
|
+
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
56
56
|
if (!success) {
|
|
57
57
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
58
58
|
__func__, params.logdir.c_str());
|
|
@@ -70,7 +70,7 @@ static void write_logfile(
|
|
|
70
70
|
fprintf(logfile, "binary: infill\n");
|
|
71
71
|
char model_desc[128];
|
|
72
72
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
73
|
-
|
|
73
|
+
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
74
74
|
|
|
75
75
|
fprintf(logfile, "\n");
|
|
76
76
|
fprintf(logfile, "######################\n");
|
|
@@ -78,8 +78,8 @@ static void write_logfile(
|
|
|
78
78
|
fprintf(logfile, "######################\n");
|
|
79
79
|
fprintf(logfile, "\n");
|
|
80
80
|
|
|
81
|
-
|
|
82
|
-
|
|
81
|
+
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
82
|
+
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
83
83
|
|
|
84
84
|
llama_dump_timing_info_yaml(logfile, ctx);
|
|
85
85
|
fclose(logfile);
|
|
@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
|
|
|
236
236
|
// print system information
|
|
237
237
|
{
|
|
238
238
|
LOG_TEE("\n");
|
|
239
|
-
LOG_TEE("%s\n",
|
|
239
|
+
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
|
240
240
|
}
|
|
241
241
|
const bool add_bos = llama_should_add_bos_token(model);
|
|
242
242
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
|
@@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
|
|
|
621
621
|
|
|
622
622
|
if (params.escape) {
|
|
623
623
|
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
|
624
|
-
|
|
625
|
-
|
|
624
|
+
string_process_escapes(params.input_prefix);
|
|
625
|
+
string_process_escapes(params.input_suffix);
|
|
626
626
|
}
|
|
627
627
|
suff_rm_leading_spc = params.escape;
|
|
628
628
|
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
|
@@ -195,12 +195,12 @@ static const cmd_params cmd_params_defaults = {
|
|
|
195
195
|
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
|
196
196
|
/* n_prompt */ {512},
|
|
197
197
|
/* n_gen */ {128},
|
|
198
|
-
/* n_pg */ {
|
|
198
|
+
/* n_pg */ {},
|
|
199
199
|
/* n_batch */ {2048},
|
|
200
200
|
/* n_ubatch */ {512},
|
|
201
201
|
/* type_k */ {GGML_TYPE_F16},
|
|
202
202
|
/* type_v */ {GGML_TYPE_F16},
|
|
203
|
-
/* n_threads */ {
|
|
203
|
+
/* n_threads */ {cpu_get_num_math()},
|
|
204
204
|
/* n_gpu_layers */ {99},
|
|
205
205
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
|
206
206
|
/* main_gpu */ {0},
|
|
@@ -12,15 +12,20 @@ cmake_minimum_required(VERSION 3.22.1)
|
|
|
12
12
|
# build script scope).
|
|
13
13
|
project("llama-android")
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
15
|
+
## Fetch latest llama.cpp from GitHub
|
|
16
|
+
#include(FetchContent)
|
|
17
|
+
#FetchContent_Declare(
|
|
18
|
+
# llama
|
|
19
|
+
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
|
20
|
+
# GIT_TAG master
|
|
21
|
+
#)
|
|
22
|
+
#
|
|
23
|
+
## Also provides "common"
|
|
24
|
+
#FetchContent_MakeAvailable(llama)
|
|
21
25
|
|
|
22
|
-
#
|
|
23
|
-
|
|
26
|
+
# llama.cpp CI uses the code from the current branch
|
|
27
|
+
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
|
|
28
|
+
add_subdirectory(../../../../../../ build-llama)
|
|
24
29
|
|
|
25
30
|
# Creates and names a library, sets it as either STATIC
|
|
26
31
|
# or SHARED, and provides the relative paths to its source code.
|
|
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
|
|
68
68
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
|
69
69
|
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
|
70
70
|
|
|
71
|
-
/** preprocess img and store the result in res_imgs, pad_to_square may be
|
|
71
|
+
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
|
72
72
|
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
|
73
73
|
|
|
74
74
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
|
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
|
|
|
290
290
|
#endif // LOG_DISABLE_LOGS
|
|
291
291
|
|
|
292
292
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
|
293
|
-
|
|
293
|
+
gpt_params_print_usage(argc, argv, params);
|
|
294
294
|
show_additional_info(argc, argv);
|
|
295
295
|
return 1;
|
|
296
296
|
}
|