@fugood/llama.node 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/bin/darwin/arm64/default.metallib +0 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/default.metallib +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/LlamaContext.cpp +2 -2
  19. package/src/LoadSessionWorker.cpp +1 -0
  20. package/src/llama.cpp/CMakeLists.txt +72 -46
  21. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  22. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  23. package/src/llama.cpp/common/common.cpp +732 -752
  24. package/src/llama.cpp/common/common.h +47 -41
  25. package/src/llama.cpp/common/grammar-parser.cpp +1 -1
  26. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  27. package/src/llama.cpp/common/log.h +5 -5
  28. package/src/llama.cpp/common/sampling.cpp +89 -7
  29. package/src/llama.cpp/common/sampling.h +5 -0
  30. package/src/llama.cpp/common/train.cpp +2 -2
  31. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  32. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  33. package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  36. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  37. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  39. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
  40. package/src/llama.cpp/examples/llava/clip.h +1 -1
  41. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  42. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  43. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  44. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  45. package/src/llama.cpp/examples/main/main.cpp +24 -16
  46. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  47. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  48. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  49. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  50. package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
  51. package/src/llama.cpp/examples/server/server.cpp +21 -9
  52. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  53. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  54. package/src/llama.cpp/ggml-backend.c +0 -1
  55. package/src/llama.cpp/ggml-common.h +0 -54
  56. package/src/llama.cpp/ggml-cuda.h +1 -0
  57. package/src/llama.cpp/ggml-impl.h +51 -0
  58. package/src/llama.cpp/ggml-kompute.cpp +4 -0
  59. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  60. package/src/llama.cpp/ggml-quants.c +3700 -2041
  61. package/src/llama.cpp/ggml-rpc.cpp +188 -56
  62. package/src/llama.cpp/ggml-sycl.cpp +99 -530
  63. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  64. package/src/llama.cpp/ggml-vulkan.cpp +202 -225
  65. package/src/llama.cpp/ggml.c +1034 -1154
  66. package/src/llama.cpp/ggml.h +59 -31
  67. package/src/llama.cpp/llama.cpp +859 -609
  68. package/src/llama.cpp/llama.h +19 -6
  69. package/src/llama.cpp/requirements.txt +0 -1
  70. package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
  71. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  72. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  73. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  74. package/src/llama.cpp/unicode-data.h +15 -12
  75. package/src/llama.cpp/unicode.cpp +89 -111
  76. package/src/llama.cpp/unicode.h +44 -12
  77. package/src/llama.cpp/build.zig +0 -172
  78. package/src/llama.cpp/ggml-mpi.c +0 -216
  79. package/src/llama.cpp/ggml-mpi.h +0 -39
  80. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -27,7 +27,7 @@
27
27
  #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
28
28
 
29
29
  #define print_build_info() do { \
30
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
30
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
31
31
  fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
32
32
  } while(0)
33
33
 
@@ -35,14 +35,18 @@
35
35
 
36
36
  // build info
37
37
  extern int LLAMA_BUILD_NUMBER;
38
- extern char const *LLAMA_COMMIT;
39
- extern char const *LLAMA_COMPILER;
40
- extern char const *LLAMA_BUILD_TARGET;
38
+ extern char const * LLAMA_COMMIT;
39
+ extern char const * LLAMA_COMPILER;
40
+ extern char const * LLAMA_BUILD_TARGET;
41
41
 
42
42
  struct llama_control_vector_load_info;
43
43
 
44
- int get_math_cpu_count();
45
- int32_t get_num_physical_cores();
44
+ //
45
+ // CPU utils
46
+ //
47
+
48
+ int32_t cpu_get_num_physical_cores();
49
+ int32_t cpu_get_num_math();
46
50
 
47
51
  //
48
52
  // CLI argument parsing
@@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
51
55
  struct gpt_params {
52
56
  uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
53
57
 
54
- int32_t n_threads = get_math_cpu_count();
58
+ int32_t n_threads = cpu_get_num_math();
55
59
  int32_t n_threads_draft = -1;
56
60
  int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
57
61
  int32_t n_threads_batch_draft = -1;
@@ -142,6 +146,7 @@ struct gpt_params {
142
146
  bool use_color = false; // use color to distinguish generations and inputs
143
147
  bool interactive = false; // interactive mode
144
148
  bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
149
+ bool special = false; // enable special token output
145
150
  bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
146
151
  bool chatml = false; // chatml mode (used for models trained on chatml syntax)
147
152
  bool prompt_cache_all = false; // save user input and generations to prompt cache
@@ -179,33 +184,34 @@ struct gpt_params {
179
184
 
180
185
  void gpt_params_handle_model_default(gpt_params & params);
181
186
 
182
- bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
183
-
184
- bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
185
-
186
- bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
187
+ bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
188
+ bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
189
+ bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
190
+ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
187
191
 
188
- void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
192
+ std::string gpt_params_get_system_info(const gpt_params & params);
189
193
 
190
- bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
191
-
192
- std::string get_system_info(const gpt_params & params);
194
+ //
195
+ // String utils
196
+ //
193
197
 
194
- std::string gpt_random_prompt(std::mt19937 & rng);
198
+ std::vector<std::string> string_split(std::string input, char separator);
195
199
 
196
- void process_escapes(std::string& input);
200
+ std::string string_strip(const std::string & str);
201
+ std::string string_get_sortable_timestamp();
202
+ std::string string_random_prompt(std::mt19937 & rng);
197
203
 
198
- bool validate_file_name(const std::string & filename);
204
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
205
+ void string_process_escapes(std::string & input);
199
206
 
200
207
  //
201
- // String utils
208
+ // Filesystem utils
202
209
  //
203
210
 
204
- std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
205
- std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
206
- std::vector<std::string> string_split(std::string input, char separator);
207
- std::string string_strip(const std::string & str);
208
- std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
211
+ bool fs_validate_filename(const std::string & filename);
212
+ bool fs_create_directory_with_parents(const std::string & path);
213
+
214
+ std::string fs_get_cache_directory();
209
215
 
210
216
  //
211
217
  // Model utils
@@ -276,29 +282,15 @@ std::string llama_detokenize_bpe(
276
282
  // defaults to true when model type is SPM, otherwise false.
277
283
  bool llama_should_add_bos_token(const llama_model * model);
278
284
 
279
- //
280
- // YAML utils
281
- //
282
-
283
- bool create_directory_with_parents(const std::string & path);
284
- void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
285
- void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
286
- void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
287
- std::string get_sortable_timestamp();
288
-
289
- void dump_non_result_info_yaml(
290
- FILE * stream, const gpt_params & params, const llama_context * lctx,
291
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
292
-
293
285
  //
294
286
  // KV cache utils
295
287
  //
296
288
 
297
289
  // Dump the KV cache view with the number of sequences per cell.
298
- void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
290
+ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
299
291
 
300
292
  // Dump the KV cache view showing individual sequences in each cell (long output).
301
- void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
293
+ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
302
294
 
303
295
  //
304
296
  // Embedding utils
@@ -332,6 +324,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
332
324
  //
333
325
  // Split utils
334
326
  //
327
+
335
328
  static const char * const LLM_KV_SPLIT_NO = "split.no";
336
329
  static const char * const LLM_KV_SPLIT_COUNT = "split.count";
337
330
  static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
331
+
332
+ //
333
+ // YAML utils
334
+ //
335
+
336
+ void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
337
+ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
338
+ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
339
+
340
+ void yaml_dump_non_result_info(
341
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
342
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
343
+
@@ -26,7 +26,7 @@ namespace grammar_parser {
26
26
 
27
27
  static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
28
28
  uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
29
- auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
29
+ auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
30
30
  return result.first->second;
31
31
  }
32
32
 
@@ -272,7 +272,7 @@ private:
272
272
  if (literal.empty()) {
273
273
  return false;
274
274
  }
275
- ret.push_back(std::make_pair(literal, true));
275
+ ret.emplace_back(literal, true);
276
276
  literal.clear();
277
277
  return true;
278
278
  };
@@ -298,7 +298,7 @@ private:
298
298
  while (i < length) {
299
299
  char c = sub_pattern[i];
300
300
  if (c == '.') {
301
- seq.push_back(std::make_pair(get_dot(), false));
301
+ seq.emplace_back(get_dot(), false);
302
302
  i++;
303
303
  } else if (c == '(') {
304
304
  i++;
@@ -307,7 +307,7 @@ private:
307
307
  _warnings.push_back("Unsupported pattern syntax");
308
308
  }
309
309
  }
310
- seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
310
+ seq.emplace_back("(" + to_rule(transform()) + ")", false);
311
311
  } else if (c == ')') {
312
312
  i++;
313
313
  if (start > 0 && sub_pattern[start - 1] != '(') {
@@ -331,9 +331,9 @@ private:
331
331
  }
332
332
  square_brackets += ']';
333
333
  i++;
334
- seq.push_back(std::make_pair(square_brackets, false));
334
+ seq.emplace_back(square_brackets, false);
335
335
  } else if (c == '|') {
336
- seq.push_back(std::make_pair("|", false));
336
+ seq.emplace_back("|", false);
337
337
  i++;
338
338
  } else if (c == '*' || c == '+' || c == '?') {
339
339
  seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@@ -417,7 +417,7 @@ private:
417
417
  }
418
418
  }
419
419
  if (!literal.empty()) {
420
- seq.push_back(std::make_pair(literal, true));
420
+ seq.emplace_back(literal, true);
421
421
  }
422
422
  }
423
423
  }
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
211
211
  #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
212
212
  #else
213
213
  #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
214
- #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
214
+ #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
215
215
  #endif
216
216
  #else
217
217
  #define LOG_FLF_FMT "%s"
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
224
224
  #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
225
225
  #else
226
226
  #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
227
- #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
227
+ #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
228
228
  #endif
229
229
  #else
230
230
  #define LOG_TEE_FLF_FMT "%s"
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
294
294
  // Main LOG macro.
295
295
  // behaves like printf, and supports arguments the exact same way.
296
296
  //
297
- #ifndef _MSC_VER
297
+ #if !defined(_MSC_VER) || defined(__clang__)
298
298
  #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
299
299
  #else
300
300
  #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
308
308
  // Secondary target can be changed just like LOG_TARGET
309
309
  // by defining LOG_TEE_TARGET
310
310
  //
311
- #ifndef _MSC_VER
311
+ #if !defined(_MSC_VER) || defined(__clang__)
312
312
  #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
313
313
  #else
314
314
  #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
315
315
  #endif
316
316
 
317
317
  // LOG macro variants with auto endline.
318
- #ifndef _MSC_VER
318
+ #if !defined(_MSC_VER) || defined(__clang__)
319
319
  #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
320
320
  #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
321
321
  #else
@@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
125
125
  std::string result = "CFG -> Penalties ";
126
126
  if (params.mirostat == 0) {
127
127
  for (auto sampler_type : params.samplers_sequence) {
128
- const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
128
+ const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
129
129
  if (!sampler_type_name.empty()) {
130
130
  result += "-> " + sampler_type_name + " ";
131
131
  }
@@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
137
137
  return result;
138
138
  }
139
139
 
140
+ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
141
+ switch (sampler_type) {
142
+ case llama_sampler_type::TOP_K: return "top_k";
143
+ case llama_sampler_type::TFS_Z: return "tfs_z";
144
+ case llama_sampler_type::TYPICAL_P: return "typical_p";
145
+ case llama_sampler_type::TOP_P: return "top_p";
146
+ case llama_sampler_type::MIN_P: return "min_p";
147
+ case llama_sampler_type::TEMPERATURE: return "temperature";
148
+ default : return "";
149
+ }
150
+ }
151
+
152
+ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
153
+ std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
154
+ {"top_k", llama_sampler_type::TOP_K},
155
+ {"top_p", llama_sampler_type::TOP_P},
156
+ {"typical_p", llama_sampler_type::TYPICAL_P},
157
+ {"min_p", llama_sampler_type::MIN_P},
158
+ {"tfs_z", llama_sampler_type::TFS_Z},
159
+ {"temperature", llama_sampler_type::TEMPERATURE}
160
+ };
161
+
162
+ // since samplers names are written multiple ways
163
+ // make it ready for both system names and input names
164
+ std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
165
+ {"top-k", llama_sampler_type::TOP_K},
166
+ {"top-p", llama_sampler_type::TOP_P},
167
+ {"nucleus", llama_sampler_type::TOP_P},
168
+ {"typical-p", llama_sampler_type::TYPICAL_P},
169
+ {"typical", llama_sampler_type::TYPICAL_P},
170
+ {"min-p", llama_sampler_type::MIN_P},
171
+ {"tfs-z", llama_sampler_type::TFS_Z},
172
+ {"tfs", llama_sampler_type::TFS_Z},
173
+ {"temp", llama_sampler_type::TEMPERATURE}
174
+ };
175
+
176
+ std::vector<llama_sampler_type> sampler_types;
177
+ sampler_types.reserve(names.size());
178
+ for (const auto & name : names)
179
+ {
180
+ auto sampler_item = sampler_canonical_name_map.find(name);
181
+ if (sampler_item != sampler_canonical_name_map.end())
182
+ {
183
+ sampler_types.push_back(sampler_item->second);
184
+ }
185
+ else
186
+ {
187
+ if (allow_alt_names)
188
+ {
189
+ sampler_item = sampler_alt_name_map.find(name);
190
+ if (sampler_item != sampler_alt_name_map.end())
191
+ {
192
+ sampler_types.push_back(sampler_item->second);
193
+ }
194
+ }
195
+ }
196
+ }
197
+ return sampler_types;
198
+ }
199
+
200
+ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
201
+ std::unordered_map<char, llama_sampler_type> sampler_name_map {
202
+ {'k', llama_sampler_type::TOP_K},
203
+ {'p', llama_sampler_type::TOP_P},
204
+ {'y', llama_sampler_type::TYPICAL_P},
205
+ {'m', llama_sampler_type::MIN_P},
206
+ {'f', llama_sampler_type::TFS_Z},
207
+ {'t', llama_sampler_type::TEMPERATURE}
208
+ };
209
+
210
+ std::vector<llama_sampler_type> sampler_types;
211
+ sampler_types.reserve(names_string.size());
212
+ for (const auto & c : names_string) {
213
+ const auto sampler_item = sampler_name_map.find(c);
214
+ if (sampler_item != sampler_name_map.end()) {
215
+ sampler_types.push_back(sampler_item->second);
216
+ }
217
+ }
218
+ return sampler_types;
219
+ }
220
+
140
221
  // no reasons to expose this function in header
141
222
  static void sampler_queue(
142
223
  struct llama_context * ctx_main,
@@ -179,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
179
260
  struct llama_context * ctx_main,
180
261
  struct llama_context * ctx_cfg,
181
262
  const int idx,
182
- bool is_resampling) { // Add a parameter to indicate if we are resampling
263
+ bool is_resampling) {
183
264
  const llama_sampling_params & params = ctx_sampling->params;
184
265
 
185
266
  const float temp = params.temp;
@@ -188,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
188
269
  const float mirostat_eta = params.mirostat_eta;
189
270
 
190
271
  std::vector<float> original_logits;
191
- auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
192
- if (!is_resampling) {
272
+ auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
273
+ if (ctx_sampling->grammar != NULL && !is_resampling) {
193
274
  GGML_ASSERT(!original_logits.empty());
194
275
  }
195
276
  llama_token id = 0;
@@ -252,7 +333,7 @@ static llama_token llama_sampling_sample_impl(
252
333
  // Restore logits from the copy
253
334
  std::copy(original_logits.begin(), original_logits.end(), logits);
254
335
 
255
- return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
336
+ return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
256
337
  }
257
338
  }
258
339
 
@@ -285,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
285
366
  // Get a pointer to the logits
286
367
  float * logits = llama_get_logits_ith(ctx_main, idx);
287
368
 
288
- if (apply_grammar && original_logits != NULL) {
369
+ if (ctx_sampling->grammar != NULL && !apply_grammar) {
370
+ GGML_ASSERT(original_logits != NULL);
289
371
  // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
290
372
  *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
291
373
  }
@@ -342,7 +424,7 @@ llama_token llama_sampling_sample(
342
424
  struct llama_context * ctx_cfg,
343
425
  const int idx) {
344
426
  // Call the implementation function with is_resampling set to false by default
345
- return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
427
+ return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
346
428
  }
347
429
 
348
430
  llama_token_data_array llama_sampling_prepare(
@@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
116
116
  // Print sampling order into a string
117
117
  std::string llama_sampling_order_print(const llama_sampling_params & params);
118
118
 
119
+ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
120
+
121
+ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
122
+ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
123
+
119
124
  // this is a common sampling function used across the examples for convenience
120
125
  // it can serve as a starting point for implementing your own sampling function
121
126
  // Note: When using multiple sequences, it is the caller's responsibility to call
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
1052
1052
 
1053
1053
  params.custom_n_ctx = false;
1054
1054
 
1055
- params.use_flash = true;
1055
+ params.use_flash = false;
1056
1056
  params.use_checkpointing = true;
1057
1057
 
1058
1058
  params.sample_start = "";
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
1380
1380
 
1381
1381
  void finish_processing_train_args(struct train_params_common * params) {
1382
1382
  if (params->escape) {
1383
- process_escapes(params->sample_start);
1383
+ string_process_escapes(params->sample_start);
1384
1384
  }
1385
1385
  }
1386
1386
 
@@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
48
48
  params.prompt = "Hello my name is";
49
49
  }
50
50
 
51
- process_escapes(params.prompt);
51
+ string_process_escapes(params.prompt);
52
52
 
53
53
  // init LLM
54
54
 
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
774
774
 
775
775
  params.samples_start_after_nl = false;
776
776
  params.use_adam = true;
777
- params.use_flash = true;
777
+ params.use_flash = false;
778
778
  params.use_scratch = true;
779
779
 
780
780
  // only adam
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
80
80
 
81
81
  std::mt19937 rng(params.seed);
82
82
  if (params.random_prompt) {
83
- params.prompt = gpt_random_prompt(rng);
83
+ params.prompt = string_random_prompt(rng);
84
84
  }
85
85
 
86
86
  llama_backend_init();
@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
107
107
  // print system information
108
108
  {
109
109
  fprintf(stderr, "\n");
110
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
110
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
111
111
  }
112
112
 
113
113
  // split the prompt into lines
@@ -211,6 +211,7 @@ int main(int argc, char ** argv) {
211
211
 
212
212
  // clean up
213
213
  llama_print_timings(ctx);
214
+ llama_batch_free(batch);
214
215
  llama_free(ctx);
215
216
  llama_free_model(model);
216
217
  llama_backend_free();
@@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
152
152
 
153
153
  std::mt19937 rng(params.seed);
154
154
  if (params.random_prompt) {
155
- params.prompt = gpt_random_prompt(rng);
155
+ params.prompt = string_random_prompt(rng);
156
156
  }
157
157
 
158
158
  llama_backend_init();
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
176
176
  // print system information
177
177
  {
178
178
  fprintf(stderr, "\n");
179
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
179
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
180
180
  }
181
181
 
182
182
  bool OK = run(ctx, params);
@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
563
563
  // not capturing these, to silcence warnings
564
564
  const int rope_mode = 0;
565
565
 
566
- return ggml_rope_custom(ctx,
567
- t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
566
+ return ggml_rope_ext(ctx,
567
+ t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
568
568
  rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
569
569
  );
570
570
  };
@@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
643
643
  struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
644
644
  struct ggml_tensor * t16;
645
645
  if (enable_flash_attn) {
646
- t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
646
+ GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
647
+ //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
647
648
  } else {
648
649
  struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
649
650
  struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
@@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
598
598
 
599
599
  std::mt19937 rng(params.seed);
600
600
  if (params.random_prompt) {
601
- params.prompt = gpt_random_prompt(rng);
601
+ params.prompt = string_random_prompt(rng);
602
602
  }
603
603
 
604
604
  sparams.dataset = params.prompt_file;
@@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
667
667
  // print system information
668
668
  {
669
669
  fprintf(stderr, "\n");
670
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
670
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
671
671
  }
672
672
 
673
673
  bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
@@ -50,9 +50,9 @@ static void write_logfile(
50
50
  return;
51
51
  }
52
52
 
53
- const std::string timestamp = get_sortable_timestamp();
53
+ const std::string timestamp = string_get_sortable_timestamp();
54
54
 
55
- const bool success = create_directory_with_parents(params.logdir);
55
+ const bool success = fs_create_directory_with_parents(params.logdir);
56
56
  if (!success) {
57
57
  fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
58
58
  __func__, params.logdir.c_str());
@@ -70,7 +70,7 @@ static void write_logfile(
70
70
  fprintf(logfile, "binary: infill\n");
71
71
  char model_desc[128];
72
72
  llama_model_desc(model, model_desc, sizeof(model_desc));
73
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
73
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
74
74
 
75
75
  fprintf(logfile, "\n");
76
76
  fprintf(logfile, "######################\n");
@@ -78,8 +78,8 @@ static void write_logfile(
78
78
  fprintf(logfile, "######################\n");
79
79
  fprintf(logfile, "\n");
80
80
 
81
- dump_string_yaml_multiline(logfile, "output", output.c_str());
82
- dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
81
+ yaml_dump_string_multiline(logfile, "output", output.c_str());
82
+ yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
83
83
 
84
84
  llama_dump_timing_info_yaml(logfile, ctx);
85
85
  fclose(logfile);
@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
236
236
  // print system information
237
237
  {
238
238
  LOG_TEE("\n");
239
- LOG_TEE("%s\n", get_system_info(params).c_str());
239
+ LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
240
240
  }
241
241
  const bool add_bos = llama_should_add_bos_token(model);
242
242
  GGML_ASSERT(llama_add_eos_token(model) != 1);
@@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
621
621
 
622
622
  if (params.escape) {
623
623
  //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
624
- process_escapes(params.input_prefix);
625
- process_escapes(params.input_suffix);
624
+ string_process_escapes(params.input_prefix);
625
+ string_process_escapes(params.input_suffix);
626
626
  }
627
627
  suff_rm_leading_spc = params.escape;
628
628
  if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
@@ -195,12 +195,12 @@ static const cmd_params cmd_params_defaults = {
195
195
  /* model */ {"models/7B/ggml-model-q4_0.gguf"},
196
196
  /* n_prompt */ {512},
197
197
  /* n_gen */ {128},
198
- /* n_pg */ {{512, 128}},
198
+ /* n_pg */ {},
199
199
  /* n_batch */ {2048},
200
200
  /* n_ubatch */ {512},
201
201
  /* type_k */ {GGML_TYPE_F16},
202
202
  /* type_v */ {GGML_TYPE_F16},
203
- /* n_threads */ {get_math_cpu_count()},
203
+ /* n_threads */ {cpu_get_num_math()},
204
204
  /* n_gpu_layers */ {99},
205
205
  /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
206
206
  /* main_gpu */ {0},
@@ -12,15 +12,20 @@ cmake_minimum_required(VERSION 3.22.1)
12
12
  # build script scope).
13
13
  project("llama-android")
14
14
 
15
- include(FetchContent)
16
- FetchContent_Declare(
17
- llama
18
- GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
19
- GIT_TAG master
20
- )
15
+ ## Fetch latest llama.cpp from GitHub
16
+ #include(FetchContent)
17
+ #FetchContent_Declare(
18
+ # llama
19
+ # GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
20
+ # GIT_TAG master
21
+ #)
22
+ #
23
+ ## Also provides "common"
24
+ #FetchContent_MakeAvailable(llama)
21
25
 
22
- # Also provides "common"
23
- FetchContent_MakeAvailable(llama)
26
+ # llama.cpp CI uses the code from the current branch
27
+ # ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
28
+ add_subdirectory(../../../../../../ build-llama)
24
29
 
25
30
  # Creates and names a library, sets it as either STATIC
26
31
  # or SHARED, and provides the relative paths to its source code.
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
68
68
  /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
69
69
  CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
70
70
 
71
- /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
71
+ /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
72
72
  CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
73
73
 
74
74
  CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
290
290
  #endif // LOG_DISABLE_LOGS
291
291
 
292
292
  if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
293
- gpt_print_usage(argc, argv, params);
293
+ gpt_params_print_usage(argc, argv, params);
294
294
  show_additional_info(argc, argv);
295
295
  return 1;
296
296
  }