@fugood/llama.node 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/LlamaContext.cpp +2 -2
  23. package/src/TokenizeWorker.cpp +1 -1
  24. package/src/llama.cpp/CMakeLists.txt +82 -54
  25. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  26. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  27. package/src/llama.cpp/common/common.cpp +748 -754
  28. package/src/llama.cpp/common/common.h +49 -41
  29. package/src/llama.cpp/common/grammar-parser.cpp +10 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  31. package/src/llama.cpp/common/log.h +5 -5
  32. package/src/llama.cpp/common/sampling.cpp +92 -10
  33. package/src/llama.cpp/common/sampling.h +6 -1
  34. package/src/llama.cpp/common/train.cpp +2 -2
  35. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  36. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  37. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
  39. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  40. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  42. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  43. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
  44. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
  45. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
  46. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  47. package/src/llama.cpp/examples/llava/clip.h +1 -1
  48. package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
  49. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  50. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  51. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  52. package/src/llama.cpp/examples/main/main.cpp +29 -17
  53. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  54. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  55. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  56. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  57. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  58. package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
  59. package/src/llama.cpp/examples/server/server.cpp +33 -25
  60. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  61. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  62. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  63. package/src/llama.cpp/ggml-backend.c +2 -3
  64. package/src/llama.cpp/ggml-common.h +0 -54
  65. package/src/llama.cpp/ggml-cuda.h +1 -0
  66. package/src/llama.cpp/ggml-impl.h +51 -0
  67. package/src/llama.cpp/ggml-kompute.cpp +13 -3
  68. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  69. package/src/llama.cpp/ggml-quants.c +3715 -2050
  70. package/src/llama.cpp/ggml-rpc.cpp +1155 -0
  71. package/src/llama.cpp/ggml-rpc.h +24 -0
  72. package/src/llama.cpp/ggml-sycl.cpp +119 -673
  73. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  74. package/src/llama.cpp/ggml-vulkan.cpp +203 -224
  75. package/src/llama.cpp/ggml.c +1208 -1483
  76. package/src/llama.cpp/ggml.h +71 -46
  77. package/src/llama.cpp/llama.cpp +1374 -938
  78. package/src/llama.cpp/llama.h +22 -6
  79. package/src/llama.cpp/requirements.txt +0 -2
  80. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
  82. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  83. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  84. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  85. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  86. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  87. package/src/llama.cpp/unicode-data.h +15 -12
  88. package/src/llama.cpp/unicode.cpp +89 -111
  89. package/src/llama.cpp/unicode.h +44 -12
  90. package/src/llama.cpp/build.zig +0 -172
  91. package/src/llama.cpp/ggml-mpi.c +0 -216
  92. package/src/llama.cpp/ggml-mpi.h +0 -39
  93. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
  94. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -27,7 +27,7 @@
27
27
  #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
28
28
 
29
29
  #define print_build_info() do { \
30
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
30
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
31
31
  fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
32
32
  } while(0)
33
33
 
@@ -35,14 +35,18 @@
35
35
 
36
36
  // build info
37
37
  extern int LLAMA_BUILD_NUMBER;
38
- extern char const *LLAMA_COMMIT;
39
- extern char const *LLAMA_COMPILER;
40
- extern char const *LLAMA_BUILD_TARGET;
38
+ extern char const * LLAMA_COMMIT;
39
+ extern char const * LLAMA_COMPILER;
40
+ extern char const * LLAMA_BUILD_TARGET;
41
41
 
42
42
  struct llama_control_vector_load_info;
43
43
 
44
- int get_math_cpu_count();
45
- int32_t get_num_physical_cores();
44
+ //
45
+ // CPU utils
46
+ //
47
+
48
+ int32_t cpu_get_num_physical_cores();
49
+ int32_t cpu_get_num_math();
46
50
 
47
51
  //
48
52
  // CLI argument parsing
@@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
51
55
  struct gpt_params {
52
56
  uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
53
57
 
54
- int32_t n_threads = get_math_cpu_count();
58
+ int32_t n_threads = cpu_get_num_math();
55
59
  int32_t n_threads_draft = -1;
56
60
  int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
57
61
  int32_t n_threads_batch_draft = -1;
@@ -82,6 +86,7 @@ struct gpt_params {
82
86
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
83
87
  int32_t yarn_orig_ctx = 0; // YaRN original context length
84
88
  float defrag_thold = -1.0f; // KV cache defragmentation threshold
89
+ std::string rpc_servers = ""; // comma separated list of RPC servers
85
90
 
86
91
  ggml_backend_sched_eval_callback cb_eval = nullptr;
87
92
  void * cb_eval_user_data = nullptr;
@@ -140,6 +145,8 @@ struct gpt_params {
140
145
  bool random_prompt = false; // do not randomize prompt if none provided
141
146
  bool use_color = false; // use color to distinguish generations and inputs
142
147
  bool interactive = false; // interactive mode
148
+ bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
149
+ bool special = false; // enable special token output
143
150
  bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
144
151
  bool chatml = false; // chatml mode (used for models trained on chatml syntax)
145
152
  bool prompt_cache_all = false; // save user input and generations to prompt cache
@@ -177,33 +184,34 @@ struct gpt_params {
177
184
 
178
185
  void gpt_params_handle_model_default(gpt_params & params);
179
186
 
180
- bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
181
-
182
- bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
183
-
184
- bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
187
+ bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
188
+ bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
189
+ bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
190
+ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
185
191
 
186
- void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
192
+ std::string gpt_params_get_system_info(const gpt_params & params);
187
193
 
188
- bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
189
-
190
- std::string get_system_info(const gpt_params & params);
194
+ //
195
+ // String utils
196
+ //
191
197
 
192
- std::string gpt_random_prompt(std::mt19937 & rng);
198
+ std::vector<std::string> string_split(std::string input, char separator);
193
199
 
194
- void process_escapes(std::string& input);
200
+ std::string string_strip(const std::string & str);
201
+ std::string string_get_sortable_timestamp();
202
+ std::string string_random_prompt(std::mt19937 & rng);
195
203
 
196
- bool validate_file_name(const std::string & filename);
204
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
205
+ void string_process_escapes(std::string & input);
197
206
 
198
207
  //
199
- // String utils
208
+ // Filesystem utils
200
209
  //
201
210
 
202
- std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
203
- std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
204
- std::vector<std::string> string_split(std::string input, char separator);
205
- std::string string_strip(const std::string & str);
206
- std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
211
+ bool fs_validate_filename(const std::string & filename);
212
+ bool fs_create_directory_with_parents(const std::string & path);
213
+
214
+ std::string fs_get_cache_directory();
207
215
 
208
216
  //
209
217
  // Model utils
@@ -274,29 +282,15 @@ std::string llama_detokenize_bpe(
274
282
  // defaults to true when model type is SPM, otherwise false.
275
283
  bool llama_should_add_bos_token(const llama_model * model);
276
284
 
277
- //
278
- // YAML utils
279
- //
280
-
281
- bool create_directory_with_parents(const std::string & path);
282
- void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
283
- void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
284
- void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
285
- std::string get_sortable_timestamp();
286
-
287
- void dump_non_result_info_yaml(
288
- FILE * stream, const gpt_params & params, const llama_context * lctx,
289
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
290
-
291
285
  //
292
286
  // KV cache utils
293
287
  //
294
288
 
295
289
  // Dump the KV cache view with the number of sequences per cell.
296
- void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
290
+ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
297
291
 
298
292
  // Dump the KV cache view showing individual sequences in each cell (long output).
299
- void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
293
+ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
300
294
 
301
295
  //
302
296
  // Embedding utils
@@ -330,6 +324,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
330
324
  //
331
325
  // Split utils
332
326
  //
327
+
333
328
  static const char * const LLM_KV_SPLIT_NO = "split.no";
334
329
  static const char * const LLM_KV_SPLIT_COUNT = "split.count";
335
330
  static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
331
+
332
+ //
333
+ // YAML utils
334
+ //
335
+
336
+ void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
337
+ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
338
+ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
339
+
340
+ void yaml_dump_non_result_info(
341
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
342
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
343
+
@@ -26,7 +26,7 @@ namespace grammar_parser {
26
26
 
27
27
  static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
28
28
  uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
29
- auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
29
+ auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
30
30
  return result.first->second;
31
31
  }
32
32
 
@@ -142,6 +142,9 @@ namespace grammar_parser {
142
142
  pos++;
143
143
  last_sym_start = out_elements.size();
144
144
  while (*pos != '"') {
145
+ if (!*pos) {
146
+ throw std::runtime_error("unexpected end of input");
147
+ }
145
148
  auto char_pair = parse_char(pos);
146
149
  pos = char_pair.second;
147
150
  out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -156,6 +159,9 @@ namespace grammar_parser {
156
159
  }
157
160
  last_sym_start = out_elements.size();
158
161
  while (*pos != ']') {
162
+ if (!*pos) {
163
+ throw std::runtime_error("unexpected end of input");
164
+ }
159
165
  auto char_pair = parse_char(pos);
160
166
  pos = char_pair.second;
161
167
  enum llama_gretype type = last_sym_start < out_elements.size()
@@ -164,6 +170,9 @@ namespace grammar_parser {
164
170
 
165
171
  out_elements.push_back({type, char_pair.first});
166
172
  if (pos[0] == '-' && pos[1] != ']') {
173
+ if (!pos[1]) {
174
+ throw std::runtime_error("unexpected end of input");
175
+ }
167
176
  auto endchar_pair = parse_char(pos + 1);
168
177
  pos = endchar_pair.second;
169
178
  out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
@@ -272,7 +272,7 @@ private:
272
272
  if (literal.empty()) {
273
273
  return false;
274
274
  }
275
- ret.push_back(std::make_pair(literal, true));
275
+ ret.emplace_back(literal, true);
276
276
  literal.clear();
277
277
  return true;
278
278
  };
@@ -298,7 +298,7 @@ private:
298
298
  while (i < length) {
299
299
  char c = sub_pattern[i];
300
300
  if (c == '.') {
301
- seq.push_back(std::make_pair(get_dot(), false));
301
+ seq.emplace_back(get_dot(), false);
302
302
  i++;
303
303
  } else if (c == '(') {
304
304
  i++;
@@ -307,7 +307,7 @@ private:
307
307
  _warnings.push_back("Unsupported pattern syntax");
308
308
  }
309
309
  }
310
- seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
310
+ seq.emplace_back("(" + to_rule(transform()) + ")", false);
311
311
  } else if (c == ')') {
312
312
  i++;
313
313
  if (start > 0 && sub_pattern[start - 1] != '(') {
@@ -331,9 +331,9 @@ private:
331
331
  }
332
332
  square_brackets += ']';
333
333
  i++;
334
- seq.push_back(std::make_pair(square_brackets, false));
334
+ seq.emplace_back(square_brackets, false);
335
335
  } else if (c == '|') {
336
- seq.push_back(std::make_pair("|", false));
336
+ seq.emplace_back("|", false);
337
337
  i++;
338
338
  } else if (c == '*' || c == '+' || c == '?') {
339
339
  seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@@ -417,7 +417,7 @@ private:
417
417
  }
418
418
  }
419
419
  if (!literal.empty()) {
420
- seq.push_back(std::make_pair(literal, true));
420
+ seq.emplace_back(literal, true);
421
421
  }
422
422
  }
423
423
  }
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
211
211
  #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
212
212
  #else
213
213
  #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
214
- #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
214
+ #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
215
215
  #endif
216
216
  #else
217
217
  #define LOG_FLF_FMT "%s"
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
224
224
  #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
225
225
  #else
226
226
  #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
227
- #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
227
+ #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
228
228
  #endif
229
229
  #else
230
230
  #define LOG_TEE_FLF_FMT "%s"
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
294
294
  // Main LOG macro.
295
295
  // behaves like printf, and supports arguments the exact same way.
296
296
  //
297
- #ifndef _MSC_VER
297
+ #if !defined(_MSC_VER) || defined(__clang__)
298
298
  #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
299
299
  #else
300
300
  #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
308
308
  // Secondary target can be changed just like LOG_TARGET
309
309
  // by defining LOG_TEE_TARGET
310
310
  //
311
- #ifndef _MSC_VER
311
+ #if !defined(_MSC_VER) || defined(__clang__)
312
312
  #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
313
313
  #else
314
314
  #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
315
315
  #endif
316
316
 
317
317
  // LOG macro variants with auto endline.
318
- #ifndef _MSC_VER
318
+ #if !defined(_MSC_VER) || defined(__clang__)
319
319
  #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
320
320
  #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
321
321
  #else
@@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
35
35
 
36
36
  result->prev.resize(params.n_prev);
37
37
 
38
- result->n_considered = 0;
38
+ result->n_valid = 0;
39
39
 
40
40
  llama_sampling_set_rng_seed(result, params.seed);
41
41
 
@@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
66
66
 
67
67
  std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
68
68
  ctx->cur.clear();
69
- ctx->n_considered = 0;
69
+ ctx->n_valid = 0;
70
70
  }
71
71
 
72
72
  void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
125
125
  std::string result = "CFG -> Penalties ";
126
126
  if (params.mirostat == 0) {
127
127
  for (auto sampler_type : params.samplers_sequence) {
128
- const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
128
+ const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
129
129
  if (!sampler_type_name.empty()) {
130
130
  result += "-> " + sampler_type_name + " ";
131
131
  }
@@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
137
137
  return result;
138
138
  }
139
139
 
140
+ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
141
+ switch (sampler_type) {
142
+ case llama_sampler_type::TOP_K: return "top_k";
143
+ case llama_sampler_type::TFS_Z: return "tfs_z";
144
+ case llama_sampler_type::TYPICAL_P: return "typical_p";
145
+ case llama_sampler_type::TOP_P: return "top_p";
146
+ case llama_sampler_type::MIN_P: return "min_p";
147
+ case llama_sampler_type::TEMPERATURE: return "temperature";
148
+ default : return "";
149
+ }
150
+ }
151
+
152
+ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
153
+ std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
154
+ {"top_k", llama_sampler_type::TOP_K},
155
+ {"top_p", llama_sampler_type::TOP_P},
156
+ {"typical_p", llama_sampler_type::TYPICAL_P},
157
+ {"min_p", llama_sampler_type::MIN_P},
158
+ {"tfs_z", llama_sampler_type::TFS_Z},
159
+ {"temperature", llama_sampler_type::TEMPERATURE}
160
+ };
161
+
162
+ // since samplers names are written multiple ways
163
+ // make it ready for both system names and input names
164
+ std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
165
+ {"top-k", llama_sampler_type::TOP_K},
166
+ {"top-p", llama_sampler_type::TOP_P},
167
+ {"nucleus", llama_sampler_type::TOP_P},
168
+ {"typical-p", llama_sampler_type::TYPICAL_P},
169
+ {"typical", llama_sampler_type::TYPICAL_P},
170
+ {"min-p", llama_sampler_type::MIN_P},
171
+ {"tfs-z", llama_sampler_type::TFS_Z},
172
+ {"tfs", llama_sampler_type::TFS_Z},
173
+ {"temp", llama_sampler_type::TEMPERATURE}
174
+ };
175
+
176
+ std::vector<llama_sampler_type> sampler_types;
177
+ sampler_types.reserve(names.size());
178
+ for (const auto & name : names)
179
+ {
180
+ auto sampler_item = sampler_canonical_name_map.find(name);
181
+ if (sampler_item != sampler_canonical_name_map.end())
182
+ {
183
+ sampler_types.push_back(sampler_item->second);
184
+ }
185
+ else
186
+ {
187
+ if (allow_alt_names)
188
+ {
189
+ sampler_item = sampler_alt_name_map.find(name);
190
+ if (sampler_item != sampler_alt_name_map.end())
191
+ {
192
+ sampler_types.push_back(sampler_item->second);
193
+ }
194
+ }
195
+ }
196
+ }
197
+ return sampler_types;
198
+ }
199
+
200
+ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
201
+ std::unordered_map<char, llama_sampler_type> sampler_name_map {
202
+ {'k', llama_sampler_type::TOP_K},
203
+ {'p', llama_sampler_type::TOP_P},
204
+ {'y', llama_sampler_type::TYPICAL_P},
205
+ {'m', llama_sampler_type::MIN_P},
206
+ {'f', llama_sampler_type::TFS_Z},
207
+ {'t', llama_sampler_type::TEMPERATURE}
208
+ };
209
+
210
+ std::vector<llama_sampler_type> sampler_types;
211
+ sampler_types.reserve(names_string.size());
212
+ for (const auto & c : names_string) {
213
+ const auto sampler_item = sampler_name_map.find(c);
214
+ if (sampler_item != sampler_name_map.end()) {
215
+ sampler_types.push_back(sampler_item->second);
216
+ }
217
+ }
218
+ return sampler_types;
219
+ }
220
+
140
221
  // no reasons to expose this function in header
141
222
  static void sampler_queue(
142
223
  struct llama_context * ctx_main,
@@ -179,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
179
260
  struct llama_context * ctx_main,
180
261
  struct llama_context * ctx_cfg,
181
262
  const int idx,
182
- bool is_resampling) { // Add a parameter to indicate if we are resampling
263
+ bool is_resampling) {
183
264
  const llama_sampling_params & params = ctx_sampling->params;
184
265
 
185
266
  const float temp = params.temp;
@@ -188,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
188
269
  const float mirostat_eta = params.mirostat_eta;
189
270
 
190
271
  std::vector<float> original_logits;
191
- auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
192
- if (!is_resampling) {
272
+ auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
273
+ if (ctx_sampling->grammar != NULL && !is_resampling) {
193
274
  GGML_ASSERT(!original_logits.empty());
194
275
  }
195
276
  llama_token id = 0;
@@ -252,11 +333,11 @@ static llama_token llama_sampling_sample_impl(
252
333
  // Restore logits from the copy
253
334
  std::copy(original_logits.begin(), original_logits.end(), logits);
254
335
 
255
- return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
336
+ return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
256
337
  }
257
338
  }
258
339
 
259
- ctx_sampling->n_considered = cur_p.size;
340
+ ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
260
341
 
261
342
  return id;
262
343
  }
@@ -285,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
285
366
  // Get a pointer to the logits
286
367
  float * logits = llama_get_logits_ith(ctx_main, idx);
287
368
 
288
- if (apply_grammar && original_logits != NULL) {
369
+ if (ctx_sampling->grammar != NULL && !apply_grammar) {
370
+ GGML_ASSERT(original_logits != NULL);
289
371
  // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
290
372
  *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
291
373
  }
@@ -342,7 +424,7 @@ llama_token llama_sampling_sample(
342
424
  struct llama_context * ctx_cfg,
343
425
  const int idx) {
344
426
  // Call the implementation function with is_resampling set to false by default
345
- return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
427
+ return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
346
428
  }
347
429
 
348
430
  llama_token_data_array llama_sampling_prepare(
@@ -81,7 +81,7 @@ struct llama_sampling_context {
81
81
  // TODO: replace with ring-buffer
82
82
  std::vector<llama_token> prev;
83
83
  std::vector<llama_token_data> cur;
84
- size_t n_considered;
84
+ size_t n_valid; // Number of correct top tokens with correct probabilities.
85
85
 
86
86
  std::mt19937 rng;
87
87
  };
@@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
116
116
  // Print sampling order into a string
117
117
  std::string llama_sampling_order_print(const llama_sampling_params & params);
118
118
 
119
+ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
120
+
121
+ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
122
+ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
123
+
119
124
  // this is a common sampling function used across the examples for convenience
120
125
  // it can serve as a starting point for implementing your own sampling function
121
126
  // Note: When using multiple sequences, it is the caller's responsibility to call
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
1052
1052
 
1053
1053
  params.custom_n_ctx = false;
1054
1054
 
1055
- params.use_flash = true;
1055
+ params.use_flash = false;
1056
1056
  params.use_checkpointing = true;
1057
1057
 
1058
1058
  params.sample_start = "";
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
1380
1380
 
1381
1381
  void finish_processing_train_args(struct train_params_common * params) {
1382
1382
  if (params->escape) {
1383
- process_escapes(params->sample_start);
1383
+ string_process_escapes(params->sample_start);
1384
1384
  }
1385
1385
  }
1386
1386
 
@@ -49,4 +49,7 @@ else()
49
49
  add_subdirectory(server)
50
50
  endif()
51
51
  add_subdirectory(export-lora)
52
+ if (LLAMA_RPC)
53
+ add_subdirectory(rpc)
54
+ endif()
52
55
  endif()
@@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
48
48
  params.prompt = "Hello my name is";
49
49
  }
50
50
 
51
- process_escapes(params.prompt);
51
+ string_process_escapes(params.prompt);
52
52
 
53
53
  // init LLM
54
54
 
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
774
774
 
775
775
  params.samples_start_after_nl = false;
776
776
  params.use_adam = true;
777
- params.use_flash = true;
777
+ params.use_flash = false;
778
778
  params.use_scratch = true;
779
779
 
780
780
  // only adam
@@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
49
49
  }
50
50
 
51
51
  float * out = output + batch.seq_id[i][0] * n_embd;
52
+ //TODO: I would also add a parameter here to enable normalization or not.
53
+ /*fprintf(stdout, "unnormalized_embedding:");
54
+ for (int hh = 0; hh < n_embd; hh++) {
55
+ fprintf(stdout, "%9.6f ", embd[hh]);
56
+ }
57
+ fprintf(stdout, "\n");*/
52
58
  llama_embd_normalize(embd, out, n_embd);
53
59
  }
54
60
  }
@@ -74,7 +80,7 @@ int main(int argc, char ** argv) {
74
80
 
75
81
  std::mt19937 rng(params.seed);
76
82
  if (params.random_prompt) {
77
- params.prompt = gpt_random_prompt(rng);
83
+ params.prompt = string_random_prompt(rng);
78
84
  }
79
85
 
80
86
  llama_backend_init();
@@ -101,7 +107,7 @@ int main(int argc, char ** argv) {
101
107
  // print system information
102
108
  {
103
109
  fprintf(stderr, "\n");
104
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
110
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
105
111
  }
106
112
 
107
113
  // split the prompt into lines
@@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
123
129
  inputs.push_back(inp);
124
130
  }
125
131
 
126
- // add SEP if not present
132
+ // check if the last token is SEP
133
+ // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
127
134
  for (auto & inp : inputs) {
128
135
  if (inp.empty() || inp.back() != llama_token_sep(model)) {
129
- inp.push_back(llama_token_sep(model));
136
+ fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
137
+ fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
130
138
  }
131
139
  }
132
140
 
@@ -203,6 +211,7 @@ int main(int argc, char ** argv) {
203
211
 
204
212
  // clean up
205
213
  llama_print_timings(ctx);
214
+ llama_batch_free(batch);
206
215
  llama_free(ctx);
207
216
  llama_free_model(model);
208
217
  llama_backend_free();
@@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
152
152
 
153
153
  std::mt19937 rng(params.seed);
154
154
  if (params.random_prompt) {
155
- params.prompt = gpt_random_prompt(rng);
155
+ params.prompt = string_random_prompt(rng);
156
156
  }
157
157
 
158
158
  llama_backend_init();
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
176
176
  // print system information
177
177
  {
178
178
  fprintf(stderr, "\n");
179
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
179
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
180
180
  }
181
181
 
182
182
  bool OK = run(ctx, params);
@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
563
563
  // not capturing these, to silcence warnings
564
564
  const int rope_mode = 0;
565
565
 
566
- return ggml_rope_custom(ctx,
567
- t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
566
+ return ggml_rope_ext(ctx,
567
+ t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
568
568
  rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
569
569
  );
570
570
  };
@@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
643
643
  struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
644
644
  struct ggml_tensor * t16;
645
645
  if (enable_flash_attn) {
646
- t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
646
+ GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
647
+ //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
647
648
  } else {
648
649
  struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
649
650
  struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
@@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
598
598
 
599
599
  std::mt19937 rng(params.seed);
600
600
  if (params.random_prompt) {
601
- params.prompt = gpt_random_prompt(rng);
601
+ params.prompt = string_random_prompt(rng);
602
602
  }
603
603
 
604
604
  sparams.dataset = params.prompt_file;
@@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
667
667
  // print system information
668
668
  {
669
669
  fprintf(stderr, "\n");
670
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
670
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
671
671
  }
672
672
 
673
673
  bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);