@fugood/llama.node 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/LlamaContext.cpp +2 -2
  23. package/src/TokenizeWorker.cpp +1 -1
  24. package/src/llama.cpp/CMakeLists.txt +82 -54
  25. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  26. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  27. package/src/llama.cpp/common/common.cpp +748 -754
  28. package/src/llama.cpp/common/common.h +49 -41
  29. package/src/llama.cpp/common/grammar-parser.cpp +10 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  31. package/src/llama.cpp/common/log.h +5 -5
  32. package/src/llama.cpp/common/sampling.cpp +92 -10
  33. package/src/llama.cpp/common/sampling.h +6 -1
  34. package/src/llama.cpp/common/train.cpp +2 -2
  35. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  36. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  37. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
  39. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  40. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  42. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  43. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
  44. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
  45. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
  46. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  47. package/src/llama.cpp/examples/llava/clip.h +1 -1
  48. package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
  49. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  50. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  51. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  52. package/src/llama.cpp/examples/main/main.cpp +29 -17
  53. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  54. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  55. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  56. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  57. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  58. package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
  59. package/src/llama.cpp/examples/server/server.cpp +33 -25
  60. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  61. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  62. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  63. package/src/llama.cpp/ggml-backend.c +2 -3
  64. package/src/llama.cpp/ggml-common.h +0 -54
  65. package/src/llama.cpp/ggml-cuda.h +1 -0
  66. package/src/llama.cpp/ggml-impl.h +51 -0
  67. package/src/llama.cpp/ggml-kompute.cpp +13 -3
  68. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  69. package/src/llama.cpp/ggml-quants.c +3715 -2050
  70. package/src/llama.cpp/ggml-rpc.cpp +1155 -0
  71. package/src/llama.cpp/ggml-rpc.h +24 -0
  72. package/src/llama.cpp/ggml-sycl.cpp +119 -673
  73. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  74. package/src/llama.cpp/ggml-vulkan.cpp +203 -224
  75. package/src/llama.cpp/ggml.c +1208 -1483
  76. package/src/llama.cpp/ggml.h +71 -46
  77. package/src/llama.cpp/llama.cpp +1374 -938
  78. package/src/llama.cpp/llama.h +22 -6
  79. package/src/llama.cpp/requirements.txt +0 -2
  80. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
  82. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  83. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  84. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  85. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  86. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  87. package/src/llama.cpp/unicode-data.h +15 -12
  88. package/src/llama.cpp/unicode.cpp +89 -111
  89. package/src/llama.cpp/unicode.h +44 -12
  90. package/src/llama.cpp/build.zig +0 -172
  91. package/src/llama.cpp/ggml-mpi.c +0 -216
  92. package/src/llama.cpp/ggml-mpi.h +0 -39
  93. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
  94. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -73,7 +73,11 @@
73
73
 
74
74
  using json = nlohmann::ordered_json;
75
75
 
76
- int32_t get_num_physical_cores() {
76
+ //
77
+ // CPU utils
78
+ //
79
+
80
+ int32_t cpu_get_num_physical_cores() {
77
81
  #ifdef __linux__
78
82
  // enumerate the set of thread siblings, num entries is num cores
79
83
  std::unordered_set<std::string> siblings;
@@ -142,9 +146,9 @@ static bool is_running_on_efficiency_core(void) {
142
146
  return core_type == intel_atom;
143
147
  }
144
148
 
145
- static int count_math_cpus(int cpu_count) {
149
+ static int cpu_count_math_cpus(int n_cpu) {
146
150
  int result = 0;
147
- for (int cpu = 0; cpu < cpu_count; ++cpu) {
151
+ for (int cpu = 0; cpu < n_cpu; ++cpu) {
148
152
  if (pin_cpu(cpu)) {
149
153
  return -1;
150
154
  }
@@ -162,16 +166,16 @@ static int count_math_cpus(int cpu_count) {
162
166
  /**
163
167
  * Returns number of CPUs on system that are useful for math.
164
168
  */
165
- int get_math_cpu_count() {
169
+ int32_t cpu_get_num_math() {
166
170
  #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
167
- int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
168
- if (cpu_count < 1) {
169
- return get_num_physical_cores();
171
+ int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
172
+ if (n_cpu < 1) {
173
+ return cpu_get_num_physical_cores();
170
174
  }
171
175
  if (is_hybrid_cpu()) {
172
176
  cpu_set_t affinity;
173
177
  if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
174
- int result = count_math_cpus(cpu_count);
178
+ int result = cpu_count_math_cpus(n_cpu);
175
179
  pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
176
180
  if (result > 0) {
177
181
  return result;
@@ -179,108 +183,103 @@ int get_math_cpu_count() {
179
183
  }
180
184
  }
181
185
  #endif
182
- return get_num_physical_cores();
186
+ return cpu_get_num_physical_cores();
183
187
  }
184
188
 
185
- void process_escapes(std::string & input) {
186
- std::size_t input_len = input.length();
187
- std::size_t output_idx = 0;
189
+ //
190
+ // CLI argument parsing
191
+ //
188
192
 
189
- for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
190
- if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
191
- switch (input[++input_idx]) {
192
- case 'n': input[output_idx++] = '\n'; break;
193
- case 'r': input[output_idx++] = '\r'; break;
194
- case 't': input[output_idx++] = '\t'; break;
195
- case '\'': input[output_idx++] = '\''; break;
196
- case '\"': input[output_idx++] = '\"'; break;
197
- case '\\': input[output_idx++] = '\\'; break;
198
- case 'x':
199
- // Handle \x12, etc
200
- if (input_idx + 2 < input_len) {
201
- const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
202
- char *err_p = nullptr;
203
- const long val = std::strtol(x, &err_p, 16);
204
- if (err_p == x + 2) {
205
- input_idx += 2;
206
- input[output_idx++] = char(val);
207
- break;
208
- }
209
- }
210
- // fall through
211
- default: input[output_idx++] = '\\';
212
- input[output_idx++] = input[input_idx]; break;
193
+ void gpt_params_handle_model_default(gpt_params & params) {
194
+ if (!params.hf_repo.empty()) {
195
+ // short-hand to avoid specifying --hf-file -> default it to --model
196
+ if (params.hf_file.empty()) {
197
+ if (params.model.empty()) {
198
+ throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
213
199
  }
214
- } else {
215
- input[output_idx++] = input[input_idx];
200
+ params.hf_file = params.model;
201
+ } else if (params.model.empty()) {
202
+ std::string cache_directory = fs_get_cache_directory();
203
+ const bool success = fs_create_directory_with_parents(cache_directory);
204
+ if (!success) {
205
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
206
+ }
207
+ params.model = cache_directory + string_split(params.hf_file, '/').back();
208
+ }
209
+ } else if (!params.model_url.empty()) {
210
+ if (params.model.empty()) {
211
+ auto f = string_split(params.model_url, '#').front();
212
+ f = string_split(f, '?').front();
213
+ f = string_split(f, '/').back();
214
+ params.model = "models/" + f;
216
215
  }
216
+ } else if (params.model.empty()) {
217
+ params.model = DEFAULT_MODEL_PATH;
217
218
  }
219
+ }
218
220
 
219
- input.resize(output_idx);
221
+ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
222
+ bool invalid_param = false;
223
+ std::string arg;
224
+ const std::string arg_prefix = "--";
225
+ llama_sampling_params & sparams = params.sparams;
226
+
227
+ for (int i = 1; i < argc; i++) {
228
+ arg = argv[i];
229
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
230
+ std::replace(arg.begin(), arg.end(), '_', '-');
231
+ }
232
+ if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
233
+ throw std::invalid_argument("error: unknown argument: " + arg);
234
+ }
235
+ if (invalid_param) {
236
+ throw std::invalid_argument("error: invalid parameter for argument: " + arg);
237
+ }
238
+ }
239
+
240
+ if (params.prompt_cache_all &&
241
+ (params.interactive || params.interactive_first ||
242
+ params.instruct)) {
243
+
244
+ throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
245
+ }
246
+
247
+ gpt_params_handle_model_default(params);
248
+
249
+ if (params.escape) {
250
+ string_process_escapes(params.prompt);
251
+ string_process_escapes(params.input_prefix);
252
+ string_process_escapes(params.input_suffix);
253
+ string_process_escapes(sparams.cfg_negative_prompt);
254
+ for (auto & antiprompt : params.antiprompt) {
255
+ string_process_escapes(antiprompt);
256
+ }
257
+ }
258
+
259
+ if (!params.kv_overrides.empty()) {
260
+ params.kv_overrides.emplace_back();
261
+ params.kv_overrides.back().key[0] = 0;
262
+ }
263
+
264
+ return true;
220
265
  }
221
266
 
222
267
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
223
268
  bool result = true;
224
269
  try {
225
270
  if (!gpt_params_parse_ex(argc, argv, params)) {
226
- gpt_print_usage(argc, argv, gpt_params());
271
+ gpt_params_print_usage(argc, argv, gpt_params());
227
272
  exit(0);
228
273
  }
229
274
  }
230
275
  catch (const std::invalid_argument & ex) {
231
276
  fprintf(stderr, "%s\n", ex.what());
232
- gpt_print_usage(argc, argv, gpt_params());
277
+ gpt_params_print_usage(argc, argv, gpt_params());
233
278
  exit(1);
234
279
  }
235
280
  return result;
236
281
  }
237
282
 
238
- bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
239
- const char * sep = strchr(data, '=');
240
- if (sep == nullptr || sep - data >= 128) {
241
- fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
242
- return false;
243
- }
244
- llama_model_kv_override kvo;
245
- std::strncpy(kvo.key, data, sep - data);
246
- kvo.key[sep - data] = 0;
247
- sep++;
248
- if (strncmp(sep, "int:", 4) == 0) {
249
- sep += 4;
250
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
251
- kvo.val_i64 = std::atol(sep);
252
- } else if (strncmp(sep, "float:", 6) == 0) {
253
- sep += 6;
254
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
255
- kvo.val_f64 = std::atof(sep);
256
- } else if (strncmp(sep, "bool:", 5) == 0) {
257
- sep += 5;
258
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
259
- if (std::strcmp(sep, "true") == 0) {
260
- kvo.val_bool = true;
261
- } else if (std::strcmp(sep, "false") == 0) {
262
- kvo.val_bool = false;
263
- } else {
264
- fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
265
- return false;
266
- }
267
- } else if (strncmp(sep, "str:", 4) == 0) {
268
- sep += 4;
269
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
270
- if (strlen(sep) > 127) {
271
- fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
272
- return false;
273
- }
274
- strncpy(kvo.val_str, sep, 127);
275
- kvo.val_str[127] = '\0';
276
- } else {
277
- fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
278
- return false;
279
- }
280
- overrides.emplace_back(std::move(kvo));
281
- return true;
282
- }
283
-
284
283
  bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
285
284
  llama_sampling_params & sparams = params.sparams;
286
285
 
@@ -546,7 +545,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
546
545
  return true;
547
546
  }
548
547
  const auto sampler_names = string_split(argv[i], ';');
549
- sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
548
+ sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
550
549
  return true;
551
550
  }
552
551
  if (arg == "--sampling-seq") {
@@ -554,7 +553,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
554
553
  invalid_param = true;
555
554
  return true;
556
555
  }
557
- sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
556
+ sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
558
557
  return true;
559
558
  }
560
559
  if (arg == "--top-p") {
@@ -901,6 +900,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
901
900
  params.interactive = true;
902
901
  return true;
903
902
  }
903
+ if (arg == "--interactive-specials") {
904
+ params.interactive_specials = true;
905
+ return true;
906
+ }
907
+ if (arg == "--special") {
908
+ params.special = true;
909
+ return true;
910
+ }
904
911
  if (arg == "--embedding") {
905
912
  params.embedding = true;
906
913
  return true;
@@ -1056,6 +1063,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1056
1063
  #endif // GGML_USE_CUDA_SYCL_VULKAN
1057
1064
  return true;
1058
1065
  }
1066
+ if (arg == "--rpc") {
1067
+ if (++i >= argc) {
1068
+ invalid_param = true;
1069
+ return true;
1070
+ }
1071
+ params.rpc_servers = argv[i];
1072
+ return true;
1073
+ }
1059
1074
  if (arg == "--no-mmap") {
1060
1075
  params.use_mmap = false;
1061
1076
  return true;
@@ -1228,7 +1243,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1228
1243
  return true;
1229
1244
  }
1230
1245
  if (arg == "-h" || arg == "--help") {
1231
- gpt_print_usage(argc, argv, gpt_params());
1246
+ gpt_params_print_usage(argc, argv, gpt_params());
1232
1247
  exit(0);
1233
1248
  }
1234
1249
  if (arg == "--version") {
@@ -1299,7 +1314,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1299
1314
  invalid_param = true;
1300
1315
  return true;
1301
1316
  }
1302
- if (!parse_kv_override(argv[i], params.kv_overrides)) {
1317
+ if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
1303
1318
  fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
1304
1319
  invalid_param = true;
1305
1320
  return true;
@@ -1333,85 +1348,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1333
1348
  return false;
1334
1349
  }
1335
1350
 
1336
- void gpt_params_handle_model_default(gpt_params & params) {
1337
- if (!params.hf_repo.empty()) {
1338
- // short-hand to avoid specifying --hf-file -> default it to --model
1339
- if (params.hf_file.empty()) {
1340
- if (params.model.empty()) {
1341
- throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
1342
- }
1343
- params.hf_file = params.model;
1344
- } else if (params.model.empty()) {
1345
- params.model = "models/" + string_split(params.hf_file, '/').back();
1346
- }
1347
- } else if (!params.model_url.empty()) {
1348
- if (params.model.empty()) {
1349
- auto f = string_split(params.model_url, '#').front();
1350
- f = string_split(f, '?').front();
1351
- f = string_split(f, '/').back();
1352
- params.model = "models/" + f;
1353
- }
1354
- } else if (params.model.empty()) {
1355
- params.model = DEFAULT_MODEL_PATH;
1356
- }
1357
- }
1358
-
1359
- bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
1360
- bool invalid_param = false;
1361
- std::string arg;
1362
- const std::string arg_prefix = "--";
1363
- llama_sampling_params & sparams = params.sparams;
1364
-
1365
- for (int i = 1; i < argc; i++) {
1366
- arg = argv[i];
1367
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
1368
- std::replace(arg.begin(), arg.end(), '_', '-');
1369
- }
1370
-
1371
- if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
1372
- throw std::invalid_argument("error: unknown argument: " + arg);
1373
- }
1374
- }
1375
-
1376
- if (invalid_param) {
1377
- throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1378
- }
1379
-
1380
- if (params.prompt_cache_all &&
1381
- (params.interactive || params.interactive_first ||
1382
- params.instruct)) {
1383
-
1384
- throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
1385
- }
1386
-
1387
- gpt_params_handle_model_default(params);
1388
-
1389
- if (params.escape) {
1390
- process_escapes(params.prompt);
1391
- process_escapes(params.input_prefix);
1392
- process_escapes(params.input_suffix);
1393
- process_escapes(sparams.cfg_negative_prompt);
1394
- for (auto & antiprompt : params.antiprompt) {
1395
- process_escapes(antiprompt);
1396
- }
1397
- }
1398
-
1399
- if (!params.kv_overrides.empty()) {
1400
- params.kv_overrides.emplace_back();
1401
- params.kv_overrides.back().key[0] = 0;
1402
- }
1403
-
1404
- return true;
1405
- }
1406
-
1407
- void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1351
+ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1408
1352
  const llama_sampling_params & sparams = params.sparams;
1409
1353
 
1410
1354
  std::string sampler_type_chars;
1411
1355
  std::string sampler_type_names;
1412
1356
  for (const auto sampler_type : sparams.samplers_sequence) {
1413
1357
  sampler_type_chars += static_cast<char>(sampler_type);
1414
- sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
1358
+ sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
1415
1359
  }
1416
1360
  sampler_type_names.pop_back();
1417
1361
 
@@ -1422,6 +1366,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1422
1366
  printf(" -h, --help show this help message and exit\n");
1423
1367
  printf(" --version show version and build info\n");
1424
1368
  printf(" -i, --interactive run in interactive mode\n");
1369
+ printf(" --special special tokens output enabled\n");
1370
+ printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
1425
1371
  printf(" --interactive-first run in interactive mode and wait for input right away\n");
1426
1372
  printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
1427
1373
  printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@@ -1554,6 +1500,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1554
1500
  printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
1555
1501
  printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
1556
1502
  }
1503
+ printf(" --rpc SERVERS comma separated list of RPC servers\n");
1557
1504
  printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
1558
1505
  printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
1559
1506
  printf(" -gan N, --grp-attn-n N\n");
@@ -1606,7 +1553,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1606
1553
  #endif // LOG_DISABLE_LOGS
1607
1554
  }
1608
1555
 
1609
- std::string get_system_info(const gpt_params & params) {
1556
+ std::string gpt_params_get_system_info(const gpt_params & params) {
1610
1557
  std::ostringstream os;
1611
1558
 
1612
1559
  os << "system_info: n_threads = " << params.n_threads;
@@ -1618,7 +1565,52 @@ std::string get_system_info(const gpt_params & params) {
1618
1565
  return os.str();
1619
1566
  }
1620
1567
 
1621
- std::string gpt_random_prompt(std::mt19937 & rng) {
1568
+ //
1569
+ // String utils
1570
+ //
1571
+
1572
+ std::vector<std::string> string_split(std::string input, char separator) {
1573
+ std::vector<std::string> parts;
1574
+ size_t separator_pos = input.find(separator);
1575
+ while (separator_pos != std::string::npos) {
1576
+ std::string part = input.substr(0, separator_pos);
1577
+ parts.emplace_back(part);
1578
+ input = input.substr(separator_pos + 1);
1579
+ separator_pos = input.find(separator);
1580
+ }
1581
+ parts.emplace_back(input);
1582
+ return parts;
1583
+ }
1584
+
1585
+ std::string string_strip(const std::string & str) {
1586
+ size_t start = 0;
1587
+ size_t end = str.size();
1588
+ while (start < end && std::isspace(str[start])) {
1589
+ start++;
1590
+ }
1591
+ while (end > start && std::isspace(str[end - 1])) {
1592
+ end--;
1593
+ }
1594
+ return str.substr(start, end - start);
1595
+ }
1596
+
1597
+ std::string string_get_sortable_timestamp() {
1598
+ using clock = std::chrono::system_clock;
1599
+
1600
+ const clock::time_point current_time = clock::now();
1601
+ const time_t as_time_t = clock::to_time_t(current_time);
1602
+ char timestamp_no_ns[100];
1603
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
1604
+
1605
+ const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
1606
+ current_time.time_since_epoch() % 1000000000).count();
1607
+ char timestamp_ns[11];
1608
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
1609
+
1610
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
1611
+ }
1612
+
1613
+ std::string string_random_prompt(std::mt19937 & rng) {
1622
1614
  const int r = rng() % 10;
1623
1615
  switch (r) {
1624
1616
  case 0: return "So";
@@ -1636,11 +1628,98 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
1636
1628
  GGML_UNREACHABLE();
1637
1629
  }
1638
1630
 
1639
- // Validate if a filename is safe to use
1640
- // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
1641
- bool validate_file_name(const std::string & filename) {
1642
- if (!filename.length()) {
1643
- // Empty filename invalid
1631
+ void string_process_escapes(std::string & input) {
1632
+ std::size_t input_len = input.length();
1633
+ std::size_t output_idx = 0;
1634
+
1635
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
1636
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
1637
+ switch (input[++input_idx]) {
1638
+ case 'n': input[output_idx++] = '\n'; break;
1639
+ case 'r': input[output_idx++] = '\r'; break;
1640
+ case 't': input[output_idx++] = '\t'; break;
1641
+ case '\'': input[output_idx++] = '\''; break;
1642
+ case '\"': input[output_idx++] = '\"'; break;
1643
+ case '\\': input[output_idx++] = '\\'; break;
1644
+ case 'x':
1645
+ // Handle \x12, etc
1646
+ if (input_idx + 2 < input_len) {
1647
+ const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
1648
+ char *err_p = nullptr;
1649
+ const long val = std::strtol(x, &err_p, 16);
1650
+ if (err_p == x + 2) {
1651
+ input_idx += 2;
1652
+ input[output_idx++] = char(val);
1653
+ break;
1654
+ }
1655
+ }
1656
+ // fall through
1657
+ default: input[output_idx++] = '\\';
1658
+ input[output_idx++] = input[input_idx]; break;
1659
+ }
1660
+ } else {
1661
+ input[output_idx++] = input[input_idx];
1662
+ }
1663
+ }
1664
+
1665
+ input.resize(output_idx);
1666
+ }
1667
+
1668
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
1669
+ const char * sep = strchr(data, '=');
1670
+ if (sep == nullptr || sep - data >= 128) {
1671
+ fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
1672
+ return false;
1673
+ }
1674
+ llama_model_kv_override kvo;
1675
+ std::strncpy(kvo.key, data, sep - data);
1676
+ kvo.key[sep - data] = 0;
1677
+ sep++;
1678
+ if (strncmp(sep, "int:", 4) == 0) {
1679
+ sep += 4;
1680
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
1681
+ kvo.val_i64 = std::atol(sep);
1682
+ } else if (strncmp(sep, "float:", 6) == 0) {
1683
+ sep += 6;
1684
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
1685
+ kvo.val_f64 = std::atof(sep);
1686
+ } else if (strncmp(sep, "bool:", 5) == 0) {
1687
+ sep += 5;
1688
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
1689
+ if (std::strcmp(sep, "true") == 0) {
1690
+ kvo.val_bool = true;
1691
+ } else if (std::strcmp(sep, "false") == 0) {
1692
+ kvo.val_bool = false;
1693
+ } else {
1694
+ fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
1695
+ return false;
1696
+ }
1697
+ } else if (strncmp(sep, "str:", 4) == 0) {
1698
+ sep += 4;
1699
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
1700
+ if (strlen(sep) > 127) {
1701
+ fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
1702
+ return false;
1703
+ }
1704
+ strncpy(kvo.val_str, sep, 127);
1705
+ kvo.val_str[127] = '\0';
1706
+ } else {
1707
+ fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
1708
+ return false;
1709
+ }
1710
+ overrides.emplace_back(std::move(kvo));
1711
+ return true;
1712
+ }
1713
+
1714
+ //
1715
+ // Filesystem utils
1716
+ //
1717
+
1718
+ // Validate if a filename is safe to use
1719
+ // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
1720
+ bool fs_validate_filename(const std::string & filename) {
1721
+ if (!filename.length()) {
1722
+ // Empty filename invalid
1644
1723
  return false;
1645
1724
  }
1646
1725
  if (filename.length() > 255) {
@@ -1707,181 +1786,260 @@ bool validate_file_name(const std::string & filename) {
1707
1786
  return true;
1708
1787
  }
1709
1788
 
1710
- //
1711
- // String utils
1712
- //
1789
+ // returns true if successful, false otherwise
1790
+ bool fs_create_directory_with_parents(const std::string & path) {
1791
+ #ifdef _WIN32
1792
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
1793
+ std::wstring wpath = converter.from_bytes(path);
1713
1794
 
1714
- std::vector<std::string> string_split(std::string input, char separator) {
1715
- std::vector<std::string> parts;
1716
- size_t separator_pos = input.find(separator);
1717
- while (separator_pos != std::string::npos) {
1718
- std::string part = input.substr(0, separator_pos);
1719
- parts.emplace_back(part);
1720
- input = input.substr(separator_pos + 1);
1721
- separator_pos = input.find(separator);
1795
+ // if the path already exists, check whether it's a directory
1796
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
1797
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
1798
+ return true;
1722
1799
  }
1723
- parts.emplace_back(input);
1724
- return parts;
1725
- }
1726
1800
 
1727
- std::string string_strip(const std::string & str) {
1728
- size_t start = 0;
1729
- size_t end = str.size();
1730
- while (start < end && std::isspace(str[start])) {
1731
- start++;
1732
- }
1733
- while (end > start && std::isspace(str[end - 1])) {
1734
- end--;
1735
- }
1736
- return str.substr(start, end - start);
1737
- }
1801
+ size_t pos_slash = 0;
1738
1802
 
1739
- std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
1740
- std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
1741
- {"top_k", llama_sampler_type::TOP_K},
1742
- {"top_p", llama_sampler_type::TOP_P},
1743
- {"typical_p", llama_sampler_type::TYPICAL_P},
1744
- {"min_p", llama_sampler_type::MIN_P},
1745
- {"tfs_z", llama_sampler_type::TFS_Z},
1746
- {"temperature", llama_sampler_type::TEMPERATURE}
1747
- };
1803
+ // process path from front to back, procedurally creating directories
1804
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
1805
+ const std::wstring subpath = wpath.substr(0, pos_slash);
1806
+ const wchar_t * test = subpath.c_str();
1748
1807
 
1749
- // since samplers names are written multiple ways
1750
- // make it ready for both system names and input names
1751
- std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
1752
- {"top-k", llama_sampler_type::TOP_K},
1753
- {"top-p", llama_sampler_type::TOP_P},
1754
- {"nucleus", llama_sampler_type::TOP_P},
1755
- {"typical-p", llama_sampler_type::TYPICAL_P},
1756
- {"typical", llama_sampler_type::TYPICAL_P},
1757
- {"min-p", llama_sampler_type::MIN_P},
1758
- {"tfs-z", llama_sampler_type::TFS_Z},
1759
- {"tfs", llama_sampler_type::TFS_Z},
1760
- {"temp", llama_sampler_type::TEMPERATURE}
1761
- };
1808
+ const bool success = CreateDirectoryW(test, NULL);
1809
+ if (!success) {
1810
+ const DWORD error = GetLastError();
1762
1811
 
1763
- std::vector<llama_sampler_type> sampler_types;
1764
- sampler_types.reserve(names.size());
1765
- for (const auto & name : names)
1766
- {
1767
- auto sampler_item = sampler_canonical_name_map.find(name);
1768
- if (sampler_item != sampler_canonical_name_map.end())
1769
- {
1770
- sampler_types.push_back(sampler_item->second);
1771
- }
1772
- else
1773
- {
1774
- if (allow_alt_names)
1775
- {
1776
- sampler_item = sampler_alt_name_map.find(name);
1777
- if (sampler_item != sampler_alt_name_map.end())
1778
- {
1779
- sampler_types.push_back(sampler_item->second);
1812
+ // if the path already exists, ensure that it's a directory
1813
+ if (error == ERROR_ALREADY_EXISTS) {
1814
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
1815
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
1816
+ return false;
1780
1817
  }
1818
+ } else {
1819
+ return false;
1781
1820
  }
1782
1821
  }
1822
+
1823
+ pos_slash += 1;
1783
1824
  }
1784
- return sampler_types;
1785
- }
1786
1825
 
1787
- std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
1788
- std::unordered_map<char, llama_sampler_type> sampler_name_map {
1789
- {'k', llama_sampler_type::TOP_K},
1790
- {'p', llama_sampler_type::TOP_P},
1791
- {'y', llama_sampler_type::TYPICAL_P},
1792
- {'m', llama_sampler_type::MIN_P},
1793
- {'f', llama_sampler_type::TFS_Z},
1794
- {'t', llama_sampler_type::TEMPERATURE}
1795
- };
1826
+ return true;
1827
+ #else
1828
+ // if the path already exists, check whether it's a directory
1829
+ struct stat info;
1830
+ if (stat(path.c_str(), &info) == 0) {
1831
+ return S_ISDIR(info.st_mode);
1832
+ }
1833
+
1834
+ size_t pos_slash = 1; // skip leading slashes for directory creation
1835
+
1836
+ // process path from front to back, procedurally creating directories
1837
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
1838
+ const std::string subpath = path.substr(0, pos_slash);
1839
+ struct stat info;
1796
1840
 
1797
- std::vector<llama_sampler_type> sampler_types;
1798
- sampler_types.reserve(names_string.size());
1799
- for (const auto & c : names_string) {
1800
- const auto sampler_item = sampler_name_map.find(c);
1801
- if (sampler_item != sampler_name_map.end()) {
1802
- sampler_types.push_back(sampler_item->second);
1841
+ // if the path already exists, ensure that it's a directory
1842
+ if (stat(subpath.c_str(), &info) == 0) {
1843
+ if (!S_ISDIR(info.st_mode)) {
1844
+ return false;
1845
+ }
1846
+ } else {
1847
+ // create parent directories
1848
+ const int ret = mkdir(subpath.c_str(), 0755);
1849
+ if (ret != 0) {
1850
+ return false;
1851
+ }
1803
1852
  }
1853
+
1854
+ pos_slash += 1;
1804
1855
  }
1805
- return sampler_types;
1856
+
1857
+ return true;
1858
+ #endif // _WIN32
1806
1859
  }
1807
1860
 
1808
- std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
1809
- switch (sampler_type) {
1810
- case llama_sampler_type::TOP_K: return "top_k";
1811
- case llama_sampler_type::TFS_Z: return "tfs_z";
1812
- case llama_sampler_type::TYPICAL_P: return "typical_p";
1813
- case llama_sampler_type::TOP_P: return "top_p";
1814
- case llama_sampler_type::MIN_P: return "min_p";
1815
- case llama_sampler_type::TEMPERATURE: return "temperature";
1816
- default : return "";
1861
+ std::string fs_get_cache_directory() {
1862
+ std::string cache_directory = "";
1863
+ auto ensure_trailing_slash = [](std::string p) {
1864
+ // Make sure to add trailing slash
1865
+ if (p.back() != DIRECTORY_SEPARATOR) {
1866
+ p += DIRECTORY_SEPARATOR;
1867
+ }
1868
+ return p;
1869
+ };
1870
+ if (getenv("LLAMA_CACHE")) {
1871
+ cache_directory = std::getenv("LLAMA_CACHE");
1872
+ } else {
1873
+ #ifdef __linux__
1874
+ if (std::getenv("XDG_CACHE_HOME")) {
1875
+ cache_directory = std::getenv("XDG_CACHE_HOME");
1876
+ } else {
1877
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
1878
+ }
1879
+ #elif defined(__APPLE__)
1880
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
1881
+ #elif defined(_WIN32)
1882
+ cache_directory = std::getenv("LOCALAPPDATA");
1883
+ #endif // __linux__
1884
+ cache_directory = ensure_trailing_slash(cache_directory);
1885
+ cache_directory += "llama.cpp";
1817
1886
  }
1887
+ return ensure_trailing_slash(cache_directory);
1818
1888
  }
1819
1889
 
1890
+
1820
1891
  //
1821
1892
  // Model utils
1822
1893
  //
1823
1894
 
1824
- struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
1825
- auto mparams = llama_model_default_params();
1895
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
1896
+ auto mparams = llama_model_params_from_gpt_params(params);
1826
1897
 
1827
- if (params.n_gpu_layers != -1) {
1828
- mparams.n_gpu_layers = params.n_gpu_layers;
1829
- }
1830
- mparams.main_gpu = params.main_gpu;
1831
- mparams.split_mode = params.split_mode;
1832
- mparams.tensor_split = params.tensor_split;
1833
- mparams.use_mmap = params.use_mmap;
1834
- mparams.use_mlock = params.use_mlock;
1835
- mparams.check_tensors = params.check_tensors;
1836
- if (params.kv_overrides.empty()) {
1837
- mparams.kv_overrides = NULL;
1898
+ llama_model * model = nullptr;
1899
+
1900
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
1901
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
1902
+ } else if (!params.model_url.empty()) {
1903
+ model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
1838
1904
  } else {
1839
- GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1840
- mparams.kv_overrides = params.kv_overrides.data();
1905
+ model = llama_load_model_from_file(params.model.c_str(), mparams);
1841
1906
  }
1842
1907
 
1843
- return mparams;
1844
- }
1845
-
1846
- static ggml_type kv_cache_type_from_str(const std::string & s) {
1847
- if (s == "f32") {
1848
- return GGML_TYPE_F32;
1849
- }
1850
- if (s == "f16") {
1851
- return GGML_TYPE_F16;
1852
- }
1853
- if (s == "q8_0") {
1854
- return GGML_TYPE_Q8_0;
1855
- }
1856
- if (s == "q4_0") {
1857
- return GGML_TYPE_Q4_0;
1858
- }
1859
- if (s == "q4_1") {
1860
- return GGML_TYPE_Q4_1;
1861
- }
1862
- if (s == "iq4_nl") {
1863
- return GGML_TYPE_IQ4_NL;
1864
- }
1865
- if (s == "q5_0") {
1866
- return GGML_TYPE_Q5_0;
1867
- }
1868
- if (s == "q5_1") {
1869
- return GGML_TYPE_Q5_1;
1908
+ if (model == NULL) {
1909
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
1910
+ return std::make_tuple(nullptr, nullptr);
1870
1911
  }
1871
1912
 
1872
- throw std::runtime_error("Invalid cache type: " + s);
1873
- }
1913
+ auto cparams = llama_context_params_from_gpt_params(params);
1874
1914
 
1875
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
1876
- auto cparams = llama_context_default_params();
1915
+ llama_context * lctx = llama_new_context_with_model(model, cparams);
1916
+ if (lctx == NULL) {
1917
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
1918
+ llama_free_model(model);
1919
+ return std::make_tuple(nullptr, nullptr);
1920
+ }
1877
1921
 
1878
- cparams.n_ctx = params.n_ctx;
1879
- cparams.n_seq_max = params.n_parallel;
1880
- cparams.n_batch = params.n_batch;
1881
- cparams.n_ubatch = params.n_ubatch;
1882
- cparams.n_threads = params.n_threads;
1883
- cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
1884
- cparams.seed = params.seed;
1922
+ if (!params.control_vectors.empty()) {
1923
+ if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
1924
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
1925
+
1926
+ const auto cvec = llama_control_vector_load(params.control_vectors);
1927
+ if (cvec.n_embd == -1) {
1928
+ llama_free(lctx);
1929
+ llama_free_model(model);
1930
+ return std::make_tuple(nullptr, nullptr);
1931
+ }
1932
+
1933
+ int err = llama_control_vector_apply(lctx,
1934
+ cvec.data.data(),
1935
+ cvec.data.size(),
1936
+ cvec.n_embd,
1937
+ params.control_vector_layer_start,
1938
+ params.control_vector_layer_end);
1939
+ if (err) {
1940
+ llama_free(lctx);
1941
+ llama_free_model(model);
1942
+ return std::make_tuple(nullptr, nullptr);
1943
+ }
1944
+ }
1945
+
1946
+ for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
1947
+ const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
1948
+ float lora_scale = std::get<1>(params.lora_adapter[i]);
1949
+ int err = llama_model_apply_lora_from_file(model,
1950
+ lora_adapter.c_str(),
1951
+ lora_scale,
1952
+ ((i > 0) || params.lora_base.empty())
1953
+ ? NULL
1954
+ : params.lora_base.c_str(),
1955
+ params.n_threads);
1956
+ if (err != 0) {
1957
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
1958
+ llama_free(lctx);
1959
+ llama_free_model(model);
1960
+ return std::make_tuple(nullptr, nullptr);
1961
+ }
1962
+ }
1963
+
1964
+ if (params.ignore_eos) {
1965
+ params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
1966
+ }
1967
+
1968
+ if (params.warmup) {
1969
+ LOG("warming up the model with an empty run\n");
1970
+
1971
+ std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
1972
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
1973
+ llama_kv_cache_clear(lctx);
1974
+ llama_synchronize(lctx);
1975
+ llama_reset_timings(lctx);
1976
+ }
1977
+
1978
+ return std::make_tuple(model, lctx);
1979
+ }
1980
+
1981
+ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
1982
+ auto mparams = llama_model_default_params();
1983
+
1984
+ if (params.n_gpu_layers != -1) {
1985
+ mparams.n_gpu_layers = params.n_gpu_layers;
1986
+ }
1987
+ mparams.rpc_servers = params.rpc_servers.c_str();
1988
+ mparams.main_gpu = params.main_gpu;
1989
+ mparams.split_mode = params.split_mode;
1990
+ mparams.tensor_split = params.tensor_split;
1991
+ mparams.use_mmap = params.use_mmap;
1992
+ mparams.use_mlock = params.use_mlock;
1993
+ mparams.check_tensors = params.check_tensors;
1994
+ if (params.kv_overrides.empty()) {
1995
+ mparams.kv_overrides = NULL;
1996
+ } else {
1997
+ GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1998
+ mparams.kv_overrides = params.kv_overrides.data();
1999
+ }
2000
+
2001
+ return mparams;
2002
+ }
2003
+
2004
+ static ggml_type kv_cache_type_from_str(const std::string & s) {
2005
+ if (s == "f32") {
2006
+ return GGML_TYPE_F32;
2007
+ }
2008
+ if (s == "f16") {
2009
+ return GGML_TYPE_F16;
2010
+ }
2011
+ if (s == "q8_0") {
2012
+ return GGML_TYPE_Q8_0;
2013
+ }
2014
+ if (s == "q4_0") {
2015
+ return GGML_TYPE_Q4_0;
2016
+ }
2017
+ if (s == "q4_1") {
2018
+ return GGML_TYPE_Q4_1;
2019
+ }
2020
+ if (s == "iq4_nl") {
2021
+ return GGML_TYPE_IQ4_NL;
2022
+ }
2023
+ if (s == "q5_0") {
2024
+ return GGML_TYPE_Q5_0;
2025
+ }
2026
+ if (s == "q5_1") {
2027
+ return GGML_TYPE_Q5_1;
2028
+ }
2029
+
2030
+ throw std::runtime_error("Invalid cache type: " + s);
2031
+ }
2032
+
2033
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
2034
+ auto cparams = llama_context_default_params();
2035
+
2036
+ cparams.n_ctx = params.n_ctx;
2037
+ cparams.n_seq_max = params.n_parallel;
2038
+ cparams.n_batch = params.n_batch;
2039
+ cparams.n_ubatch = params.n_ubatch;
2040
+ cparams.n_threads = params.n_threads;
2041
+ cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
2042
+ cparams.seed = params.seed;
1885
2043
  cparams.logits_all = params.logits_all;
1886
2044
  cparams.embeddings = params.embedding;
1887
2045
  cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1905,27 +2063,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1905
2063
  return cparams;
1906
2064
  }
1907
2065
 
1908
- void llama_batch_clear(struct llama_batch & batch) {
1909
- batch.n_tokens = 0;
1910
- }
1911
-
1912
- void llama_batch_add(
1913
- struct llama_batch & batch,
1914
- llama_token id,
1915
- llama_pos pos,
1916
- const std::vector<llama_seq_id> & seq_ids,
1917
- bool logits) {
1918
- batch.token [batch.n_tokens] = id;
1919
- batch.pos [batch.n_tokens] = pos;
1920
- batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1921
- for (size_t i = 0; i < seq_ids.size(); ++i) {
1922
- batch.seq_id[batch.n_tokens][i] = seq_ids[i];
1923
- }
1924
- batch.logits [batch.n_tokens] = logits;
1925
-
1926
- batch.n_tokens++;
1927
- }
1928
-
1929
2066
  #ifdef LLAMA_USE_CURL
1930
2067
 
1931
2068
  static bool starts_with(const std::string & str, const std::string & prefix) {
@@ -2256,90 +2393,29 @@ struct llama_model * llama_load_model_from_hf(
2256
2393
 
2257
2394
  #endif // LLAMA_USE_CURL
2258
2395
 
2259
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2260
- auto mparams = llama_model_params_from_gpt_params(params);
2261
-
2262
- llama_model * model = nullptr;
2263
-
2264
- if (!params.hf_repo.empty() && !params.hf_file.empty()) {
2265
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
2266
- } else if (!params.model_url.empty()) {
2267
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
2268
- } else {
2269
- model = llama_load_model_from_file(params.model.c_str(), mparams);
2270
- }
2271
-
2272
- if (model == NULL) {
2273
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2274
- return std::make_tuple(nullptr, nullptr);
2275
- }
2276
-
2277
- auto cparams = llama_context_params_from_gpt_params(params);
2278
-
2279
- llama_context * lctx = llama_new_context_with_model(model, cparams);
2280
- if (lctx == NULL) {
2281
- fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
2282
- llama_free_model(model);
2283
- return std::make_tuple(nullptr, nullptr);
2284
- }
2285
-
2286
- if (!params.control_vectors.empty()) {
2287
- if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
2288
- if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
2289
-
2290
- const auto cvec = llama_control_vector_load(params.control_vectors);
2291
- if (cvec.n_embd == -1) {
2292
- llama_free(lctx);
2293
- llama_free_model(model);
2294
- return std::make_tuple(nullptr, nullptr);
2295
- }
2296
-
2297
- int err = llama_control_vector_apply(lctx,
2298
- cvec.data.data(),
2299
- cvec.data.size(),
2300
- cvec.n_embd,
2301
- params.control_vector_layer_start,
2302
- params.control_vector_layer_end);
2303
- if (err) {
2304
- llama_free(lctx);
2305
- llama_free_model(model);
2306
- return std::make_tuple(nullptr, nullptr);
2307
- }
2308
- }
2309
-
2310
- for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2311
- const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2312
- float lora_scale = std::get<1>(params.lora_adapter[i]);
2313
- int err = llama_model_apply_lora_from_file(model,
2314
- lora_adapter.c_str(),
2315
- lora_scale,
2316
- ((i > 0) || params.lora_base.empty())
2317
- ? NULL
2318
- : params.lora_base.c_str(),
2319
- params.n_threads);
2320
- if (err != 0) {
2321
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2322
- llama_free(lctx);
2323
- llama_free_model(model);
2324
- return std::make_tuple(nullptr, nullptr);
2325
- }
2326
- }
2327
-
2328
- if (params.ignore_eos) {
2329
- params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
2330
- }
2396
+ //
2397
+ // Batch utils
2398
+ //
2331
2399
 
2332
- if (params.warmup) {
2333
- LOG("warming up the model with an empty run\n");
2400
+ void llama_batch_clear(struct llama_batch & batch) {
2401
+ batch.n_tokens = 0;
2402
+ }
2334
2403
 
2335
- std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
2336
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2337
- llama_kv_cache_clear(lctx);
2338
- llama_synchronize(lctx);
2339
- llama_reset_timings(lctx);
2404
+ void llama_batch_add(
2405
+ struct llama_batch & batch,
2406
+ llama_token id,
2407
+ llama_pos pos,
2408
+ const std::vector<llama_seq_id> & seq_ids,
2409
+ bool logits) {
2410
+ batch.token [batch.n_tokens] = id;
2411
+ batch.pos [batch.n_tokens] = pos;
2412
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
2413
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
2414
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
2340
2415
  }
2416
+ batch.logits [batch.n_tokens] = logits;
2341
2417
 
2342
- return std::make_tuple(model, lctx);
2418
+ batch.n_tokens++;
2343
2419
  }
2344
2420
 
2345
2421
  //
@@ -2392,355 +2468,46 @@ std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_to
2392
2468
 
2393
2469
  std::string piece;
2394
2470
  std::string result;
2395
-
2396
- for (size_t i = 0; i < tokens.size(); ++i) {
2397
- piece = llama_token_to_piece(ctx, tokens[i]);
2398
-
2399
- // remove the leading space of the first non-BOS token
2400
- if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
2401
- piece = piece.substr(1);
2402
- }
2403
-
2404
- result += piece;
2405
- }
2406
-
2407
- return result;
2408
- }
2409
-
2410
- std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
2411
- std::string piece;
2412
- std::string result;
2413
-
2414
- for (size_t i = 0; i < tokens.size(); ++i) {
2415
- piece = llama_token_to_piece(ctx, tokens[i]);
2416
-
2417
- result += piece;
2418
- }
2419
-
2420
- // NOTE: the original tokenizer decodes bytes after collecting the pieces.
2421
- return result;
2422
- }
2423
-
2424
- bool llama_should_add_bos_token(const llama_model * model) {
2425
- const int add_bos = llama_add_bos_token(model);
2426
-
2427
- return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2428
- }
2429
-
2430
- //
2431
- // YAML utils
2432
- //
2433
-
2434
- // returns true if successful, false otherwise
2435
- bool create_directory_with_parents(const std::string & path) {
2436
- #ifdef _WIN32
2437
- std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
2438
- std::wstring wpath = converter.from_bytes(path);
2439
-
2440
- // if the path already exists, check whether it's a directory
2441
- const DWORD attributes = GetFileAttributesW(wpath.c_str());
2442
- if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
2443
- return true;
2444
- }
2445
-
2446
- size_t pos_slash = 0;
2447
-
2448
- // process path from front to back, procedurally creating directories
2449
- while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
2450
- const std::wstring subpath = wpath.substr(0, pos_slash);
2451
- const wchar_t * test = subpath.c_str();
2452
-
2453
- const bool success = CreateDirectoryW(test, NULL);
2454
- if (!success) {
2455
- const DWORD error = GetLastError();
2456
-
2457
- // if the path already exists, ensure that it's a directory
2458
- if (error == ERROR_ALREADY_EXISTS) {
2459
- const DWORD attributes = GetFileAttributesW(subpath.c_str());
2460
- if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
2461
- return false;
2462
- }
2463
- } else {
2464
- return false;
2465
- }
2466
- }
2467
-
2468
- pos_slash += 1;
2469
- }
2470
-
2471
- return true;
2472
- #else
2473
- // if the path already exists, check whether it's a directory
2474
- struct stat info;
2475
- if (stat(path.c_str(), &info) == 0) {
2476
- return S_ISDIR(info.st_mode);
2477
- }
2478
-
2479
- size_t pos_slash = 1; // skip leading slashes for directory creation
2480
-
2481
- // process path from front to back, procedurally creating directories
2482
- while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
2483
- const std::string subpath = path.substr(0, pos_slash);
2484
- struct stat info;
2485
-
2486
- // if the path already exists, ensure that it's a directory
2487
- if (stat(subpath.c_str(), &info) == 0) {
2488
- if (!S_ISDIR(info.st_mode)) {
2489
- return false;
2490
- }
2491
- } else {
2492
- // create parent directories
2493
- const int ret = mkdir(subpath.c_str(), 0755);
2494
- if (ret != 0) {
2495
- return false;
2496
- }
2497
- }
2498
-
2499
- pos_slash += 1;
2500
- }
2501
-
2502
- return true;
2503
- #endif // _WIN32
2504
- }
2505
-
2506
- void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
2507
- if (data.empty()) {
2508
- fprintf(stream, "%s:\n", prop_name);
2509
- return;
2510
- }
2511
-
2512
- fprintf(stream, "%s: [", prop_name);
2513
- for (size_t i = 0; i < data.size() - 1; ++i) {
2514
- fprintf(stream, "%e, ", data[i]);
2515
- }
2516
- fprintf(stream, "%e]\n", data.back());
2517
- }
2518
-
2519
- void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
2520
- if (data.empty()) {
2521
- fprintf(stream, "%s:\n", prop_name);
2522
- return;
2523
- }
2524
-
2525
- fprintf(stream, "%s: [", prop_name);
2526
- for (size_t i = 0; i < data.size() - 1; ++i) {
2527
- fprintf(stream, "%d, ", data[i]);
2528
- }
2529
- fprintf(stream, "%d]\n", data.back());
2530
- }
2531
-
2532
- void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
2533
- std::string data_str(data == NULL ? "" : data);
2534
-
2535
- if (data_str.empty()) {
2536
- fprintf(stream, "%s:\n", prop_name);
2537
- return;
2538
- }
2539
-
2540
- size_t pos_start = 0;
2541
- size_t pos_found = 0;
2542
-
2543
- if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
2544
- data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
2545
- data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
2546
- data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
2547
- data_str = "\"" + data_str + "\"";
2548
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2549
- return;
2550
- }
2551
-
2552
- if (data_str.find('\n') == std::string::npos) {
2553
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2554
- return;
2555
- }
2556
-
2557
- fprintf(stream, "%s: |\n", prop_name);
2558
- while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
2559
- fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
2560
- pos_start = pos_found + 1;
2561
- }
2562
- }
2563
-
2564
- std::string get_sortable_timestamp() {
2565
- using clock = std::chrono::system_clock;
2566
-
2567
- const clock::time_point current_time = clock::now();
2568
- const time_t as_time_t = clock::to_time_t(current_time);
2569
- char timestamp_no_ns[100];
2570
- std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
2571
-
2572
- const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
2573
- current_time.time_since_epoch() % 1000000000).count();
2574
- char timestamp_ns[11];
2575
- snprintf(timestamp_ns, 11, "%09" PRId64, ns);
2576
-
2577
- return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
2578
- }
2579
-
2580
- void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
2581
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
2582
- const llama_sampling_params & sparams = params.sparams;
2583
-
2584
- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
2585
- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
2586
- fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
2587
- fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
2588
- fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
2589
- fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
2590
- fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
2591
- fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
2592
- fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
2593
- fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
2594
- fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
2595
- fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
2596
- fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
2597
- fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
2598
- fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
2599
- fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
2600
- fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
2601
- fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
2602
- fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
2603
- fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
2604
- fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
2605
- fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
2606
- fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
2607
-
2608
- #ifdef NDEBUG
2609
- fprintf(stream, "debug: false\n");
2610
- #else
2611
- fprintf(stream, "debug: true\n");
2612
- #endif // NDEBUG
2613
-
2614
- fprintf(stream, "model_desc: %s\n", model_desc);
2615
- fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
2616
-
2617
- #ifdef __OPTIMIZE__
2618
- fprintf(stream, "optimize: true\n");
2619
- #else
2620
- fprintf(stream, "optimize: false\n");
2621
- #endif // __OPTIMIZE__
2622
-
2623
- fprintf(stream, "time: %s\n", timestamp.c_str());
2624
-
2625
- fprintf(stream, "\n");
2626
- fprintf(stream, "###############\n");
2627
- fprintf(stream, "# User Inputs #\n");
2628
- fprintf(stream, "###############\n");
2629
- fprintf(stream, "\n");
2630
-
2631
- fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2632
- fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2633
- dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
2634
- fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
2635
- fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2636
- fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2637
- fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2638
- fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2639
- fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2640
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2641
- dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
2642
- fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2643
- fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2644
- fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2645
-
2646
- const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
2647
- const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
2648
- fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
2649
-
2650
- dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
2651
- fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2652
- dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
2653
- fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
2654
- fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2655
- fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2656
- fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2657
- fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2658
-
2659
- fprintf(stream, "logit_bias:\n");
2660
- for (std::pair<llama_token, float> lb : sparams.logit_bias) {
2661
- if (ignore_eos && lb.first == logit_bias_eos->first) {
2662
- continue;
2663
- }
2664
- fprintf(stream, " %d: %f", lb.first, lb.second);
2665
- }
2666
-
2667
- fprintf(stream, "lora:\n");
2668
- for (std::tuple<std::string, float> la : params.lora_adapter) {
2669
- if (std::get<1>(la) != 1.0f) {
2670
- continue;
2671
- }
2672
- fprintf(stream, " - %s\n", std::get<0>(la).c_str());
2673
- }
2674
- fprintf(stream, "lora_scaled:\n");
2675
- for (std::tuple<std::string, float> la : params.lora_adapter) {
2676
- if (std::get<1>(la) == 1.0f) {
2677
- continue;
2678
- }
2679
- fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
2680
- }
2681
- fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
2682
- fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2683
- fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2684
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2685
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2686
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2687
- fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2688
- fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2689
- fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2690
- fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2691
- fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2692
- fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2693
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2694
- fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2695
- fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2696
- fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2697
- fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2698
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2699
- dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
2700
- fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2701
- fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2702
- fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2703
- dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
2704
- fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
2705
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2706
-
2707
- fprintf(stream, "reverse_prompt:\n");
2708
- for (std::string ap : params.antiprompt) {
2709
- size_t pos = 0;
2710
- while ((pos = ap.find('\n', pos)) != std::string::npos) {
2711
- ap.replace(pos, 1, "\\n");
2712
- pos += 1;
2471
+
2472
+ for (size_t i = 0; i < tokens.size(); ++i) {
2473
+ piece = llama_token_to_piece(ctx, tokens[i]);
2474
+
2475
+ // remove the leading space of the first non-BOS token
2476
+ if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
2477
+ piece = piece.substr(1);
2713
2478
  }
2714
2479
 
2715
- fprintf(stream, " - %s\n", ap.c_str());
2480
+ result += piece;
2716
2481
  }
2717
2482
 
2718
- fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2719
- fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2720
- fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
2721
- fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2722
- fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2723
- fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2724
- fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2483
+ return result;
2484
+ }
2725
2485
 
2726
- const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2727
- dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
2486
+ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
2487
+ std::string piece;
2488
+ std::string result;
2728
2489
 
2729
- fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
2730
- fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
2731
- fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2732
- fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2733
- fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2734
- fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
2735
- fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2736
- fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2490
+ for (size_t i = 0; i < tokens.size(); ++i) {
2491
+ piece = llama_token_to_piece(ctx, tokens[i]);
2492
+
2493
+ result += piece;
2494
+ }
2495
+
2496
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
2497
+ return result;
2498
+ }
2499
+
2500
+ bool llama_should_add_bos_token(const llama_model * model) {
2501
+ const int add_bos = llama_add_bos_token(model);
2502
+
2503
+ return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2737
2504
  }
2738
2505
 
2739
2506
  //
2740
2507
  // KV cache utils
2741
2508
  //
2742
2509
 
2743
- void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
2510
+ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
2744
2511
  static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
2745
2512
 
2746
2513
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -2763,7 +2530,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
2763
2530
  printf("\n=== Done dumping\n");
2764
2531
  }
2765
2532
 
2766
- void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
2533
+ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
2767
2534
  static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
2768
2535
 
2769
2536
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -2811,6 +2578,10 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
2811
2578
  printf("\n=== Done dumping\n");
2812
2579
  }
2813
2580
 
2581
+ //
2582
+ // Embedding utils
2583
+ //
2584
+
2814
2585
  void llama_embd_normalize(const float * inp, float * out, int n) {
2815
2586
  double sum = 0.0;
2816
2587
  for (int i = 0; i < n; i++) {
@@ -2995,3 +2766,226 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
2995
2766
 
2996
2767
  return result;
2997
2768
  }
2769
+
2770
+ //
2771
+ // YAML utils
2772
+ //
2773
+
2774
+ void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
2775
+ if (data.empty()) {
2776
+ fprintf(stream, "%s:\n", prop_name);
2777
+ return;
2778
+ }
2779
+
2780
+ fprintf(stream, "%s: [", prop_name);
2781
+ for (size_t i = 0; i < data.size() - 1; ++i) {
2782
+ fprintf(stream, "%e, ", data[i]);
2783
+ }
2784
+ fprintf(stream, "%e]\n", data.back());
2785
+ }
2786
+
2787
+ void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
2788
+ if (data.empty()) {
2789
+ fprintf(stream, "%s:\n", prop_name);
2790
+ return;
2791
+ }
2792
+
2793
+ fprintf(stream, "%s: [", prop_name);
2794
+ for (size_t i = 0; i < data.size() - 1; ++i) {
2795
+ fprintf(stream, "%d, ", data[i]);
2796
+ }
2797
+ fprintf(stream, "%d]\n", data.back());
2798
+ }
2799
+
2800
+ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
2801
+ std::string data_str(data == NULL ? "" : data);
2802
+
2803
+ if (data_str.empty()) {
2804
+ fprintf(stream, "%s:\n", prop_name);
2805
+ return;
2806
+ }
2807
+
2808
+ size_t pos_start = 0;
2809
+ size_t pos_found = 0;
2810
+
2811
+ if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
2812
+ data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
2813
+ data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
2814
+ data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
2815
+ data_str = "\"" + data_str + "\"";
2816
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2817
+ return;
2818
+ }
2819
+
2820
+ if (data_str.find('\n') == std::string::npos) {
2821
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2822
+ return;
2823
+ }
2824
+
2825
+ fprintf(stream, "%s: |\n", prop_name);
2826
+ while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
2827
+ fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
2828
+ pos_start = pos_found + 1;
2829
+ }
2830
+ }
2831
+
2832
+ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
2833
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
2834
+ const llama_sampling_params & sparams = params.sparams;
2835
+
2836
+ fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
2837
+ fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
2838
+ fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
2839
+ fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
2840
+ fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
2841
+ fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
2842
+ fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
2843
+ fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
2844
+ fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
2845
+ fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
2846
+ fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
2847
+ fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
2848
+ fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
2849
+ fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
2850
+ fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
2851
+ fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
2852
+ fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
2853
+ fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
2854
+ fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
2855
+ fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
2856
+ fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
2857
+ fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
2858
+ fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
2859
+ fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
2860
+
2861
+ #ifdef NDEBUG
2862
+ fprintf(stream, "debug: false\n");
2863
+ #else
2864
+ fprintf(stream, "debug: true\n");
2865
+ #endif // NDEBUG
2866
+
2867
+ fprintf(stream, "model_desc: %s\n", model_desc);
2868
+ fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
2869
+
2870
+ #ifdef __OPTIMIZE__
2871
+ fprintf(stream, "optimize: true\n");
2872
+ #else
2873
+ fprintf(stream, "optimize: false\n");
2874
+ #endif // __OPTIMIZE__
2875
+
2876
+ fprintf(stream, "time: %s\n", timestamp.c_str());
2877
+
2878
+ fprintf(stream, "\n");
2879
+ fprintf(stream, "###############\n");
2880
+ fprintf(stream, "# User Inputs #\n");
2881
+ fprintf(stream, "###############\n");
2882
+ fprintf(stream, "\n");
2883
+
2884
+ fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2885
+ fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2886
+ yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
2887
+ fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
2888
+ fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2889
+ fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2890
+ fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2891
+ fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2892
+ fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2893
+ fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2894
+ yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
2895
+ fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2896
+ fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2897
+ fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2898
+
2899
+ const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
2900
+ const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
2901
+ fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
2902
+
2903
+ yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2904
+ fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2905
+ yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2906
+ fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
2907
+ fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2908
+ fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
2909
+ fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2910
+ fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2911
+ fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2912
+
2913
+ fprintf(stream, "logit_bias:\n");
2914
+ for (std::pair<llama_token, float> lb : sparams.logit_bias) {
2915
+ if (ignore_eos && lb.first == logit_bias_eos->first) {
2916
+ continue;
2917
+ }
2918
+ fprintf(stream, " %d: %f", lb.first, lb.second);
2919
+ }
2920
+
2921
+ fprintf(stream, "lora:\n");
2922
+ for (std::tuple<std::string, float> la : params.lora_adapter) {
2923
+ if (std::get<1>(la) != 1.0f) {
2924
+ continue;
2925
+ }
2926
+ fprintf(stream, " - %s\n", std::get<0>(la).c_str());
2927
+ }
2928
+ fprintf(stream, "lora_scaled:\n");
2929
+ for (std::tuple<std::string, float> la : params.lora_adapter) {
2930
+ if (std::get<1>(la) == 1.0f) {
2931
+ continue;
2932
+ }
2933
+ fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
2934
+ }
2935
+ fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
2936
+ fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2937
+ fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2938
+ fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2939
+ fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2940
+ fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2941
+ fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2942
+ fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2943
+ fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2944
+ fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2945
+ fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2946
+ fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2947
+ fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2948
+ fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2949
+ fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2950
+ fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2951
+ fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2952
+ fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2953
+ yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
2954
+ fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2955
+ fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2956
+ fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2957
+ yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2958
+ fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
2959
+ fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2960
+
2961
+ fprintf(stream, "reverse_prompt:\n");
2962
+ for (std::string ap : params.antiprompt) {
2963
+ size_t pos = 0;
2964
+ while ((pos = ap.find('\n', pos)) != std::string::npos) {
2965
+ ap.replace(pos, 1, "\\n");
2966
+ pos += 1;
2967
+ }
2968
+
2969
+ fprintf(stream, " - %s\n", ap.c_str());
2970
+ }
2971
+
2972
+ fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2973
+ fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2974
+ fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
2975
+ fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2976
+ fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2977
+ fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2978
+ fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2979
+
2980
+ const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2981
+ yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2982
+
2983
+ fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
2984
+ fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
2985
+ fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2986
+ fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2987
+ fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2988
+ fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
2989
+ fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2990
+ fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2991
+ }