@fugood/llama.node 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/bin/darwin/arm64/default.metallib +0 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/default.metallib +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/LlamaContext.cpp +2 -2
  19. package/src/LoadSessionWorker.cpp +1 -0
  20. package/src/llama.cpp/CMakeLists.txt +72 -46
  21. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  22. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  23. package/src/llama.cpp/common/common.cpp +732 -752
  24. package/src/llama.cpp/common/common.h +47 -41
  25. package/src/llama.cpp/common/grammar-parser.cpp +1 -1
  26. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  27. package/src/llama.cpp/common/log.h +5 -5
  28. package/src/llama.cpp/common/sampling.cpp +89 -7
  29. package/src/llama.cpp/common/sampling.h +5 -0
  30. package/src/llama.cpp/common/train.cpp +2 -2
  31. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  32. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  33. package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  36. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  37. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  39. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
  40. package/src/llama.cpp/examples/llava/clip.h +1 -1
  41. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  42. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  43. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  44. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  45. package/src/llama.cpp/examples/main/main.cpp +24 -16
  46. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  47. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  48. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  49. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  50. package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
  51. package/src/llama.cpp/examples/server/server.cpp +21 -9
  52. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  53. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  54. package/src/llama.cpp/ggml-backend.c +0 -1
  55. package/src/llama.cpp/ggml-common.h +0 -54
  56. package/src/llama.cpp/ggml-cuda.h +1 -0
  57. package/src/llama.cpp/ggml-impl.h +51 -0
  58. package/src/llama.cpp/ggml-kompute.cpp +4 -0
  59. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  60. package/src/llama.cpp/ggml-quants.c +3700 -2041
  61. package/src/llama.cpp/ggml-rpc.cpp +188 -56
  62. package/src/llama.cpp/ggml-sycl.cpp +99 -530
  63. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  64. package/src/llama.cpp/ggml-vulkan.cpp +202 -225
  65. package/src/llama.cpp/ggml.c +1034 -1154
  66. package/src/llama.cpp/ggml.h +59 -31
  67. package/src/llama.cpp/llama.cpp +859 -609
  68. package/src/llama.cpp/llama.h +19 -6
  69. package/src/llama.cpp/requirements.txt +0 -1
  70. package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
  71. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  72. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  73. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  74. package/src/llama.cpp/unicode-data.h +15 -12
  75. package/src/llama.cpp/unicode.cpp +89 -111
  76. package/src/llama.cpp/unicode.h +44 -12
  77. package/src/llama.cpp/build.zig +0 -172
  78. package/src/llama.cpp/ggml-mpi.c +0 -216
  79. package/src/llama.cpp/ggml-mpi.h +0 -39
  80. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -73,7 +73,11 @@
73
73
 
74
74
  using json = nlohmann::ordered_json;
75
75
 
76
- int32_t get_num_physical_cores() {
76
+ //
77
+ // CPU utils
78
+ //
79
+
80
+ int32_t cpu_get_num_physical_cores() {
77
81
  #ifdef __linux__
78
82
  // enumerate the set of thread siblings, num entries is num cores
79
83
  std::unordered_set<std::string> siblings;
@@ -142,9 +146,9 @@ static bool is_running_on_efficiency_core(void) {
142
146
  return core_type == intel_atom;
143
147
  }
144
148
 
145
- static int count_math_cpus(int cpu_count) {
149
+ static int cpu_count_math_cpus(int n_cpu) {
146
150
  int result = 0;
147
- for (int cpu = 0; cpu < cpu_count; ++cpu) {
151
+ for (int cpu = 0; cpu < n_cpu; ++cpu) {
148
152
  if (pin_cpu(cpu)) {
149
153
  return -1;
150
154
  }
@@ -162,16 +166,16 @@ static int count_math_cpus(int cpu_count) {
162
166
  /**
163
167
  * Returns number of CPUs on system that are useful for math.
164
168
  */
165
- int get_math_cpu_count() {
169
+ int32_t cpu_get_num_math() {
166
170
  #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
167
- int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
168
- if (cpu_count < 1) {
169
- return get_num_physical_cores();
171
+ int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
172
+ if (n_cpu < 1) {
173
+ return cpu_get_num_physical_cores();
170
174
  }
171
175
  if (is_hybrid_cpu()) {
172
176
  cpu_set_t affinity;
173
177
  if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
174
- int result = count_math_cpus(cpu_count);
178
+ int result = cpu_count_math_cpus(n_cpu);
175
179
  pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
176
180
  if (result > 0) {
177
181
  return result;
@@ -179,108 +183,103 @@ int get_math_cpu_count() {
179
183
  }
180
184
  }
181
185
  #endif
182
- return get_num_physical_cores();
186
+ return cpu_get_num_physical_cores();
183
187
  }
184
188
 
185
- void process_escapes(std::string & input) {
186
- std::size_t input_len = input.length();
187
- std::size_t output_idx = 0;
189
+ //
190
+ // CLI argument parsing
191
+ //
188
192
 
189
- for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
190
- if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
191
- switch (input[++input_idx]) {
192
- case 'n': input[output_idx++] = '\n'; break;
193
- case 'r': input[output_idx++] = '\r'; break;
194
- case 't': input[output_idx++] = '\t'; break;
195
- case '\'': input[output_idx++] = '\''; break;
196
- case '\"': input[output_idx++] = '\"'; break;
197
- case '\\': input[output_idx++] = '\\'; break;
198
- case 'x':
199
- // Handle \x12, etc
200
- if (input_idx + 2 < input_len) {
201
- const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
202
- char *err_p = nullptr;
203
- const long val = std::strtol(x, &err_p, 16);
204
- if (err_p == x + 2) {
205
- input_idx += 2;
206
- input[output_idx++] = char(val);
207
- break;
208
- }
209
- }
210
- // fall through
211
- default: input[output_idx++] = '\\';
212
- input[output_idx++] = input[input_idx]; break;
193
+ void gpt_params_handle_model_default(gpt_params & params) {
194
+ if (!params.hf_repo.empty()) {
195
+ // short-hand to avoid specifying --hf-file -> default it to --model
196
+ if (params.hf_file.empty()) {
197
+ if (params.model.empty()) {
198
+ throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
213
199
  }
214
- } else {
215
- input[output_idx++] = input[input_idx];
200
+ params.hf_file = params.model;
201
+ } else if (params.model.empty()) {
202
+ std::string cache_directory = fs_get_cache_directory();
203
+ const bool success = fs_create_directory_with_parents(cache_directory);
204
+ if (!success) {
205
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
206
+ }
207
+ params.model = cache_directory + string_split(params.hf_file, '/').back();
208
+ }
209
+ } else if (!params.model_url.empty()) {
210
+ if (params.model.empty()) {
211
+ auto f = string_split(params.model_url, '#').front();
212
+ f = string_split(f, '?').front();
213
+ f = string_split(f, '/').back();
214
+ params.model = "models/" + f;
216
215
  }
216
+ } else if (params.model.empty()) {
217
+ params.model = DEFAULT_MODEL_PATH;
217
218
  }
219
+ }
218
220
 
219
- input.resize(output_idx);
221
+ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
222
+ bool invalid_param = false;
223
+ std::string arg;
224
+ const std::string arg_prefix = "--";
225
+ llama_sampling_params & sparams = params.sparams;
226
+
227
+ for (int i = 1; i < argc; i++) {
228
+ arg = argv[i];
229
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
230
+ std::replace(arg.begin(), arg.end(), '_', '-');
231
+ }
232
+ if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
233
+ throw std::invalid_argument("error: unknown argument: " + arg);
234
+ }
235
+ if (invalid_param) {
236
+ throw std::invalid_argument("error: invalid parameter for argument: " + arg);
237
+ }
238
+ }
239
+
240
+ if (params.prompt_cache_all &&
241
+ (params.interactive || params.interactive_first ||
242
+ params.instruct)) {
243
+
244
+ throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
245
+ }
246
+
247
+ gpt_params_handle_model_default(params);
248
+
249
+ if (params.escape) {
250
+ string_process_escapes(params.prompt);
251
+ string_process_escapes(params.input_prefix);
252
+ string_process_escapes(params.input_suffix);
253
+ string_process_escapes(sparams.cfg_negative_prompt);
254
+ for (auto & antiprompt : params.antiprompt) {
255
+ string_process_escapes(antiprompt);
256
+ }
257
+ }
258
+
259
+ if (!params.kv_overrides.empty()) {
260
+ params.kv_overrides.emplace_back();
261
+ params.kv_overrides.back().key[0] = 0;
262
+ }
263
+
264
+ return true;
220
265
  }
221
266
 
222
267
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
223
268
  bool result = true;
224
269
  try {
225
270
  if (!gpt_params_parse_ex(argc, argv, params)) {
226
- gpt_print_usage(argc, argv, gpt_params());
271
+ gpt_params_print_usage(argc, argv, gpt_params());
227
272
  exit(0);
228
273
  }
229
274
  }
230
275
  catch (const std::invalid_argument & ex) {
231
276
  fprintf(stderr, "%s\n", ex.what());
232
- gpt_print_usage(argc, argv, gpt_params());
277
+ gpt_params_print_usage(argc, argv, gpt_params());
233
278
  exit(1);
234
279
  }
235
280
  return result;
236
281
  }
237
282
 
238
- bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
239
- const char * sep = strchr(data, '=');
240
- if (sep == nullptr || sep - data >= 128) {
241
- fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
242
- return false;
243
- }
244
- llama_model_kv_override kvo;
245
- std::strncpy(kvo.key, data, sep - data);
246
- kvo.key[sep - data] = 0;
247
- sep++;
248
- if (strncmp(sep, "int:", 4) == 0) {
249
- sep += 4;
250
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
251
- kvo.val_i64 = std::atol(sep);
252
- } else if (strncmp(sep, "float:", 6) == 0) {
253
- sep += 6;
254
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
255
- kvo.val_f64 = std::atof(sep);
256
- } else if (strncmp(sep, "bool:", 5) == 0) {
257
- sep += 5;
258
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
259
- if (std::strcmp(sep, "true") == 0) {
260
- kvo.val_bool = true;
261
- } else if (std::strcmp(sep, "false") == 0) {
262
- kvo.val_bool = false;
263
- } else {
264
- fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
265
- return false;
266
- }
267
- } else if (strncmp(sep, "str:", 4) == 0) {
268
- sep += 4;
269
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
270
- if (strlen(sep) > 127) {
271
- fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
272
- return false;
273
- }
274
- strncpy(kvo.val_str, sep, 127);
275
- kvo.val_str[127] = '\0';
276
- } else {
277
- fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
278
- return false;
279
- }
280
- overrides.emplace_back(std::move(kvo));
281
- return true;
282
- }
283
-
284
283
  bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
285
284
  llama_sampling_params & sparams = params.sparams;
286
285
 
@@ -546,7 +545,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
546
545
  return true;
547
546
  }
548
547
  const auto sampler_names = string_split(argv[i], ';');
549
- sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
548
+ sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
550
549
  return true;
551
550
  }
552
551
  if (arg == "--sampling-seq") {
@@ -554,7 +553,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
554
553
  invalid_param = true;
555
554
  return true;
556
555
  }
557
- sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
556
+ sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
558
557
  return true;
559
558
  }
560
559
  if (arg == "--top-p") {
@@ -905,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
905
904
  params.interactive_specials = true;
906
905
  return true;
907
906
  }
907
+ if (arg == "--special") {
908
+ params.special = true;
909
+ return true;
910
+ }
908
911
  if (arg == "--embedding") {
909
912
  params.embedding = true;
910
913
  return true;
@@ -1240,7 +1243,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1240
1243
  return true;
1241
1244
  }
1242
1245
  if (arg == "-h" || arg == "--help") {
1243
- gpt_print_usage(argc, argv, gpt_params());
1246
+ gpt_params_print_usage(argc, argv, gpt_params());
1244
1247
  exit(0);
1245
1248
  }
1246
1249
  if (arg == "--version") {
@@ -1311,7 +1314,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1311
1314
  invalid_param = true;
1312
1315
  return true;
1313
1316
  }
1314
- if (!parse_kv_override(argv[i], params.kv_overrides)) {
1317
+ if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
1315
1318
  fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
1316
1319
  invalid_param = true;
1317
1320
  return true;
@@ -1345,83 +1348,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1345
1348
  return false;
1346
1349
  }
1347
1350
 
1348
- void gpt_params_handle_model_default(gpt_params & params) {
1349
- if (!params.hf_repo.empty()) {
1350
- // short-hand to avoid specifying --hf-file -> default it to --model
1351
- if (params.hf_file.empty()) {
1352
- if (params.model.empty()) {
1353
- throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
1354
- }
1355
- params.hf_file = params.model;
1356
- } else if (params.model.empty()) {
1357
- params.model = "models/" + string_split(params.hf_file, '/').back();
1358
- }
1359
- } else if (!params.model_url.empty()) {
1360
- if (params.model.empty()) {
1361
- auto f = string_split(params.model_url, '#').front();
1362
- f = string_split(f, '?').front();
1363
- f = string_split(f, '/').back();
1364
- params.model = "models/" + f;
1365
- }
1366
- } else if (params.model.empty()) {
1367
- params.model = DEFAULT_MODEL_PATH;
1368
- }
1369
- }
1370
-
1371
- bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
1372
- bool invalid_param = false;
1373
- std::string arg;
1374
- const std::string arg_prefix = "--";
1375
- llama_sampling_params & sparams = params.sparams;
1376
-
1377
- for (int i = 1; i < argc; i++) {
1378
- arg = argv[i];
1379
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
1380
- std::replace(arg.begin(), arg.end(), '_', '-');
1381
- }
1382
- if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
1383
- throw std::invalid_argument("error: unknown argument: " + arg);
1384
- }
1385
- if (invalid_param) {
1386
- throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1387
- }
1388
- }
1389
-
1390
- if (params.prompt_cache_all &&
1391
- (params.interactive || params.interactive_first ||
1392
- params.instruct)) {
1393
-
1394
- throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
1395
- }
1396
-
1397
- gpt_params_handle_model_default(params);
1398
-
1399
- if (params.escape) {
1400
- process_escapes(params.prompt);
1401
- process_escapes(params.input_prefix);
1402
- process_escapes(params.input_suffix);
1403
- process_escapes(sparams.cfg_negative_prompt);
1404
- for (auto & antiprompt : params.antiprompt) {
1405
- process_escapes(antiprompt);
1406
- }
1407
- }
1408
-
1409
- if (!params.kv_overrides.empty()) {
1410
- params.kv_overrides.emplace_back();
1411
- params.kv_overrides.back().key[0] = 0;
1412
- }
1413
-
1414
- return true;
1415
- }
1416
-
1417
- void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1351
+ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1418
1352
  const llama_sampling_params & sparams = params.sparams;
1419
1353
 
1420
1354
  std::string sampler_type_chars;
1421
1355
  std::string sampler_type_names;
1422
1356
  for (const auto sampler_type : sparams.samplers_sequence) {
1423
1357
  sampler_type_chars += static_cast<char>(sampler_type);
1424
- sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
1358
+ sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
1425
1359
  }
1426
1360
  sampler_type_names.pop_back();
1427
1361
 
@@ -1432,6 +1366,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1432
1366
  printf(" -h, --help show this help message and exit\n");
1433
1367
  printf(" --version show version and build info\n");
1434
1368
  printf(" -i, --interactive run in interactive mode\n");
1369
+ printf(" --special special tokens output enabled\n");
1435
1370
  printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
1436
1371
  printf(" --interactive-first run in interactive mode and wait for input right away\n");
1437
1372
  printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
@@ -1618,7 +1553,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1618
1553
  #endif // LOG_DISABLE_LOGS
1619
1554
  }
1620
1555
 
1621
- std::string get_system_info(const gpt_params & params) {
1556
+ std::string gpt_params_get_system_info(const gpt_params & params) {
1622
1557
  std::ostringstream os;
1623
1558
 
1624
1559
  os << "system_info: n_threads = " << params.n_threads;
@@ -1630,7 +1565,52 @@ std::string get_system_info(const gpt_params & params) {
1630
1565
  return os.str();
1631
1566
  }
1632
1567
 
1633
- std::string gpt_random_prompt(std::mt19937 & rng) {
1568
+ //
1569
+ // String utils
1570
+ //
1571
+
1572
+ std::vector<std::string> string_split(std::string input, char separator) {
1573
+ std::vector<std::string> parts;
1574
+ size_t separator_pos = input.find(separator);
1575
+ while (separator_pos != std::string::npos) {
1576
+ std::string part = input.substr(0, separator_pos);
1577
+ parts.emplace_back(part);
1578
+ input = input.substr(separator_pos + 1);
1579
+ separator_pos = input.find(separator);
1580
+ }
1581
+ parts.emplace_back(input);
1582
+ return parts;
1583
+ }
1584
+
1585
+ std::string string_strip(const std::string & str) {
1586
+ size_t start = 0;
1587
+ size_t end = str.size();
1588
+ while (start < end && std::isspace(str[start])) {
1589
+ start++;
1590
+ }
1591
+ while (end > start && std::isspace(str[end - 1])) {
1592
+ end--;
1593
+ }
1594
+ return str.substr(start, end - start);
1595
+ }
1596
+
1597
+ std::string string_get_sortable_timestamp() {
1598
+ using clock = std::chrono::system_clock;
1599
+
1600
+ const clock::time_point current_time = clock::now();
1601
+ const time_t as_time_t = clock::to_time_t(current_time);
1602
+ char timestamp_no_ns[100];
1603
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
1604
+
1605
+ const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
1606
+ current_time.time_since_epoch() % 1000000000).count();
1607
+ char timestamp_ns[11];
1608
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
1609
+
1610
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
1611
+ }
1612
+
1613
+ std::string string_random_prompt(std::mt19937 & rng) {
1634
1614
  const int r = rng() % 10;
1635
1615
  switch (r) {
1636
1616
  case 0: return "So";
@@ -1648,17 +1628,104 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
1648
1628
  GGML_UNREACHABLE();
1649
1629
  }
1650
1630
 
1651
- // Validate if a filename is safe to use
1652
- // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
1653
- bool validate_file_name(const std::string & filename) {
1654
- if (!filename.length()) {
1655
- // Empty filename invalid
1656
- return false;
1657
- }
1658
- if (filename.length() > 255) {
1659
- // Limit at common largest possible filename on Linux filesystems
1660
- // to avoid unnecessary further validation
1661
- // (On systems with smaller limits it will be caught by the OS)
1631
+ void string_process_escapes(std::string & input) {
1632
+ std::size_t input_len = input.length();
1633
+ std::size_t output_idx = 0;
1634
+
1635
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
1636
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
1637
+ switch (input[++input_idx]) {
1638
+ case 'n': input[output_idx++] = '\n'; break;
1639
+ case 'r': input[output_idx++] = '\r'; break;
1640
+ case 't': input[output_idx++] = '\t'; break;
1641
+ case '\'': input[output_idx++] = '\''; break;
1642
+ case '\"': input[output_idx++] = '\"'; break;
1643
+ case '\\': input[output_idx++] = '\\'; break;
1644
+ case 'x':
1645
+ // Handle \x12, etc
1646
+ if (input_idx + 2 < input_len) {
1647
+ const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
1648
+ char *err_p = nullptr;
1649
+ const long val = std::strtol(x, &err_p, 16);
1650
+ if (err_p == x + 2) {
1651
+ input_idx += 2;
1652
+ input[output_idx++] = char(val);
1653
+ break;
1654
+ }
1655
+ }
1656
+ // fall through
1657
+ default: input[output_idx++] = '\\';
1658
+ input[output_idx++] = input[input_idx]; break;
1659
+ }
1660
+ } else {
1661
+ input[output_idx++] = input[input_idx];
1662
+ }
1663
+ }
1664
+
1665
+ input.resize(output_idx);
1666
+ }
1667
+
1668
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
1669
+ const char * sep = strchr(data, '=');
1670
+ if (sep == nullptr || sep - data >= 128) {
1671
+ fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
1672
+ return false;
1673
+ }
1674
+ llama_model_kv_override kvo;
1675
+ std::strncpy(kvo.key, data, sep - data);
1676
+ kvo.key[sep - data] = 0;
1677
+ sep++;
1678
+ if (strncmp(sep, "int:", 4) == 0) {
1679
+ sep += 4;
1680
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
1681
+ kvo.val_i64 = std::atol(sep);
1682
+ } else if (strncmp(sep, "float:", 6) == 0) {
1683
+ sep += 6;
1684
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
1685
+ kvo.val_f64 = std::atof(sep);
1686
+ } else if (strncmp(sep, "bool:", 5) == 0) {
1687
+ sep += 5;
1688
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
1689
+ if (std::strcmp(sep, "true") == 0) {
1690
+ kvo.val_bool = true;
1691
+ } else if (std::strcmp(sep, "false") == 0) {
1692
+ kvo.val_bool = false;
1693
+ } else {
1694
+ fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
1695
+ return false;
1696
+ }
1697
+ } else if (strncmp(sep, "str:", 4) == 0) {
1698
+ sep += 4;
1699
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
1700
+ if (strlen(sep) > 127) {
1701
+ fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
1702
+ return false;
1703
+ }
1704
+ strncpy(kvo.val_str, sep, 127);
1705
+ kvo.val_str[127] = '\0';
1706
+ } else {
1707
+ fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
1708
+ return false;
1709
+ }
1710
+ overrides.emplace_back(std::move(kvo));
1711
+ return true;
1712
+ }
1713
+
1714
+ //
1715
+ // Filesystem utils
1716
+ //
1717
+
1718
+ // Validate if a filename is safe to use
1719
+ // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
1720
+ bool fs_validate_filename(const std::string & filename) {
1721
+ if (!filename.length()) {
1722
+ // Empty filename invalid
1723
+ return false;
1724
+ }
1725
+ if (filename.length() > 255) {
1726
+ // Limit at common largest possible filename on Linux filesystems
1727
+ // to avoid unnecessary further validation
1728
+ // (On systems with smaller limits it will be caught by the OS)
1662
1729
  return false;
1663
1730
  }
1664
1731
 
@@ -1719,174 +1786,252 @@ bool validate_file_name(const std::string & filename) {
1719
1786
  return true;
1720
1787
  }
1721
1788
 
1722
- //
1723
- // String utils
1724
- //
1789
+ // returns true if successful, false otherwise
1790
+ bool fs_create_directory_with_parents(const std::string & path) {
1791
+ #ifdef _WIN32
1792
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
1793
+ std::wstring wpath = converter.from_bytes(path);
1725
1794
 
1726
- std::vector<std::string> string_split(std::string input, char separator) {
1727
- std::vector<std::string> parts;
1728
- size_t separator_pos = input.find(separator);
1729
- while (separator_pos != std::string::npos) {
1730
- std::string part = input.substr(0, separator_pos);
1731
- parts.emplace_back(part);
1732
- input = input.substr(separator_pos + 1);
1733
- separator_pos = input.find(separator);
1795
+ // if the path already exists, check whether it's a directory
1796
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
1797
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
1798
+ return true;
1734
1799
  }
1735
- parts.emplace_back(input);
1736
- return parts;
1737
- }
1738
1800
 
1739
- std::string string_strip(const std::string & str) {
1740
- size_t start = 0;
1741
- size_t end = str.size();
1742
- while (start < end && std::isspace(str[start])) {
1743
- start++;
1744
- }
1745
- while (end > start && std::isspace(str[end - 1])) {
1746
- end--;
1747
- }
1748
- return str.substr(start, end - start);
1749
- }
1801
+ size_t pos_slash = 0;
1750
1802
 
1751
- std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
1752
- std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
1753
- {"top_k", llama_sampler_type::TOP_K},
1754
- {"top_p", llama_sampler_type::TOP_P},
1755
- {"typical_p", llama_sampler_type::TYPICAL_P},
1756
- {"min_p", llama_sampler_type::MIN_P},
1757
- {"tfs_z", llama_sampler_type::TFS_Z},
1758
- {"temperature", llama_sampler_type::TEMPERATURE}
1759
- };
1803
+ // process path from front to back, procedurally creating directories
1804
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
1805
+ const std::wstring subpath = wpath.substr(0, pos_slash);
1806
+ const wchar_t * test = subpath.c_str();
1760
1807
 
1761
- // since samplers names are written multiple ways
1762
- // make it ready for both system names and input names
1763
- std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
1764
- {"top-k", llama_sampler_type::TOP_K},
1765
- {"top-p", llama_sampler_type::TOP_P},
1766
- {"nucleus", llama_sampler_type::TOP_P},
1767
- {"typical-p", llama_sampler_type::TYPICAL_P},
1768
- {"typical", llama_sampler_type::TYPICAL_P},
1769
- {"min-p", llama_sampler_type::MIN_P},
1770
- {"tfs-z", llama_sampler_type::TFS_Z},
1771
- {"tfs", llama_sampler_type::TFS_Z},
1772
- {"temp", llama_sampler_type::TEMPERATURE}
1773
- };
1808
+ const bool success = CreateDirectoryW(test, NULL);
1809
+ if (!success) {
1810
+ const DWORD error = GetLastError();
1774
1811
 
1775
- std::vector<llama_sampler_type> sampler_types;
1776
- sampler_types.reserve(names.size());
1777
- for (const auto & name : names)
1778
- {
1779
- auto sampler_item = sampler_canonical_name_map.find(name);
1780
- if (sampler_item != sampler_canonical_name_map.end())
1781
- {
1782
- sampler_types.push_back(sampler_item->second);
1783
- }
1784
- else
1785
- {
1786
- if (allow_alt_names)
1787
- {
1788
- sampler_item = sampler_alt_name_map.find(name);
1789
- if (sampler_item != sampler_alt_name_map.end())
1790
- {
1791
- sampler_types.push_back(sampler_item->second);
1812
+ // if the path already exists, ensure that it's a directory
1813
+ if (error == ERROR_ALREADY_EXISTS) {
1814
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
1815
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
1816
+ return false;
1792
1817
  }
1818
+ } else {
1819
+ return false;
1793
1820
  }
1794
1821
  }
1822
+
1823
+ pos_slash += 1;
1795
1824
  }
1796
- return sampler_types;
1797
- }
1798
1825
 
1799
- std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
1800
- std::unordered_map<char, llama_sampler_type> sampler_name_map {
1801
- {'k', llama_sampler_type::TOP_K},
1802
- {'p', llama_sampler_type::TOP_P},
1803
- {'y', llama_sampler_type::TYPICAL_P},
1804
- {'m', llama_sampler_type::MIN_P},
1805
- {'f', llama_sampler_type::TFS_Z},
1806
- {'t', llama_sampler_type::TEMPERATURE}
1807
- };
1826
+ return true;
1827
+ #else
1828
+ // if the path already exists, check whether it's a directory
1829
+ struct stat info;
1830
+ if (stat(path.c_str(), &info) == 0) {
1831
+ return S_ISDIR(info.st_mode);
1832
+ }
1833
+
1834
+ size_t pos_slash = 1; // skip leading slashes for directory creation
1835
+
1836
+ // process path from front to back, procedurally creating directories
1837
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
1838
+ const std::string subpath = path.substr(0, pos_slash);
1839
+ struct stat info;
1808
1840
 
1809
- std::vector<llama_sampler_type> sampler_types;
1810
- sampler_types.reserve(names_string.size());
1811
- for (const auto & c : names_string) {
1812
- const auto sampler_item = sampler_name_map.find(c);
1813
- if (sampler_item != sampler_name_map.end()) {
1814
- sampler_types.push_back(sampler_item->second);
1841
+ // if the path already exists, ensure that it's a directory
1842
+ if (stat(subpath.c_str(), &info) == 0) {
1843
+ if (!S_ISDIR(info.st_mode)) {
1844
+ return false;
1845
+ }
1846
+ } else {
1847
+ // create parent directories
1848
+ const int ret = mkdir(subpath.c_str(), 0755);
1849
+ if (ret != 0) {
1850
+ return false;
1851
+ }
1815
1852
  }
1853
+
1854
+ pos_slash += 1;
1816
1855
  }
1817
- return sampler_types;
1856
+
1857
+ return true;
1858
+ #endif // _WIN32
1818
1859
  }
1819
1860
 
1820
- std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
1821
- switch (sampler_type) {
1822
- case llama_sampler_type::TOP_K: return "top_k";
1823
- case llama_sampler_type::TFS_Z: return "tfs_z";
1824
- case llama_sampler_type::TYPICAL_P: return "typical_p";
1825
- case llama_sampler_type::TOP_P: return "top_p";
1826
- case llama_sampler_type::MIN_P: return "min_p";
1827
- case llama_sampler_type::TEMPERATURE: return "temperature";
1828
- default : return "";
1861
+ std::string fs_get_cache_directory() {
1862
+ std::string cache_directory = "";
1863
+ auto ensure_trailing_slash = [](std::string p) {
1864
+ // Make sure to add trailing slash
1865
+ if (p.back() != DIRECTORY_SEPARATOR) {
1866
+ p += DIRECTORY_SEPARATOR;
1867
+ }
1868
+ return p;
1869
+ };
1870
+ if (getenv("LLAMA_CACHE")) {
1871
+ cache_directory = std::getenv("LLAMA_CACHE");
1872
+ } else {
1873
+ #ifdef __linux__
1874
+ if (std::getenv("XDG_CACHE_HOME")) {
1875
+ cache_directory = std::getenv("XDG_CACHE_HOME");
1876
+ } else {
1877
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
1878
+ }
1879
+ #elif defined(__APPLE__)
1880
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
1881
+ #elif defined(_WIN32)
1882
+ cache_directory = std::getenv("LOCALAPPDATA");
1883
+ #endif // __linux__
1884
+ cache_directory = ensure_trailing_slash(cache_directory);
1885
+ cache_directory += "llama.cpp";
1829
1886
  }
1887
+ return ensure_trailing_slash(cache_directory);
1830
1888
  }
1831
1889
 
1890
+
1832
1891
  //
1833
1892
  // Model utils
1834
1893
  //
1835
1894
 
1836
- struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
1837
- auto mparams = llama_model_default_params();
1895
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
1896
+ auto mparams = llama_model_params_from_gpt_params(params);
1838
1897
 
1839
- if (params.n_gpu_layers != -1) {
1840
- mparams.n_gpu_layers = params.n_gpu_layers;
1841
- }
1842
- mparams.rpc_servers = params.rpc_servers.c_str();
1843
- mparams.main_gpu = params.main_gpu;
1844
- mparams.split_mode = params.split_mode;
1845
- mparams.tensor_split = params.tensor_split;
1846
- mparams.use_mmap = params.use_mmap;
1847
- mparams.use_mlock = params.use_mlock;
1848
- mparams.check_tensors = params.check_tensors;
1849
- if (params.kv_overrides.empty()) {
1850
- mparams.kv_overrides = NULL;
1898
+ llama_model * model = nullptr;
1899
+
1900
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
1901
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
1902
+ } else if (!params.model_url.empty()) {
1903
+ model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
1851
1904
  } else {
1852
- GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1853
- mparams.kv_overrides = params.kv_overrides.data();
1905
+ model = llama_load_model_from_file(params.model.c_str(), mparams);
1854
1906
  }
1855
1907
 
1856
- return mparams;
1857
- }
1858
-
1859
- static ggml_type kv_cache_type_from_str(const std::string & s) {
1860
- if (s == "f32") {
1861
- return GGML_TYPE_F32;
1862
- }
1863
- if (s == "f16") {
1864
- return GGML_TYPE_F16;
1865
- }
1866
- if (s == "q8_0") {
1867
- return GGML_TYPE_Q8_0;
1868
- }
1869
- if (s == "q4_0") {
1870
- return GGML_TYPE_Q4_0;
1871
- }
1872
- if (s == "q4_1") {
1873
- return GGML_TYPE_Q4_1;
1874
- }
1875
- if (s == "iq4_nl") {
1876
- return GGML_TYPE_IQ4_NL;
1877
- }
1878
- if (s == "q5_0") {
1879
- return GGML_TYPE_Q5_0;
1880
- }
1881
- if (s == "q5_1") {
1882
- return GGML_TYPE_Q5_1;
1908
+ if (model == NULL) {
1909
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
1910
+ return std::make_tuple(nullptr, nullptr);
1883
1911
  }
1884
1912
 
1885
- throw std::runtime_error("Invalid cache type: " + s);
1886
- }
1913
+ auto cparams = llama_context_params_from_gpt_params(params);
1887
1914
 
1888
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
1889
- auto cparams = llama_context_default_params();
1915
+ llama_context * lctx = llama_new_context_with_model(model, cparams);
1916
+ if (lctx == NULL) {
1917
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
1918
+ llama_free_model(model);
1919
+ return std::make_tuple(nullptr, nullptr);
1920
+ }
1921
+
1922
+ if (!params.control_vectors.empty()) {
1923
+ if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
1924
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
1925
+
1926
+ const auto cvec = llama_control_vector_load(params.control_vectors);
1927
+ if (cvec.n_embd == -1) {
1928
+ llama_free(lctx);
1929
+ llama_free_model(model);
1930
+ return std::make_tuple(nullptr, nullptr);
1931
+ }
1932
+
1933
+ int err = llama_control_vector_apply(lctx,
1934
+ cvec.data.data(),
1935
+ cvec.data.size(),
1936
+ cvec.n_embd,
1937
+ params.control_vector_layer_start,
1938
+ params.control_vector_layer_end);
1939
+ if (err) {
1940
+ llama_free(lctx);
1941
+ llama_free_model(model);
1942
+ return std::make_tuple(nullptr, nullptr);
1943
+ }
1944
+ }
1945
+
1946
+ for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
1947
+ const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
1948
+ float lora_scale = std::get<1>(params.lora_adapter[i]);
1949
+ int err = llama_model_apply_lora_from_file(model,
1950
+ lora_adapter.c_str(),
1951
+ lora_scale,
1952
+ ((i > 0) || params.lora_base.empty())
1953
+ ? NULL
1954
+ : params.lora_base.c_str(),
1955
+ params.n_threads);
1956
+ if (err != 0) {
1957
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
1958
+ llama_free(lctx);
1959
+ llama_free_model(model);
1960
+ return std::make_tuple(nullptr, nullptr);
1961
+ }
1962
+ }
1963
+
1964
+ if (params.ignore_eos) {
1965
+ params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
1966
+ }
1967
+
1968
+ if (params.warmup) {
1969
+ LOG("warming up the model with an empty run\n");
1970
+
1971
+ std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
1972
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
1973
+ llama_kv_cache_clear(lctx);
1974
+ llama_synchronize(lctx);
1975
+ llama_reset_timings(lctx);
1976
+ }
1977
+
1978
+ return std::make_tuple(model, lctx);
1979
+ }
1980
+
1981
+ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
1982
+ auto mparams = llama_model_default_params();
1983
+
1984
+ if (params.n_gpu_layers != -1) {
1985
+ mparams.n_gpu_layers = params.n_gpu_layers;
1986
+ }
1987
+ mparams.rpc_servers = params.rpc_servers.c_str();
1988
+ mparams.main_gpu = params.main_gpu;
1989
+ mparams.split_mode = params.split_mode;
1990
+ mparams.tensor_split = params.tensor_split;
1991
+ mparams.use_mmap = params.use_mmap;
1992
+ mparams.use_mlock = params.use_mlock;
1993
+ mparams.check_tensors = params.check_tensors;
1994
+ if (params.kv_overrides.empty()) {
1995
+ mparams.kv_overrides = NULL;
1996
+ } else {
1997
+ GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1998
+ mparams.kv_overrides = params.kv_overrides.data();
1999
+ }
2000
+
2001
+ return mparams;
2002
+ }
2003
+
2004
+ static ggml_type kv_cache_type_from_str(const std::string & s) {
2005
+ if (s == "f32") {
2006
+ return GGML_TYPE_F32;
2007
+ }
2008
+ if (s == "f16") {
2009
+ return GGML_TYPE_F16;
2010
+ }
2011
+ if (s == "q8_0") {
2012
+ return GGML_TYPE_Q8_0;
2013
+ }
2014
+ if (s == "q4_0") {
2015
+ return GGML_TYPE_Q4_0;
2016
+ }
2017
+ if (s == "q4_1") {
2018
+ return GGML_TYPE_Q4_1;
2019
+ }
2020
+ if (s == "iq4_nl") {
2021
+ return GGML_TYPE_IQ4_NL;
2022
+ }
2023
+ if (s == "q5_0") {
2024
+ return GGML_TYPE_Q5_0;
2025
+ }
2026
+ if (s == "q5_1") {
2027
+ return GGML_TYPE_Q5_1;
2028
+ }
2029
+
2030
+ throw std::runtime_error("Invalid cache type: " + s);
2031
+ }
2032
+
2033
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
2034
+ auto cparams = llama_context_default_params();
1890
2035
 
1891
2036
  cparams.n_ctx = params.n_ctx;
1892
2037
  cparams.n_seq_max = params.n_parallel;
@@ -1918,27 +2063,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1918
2063
  return cparams;
1919
2064
  }
1920
2065
 
1921
- void llama_batch_clear(struct llama_batch & batch) {
1922
- batch.n_tokens = 0;
1923
- }
1924
-
1925
- void llama_batch_add(
1926
- struct llama_batch & batch,
1927
- llama_token id,
1928
- llama_pos pos,
1929
- const std::vector<llama_seq_id> & seq_ids,
1930
- bool logits) {
1931
- batch.token [batch.n_tokens] = id;
1932
- batch.pos [batch.n_tokens] = pos;
1933
- batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1934
- for (size_t i = 0; i < seq_ids.size(); ++i) {
1935
- batch.seq_id[batch.n_tokens][i] = seq_ids[i];
1936
- }
1937
- batch.logits [batch.n_tokens] = logits;
1938
-
1939
- batch.n_tokens++;
1940
- }
1941
-
1942
2066
  #ifdef LLAMA_USE_CURL
1943
2067
 
1944
2068
  static bool starts_with(const std::string & str, const std::string & prefix) {
@@ -2269,90 +2393,29 @@ struct llama_model * llama_load_model_from_hf(
2269
2393
 
2270
2394
  #endif // LLAMA_USE_CURL
2271
2395
 
2272
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2273
- auto mparams = llama_model_params_from_gpt_params(params);
2274
-
2275
- llama_model * model = nullptr;
2276
-
2277
- if (!params.hf_repo.empty() && !params.hf_file.empty()) {
2278
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
2279
- } else if (!params.model_url.empty()) {
2280
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
2281
- } else {
2282
- model = llama_load_model_from_file(params.model.c_str(), mparams);
2283
- }
2284
-
2285
- if (model == NULL) {
2286
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2287
- return std::make_tuple(nullptr, nullptr);
2288
- }
2289
-
2290
- auto cparams = llama_context_params_from_gpt_params(params);
2291
-
2292
- llama_context * lctx = llama_new_context_with_model(model, cparams);
2293
- if (lctx == NULL) {
2294
- fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
2295
- llama_free_model(model);
2296
- return std::make_tuple(nullptr, nullptr);
2297
- }
2298
-
2299
- if (!params.control_vectors.empty()) {
2300
- if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
2301
- if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
2302
-
2303
- const auto cvec = llama_control_vector_load(params.control_vectors);
2304
- if (cvec.n_embd == -1) {
2305
- llama_free(lctx);
2306
- llama_free_model(model);
2307
- return std::make_tuple(nullptr, nullptr);
2308
- }
2309
-
2310
- int err = llama_control_vector_apply(lctx,
2311
- cvec.data.data(),
2312
- cvec.data.size(),
2313
- cvec.n_embd,
2314
- params.control_vector_layer_start,
2315
- params.control_vector_layer_end);
2316
- if (err) {
2317
- llama_free(lctx);
2318
- llama_free_model(model);
2319
- return std::make_tuple(nullptr, nullptr);
2320
- }
2321
- }
2322
-
2323
- for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2324
- const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2325
- float lora_scale = std::get<1>(params.lora_adapter[i]);
2326
- int err = llama_model_apply_lora_from_file(model,
2327
- lora_adapter.c_str(),
2328
- lora_scale,
2329
- ((i > 0) || params.lora_base.empty())
2330
- ? NULL
2331
- : params.lora_base.c_str(),
2332
- params.n_threads);
2333
- if (err != 0) {
2334
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2335
- llama_free(lctx);
2336
- llama_free_model(model);
2337
- return std::make_tuple(nullptr, nullptr);
2338
- }
2339
- }
2340
-
2341
- if (params.ignore_eos) {
2342
- params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
2343
- }
2396
+ //
2397
+ // Batch utils
2398
+ //
2344
2399
 
2345
- if (params.warmup) {
2346
- LOG("warming up the model with an empty run\n");
2400
+ void llama_batch_clear(struct llama_batch & batch) {
2401
+ batch.n_tokens = 0;
2402
+ }
2347
2403
 
2348
- std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
2349
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2350
- llama_kv_cache_clear(lctx);
2351
- llama_synchronize(lctx);
2352
- llama_reset_timings(lctx);
2404
+ void llama_batch_add(
2405
+ struct llama_batch & batch,
2406
+ llama_token id,
2407
+ llama_pos pos,
2408
+ const std::vector<llama_seq_id> & seq_ids,
2409
+ bool logits) {
2410
+ batch.token [batch.n_tokens] = id;
2411
+ batch.pos [batch.n_tokens] = pos;
2412
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
2413
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
2414
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
2353
2415
  }
2416
+ batch.logits [batch.n_tokens] = logits;
2354
2417
 
2355
- return std::make_tuple(model, lctx);
2418
+ batch.n_tokens++;
2356
2419
  }
2357
2420
 
2358
2421
  //
@@ -2406,355 +2469,45 @@ std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_to
2406
2469
  std::string piece;
2407
2470
  std::string result;
2408
2471
 
2409
- for (size_t i = 0; i < tokens.size(); ++i) {
2410
- piece = llama_token_to_piece(ctx, tokens[i]);
2411
-
2412
- // remove the leading space of the first non-BOS token
2413
- if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
2414
- piece = piece.substr(1);
2415
- }
2416
-
2417
- result += piece;
2418
- }
2419
-
2420
- return result;
2421
- }
2422
-
2423
- std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
2424
- std::string piece;
2425
- std::string result;
2426
-
2427
- for (size_t i = 0; i < tokens.size(); ++i) {
2428
- piece = llama_token_to_piece(ctx, tokens[i]);
2429
-
2430
- result += piece;
2431
- }
2432
-
2433
- // NOTE: the original tokenizer decodes bytes after collecting the pieces.
2434
- return result;
2435
- }
2436
-
2437
- bool llama_should_add_bos_token(const llama_model * model) {
2438
- const int add_bos = llama_add_bos_token(model);
2439
-
2440
- return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2441
- }
2442
-
2443
- //
2444
- // YAML utils
2445
- //
2446
-
2447
- // returns true if successful, false otherwise
2448
- bool create_directory_with_parents(const std::string & path) {
2449
- #ifdef _WIN32
2450
- std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
2451
- std::wstring wpath = converter.from_bytes(path);
2452
-
2453
- // if the path already exists, check whether it's a directory
2454
- const DWORD attributes = GetFileAttributesW(wpath.c_str());
2455
- if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
2456
- return true;
2457
- }
2458
-
2459
- size_t pos_slash = 0;
2460
-
2461
- // process path from front to back, procedurally creating directories
2462
- while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
2463
- const std::wstring subpath = wpath.substr(0, pos_slash);
2464
- const wchar_t * test = subpath.c_str();
2465
-
2466
- const bool success = CreateDirectoryW(test, NULL);
2467
- if (!success) {
2468
- const DWORD error = GetLastError();
2469
-
2470
- // if the path already exists, ensure that it's a directory
2471
- if (error == ERROR_ALREADY_EXISTS) {
2472
- const DWORD attributes = GetFileAttributesW(subpath.c_str());
2473
- if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
2474
- return false;
2475
- }
2476
- } else {
2477
- return false;
2478
- }
2479
- }
2480
-
2481
- pos_slash += 1;
2482
- }
2483
-
2484
- return true;
2485
- #else
2486
- // if the path already exists, check whether it's a directory
2487
- struct stat info;
2488
- if (stat(path.c_str(), &info) == 0) {
2489
- return S_ISDIR(info.st_mode);
2490
- }
2491
-
2492
- size_t pos_slash = 1; // skip leading slashes for directory creation
2493
-
2494
- // process path from front to back, procedurally creating directories
2495
- while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
2496
- const std::string subpath = path.substr(0, pos_slash);
2497
- struct stat info;
2498
-
2499
- // if the path already exists, ensure that it's a directory
2500
- if (stat(subpath.c_str(), &info) == 0) {
2501
- if (!S_ISDIR(info.st_mode)) {
2502
- return false;
2503
- }
2504
- } else {
2505
- // create parent directories
2506
- const int ret = mkdir(subpath.c_str(), 0755);
2507
- if (ret != 0) {
2508
- return false;
2509
- }
2510
- }
2511
-
2512
- pos_slash += 1;
2513
- }
2514
-
2515
- return true;
2516
- #endif // _WIN32
2517
- }
2518
-
2519
- void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
2520
- if (data.empty()) {
2521
- fprintf(stream, "%s:\n", prop_name);
2522
- return;
2523
- }
2524
-
2525
- fprintf(stream, "%s: [", prop_name);
2526
- for (size_t i = 0; i < data.size() - 1; ++i) {
2527
- fprintf(stream, "%e, ", data[i]);
2528
- }
2529
- fprintf(stream, "%e]\n", data.back());
2530
- }
2531
-
2532
- void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
2533
- if (data.empty()) {
2534
- fprintf(stream, "%s:\n", prop_name);
2535
- return;
2536
- }
2537
-
2538
- fprintf(stream, "%s: [", prop_name);
2539
- for (size_t i = 0; i < data.size() - 1; ++i) {
2540
- fprintf(stream, "%d, ", data[i]);
2541
- }
2542
- fprintf(stream, "%d]\n", data.back());
2543
- }
2544
-
2545
- void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
2546
- std::string data_str(data == NULL ? "" : data);
2547
-
2548
- if (data_str.empty()) {
2549
- fprintf(stream, "%s:\n", prop_name);
2550
- return;
2551
- }
2552
-
2553
- size_t pos_start = 0;
2554
- size_t pos_found = 0;
2555
-
2556
- if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
2557
- data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
2558
- data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
2559
- data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
2560
- data_str = "\"" + data_str + "\"";
2561
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2562
- return;
2563
- }
2564
-
2565
- if (data_str.find('\n') == std::string::npos) {
2566
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2567
- return;
2568
- }
2569
-
2570
- fprintf(stream, "%s: |\n", prop_name);
2571
- while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
2572
- fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
2573
- pos_start = pos_found + 1;
2574
- }
2575
- }
2576
-
2577
- std::string get_sortable_timestamp() {
2578
- using clock = std::chrono::system_clock;
2579
-
2580
- const clock::time_point current_time = clock::now();
2581
- const time_t as_time_t = clock::to_time_t(current_time);
2582
- char timestamp_no_ns[100];
2583
- std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
2584
-
2585
- const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
2586
- current_time.time_since_epoch() % 1000000000).count();
2587
- char timestamp_ns[11];
2588
- snprintf(timestamp_ns, 11, "%09" PRId64, ns);
2589
-
2590
- return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
2591
- }
2592
-
2593
- void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
2594
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
2595
- const llama_sampling_params & sparams = params.sparams;
2596
-
2597
- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
2598
- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
2599
- fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
2600
- fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
2601
- fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
2602
- fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
2603
- fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
2604
- fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
2605
- fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
2606
- fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
2607
- fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
2608
- fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
2609
- fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
2610
- fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
2611
- fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
2612
- fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
2613
- fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
2614
- fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
2615
- fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
2616
- fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
2617
- fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
2618
- fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
2619
- fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
2620
-
2621
- #ifdef NDEBUG
2622
- fprintf(stream, "debug: false\n");
2623
- #else
2624
- fprintf(stream, "debug: true\n");
2625
- #endif // NDEBUG
2626
-
2627
- fprintf(stream, "model_desc: %s\n", model_desc);
2628
- fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
2629
-
2630
- #ifdef __OPTIMIZE__
2631
- fprintf(stream, "optimize: true\n");
2632
- #else
2633
- fprintf(stream, "optimize: false\n");
2634
- #endif // __OPTIMIZE__
2635
-
2636
- fprintf(stream, "time: %s\n", timestamp.c_str());
2637
-
2638
- fprintf(stream, "\n");
2639
- fprintf(stream, "###############\n");
2640
- fprintf(stream, "# User Inputs #\n");
2641
- fprintf(stream, "###############\n");
2642
- fprintf(stream, "\n");
2643
-
2644
- fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2645
- fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2646
- dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
2647
- fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
2648
- fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2649
- fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2650
- fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2651
- fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2652
- fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2653
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2654
- dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
2655
- fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2656
- fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2657
- fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2658
-
2659
- const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
2660
- const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
2661
- fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
2662
-
2663
- dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
2664
- fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2665
- dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
2666
- fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
2667
- fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2668
- fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
2669
- fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2670
- fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2671
- fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2672
-
2673
- fprintf(stream, "logit_bias:\n");
2674
- for (std::pair<llama_token, float> lb : sparams.logit_bias) {
2675
- if (ignore_eos && lb.first == logit_bias_eos->first) {
2676
- continue;
2677
- }
2678
- fprintf(stream, " %d: %f", lb.first, lb.second);
2679
- }
2680
-
2681
- fprintf(stream, "lora:\n");
2682
- for (std::tuple<std::string, float> la : params.lora_adapter) {
2683
- if (std::get<1>(la) != 1.0f) {
2684
- continue;
2685
- }
2686
- fprintf(stream, " - %s\n", std::get<0>(la).c_str());
2687
- }
2688
- fprintf(stream, "lora_scaled:\n");
2689
- for (std::tuple<std::string, float> la : params.lora_adapter) {
2690
- if (std::get<1>(la) == 1.0f) {
2691
- continue;
2692
- }
2693
- fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
2694
- }
2695
- fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
2696
- fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2697
- fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2698
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2699
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2700
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2701
- fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2702
- fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2703
- fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2704
- fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2705
- fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2706
- fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2707
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2708
- fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2709
- fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2710
- fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2711
- fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2712
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2713
- dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
2714
- fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2715
- fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2716
- fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2717
- dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
2718
- fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
2719
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2720
-
2721
- fprintf(stream, "reverse_prompt:\n");
2722
- for (std::string ap : params.antiprompt) {
2723
- size_t pos = 0;
2724
- while ((pos = ap.find('\n', pos)) != std::string::npos) {
2725
- ap.replace(pos, 1, "\\n");
2726
- pos += 1;
2472
+ for (size_t i = 0; i < tokens.size(); ++i) {
2473
+ piece = llama_token_to_piece(ctx, tokens[i]);
2474
+
2475
+ // remove the leading space of the first non-BOS token
2476
+ if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
2477
+ piece = piece.substr(1);
2727
2478
  }
2728
2479
 
2729
- fprintf(stream, " - %s\n", ap.c_str());
2480
+ result += piece;
2730
2481
  }
2731
2482
 
2732
- fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2733
- fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2734
- fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
2735
- fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2736
- fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2737
- fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2738
- fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2483
+ return result;
2484
+ }
2739
2485
 
2740
- const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2741
- dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
2486
+ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
2487
+ std::string piece;
2488
+ std::string result;
2742
2489
 
2743
- fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
2744
- fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
2745
- fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2746
- fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2747
- fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2748
- fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
2749
- fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2750
- fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2490
+ for (size_t i = 0; i < tokens.size(); ++i) {
2491
+ piece = llama_token_to_piece(ctx, tokens[i]);
2492
+
2493
+ result += piece;
2494
+ }
2495
+
2496
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
2497
+ return result;
2498
+ }
2499
+
2500
+ bool llama_should_add_bos_token(const llama_model * model) {
2501
+ const int add_bos = llama_add_bos_token(model);
2502
+
2503
+ return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2751
2504
  }
2752
2505
 
2753
2506
  //
2754
2507
  // KV cache utils
2755
2508
  //
2756
2509
 
2757
- void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
2510
+ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
2758
2511
  static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
2759
2512
 
2760
2513
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -2777,7 +2530,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
2777
2530
  printf("\n=== Done dumping\n");
2778
2531
  }
2779
2532
 
2780
- void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
2533
+ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
2781
2534
  static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
2782
2535
 
2783
2536
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -2825,6 +2578,10 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
2825
2578
  printf("\n=== Done dumping\n");
2826
2579
  }
2827
2580
 
2581
+ //
2582
+ // Embedding utils
2583
+ //
2584
+
2828
2585
  void llama_embd_normalize(const float * inp, float * out, int n) {
2829
2586
  double sum = 0.0;
2830
2587
  for (int i = 0; i < n; i++) {
@@ -3009,3 +2766,226 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
3009
2766
 
3010
2767
  return result;
3011
2768
  }
2769
+
2770
+ //
2771
+ // YAML utils
2772
+ //
2773
+
2774
+ void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
2775
+ if (data.empty()) {
2776
+ fprintf(stream, "%s:\n", prop_name);
2777
+ return;
2778
+ }
2779
+
2780
+ fprintf(stream, "%s: [", prop_name);
2781
+ for (size_t i = 0; i < data.size() - 1; ++i) {
2782
+ fprintf(stream, "%e, ", data[i]);
2783
+ }
2784
+ fprintf(stream, "%e]\n", data.back());
2785
+ }
2786
+
2787
+ void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
2788
+ if (data.empty()) {
2789
+ fprintf(stream, "%s:\n", prop_name);
2790
+ return;
2791
+ }
2792
+
2793
+ fprintf(stream, "%s: [", prop_name);
2794
+ for (size_t i = 0; i < data.size() - 1; ++i) {
2795
+ fprintf(stream, "%d, ", data[i]);
2796
+ }
2797
+ fprintf(stream, "%d]\n", data.back());
2798
+ }
2799
+
2800
+ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
2801
+ std::string data_str(data == NULL ? "" : data);
2802
+
2803
+ if (data_str.empty()) {
2804
+ fprintf(stream, "%s:\n", prop_name);
2805
+ return;
2806
+ }
2807
+
2808
+ size_t pos_start = 0;
2809
+ size_t pos_found = 0;
2810
+
2811
+ if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
2812
+ data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
2813
+ data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
2814
+ data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
2815
+ data_str = "\"" + data_str + "\"";
2816
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2817
+ return;
2818
+ }
2819
+
2820
+ if (data_str.find('\n') == std::string::npos) {
2821
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
2822
+ return;
2823
+ }
2824
+
2825
+ fprintf(stream, "%s: |\n", prop_name);
2826
+ while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
2827
+ fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
2828
+ pos_start = pos_found + 1;
2829
+ }
2830
+ }
2831
+
2832
+ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
2833
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
2834
+ const llama_sampling_params & sparams = params.sparams;
2835
+
2836
+ fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
2837
+ fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
2838
+ fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
2839
+ fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
2840
+ fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
2841
+ fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
2842
+ fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
2843
+ fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
2844
+ fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
2845
+ fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
2846
+ fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
2847
+ fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
2848
+ fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
2849
+ fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
2850
+ fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
2851
+ fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
2852
+ fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
2853
+ fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
2854
+ fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
2855
+ fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
2856
+ fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
2857
+ fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
2858
+ fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
2859
+ fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
2860
+
2861
+ #ifdef NDEBUG
2862
+ fprintf(stream, "debug: false\n");
2863
+ #else
2864
+ fprintf(stream, "debug: true\n");
2865
+ #endif // NDEBUG
2866
+
2867
+ fprintf(stream, "model_desc: %s\n", model_desc);
2868
+ fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
2869
+
2870
+ #ifdef __OPTIMIZE__
2871
+ fprintf(stream, "optimize: true\n");
2872
+ #else
2873
+ fprintf(stream, "optimize: false\n");
2874
+ #endif // __OPTIMIZE__
2875
+
2876
+ fprintf(stream, "time: %s\n", timestamp.c_str());
2877
+
2878
+ fprintf(stream, "\n");
2879
+ fprintf(stream, "###############\n");
2880
+ fprintf(stream, "# User Inputs #\n");
2881
+ fprintf(stream, "###############\n");
2882
+ fprintf(stream, "\n");
2883
+
2884
+ fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2885
+ fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2886
+ yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
2887
+ fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
2888
+ fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2889
+ fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2890
+ fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2891
+ fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2892
+ fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2893
+ fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2894
+ yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
2895
+ fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2896
+ fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2897
+ fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2898
+
2899
+ const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
2900
+ const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
2901
+ fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
2902
+
2903
+ yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2904
+ fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2905
+ yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2906
+ fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
2907
+ fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2908
+ fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
2909
+ fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2910
+ fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2911
+ fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2912
+
2913
+ fprintf(stream, "logit_bias:\n");
2914
+ for (std::pair<llama_token, float> lb : sparams.logit_bias) {
2915
+ if (ignore_eos && lb.first == logit_bias_eos->first) {
2916
+ continue;
2917
+ }
2918
+ fprintf(stream, " %d: %f", lb.first, lb.second);
2919
+ }
2920
+
2921
+ fprintf(stream, "lora:\n");
2922
+ for (std::tuple<std::string, float> la : params.lora_adapter) {
2923
+ if (std::get<1>(la) != 1.0f) {
2924
+ continue;
2925
+ }
2926
+ fprintf(stream, " - %s\n", std::get<0>(la).c_str());
2927
+ }
2928
+ fprintf(stream, "lora_scaled:\n");
2929
+ for (std::tuple<std::string, float> la : params.lora_adapter) {
2930
+ if (std::get<1>(la) == 1.0f) {
2931
+ continue;
2932
+ }
2933
+ fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
2934
+ }
2935
+ fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
2936
+ fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2937
+ fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2938
+ fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2939
+ fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2940
+ fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2941
+ fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2942
+ fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2943
+ fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2944
+ fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2945
+ fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2946
+ fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2947
+ fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2948
+ fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2949
+ fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2950
+ fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2951
+ fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2952
+ fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2953
+ yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
2954
+ fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2955
+ fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2956
+ fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2957
+ yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2958
+ fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
2959
+ fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2960
+
2961
+ fprintf(stream, "reverse_prompt:\n");
2962
+ for (std::string ap : params.antiprompt) {
2963
+ size_t pos = 0;
2964
+ while ((pos = ap.find('\n', pos)) != std::string::npos) {
2965
+ ap.replace(pos, 1, "\\n");
2966
+ pos += 1;
2967
+ }
2968
+
2969
+ fprintf(stream, " - %s\n", ap.c_str());
2970
+ }
2971
+
2972
+ fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2973
+ fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2974
+ fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
2975
+ fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2976
+ fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2977
+ fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2978
+ fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2979
+
2980
+ const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2981
+ yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2982
+
2983
+ fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
2984
+ fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
2985
+ fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2986
+ fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2987
+ fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2988
+ fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
2989
+ fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2990
+ fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2991
+ }