cui-llama.rn 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/LICENSE +20 -0
  2. package/README.md +330 -0
  3. package/android/build.gradle +107 -0
  4. package/android/gradle.properties +5 -0
  5. package/android/src/main/AndroidManifest.xml +4 -0
  6. package/android/src/main/CMakeLists.txt +69 -0
  7. package/android/src/main/java/com/rnllama/LlamaContext.java +353 -0
  8. package/android/src/main/java/com/rnllama/RNLlama.java +446 -0
  9. package/android/src/main/java/com/rnllama/RNLlamaPackage.java +48 -0
  10. package/android/src/main/jni.cpp +635 -0
  11. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +94 -0
  12. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +95 -0
  13. package/cpp/README.md +4 -0
  14. package/cpp/common.cpp +3237 -0
  15. package/cpp/common.h +467 -0
  16. package/cpp/ggml-aarch64.c +2193 -0
  17. package/cpp/ggml-aarch64.h +39 -0
  18. package/cpp/ggml-alloc.c +1041 -0
  19. package/cpp/ggml-alloc.h +76 -0
  20. package/cpp/ggml-backend-impl.h +153 -0
  21. package/cpp/ggml-backend.c +2225 -0
  22. package/cpp/ggml-backend.h +236 -0
  23. package/cpp/ggml-common.h +1829 -0
  24. package/cpp/ggml-impl.h +655 -0
  25. package/cpp/ggml-metal.h +65 -0
  26. package/cpp/ggml-metal.m +3273 -0
  27. package/cpp/ggml-quants.c +15022 -0
  28. package/cpp/ggml-quants.h +132 -0
  29. package/cpp/ggml.c +22034 -0
  30. package/cpp/ggml.h +2444 -0
  31. package/cpp/grammar-parser.cpp +536 -0
  32. package/cpp/grammar-parser.h +29 -0
  33. package/cpp/json-schema-to-grammar.cpp +1045 -0
  34. package/cpp/json-schema-to-grammar.h +8 -0
  35. package/cpp/json.hpp +24766 -0
  36. package/cpp/llama.cpp +21789 -0
  37. package/cpp/llama.h +1201 -0
  38. package/cpp/log.h +737 -0
  39. package/cpp/rn-llama.hpp +630 -0
  40. package/cpp/sampling.cpp +460 -0
  41. package/cpp/sampling.h +160 -0
  42. package/cpp/sgemm.cpp +1027 -0
  43. package/cpp/sgemm.h +14 -0
  44. package/cpp/unicode-data.cpp +7032 -0
  45. package/cpp/unicode-data.h +20 -0
  46. package/cpp/unicode.cpp +812 -0
  47. package/cpp/unicode.h +64 -0
  48. package/ios/RNLlama.h +11 -0
  49. package/ios/RNLlama.mm +302 -0
  50. package/ios/RNLlama.xcodeproj/project.pbxproj +278 -0
  51. package/ios/RNLlamaContext.h +39 -0
  52. package/ios/RNLlamaContext.mm +426 -0
  53. package/jest/mock.js +169 -0
  54. package/lib/commonjs/NativeRNLlama.js +10 -0
  55. package/lib/commonjs/NativeRNLlama.js.map +1 -0
  56. package/lib/commonjs/grammar.js +574 -0
  57. package/lib/commonjs/grammar.js.map +1 -0
  58. package/lib/commonjs/index.js +151 -0
  59. package/lib/commonjs/index.js.map +1 -0
  60. package/lib/module/NativeRNLlama.js +3 -0
  61. package/lib/module/NativeRNLlama.js.map +1 -0
  62. package/lib/module/grammar.js +566 -0
  63. package/lib/module/grammar.js.map +1 -0
  64. package/lib/module/index.js +129 -0
  65. package/lib/module/index.js.map +1 -0
  66. package/lib/typescript/NativeRNLlama.d.ts +107 -0
  67. package/lib/typescript/NativeRNLlama.d.ts.map +1 -0
  68. package/lib/typescript/grammar.d.ts +38 -0
  69. package/lib/typescript/grammar.d.ts.map +1 -0
  70. package/lib/typescript/index.d.ts +46 -0
  71. package/lib/typescript/index.d.ts.map +1 -0
  72. package/llama-rn.podspec +56 -0
  73. package/package.json +230 -0
  74. package/src/NativeRNLlama.ts +132 -0
  75. package/src/grammar.ts +849 -0
  76. package/src/index.ts +182 -0
package/cpp/common.cpp ADDED
@@ -0,0 +1,3237 @@
1
+ #if defined(_MSC_VER)
2
+ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
+ #endif
4
+
5
+ #include "common.h"
6
+ // Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
7
+ #define JSON_ASSERT LM_GGML_ASSERT
8
+ #include "json.hpp"
9
+ #include "json-schema-to-grammar.h"
10
+ #include "llama.h"
11
+
12
+ #include <algorithm>
13
+ #include <cinttypes>
14
+ #include <cmath>
15
+ #include <codecvt>
16
+ #include <cstdarg>
17
+ #include <cstring>
18
+ #include <ctime>
19
+ #include <fstream>
20
+ #include <iostream>
21
+ #include <iterator>
22
+ #include <regex>
23
+ #include <sstream>
24
+ #include <string>
25
+ #include <unordered_map>
26
+ #include <unordered_set>
27
+ #include <vector>
28
+
29
+ #if defined(__APPLE__) && defined(__MACH__)
30
+ #include <sys/types.h>
31
+ #include <sys/sysctl.h>
32
+ #endif
33
+
34
+ #if defined(_WIN32)
35
+ #define WIN32_LEAN_AND_MEAN
36
+ #ifndef NOMINMAX
37
+ # define NOMINMAX
38
+ #endif
39
+ #include <locale>
40
+ #include <windows.h>
41
+ #include <fcntl.h>
42
+ #include <io.h>
43
+ #else
44
+ #include <sys/ioctl.h>
45
+ #include <sys/stat.h>
46
+ #include <unistd.h>
47
+ #endif
48
+ #if defined(LLAMA_USE_CURL)
49
+ #include <curl/curl.h>
50
+ #include <curl/easy.h>
51
+ #include <thread>
52
+ #include <future>
53
+ #endif
54
+
55
+ // build info
56
+ int LLAMA_BUILD_NUMBER = 0;
57
+ char const *LLAMA_COMMIT = "unknown";
58
+ char const *LLAMA_COMPILER = "unknown";
59
+ char const *LLAMA_BUILD_TARGET = "unknown";
60
+
61
+ #if defined(_MSC_VER)
62
+ #pragma warning(disable: 4244 4267) // possible loss of data
63
+ #endif
64
+
65
+ #if (defined(LM_GGML_USE_CUDA) || defined(LM_GGML_USE_SYCL))
66
+ #define LM_GGML_USE_CUDA_SYCL
67
+ #endif
68
+
69
+ #if (defined(LM_GGML_USE_CUDA) || defined(LM_GGML_USE_SYCL)) || defined(LM_GGML_USE_VULKAN)
70
+ #define LM_GGML_USE_CUDA_SYCL_VULKAN
71
+ #endif
72
+
73
+ #if defined(LLAMA_USE_CURL)
74
+ #ifdef __linux__
75
+ #include <linux/limits.h>
76
+ #elif defined(_WIN32)
77
+ #define PATH_MAX MAX_PATH
78
+ #else
79
+ #include <sys/syslimits.h>
80
+ #endif
81
+ #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
82
+ #endif // LLAMA_USE_CURL
83
+
84
+ using json = nlohmann::ordered_json;
85
+
86
+ //
87
+ // CPU utils
88
+ //
89
+
90
+ int32_t cpu_get_num_physical_cores() {
91
+ #ifdef __linux__
92
+ // enumerate the set of thread siblings, num entries is num cores
93
+ std::unordered_set<std::string> siblings;
94
+ for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
95
+ std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
96
+ + std::to_string(cpu) + "/topology/thread_siblings");
97
+ if (!thread_siblings.is_open()) {
98
+ break; // no more cpus
99
+ }
100
+ std::string line;
101
+ if (std::getline(thread_siblings, line)) {
102
+ siblings.insert(line);
103
+ }
104
+ }
105
+ if (!siblings.empty()) {
106
+ return static_cast<int32_t>(siblings.size());
107
+ }
108
+ #elif defined(__APPLE__) && defined(__MACH__)
109
+ int32_t num_physical_cores;
110
+ size_t len = sizeof(num_physical_cores);
111
+ int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
112
+ if (result == 0) {
113
+ return num_physical_cores;
114
+ }
115
+ result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
116
+ if (result == 0) {
117
+ return num_physical_cores;
118
+ }
119
+ #elif defined(_WIN32)
120
+ //TODO: Implement
121
+ #endif
122
+ unsigned int n_threads = std::thread::hardware_concurrency();
123
+ return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
124
+ }
125
+
126
+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
127
+ #include <pthread.h>
128
+
129
+ static void cpuid(unsigned leaf, unsigned subleaf,
130
+ unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
131
+ __asm__("movq\t%%rbx,%%rsi\n\t"
132
+ "cpuid\n\t"
133
+ "xchgq\t%%rbx,%%rsi"
134
+ : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
135
+ : "0"(leaf), "2"(subleaf));
136
+ }
137
+
138
+ static int pin_cpu(int cpu) {
139
+ cpu_set_t mask;
140
+ CPU_ZERO(&mask);
141
+ CPU_SET(cpu, &mask);
142
+ return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
143
+ }
144
+
145
+ static bool is_hybrid_cpu(void) {
146
+ unsigned eax, ebx, ecx, edx;
147
+ cpuid(7, 0, &eax, &ebx, &ecx, &edx);
148
+ return !!(edx & (1u << 15));
149
+ }
150
+
151
+ static bool is_running_on_efficiency_core(void) {
152
+ unsigned eax, ebx, ecx, edx;
153
+ cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
154
+ int intel_atom = 0x20;
155
+ int core_type = (eax & 0xff000000u) >> 24;
156
+ return core_type == intel_atom;
157
+ }
158
+
159
+ static int cpu_count_math_cpus(int n_cpu) {
160
+ int result = 0;
161
+ for (int cpu = 0; cpu < n_cpu; ++cpu) {
162
+ if (pin_cpu(cpu)) {
163
+ return -1;
164
+ }
165
+ if (is_running_on_efficiency_core()) {
166
+ continue; // efficiency cores harm lockstep threading
167
+ }
168
+ ++cpu; // hyperthreading isn't useful for linear algebra
169
+ ++result;
170
+ }
171
+ return result;
172
+ }
173
+
174
+ #endif // __x86_64__ && __linux__
175
+
176
+ /**
177
+ * Returns number of CPUs on system that are useful for math.
178
+ */
179
+ int32_t cpu_get_num_math() {
180
+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
181
+ int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
182
+ if (n_cpu < 1) {
183
+ return cpu_get_num_physical_cores();
184
+ }
185
+ if (is_hybrid_cpu()) {
186
+ cpu_set_t affinity;
187
+ if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
188
+ int result = cpu_count_math_cpus(n_cpu);
189
+ pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
190
+ if (result > 0) {
191
+ return result;
192
+ }
193
+ }
194
+ }
195
+ #endif
196
+ return cpu_get_num_physical_cores();
197
+ }
198
+
199
+ //
200
+ // CLI argument parsing
201
+ //
202
+
203
+ void gpt_params_handle_hf_token(gpt_params & params) {
204
+ if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
205
+ params.hf_token = std::getenv("HF_TOKEN");
206
+ }
207
+ }
208
+
209
+ void gpt_params_handle_model_default(gpt_params & params) {
210
+ if (!params.hf_repo.empty()) {
211
+ // short-hand to avoid specifying --hf-file -> default it to --model
212
+ if (params.hf_file.empty()) {
213
+ if (params.model.empty()) {
214
+ throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
215
+ }
216
+ params.hf_file = params.model;
217
+ } else if (params.model.empty()) {
218
+ params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
219
+ }
220
+ } else if (!params.model_url.empty()) {
221
+ if (params.model.empty()) {
222
+ auto f = string_split(params.model_url, '#').front();
223
+ f = string_split(f, '?').front();
224
+ params.model = fs_get_cache_file(string_split(f, '/').back());
225
+ }
226
+ } else if (params.model.empty()) {
227
+ params.model = DEFAULT_MODEL_PATH;
228
+ }
229
+ }
230
+
231
+ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
232
+ bool invalid_param = false;
233
+ std::string arg;
234
+ const std::string arg_prefix = "--";
235
+ llama_sampling_params & sparams = params.sparams;
236
+
237
+ for (int i = 1; i < argc; i++) {
238
+ arg = argv[i];
239
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
240
+ std::replace(arg.begin(), arg.end(), '_', '-');
241
+ }
242
+ if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
243
+ throw std::invalid_argument("error: unknown argument: " + arg);
244
+ }
245
+ if (invalid_param) {
246
+ throw std::invalid_argument("error: invalid parameter for argument: " + arg);
247
+ }
248
+ }
249
+
250
+ if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
251
+ throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
252
+ }
253
+
254
+ gpt_params_handle_model_default(params);
255
+
256
+ gpt_params_handle_hf_token(params);
257
+
258
+ if (params.escape) {
259
+ string_process_escapes(params.prompt);
260
+ string_process_escapes(params.input_prefix);
261
+ string_process_escapes(params.input_suffix);
262
+ string_process_escapes(sparams.cfg_negative_prompt);
263
+ for (auto & antiprompt : params.antiprompt) {
264
+ string_process_escapes(antiprompt);
265
+ }
266
+ }
267
+
268
+ if (!params.kv_overrides.empty()) {
269
+ params.kv_overrides.emplace_back();
270
+ params.kv_overrides.back().key[0] = 0;
271
+ }
272
+
273
+ return true;
274
+ }
275
+
276
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
277
+ const auto params_org = params; // the example can modify the default params
278
+
279
+ try {
280
+ if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
281
+ params = params_org;
282
+ params.usage = true;
283
+ return false;
284
+ }
285
+ } catch (const std::invalid_argument & ex) {
286
+ fprintf(stderr, "%s\n", ex.what());
287
+ params = params_org;
288
+ return false;
289
+ }
290
+
291
+ return true;
292
+ }
293
+
294
+ #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
295
+
296
+ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
297
+ const char split_delim = ',';
298
+
299
+ llama_sampling_params & sparams = params.sparams;
300
+
301
+ if (arg == "-s" || arg == "--seed") {
302
+ CHECK_ARG
303
+ // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
304
+ params.seed = std::stoul(argv[i]);
305
+ sparams.seed = std::stoul(argv[i]);
306
+ return true;
307
+ }
308
+ if (arg == "-t" || arg == "--threads") {
309
+ CHECK_ARG
310
+ params.n_threads = std::stoi(argv[i]);
311
+ if (params.n_threads <= 0) {
312
+ params.n_threads = std::thread::hardware_concurrency();
313
+ }
314
+ return true;
315
+ }
316
+ if (arg == "-tb" || arg == "--threads-batch") {
317
+ CHECK_ARG
318
+ params.n_threads_batch = std::stoi(argv[i]);
319
+ if (params.n_threads_batch <= 0) {
320
+ params.n_threads_batch = std::thread::hardware_concurrency();
321
+ }
322
+ return true;
323
+ }
324
+ if (arg == "-td" || arg == "--threads-draft") {
325
+ CHECK_ARG
326
+ params.n_threads_draft = std::stoi(argv[i]);
327
+ if (params.n_threads_draft <= 0) {
328
+ params.n_threads_draft = std::thread::hardware_concurrency();
329
+ }
330
+ return true;
331
+ }
332
+ if (arg == "-tbd" || arg == "--threads-batch-draft") {
333
+ CHECK_ARG
334
+ params.n_threads_batch_draft = std::stoi(argv[i]);
335
+ if (params.n_threads_batch_draft <= 0) {
336
+ params.n_threads_batch_draft = std::thread::hardware_concurrency();
337
+ }
338
+ return true;
339
+ }
340
+ if (arg == "-p" || arg == "--prompt") {
341
+ CHECK_ARG
342
+ params.prompt = argv[i];
343
+ return true;
344
+ }
345
+ if (arg == "-e" || arg == "--escape") {
346
+ params.escape = true;
347
+ return true;
348
+ }
349
+ if (arg == "--no-escape") {
350
+ params.escape = false;
351
+ return true;
352
+ }
353
+ if (arg == "--prompt-cache") {
354
+ CHECK_ARG
355
+ params.path_prompt_cache = argv[i];
356
+ return true;
357
+ }
358
+ if (arg == "--prompt-cache-all") {
359
+ params.prompt_cache_all = true;
360
+ return true;
361
+ }
362
+ if (arg == "--prompt-cache-ro") {
363
+ params.prompt_cache_ro = true;
364
+ return true;
365
+ }
366
+ if (arg == "-bf" || arg == "--binary-file") {
367
+ CHECK_ARG
368
+ std::ifstream file(argv[i], std::ios::binary);
369
+ if (!file) {
370
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
371
+ invalid_param = true;
372
+ return true;
373
+ }
374
+ // store the external file name in params
375
+ params.prompt_file = argv[i];
376
+ std::ostringstream ss;
377
+ ss << file.rdbuf();
378
+ params.prompt = ss.str();
379
+ fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
380
+ return true;
381
+ }
382
+ if (arg == "-f" || arg == "--file") {
383
+ CHECK_ARG
384
+ std::ifstream file(argv[i]);
385
+ if (!file) {
386
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
387
+ invalid_param = true;
388
+ return true;
389
+ }
390
+ // store the external file name in params
391
+ params.prompt_file = argv[i];
392
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
393
+ if (!params.prompt.empty() && params.prompt.back() == '\n') {
394
+ params.prompt.pop_back();
395
+ }
396
+ return true;
397
+ }
398
+ if (arg == "--in-file") {
399
+ CHECK_ARG
400
+ std::ifstream file(argv[i]);
401
+ if (!file) {
402
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
403
+ invalid_param = true;
404
+ return true;
405
+ }
406
+ params.in_files.push_back(argv[i]);
407
+ return true;
408
+ }
409
+ if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
410
+ CHECK_ARG
411
+ params.n_predict = std::stoi(argv[i]);
412
+ return true;
413
+ }
414
+ if (arg == "--top-k") {
415
+ CHECK_ARG
416
+ sparams.top_k = std::stoi(argv[i]);
417
+ return true;
418
+ }
419
+ if (arg == "-c" || arg == "--ctx-size") {
420
+ CHECK_ARG
421
+ params.n_ctx = std::stoi(argv[i]);
422
+ return true;
423
+ }
424
+ if (arg == "--grp-attn-n" || arg == "-gan") {
425
+ CHECK_ARG
426
+ params.grp_attn_n = std::stoi(argv[i]);
427
+ return true;
428
+ }
429
+ if (arg == "--grp-attn-w" || arg == "-gaw") {
430
+ CHECK_ARG
431
+ params.grp_attn_w = std::stoi(argv[i]);
432
+ return true;
433
+ }
434
+ if (arg == "--rope-freq-base") {
435
+ CHECK_ARG
436
+ params.rope_freq_base = std::stof(argv[i]);
437
+ return true;
438
+ }
439
+ if (arg == "--rope-freq-scale") {
440
+ CHECK_ARG
441
+ params.rope_freq_scale = std::stof(argv[i]);
442
+ return true;
443
+ }
444
+ if (arg == "--rope-scaling") {
445
+ CHECK_ARG
446
+ std::string value(argv[i]);
447
+ /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
448
+ else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
449
+ else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
450
+ else { invalid_param = true; }
451
+ return true;
452
+ }
453
+ if (arg == "--rope-scale") {
454
+ CHECK_ARG
455
+ params.rope_freq_scale = 1.0f / std::stof(argv[i]);
456
+ return true;
457
+ }
458
+ if (arg == "--yarn-orig-ctx") {
459
+ CHECK_ARG
460
+ params.yarn_orig_ctx = std::stoi(argv[i]);
461
+ return true;
462
+ }
463
+ if (arg == "--yarn-ext-factor") {
464
+ CHECK_ARG
465
+ params.yarn_ext_factor = std::stof(argv[i]);
466
+ return true;
467
+ }
468
+ if (arg == "--yarn-attn-factor") {
469
+ CHECK_ARG
470
+ params.yarn_attn_factor = std::stof(argv[i]);
471
+ return true;
472
+ }
473
+ if (arg == "--yarn-beta-fast") {
474
+ CHECK_ARG
475
+ params.yarn_beta_fast = std::stof(argv[i]);
476
+ return true;
477
+ }
478
+ if (arg == "--yarn-beta-slow") {
479
+ CHECK_ARG
480
+ params.yarn_beta_slow = std::stof(argv[i]);
481
+ return true;
482
+ }
483
+ if (arg == "--pooling") {
484
+ CHECK_ARG
485
+ std::string value(argv[i]);
486
+ /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
487
+ else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
488
+ else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
489
+ else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
490
+ else { invalid_param = true; }
491
+ return true;
492
+ }
493
+ if (arg == "--attention") {
494
+ CHECK_ARG
495
+ std::string value(argv[i]);
496
+ /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
497
+ else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
498
+ else { invalid_param = true; }
499
+ return true;
500
+ }
501
+ if (arg == "--defrag-thold" || arg == "-dt") {
502
+ CHECK_ARG
503
+ params.defrag_thold = std::stof(argv[i]);
504
+ return true;
505
+ }
506
+ if (arg == "--samplers") {
507
+ CHECK_ARG
508
+ const auto sampler_names = string_split(argv[i], ';');
509
+ sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
510
+ return true;
511
+ }
512
+ if (arg == "--sampling-seq") {
513
+ CHECK_ARG
514
+ sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
515
+ return true;
516
+ }
517
+ if (arg == "--top-p") {
518
+ CHECK_ARG
519
+ sparams.top_p = std::stof(argv[i]);
520
+ return true;
521
+ }
522
+ if (arg == "--min-p") {
523
+ CHECK_ARG
524
+ sparams.min_p = std::stof(argv[i]);
525
+ return true;
526
+ }
527
+ if (arg == "--temp") {
528
+ CHECK_ARG
529
+ sparams.temp = std::stof(argv[i]);
530
+ sparams.temp = std::max(sparams.temp, 0.0f);
531
+ return true;
532
+ }
533
+ if (arg == "--tfs") {
534
+ CHECK_ARG
535
+ sparams.tfs_z = std::stof(argv[i]);
536
+ return true;
537
+ }
538
+ if (arg == "--typical") {
539
+ CHECK_ARG
540
+ sparams.typical_p = std::stof(argv[i]);
541
+ return true;
542
+ }
543
+ if (arg == "--repeat-last-n") {
544
+ CHECK_ARG
545
+ sparams.penalty_last_n = std::stoi(argv[i]);
546
+ sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
547
+ return true;
548
+ }
549
+ if (arg == "--repeat-penalty") {
550
+ CHECK_ARG
551
+ sparams.penalty_repeat = std::stof(argv[i]);
552
+ return true;
553
+ }
554
+ if (arg == "--frequency-penalty") {
555
+ CHECK_ARG
556
+ sparams.penalty_freq = std::stof(argv[i]);
557
+ return true;
558
+ }
559
+ if (arg == "--presence-penalty") {
560
+ CHECK_ARG
561
+ sparams.penalty_present = std::stof(argv[i]);
562
+ return true;
563
+ }
564
+ if (arg == "--dynatemp-range") {
565
+ CHECK_ARG
566
+ sparams.dynatemp_range = std::stof(argv[i]);
567
+ return true;
568
+ }
569
+ if (arg == "--dynatemp-exp") {
570
+ CHECK_ARG
571
+ sparams.dynatemp_exponent = std::stof(argv[i]);
572
+ return true;
573
+ }
574
+ if (arg == "--mirostat") {
575
+ CHECK_ARG
576
+ sparams.mirostat = std::stoi(argv[i]);
577
+ return true;
578
+ }
579
+ if (arg == "--mirostat-lr") {
580
+ CHECK_ARG
581
+ sparams.mirostat_eta = std::stof(argv[i]);
582
+ return true;
583
+ }
584
+ if (arg == "--mirostat-ent") {
585
+ CHECK_ARG
586
+ sparams.mirostat_tau = std::stof(argv[i]);
587
+ return true;
588
+ }
589
+ if (arg == "--cfg-negative-prompt") {
590
+ CHECK_ARG
591
+ sparams.cfg_negative_prompt = argv[i];
592
+ return true;
593
+ }
594
+ if (arg == "--cfg-negative-prompt-file") {
595
+ CHECK_ARG
596
+ std::ifstream file(argv[i]);
597
+ if (!file) {
598
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
599
+ invalid_param = true;
600
+ return true;
601
+ }
602
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
603
+ if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
604
+ sparams.cfg_negative_prompt.pop_back();
605
+ }
606
+ return true;
607
+ }
608
+ if (arg == "--cfg-scale") {
609
+ CHECK_ARG
610
+ sparams.cfg_scale = std::stof(argv[i]);
611
+ return true;
612
+ }
613
+ if (arg == "-b" || arg == "--batch-size") {
614
+ CHECK_ARG
615
+ params.n_batch = std::stoi(argv[i]);
616
+ return true;
617
+ }
618
+ if (arg == "-ub" || arg == "--ubatch-size") {
619
+ CHECK_ARG
620
+ params.n_ubatch = std::stoi(argv[i]);
621
+ return true;
622
+ }
623
+ if (arg == "--keep") {
624
+ CHECK_ARG
625
+ params.n_keep = std::stoi(argv[i]);
626
+ return true;
627
+ }
628
+ if (arg == "--draft") {
629
+ CHECK_ARG
630
+ params.n_draft = std::stoi(argv[i]);
631
+ return true;
632
+ }
633
+ if (arg == "--chunks") {
634
+ CHECK_ARG
635
+ params.n_chunks = std::stoi(argv[i]);
636
+ return true;
637
+ }
638
+ if (arg == "-np" || arg == "--parallel") {
639
+ CHECK_ARG
640
+ params.n_parallel = std::stoi(argv[i]);
641
+ return true;
642
+ }
643
+ if (arg == "-ns" || arg == "--sequences") {
644
+ CHECK_ARG
645
+ params.n_sequences = std::stoi(argv[i]);
646
+ return true;
647
+ }
648
+ if (arg == "--p-split" || arg == "-ps") {
649
+ CHECK_ARG
650
+ params.p_split = std::stof(argv[i]);
651
+ return true;
652
+ }
653
+ if (arg == "-m" || arg == "--model") {
654
+ CHECK_ARG
655
+ params.model = argv[i];
656
+ return true;
657
+ }
658
+ if (arg == "-md" || arg == "--model-draft") {
659
+ CHECK_ARG
660
+ params.model_draft = argv[i];
661
+ return true;
662
+ }
663
+ if (arg == "-a" || arg == "--alias") {
664
+ CHECK_ARG
665
+ params.model_alias = argv[i];
666
+ return true;
667
+ }
668
+ if (arg == "-mu" || arg == "--model-url") {
669
+ CHECK_ARG
670
+ params.model_url = argv[i];
671
+ return true;
672
+ }
673
+ if (arg == "-hft" || arg == "--hf-token") {
674
+ if (++i >= argc) {
675
+ invalid_param = true;
676
+ return true;
677
+ }
678
+ params.hf_token = argv[i];
679
+ return true;
680
+ }
681
+ if (arg == "-hfr" || arg == "--hf-repo") {
682
+ CHECK_ARG
683
+ params.hf_repo = argv[i];
684
+ return true;
685
+ }
686
+ if (arg == "-hff" || arg == "--hf-file") {
687
+ CHECK_ARG
688
+ params.hf_file = argv[i];
689
+ return true;
690
+ }
691
+ if (arg == "--lora") {
692
+ CHECK_ARG
693
+ params.lora_adapter.emplace_back(argv[i], 1.0f);
694
+ params.use_mmap = false;
695
+ return true;
696
+ }
697
+ if (arg == "--lora-scaled") {
698
+ CHECK_ARG
699
+ const char* lora_adapter = argv[i];
700
+ CHECK_ARG
701
+ params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
702
+ params.use_mmap = false;
703
+ return true;
704
+ }
705
+ if (arg == "--lora-base") {
706
+ CHECK_ARG
707
+ params.lora_base = argv[i];
708
+ return true;
709
+ }
710
+ if (arg == "--control-vector") {
711
+ CHECK_ARG
712
+ params.control_vectors.push_back({ 1.0f, argv[i], });
713
+ return true;
714
+ }
715
+ if (arg == "--control-vector-scaled") {
716
+ CHECK_ARG
717
+ const char* fname = argv[i];
718
+ CHECK_ARG
719
+ params.control_vectors.push_back({ std::stof(argv[i]), fname, });
720
+ return true;
721
+ }
722
+ if (arg == "--control-vector-layer-range") {
723
+ CHECK_ARG
724
+ params.control_vector_layer_start = std::stoi(argv[i]);
725
+ CHECK_ARG
726
+ params.control_vector_layer_end = std::stoi(argv[i]);
727
+ return true;
728
+ }
729
+ if (arg == "--mmproj") {
730
+ CHECK_ARG
731
+ params.mmproj = argv[i];
732
+ return true;
733
+ }
734
+ if (arg == "--image") {
735
+ CHECK_ARG
736
+ params.image.emplace_back(argv[i]);
737
+ return true;
738
+ }
739
+ if (arg == "-i" || arg == "--interactive") {
740
+ params.interactive = true;
741
+ return true;
742
+ }
743
+ if (arg == "-sp" || arg == "--special") {
744
+ params.special = true;
745
+ return true;
746
+ }
747
+ if (arg == "--embedding" || arg == "--embeddings") {
748
+ params.embedding = true;
749
+ return true;
750
+ }
751
+ if (arg == "--embd-normalize") {
752
+ CHECK_ARG
753
+ params.embd_normalize = std::stoi(argv[i]);
754
+ return true;
755
+ }
756
+ if (arg == "--embd-output-format") {
757
+ CHECK_ARG
758
+ params.embd_out = argv[i];
759
+ return true;
760
+ }
761
+ if (arg == "--embd-separator") {
762
+ CHECK_ARG
763
+ params.embd_sep = argv[i];
764
+ return true;
765
+ }
766
+ if (arg == "-if" || arg == "--interactive-first") {
767
+ params.interactive_first = true;
768
+ return true;
769
+ }
770
+ if (arg == "-cnv" || arg == "--conversation") {
771
+ params.conversation = true;
772
+ return true;
773
+ }
774
+ if (arg == "--infill") {
775
+ params.infill = true;
776
+ return true;
777
+ }
778
+ if (arg == "-dkvc" || arg == "--dump-kv-cache") {
779
+ params.dump_kv_cache = true;
780
+ return true;
781
+ }
782
+ if (arg == "-nkvo" || arg == "--no-kv-offload") {
783
+ params.no_kv_offload = true;
784
+ return true;
785
+ }
786
+ if (arg == "-ctk" || arg == "--cache-type-k") {
787
+ params.cache_type_k = argv[++i];
788
+ return true;
789
+ }
790
+ if (arg == "-ctv" || arg == "--cache-type-v") {
791
+ params.cache_type_v = argv[++i];
792
+ return true;
793
+ }
794
+ if (arg == "-mli" || arg == "--multiline-input") {
795
+ params.multiline_input = true;
796
+ return true;
797
+ }
798
+ if (arg == "--simple-io") {
799
+ params.simple_io = true;
800
+ return true;
801
+ }
802
+ if (arg == "-cb" || arg == "--cont-batching") {
803
+ params.cont_batching = true;
804
+ return true;
805
+ }
806
+ if (arg == "-nocb" || arg == "--no-cont-batching") {
807
+ params.cont_batching = false;
808
+ return true;
809
+ }
810
+ if (arg == "-fa" || arg == "--flash-attn") {
811
+ params.flash_attn = true;
812
+ return true;
813
+ }
814
+ if (arg == "-co" || arg == "--color") {
815
+ params.use_color = true;
816
+ return true;
817
+ }
818
+ if (arg == "--mlock") {
819
+ params.use_mlock = true;
820
+ return true;
821
+ }
822
+ if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
823
+ CHECK_ARG
824
+ params.n_gpu_layers = std::stoi(argv[i]);
825
+ if (!llama_supports_gpu_offload()) {
826
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
827
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
828
+ }
829
+ return true;
830
+ }
831
+ if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
832
+ CHECK_ARG
833
+ params.n_gpu_layers_draft = std::stoi(argv[i]);
834
+ if (!llama_supports_gpu_offload()) {
835
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
836
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
837
+ }
838
+ return true;
839
+ }
840
+ if (arg == "--main-gpu" || arg == "-mg") {
841
+ CHECK_ARG
842
+ params.main_gpu = std::stoi(argv[i]);
843
+ #ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
844
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
845
+ #endif // LM_GGML_USE_CUDA_SYCL_VULKAN
846
+ return true;
847
+ }
848
+ if (arg == "--split-mode" || arg == "-sm") {
849
+ CHECK_ARG
850
+ std::string arg_next = argv[i];
851
+ if (arg_next == "none") {
852
+ params.split_mode = LLAMA_SPLIT_MODE_NONE;
853
+ }
854
+ else if (arg_next == "layer") {
855
+ params.split_mode = LLAMA_SPLIT_MODE_LAYER;
856
+ }
857
+ else if (arg_next == "row") {
858
+ #ifdef LM_GGML_USE_SYCL
859
+ fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
860
+ exit(1);
861
+ #endif // LM_GGML_USE_SYCL
862
+ params.split_mode = LLAMA_SPLIT_MODE_ROW;
863
+ }
864
+ else {
865
+ invalid_param = true;
866
+ return true;
867
+ }
868
+ #ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
869
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
870
+ #endif // LM_GGML_USE_CUDA_SYCL_VULKAN
871
+ return true;
872
+ }
873
+ if (arg == "--tensor-split" || arg == "-ts") {
874
+ CHECK_ARG
875
+ std::string arg_next = argv[i];
876
+
877
+ // split string by , and /
878
+ const std::regex regex{ R"([,/]+)" };
879
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
880
+ std::vector<std::string> split_arg{ it, {} };
881
+ if (split_arg.size() >= llama_max_devices()) {
882
+ invalid_param = true;
883
+ return true;
884
+ }
885
+ for (size_t i = 0; i < llama_max_devices(); ++i) {
886
+ if (i < split_arg.size()) {
887
+ params.tensor_split[i] = std::stof(split_arg[i]);
888
+ }
889
+ else {
890
+ params.tensor_split[i] = 0.0f;
891
+ }
892
+ }
893
+ #ifndef LM_GGML_USE_CUDA_SYCL_VULKAN
894
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
895
+ #endif // LM_GGML_USE_CUDA_SYCL_VULKAN
896
+ return true;
897
+ }
898
+ if (arg == "--rpc") {
899
+ CHECK_ARG
900
+ params.rpc_servers = argv[i];
901
+ return true;
902
+ }
903
+ if (arg == "--no-mmap") {
904
+ params.use_mmap = false;
905
+ return true;
906
+ }
907
+ if (arg == "--numa") {
908
+ CHECK_ARG
909
+ std::string value(argv[i]);
910
+ /**/ if (value == "distribute" || value == "") { params.numa = LM_GGML_NUMA_STRATEGY_DISTRIBUTE; }
911
+ else if (value == "isolate") { params.numa = LM_GGML_NUMA_STRATEGY_ISOLATE; }
912
+ else if (value == "numactl") { params.numa = LM_GGML_NUMA_STRATEGY_NUMACTL; }
913
+ else { invalid_param = true; }
914
+ return true;
915
+ }
916
+ if (arg == "-v" || arg == "--verbose") {
917
+ params.verbosity = 1;
918
+ return true;
919
+ }
920
+ if (arg == "--verbosity") {
921
+ CHECK_ARG
922
+ params.verbosity = std::stoi(argv[i]);
923
+ return true;
924
+ }
925
+ if (arg == "--verbose-prompt") {
926
+ params.verbose_prompt = true;
927
+ return true;
928
+ }
929
+ if (arg == "--no-display-prompt") {
930
+ params.display_prompt = false;
931
+ return true;
932
+ }
933
+ if (arg == "-r" || arg == "--reverse-prompt") {
934
+ CHECK_ARG
935
+ params.antiprompt.emplace_back(argv[i]);
936
+ return true;
937
+ }
938
+ if (arg == "-ld" || arg == "--logdir") {
939
+ CHECK_ARG
940
+ params.logdir = argv[i];
941
+
942
+ if (params.logdir.back() != DIRECTORY_SEPARATOR) {
943
+ params.logdir += DIRECTORY_SEPARATOR;
944
+ }
945
+ return true;
946
+ }
947
+ if (arg == "-lcs" || arg == "--lookup-cache-static") {
948
+ CHECK_ARG
949
+ params.lookup_cache_static = argv[i];
950
+ return true;
951
+ }
952
+ if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
953
+ CHECK_ARG
954
+ params.lookup_cache_dynamic = argv[i];
955
+ return true;
956
+ }
957
+ if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
958
+ CHECK_ARG
959
+ params.logits_file = argv[i];
960
+ return true;
961
+ }
962
+ if (arg == "--perplexity" || arg == "--all-logits") {
963
+ params.logits_all = true;
964
+ return true;
965
+ }
966
+ if (arg == "--ppl-stride") {
967
+ CHECK_ARG
968
+ params.ppl_stride = std::stoi(argv[i]);
969
+ return true;
970
+ }
971
+ if (arg == "--ppl-output-type") {
972
+ CHECK_ARG
973
+ params.ppl_output_type = std::stoi(argv[i]);
974
+ return true;
975
+ }
976
+ if (arg == "-ptc" || arg == "--print-token-count") {
977
+ CHECK_ARG
978
+ params.n_print = std::stoi(argv[i]);
979
+ return true;
980
+ }
981
+ if (arg == "--check-tensors") {
982
+ params.check_tensors = true;
983
+ return true;
984
+ }
985
+ if (arg == "--hellaswag") {
986
+ params.hellaswag = true;
987
+ return true;
988
+ }
989
+ if (arg == "--hellaswag-tasks") {
990
+ CHECK_ARG
991
+ params.hellaswag_tasks = std::stoi(argv[i]);
992
+ return true;
993
+ }
994
+ if (arg == "--winogrande") {
995
+ params.winogrande = true;
996
+ return true;
997
+ }
998
+ if (arg == "--winogrande-tasks") {
999
+ CHECK_ARG
1000
+ params.winogrande_tasks = std::stoi(argv[i]);
1001
+ return true;
1002
+ }
1003
+ if (arg == "--multiple-choice") {
1004
+ params.multiple_choice = true;
1005
+ return true;
1006
+ }
1007
+ if (arg == "--multiple-choice-tasks") {
1008
+ CHECK_ARG
1009
+ params.multiple_choice_tasks = std::stoi(argv[i]);
1010
+ return true;
1011
+ }
1012
+ if (arg == "--kl-divergence") {
1013
+ params.kl_divergence = true;
1014
+ return true;
1015
+ }
1016
+ if (arg == "--ignore-eos") {
1017
+ params.ignore_eos = true;
1018
+ return true;
1019
+ }
1020
+ if (arg == "--penalize-nl") {
1021
+ sparams.penalize_nl = true;
1022
+ return true;
1023
+ }
1024
+ if (arg == "-l" || arg == "--logit-bias") {
1025
+ CHECK_ARG
1026
+ std::stringstream ss(argv[i]);
1027
+ llama_token key;
1028
+ char sign;
1029
+ std::string value_str;
1030
+ try {
1031
+ if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
1032
+ sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
1033
+ }
1034
+ else {
1035
+ throw std::exception();
1036
+ }
1037
+ }
1038
+ catch (const std::exception&) {
1039
+ invalid_param = true;
1040
+ return true;
1041
+ }
1042
+ return true;
1043
+ }
1044
+ if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
1045
+ params.usage = true;
1046
+ return true;
1047
+ }
1048
+ if (arg == "--version") {
1049
+ fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
1050
+ fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
1051
+ exit(0);
1052
+ }
1053
+ if (arg == "--in-prefix-bos") {
1054
+ params.input_prefix_bos = true;
1055
+ params.enable_chat_template = false;
1056
+ return true;
1057
+ }
1058
+ if (arg == "--in-prefix") {
1059
+ CHECK_ARG
1060
+ params.input_prefix = argv[i];
1061
+ params.enable_chat_template = false;
1062
+ return true;
1063
+ }
1064
+ if (arg == "--in-suffix") {
1065
+ CHECK_ARG
1066
+ params.input_suffix = argv[i];
1067
+ params.enable_chat_template = false;
1068
+ return true;
1069
+ }
1070
+ if (arg == "--spm-infill") {
1071
+ params.spm_infill = true;
1072
+ return true;
1073
+ }
1074
+ if (arg == "--grammar") {
1075
+ CHECK_ARG
1076
+ sparams.grammar = argv[i];
1077
+ return true;
1078
+ }
1079
+ if (arg == "--grammar-file") {
1080
+ CHECK_ARG
1081
+ std::ifstream file(argv[i]);
1082
+ if (!file) {
1083
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1084
+ invalid_param = true;
1085
+ return true;
1086
+ }
1087
+ std::copy(
1088
+ std::istreambuf_iterator<char>(file),
1089
+ std::istreambuf_iterator<char>(),
1090
+ std::back_inserter(sparams.grammar)
1091
+ );
1092
+ return true;
1093
+ }
1094
+ if (arg == "-j" || arg == "--json-schema") {
1095
+ CHECK_ARG
1096
+ sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
1097
+ return true;
1098
+ }
1099
+ if (arg == "--override-kv") {
1100
+ CHECK_ARG
1101
+ if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
1102
+ fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
1103
+ invalid_param = true;
1104
+ return true;
1105
+ }
1106
+ return true;
1107
+ }
1108
+ if (arg == "--host") {
1109
+ CHECK_ARG
1110
+ params.hostname = argv[i];
1111
+ return true;
1112
+ }
1113
+ if (arg == "--port") {
1114
+ CHECK_ARG
1115
+ params.port = std::stoi(argv[i]);
1116
+ return true;
1117
+ }
1118
+ if (arg == "--path") {
1119
+ CHECK_ARG
1120
+ params.public_path = argv[i];
1121
+ return true;
1122
+ }
1123
+ if (arg == "--api-key") {
1124
+ CHECK_ARG
1125
+ params.api_keys.push_back(argv[i]);
1126
+ return true;
1127
+ }
1128
+ if (arg == "--api-key-file") {
1129
+ CHECK_ARG
1130
+ std::ifstream key_file(argv[i]);
1131
+ if (!key_file) {
1132
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1133
+ invalid_param = true;
1134
+ return true;
1135
+ }
1136
+ std::string key;
1137
+ while (std::getline(key_file, key)) {
1138
+ if (!key.empty()) {
1139
+ params.api_keys.push_back(key);
1140
+ }
1141
+ }
1142
+ key_file.close();
1143
+ return true;
1144
+ }
1145
+ if (arg == "--ssl-key-file") {
1146
+ CHECK_ARG
1147
+ params.ssl_file_key = argv[i];
1148
+ return true;
1149
+ }
1150
+ if (arg == "--ssl-cert-file") {
1151
+ CHECK_ARG
1152
+ params.ssl_file_cert = argv[i];
1153
+ return true;
1154
+ }
1155
+ if (arg == "--timeout" || arg == "-to") {
1156
+ CHECK_ARG
1157
+ params.timeout_read = std::stoi(argv[i]);
1158
+ params.timeout_write = std::stoi(argv[i]);
1159
+ return true;
1160
+ }
1161
+ if (arg == "--threads-http") {
1162
+ CHECK_ARG
1163
+ params.n_threads_http = std::stoi(argv[i]);
1164
+ return true;
1165
+ }
1166
+ if (arg == "-spf" || arg == "--system-prompt-file") {
1167
+ CHECK_ARG
1168
+ std::ifstream file(argv[i]);
1169
+ if (!file) {
1170
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1171
+ invalid_param = true;
1172
+ return true;
1173
+ }
1174
+ std::string system_prompt;
1175
+ std::copy(
1176
+ std::istreambuf_iterator<char>(file),
1177
+ std::istreambuf_iterator<char>(),
1178
+ std::back_inserter(system_prompt)
1179
+ );
1180
+ params.system_prompt = system_prompt;
1181
+ return true;
1182
+ }
1183
+ if (arg == "--log-format") {
1184
+ CHECK_ARG
1185
+ if (std::strcmp(argv[i], "json") == 0) {
1186
+ params.log_json = true;
1187
+ } else if (std::strcmp(argv[i], "text") == 0) {
1188
+ params.log_json = false;
1189
+ } else {
1190
+ invalid_param = true;
1191
+ return true;
1192
+ }
1193
+ return true;
1194
+ }
1195
+ if (arg == "--no-slots") {
1196
+ params.endpoint_slots = false;
1197
+ return true;
1198
+ }
1199
+ if (arg == "--metrics") {
1200
+ params.endpoint_metrics = true;
1201
+ return true;
1202
+ }
1203
+ if (arg == "--slot-save-path") {
1204
+ CHECK_ARG
1205
+ params.slot_save_path = argv[i];
1206
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
1207
+ if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
1208
+ params.slot_save_path += DIRECTORY_SEPARATOR;
1209
+ }
1210
+ return true;
1211
+ }
1212
+ if (arg == "--chat-template") {
1213
+ CHECK_ARG
1214
+ if (!llama_chat_verify_template(argv[i])) {
1215
+ fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
1216
+ fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
1217
+ invalid_param = true;
1218
+ return true;
1219
+ }
1220
+ params.chat_template = argv[i];
1221
+ return true;
1222
+ }
1223
+ if (arg == "--slot-prompt-similarity" || arg == "-sps") {
1224
+ CHECK_ARG
1225
+ params.slot_prompt_similarity = std::stof(argv[i]);
1226
+ return true;
1227
+ }
1228
+ if (arg == "-pps") {
1229
+ params.is_pp_shared = true;
1230
+ return true;
1231
+ }
1232
+ if (arg == "-npp") {
1233
+ CHECK_ARG
1234
+ auto p = string_split<int>(argv[i], split_delim);
1235
+ params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
1236
+ return true;
1237
+ }
1238
+ if (arg == "-ntg") {
1239
+ CHECK_ARG
1240
+ auto p = string_split<int>(argv[i], split_delim);
1241
+ params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
1242
+ return true;
1243
+ }
1244
+ if (arg == "-npl") {
1245
+ CHECK_ARG
1246
+ auto p = string_split<int>(argv[i], split_delim);
1247
+ params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
1248
+ return true;
1249
+ }
1250
+ if (arg == "--context-file") {
1251
+ CHECK_ARG
1252
+ std::ifstream file(argv[i], std::ios::binary);
1253
+ if (!file) {
1254
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1255
+ invalid_param = true;
1256
+ return true;
1257
+ }
1258
+ params.context_files.push_back(argv[i]);
1259
+ return true;
1260
+ }
1261
+ if (arg == "--chunk-size") {
1262
+ CHECK_ARG
1263
+ params.chunk_size = std::stoi(argv[i]);
1264
+ return true;
1265
+ }
1266
+ if (arg == "--chunk-separator") {
1267
+ CHECK_ARG
1268
+ params.chunk_separator = argv[i];
1269
+ return true;
1270
+ }
1271
+ if (arg == "--junk") {
1272
+ CHECK_ARG
1273
+ params.n_junk = std::stoi(argv[i]);
1274
+ return true;
1275
+ }
1276
+ if (arg == "--pos") {
1277
+ CHECK_ARG
1278
+ params.i_pos = std::stoi(argv[i]);
1279
+ return true;
1280
+ }
1281
+ if (arg == "-o" || arg == "--output" || arg == "--output-file") {
1282
+ CHECK_ARG
1283
+ params.out_file = argv[i];
1284
+ params.cvector_outfile = argv[i];
1285
+ return true;
1286
+ }
1287
+ if (arg == "-ofreq" || arg == "--output-frequency") {
1288
+ CHECK_ARG
1289
+ params.n_out_freq = std::stoi(argv[i]);
1290
+ return true;
1291
+ }
1292
+ if (arg == "--save-frequency") {
1293
+ CHECK_ARG
1294
+ params.n_save_freq = std::stoi(argv[i]);
1295
+ return true;
1296
+ }
1297
+ if (arg == "--process-output") {
1298
+ params.process_output = true;
1299
+ return true;
1300
+ }
1301
+ if (arg == "--no-ppl") {
1302
+ params.compute_ppl = false;
1303
+ return true;
1304
+ }
1305
+ if (arg == "--chunk" || arg == "--from-chunk") {
1306
+ CHECK_ARG
1307
+ params.i_chunk = std::stoi(argv[i]);
1308
+ return true;
1309
+ }
1310
+ // cvector params
1311
+ if (arg == "--positive-file") {
1312
+ CHECK_ARG
1313
+ params.cvector_positive_file = argv[i];
1314
+ return true;
1315
+ }
1316
+ if (arg == "--negative-file") {
1317
+ CHECK_ARG
1318
+ params.cvector_negative_file = argv[i];
1319
+ return true;
1320
+ }
1321
+ if (arg == "--pca-batch") {
1322
+ CHECK_ARG
1323
+ params.n_pca_batch = std::stoi(argv[i]);
1324
+ return true;
1325
+ }
1326
+ if (arg == "--pca-iter") {
1327
+ CHECK_ARG
1328
+ params.n_pca_iterations = std::stoi(argv[i]);
1329
+ return true;
1330
+ }
1331
+ if (arg == "--method") {
1332
+ CHECK_ARG
1333
+ std::string value(argv[i]);
1334
+ /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
1335
+ else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
1336
+ else { invalid_param = true; }
1337
+ return true;
1338
+ }
1339
+ #ifndef LOG_DISABLE_LOGS
1340
+ // Parse args for logging parameters
1341
+ if (log_param_single_parse(argv[i])) {
1342
+ // Do nothing, log_param_single_parse automatically does it's thing
1343
+ // and returns if a match was found and parsed.
1344
+ return true;
1345
+ }
1346
+ if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) {
1347
+ // We have a matching known parameter requiring an argument,
1348
+ // now we need to check if there is anything after this argv
1349
+ // and flag invalid_param or parse it.
1350
+ CHECK_ARG
1351
+ if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
1352
+ invalid_param = true;
1353
+ return true;
1354
+ }
1355
+ return true;
1356
+ }
1357
+ // End of Parse args for logging parameters
1358
+ #endif // LOG_DISABLE_LOGS
1359
+
1360
+ return false;
1361
+ }
1362
+
1363
+ #ifdef __GNUC__
1364
+ #ifdef __MINGW32__
1365
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
1366
+ #else
1367
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
1368
+ #endif
1369
+ #else
1370
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
1371
+ #endif
1372
+
1373
+ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1374
+ const llama_sampling_params & sparams = params.sparams;
1375
+
1376
+ std::string sampler_type_chars;
1377
+ std::string sampler_type_names;
1378
+ for (const auto sampler_type : sparams.samplers_sequence) {
1379
+ sampler_type_chars += static_cast<char>(sampler_type);
1380
+ sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
1381
+ }
1382
+ sampler_type_names.pop_back();
1383
+
1384
+ struct option_info {
1385
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
1386
+ option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
1387
+ va_list args_list;
1388
+ va_start(args_list, desc);
1389
+ char buffer[1024];
1390
+ vsnprintf(buffer, sizeof(buffer), desc, args_list);
1391
+ va_end(args_list);
1392
+ this->desc = buffer;
1393
+ }
1394
+
1395
+ option_info(const std::string & grp) : grp(grp) {}
1396
+
1397
+ std::string tags;
1398
+ std::string args;
1399
+ std::string desc;
1400
+ std::string grp;
1401
+ };
1402
+
1403
+ std::vector<option_info> options;
1404
+
1405
+ // TODO: filter by tags
1406
+
1407
+ options.push_back({ "general" });
1408
+ options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
1409
+ options.push_back({ "*", " --version", "show version and build info" });
1410
+ options.push_back({ "*", "-v, --verbose", "print verbose information" });
1411
+ options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
1412
+ options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
1413
+ options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
1414
+ options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
1415
+ options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
1416
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
1417
+ options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
1418
+ options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
1419
+ options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
1420
+ "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
1421
+ options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
1422
+ options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
1423
+ options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
1424
+ "path to static lookup cache to use for lookup decoding (not updated by generation)" });
1425
+ options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
1426
+ "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
1427
+
1428
+ options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
1429
+ options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
1430
+ options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
1431
+ options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
1432
+ options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
1433
+ options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
1434
+ options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
1435
+ options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
1436
+ "in conversation mode, this will be used as system prompt\n"
1437
+ "(default: '%s')", params.prompt.c_str() });
1438
+ options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
1439
+ options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
1440
+ options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
1441
+ options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
1442
+ options.push_back({ "*", " --no-escape", "do not process escape sequences" });
1443
+ options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
1444
+ options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
1445
+ options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
1446
+ "not supported with --interactive or other interactive options" });
1447
+ options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
1448
+ options.push_back({ "main", "-r, --reverse-prompt PROMPT",
1449
+ "halt generation at PROMPT, return control in interactive mode\n"
1450
+ "can be specified more than once for multiple prompts" });
1451
+ options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
1452
+ options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
1453
+ "if suffix/prefix are not specified, default chat template will be used\n"
1454
+ "(default: %s)", params.conversation ? "true" : "false" });
1455
+ options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
1456
+ options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
1457
+ options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
1458
+ options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
1459
+ options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
1460
+ options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
1461
+ options.push_back({ "server infill",
1462
+ " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
1463
+
1464
+ options.push_back({ "sampling" });
1465
+ options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
1466
+ "(default: %s)", sampler_type_names.c_str() });
1467
+ options.push_back({ "*", " --sampling-seq SEQUENCE",
1468
+ "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
1469
+ options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
1470
+ options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
1471
+ options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
1472
+ options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
1473
+ options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
1474
+ options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
1475
+ options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
1476
+ options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
1477
+ options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
1478
+ options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
1479
+ options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
1480
+ options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
1481
+ options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
1482
+ options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
1483
+ options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
1484
+ "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
1485
+ "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
1486
+ options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
1487
+ options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
1488
+ options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
1489
+ "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
1490
+ "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
1491
+ options.push_back({ "main", " --cfg-negative-prompt PROMPT",
1492
+ "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
1493
+ options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
1494
+ "negative prompt file to use for guidance" });
1495
+ options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
1496
+ options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
1497
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
1498
+ "if suffix/prefix are specified, template will be disabled\n"
1499
+ "only commonly used templates are accepted:\n"
1500
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
1501
+ options.push_back({ "grammar" });
1502
+ options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
1503
+ options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
1504
+ options.push_back({ "*", "-j, --json-schema SCHEMA",
1505
+ "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
1506
+ "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
1507
+
1508
+ options.push_back({ "embedding" });
1509
+ options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
1510
+ "pooling type for embeddings, use model default if unspecified" });
1511
+ options.push_back({ "embedding", " --attention {causal,non-causal}",
1512
+ "attention type for embeddings, use model default if unspecified" });
1513
+
1514
+ options.push_back({ "context hacking" });
1515
+ options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
1516
+ "RoPE frequency scaling method, defaults to linear unless specified by the model" });
1517
+ options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
1518
+ options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
1519
+ options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
1520
+ options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
1521
+ options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
1522
+ options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
1523
+ options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
1524
+ options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
1525
+ options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
1526
+ options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
1527
+ options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
1528
+ options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
1529
+ options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
1530
+ options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
1531
+
1532
+ options.push_back({ "perplexity" });
1533
+ options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
1534
+ options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
1535
+ options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
1536
+ options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
1537
+ options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
1538
+ options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
1539
+ options.push_back({ "perplexity", " --multiple-choice-tasks N",
1540
+ "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
1541
+ options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
1542
+ options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
1543
+ options.push_back({ "perplexity", " --ppl-output-type {0,1}",
1544
+ "output type for perplexity calculation (default: %d)", params.ppl_output_type });
1545
+
1546
+ options.push_back({ "parallel" });
1547
+ options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
1548
+ options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
1549
+ options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
1550
+ options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
1551
+ options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
1552
+
1553
+ options.push_back({ "multi-modality" });
1554
+ options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
1555
+ options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
1556
+
1557
+ options.push_back({ "backend" });
1558
+ options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
1559
+
1560
+ if (llama_supports_mlock()) {
1561
+ options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
1562
+ }
1563
+ if (llama_supports_mmap()) {
1564
+ options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
1565
+ }
1566
+ options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
1567
+ " - distribute: spread execution evenly over all nodes\n"
1568
+ " - isolate: only spawn threads on CPUs on the node that execution started on\n"
1569
+ " - numactl: use the CPU map provided by numactl\n"
1570
+ "if run without this previously, it is recommended to drop the system page cache before using this\n"
1571
+ "see https://github.com/ggerganov/llama.cpp/issues/1437" });
1572
+
1573
+ if (llama_supports_gpu_offload()) {
1574
+ options.push_back({ "*", "-ngl, --gpu-layers N",
1575
+ "number of layers to store in VRAM" });
1576
+ options.push_back({ "*", "-ngld, --gpu-layers-draft N",
1577
+ "number of layers to store in VRAM for the draft model" });
1578
+ options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
1579
+ "how to split the model across multiple GPUs, one of:\n"
1580
+ " - none: use one GPU only\n"
1581
+ " - layer (default): split layers and KV across GPUs\n"
1582
+ " - row: split rows across GPUs" });
1583
+ options.push_back({ "*", "-ts, --tensor-split SPLIT",
1584
+ "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
1585
+ options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
1586
+ "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
1587
+ }
1588
+
1589
+ options.push_back({ "model" });
1590
+ options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
1591
+ options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
1592
+ "advanced option to override model metadata by key. may be specified multiple times.\n"
1593
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
1594
+ options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
1595
+ options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
1596
+ options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
1597
+ options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
1598
+ "note: this argument can be repeated to add multiple control vectors" });
1599
+ options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
1600
+ "add a control vector with user defined scaling SCALE\n"
1601
+ "note: this argument can be repeated to add multiple scaled control vectors" });
1602
+ options.push_back({ "*", " --control-vector-layer-range START END",
1603
+ "layer range to apply the control vector(s) to, start and end inclusive" });
1604
+ options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
1605
+ "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
1606
+ options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
1607
+ options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
1608
+ options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
1609
+ options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
1610
+ options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
1611
+
1612
+ options.push_back({ "retrieval" });
1613
+ options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
1614
+ options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
1615
+ options.push_back({ "retrieval", " --chunk-separator STRING",
1616
+ "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
1617
+
1618
+ options.push_back({ "passkey" });
1619
+ options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
1620
+ options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
1621
+
1622
+ options.push_back({ "imatrix" });
1623
+ options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
1624
+ options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
1625
+ options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
1626
+ options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
1627
+ options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
1628
+ options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
1629
+
1630
+ options.push_back({ "bench" });
1631
+ options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
1632
+ options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
1633
+ options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
1634
+ options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
1635
+
1636
+ options.push_back({ "embedding" });
1637
+ options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
1638
+ options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
1639
+ options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
1640
+
1641
+ options.push_back({ "server" });
1642
+ options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
1643
+ options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
1644
+ options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
1645
+ options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
1646
+ options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
1647
+ options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
1648
+ options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
1649
+ options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
1650
+ options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
1651
+ options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
1652
+ options.push_back({ "server", " --system-prompt-file FNAME",
1653
+ "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
1654
+ options.push_back({ "server", " --log-format {text,json}",
1655
+ "log output format: json or text (default: json)" });
1656
+ options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
1657
+ options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
1658
+ options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
1659
+ options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
1660
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
1661
+ "only commonly used templates are accepted:\n"
1662
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
1663
+ options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
1664
+ "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
1665
+
1666
+ #ifndef LOG_DISABLE_LOGS
1667
+ options.push_back({ "logging" });
1668
+ options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
1669
+ options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
1670
+ options.push_back({ "logging", " --log-test", "Run simple logging test" });
1671
+ options.push_back({ "logging", " --log-disable", "Disable trace logs" });
1672
+ options.push_back({ "logging", " --log-enable", "Enable trace logs" });
1673
+ options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
1674
+ options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
1675
+ "Each log file will have unique name: \"<name>.<ID>.log\"" });
1676
+ options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
1677
+ #endif // LOG_DISABLE_LOGS
1678
+
1679
+ options.push_back({ "cvector" });
1680
+ options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
1681
+ options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
1682
+ options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
1683
+ options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
1684
+ options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
1685
+ options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
1686
+
1687
+ printf("usage: %s [options]\n", argv[0]);
1688
+
1689
+ for (const auto & o : options) {
1690
+ if (!o.grp.empty()) {
1691
+ printf("\n%s:\n\n", o.grp.c_str());
1692
+ continue;
1693
+ }
1694
+ printf(" %-32s", o.args.c_str());
1695
+ if (o.args.length() > 30) {
1696
+ printf("\n%34s", "");
1697
+ }
1698
+
1699
+ const auto desc = o.desc;
1700
+ size_t start = 0;
1701
+ size_t end = desc.find('\n');
1702
+ while (end != std::string::npos) {
1703
+ printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
1704
+ start = end + 1;
1705
+ end = desc.find('\n', start);
1706
+ }
1707
+
1708
+ printf("%s\n", desc.substr(start).c_str());
1709
+ }
1710
+ printf("\n");
1711
+ }
1712
+
1713
+ std::string gpt_params_get_system_info(const gpt_params & params) {
1714
+ std::ostringstream os;
1715
+
1716
+ os << "system_info: n_threads = " << params.n_threads;
1717
+ if (params.n_threads_batch != -1) {
1718
+ os << " (n_threads_batch = " << params.n_threads_batch << ")";
1719
+ }
1720
+ os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1721
+
1722
+ return os.str();
1723
+ }
1724
+
1725
+ //
1726
+ // String utils
1727
+ //
1728
+
1729
+ std::vector<std::string> string_split(std::string input, char separator) {
1730
+ std::vector<std::string> parts;
1731
+ size_t separator_pos = input.find(separator);
1732
+ while (separator_pos != std::string::npos) {
1733
+ std::string part = input.substr(0, separator_pos);
1734
+ parts.emplace_back(part);
1735
+ input = input.substr(separator_pos + 1);
1736
+ separator_pos = input.find(separator);
1737
+ }
1738
+ parts.emplace_back(input);
1739
+ return parts;
1740
+ }
1741
+
1742
+ std::string string_strip(const std::string & str) {
1743
+ size_t start = 0;
1744
+ size_t end = str.size();
1745
+ while (start < end && std::isspace(str[start])) {
1746
+ start++;
1747
+ }
1748
+ while (end > start && std::isspace(str[end - 1])) {
1749
+ end--;
1750
+ }
1751
+ return str.substr(start, end - start);
1752
+ }
1753
+
1754
+ std::string string_get_sortable_timestamp() {
1755
+ using clock = std::chrono::system_clock;
1756
+
1757
+ const clock::time_point current_time = clock::now();
1758
+ const time_t as_time_t = clock::to_time_t(current_time);
1759
+ char timestamp_no_ns[100];
1760
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
1761
+
1762
+ const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
1763
+ current_time.time_since_epoch() % 1000000000).count();
1764
+ char timestamp_ns[11];
1765
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
1766
+
1767
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
1768
+ }
1769
+
1770
+ void string_process_escapes(std::string & input) {
1771
+ std::size_t input_len = input.length();
1772
+ std::size_t output_idx = 0;
1773
+
1774
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
1775
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
1776
+ switch (input[++input_idx]) {
1777
+ case 'n': input[output_idx++] = '\n'; break;
1778
+ case 'r': input[output_idx++] = '\r'; break;
1779
+ case 't': input[output_idx++] = '\t'; break;
1780
+ case '\'': input[output_idx++] = '\''; break;
1781
+ case '\"': input[output_idx++] = '\"'; break;
1782
+ case '\\': input[output_idx++] = '\\'; break;
1783
+ case 'x':
1784
+ // Handle \x12, etc
1785
+ if (input_idx + 2 < input_len) {
1786
+ const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
1787
+ char *err_p = nullptr;
1788
+ const long val = std::strtol(x, &err_p, 16);
1789
+ if (err_p == x + 2) {
1790
+ input_idx += 2;
1791
+ input[output_idx++] = char(val);
1792
+ break;
1793
+ }
1794
+ }
1795
+ // fall through
1796
+ default: input[output_idx++] = '\\';
1797
+ input[output_idx++] = input[input_idx]; break;
1798
+ }
1799
+ } else {
1800
+ input[output_idx++] = input[input_idx];
1801
+ }
1802
+ }
1803
+
1804
+ input.resize(output_idx);
1805
+ }
1806
+
1807
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
1808
+ const char * sep = strchr(data, '=');
1809
+ if (sep == nullptr || sep - data >= 128) {
1810
+ fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
1811
+ return false;
1812
+ }
1813
+ llama_model_kv_override kvo;
1814
+ std::strncpy(kvo.key, data, sep - data);
1815
+ kvo.key[sep - data] = 0;
1816
+ sep++;
1817
+ if (strncmp(sep, "int:", 4) == 0) {
1818
+ sep += 4;
1819
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
1820
+ kvo.val_i64 = std::atol(sep);
1821
+ } else if (strncmp(sep, "float:", 6) == 0) {
1822
+ sep += 6;
1823
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
1824
+ kvo.val_f64 = std::atof(sep);
1825
+ } else if (strncmp(sep, "bool:", 5) == 0) {
1826
+ sep += 5;
1827
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
1828
+ if (std::strcmp(sep, "true") == 0) {
1829
+ kvo.val_bool = true;
1830
+ } else if (std::strcmp(sep, "false") == 0) {
1831
+ kvo.val_bool = false;
1832
+ } else {
1833
+ fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
1834
+ return false;
1835
+ }
1836
+ } else if (strncmp(sep, "str:", 4) == 0) {
1837
+ sep += 4;
1838
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
1839
+ if (strlen(sep) > 127) {
1840
+ fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
1841
+ return false;
1842
+ }
1843
+ strncpy(kvo.val_str, sep, 127);
1844
+ kvo.val_str[127] = '\0';
1845
+ } else {
1846
+ fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
1847
+ return false;
1848
+ }
1849
+ overrides.emplace_back(std::move(kvo));
1850
+ return true;
1851
+ }
1852
+
1853
+ //
1854
+ // Filesystem utils
1855
+ //
1856
+
1857
+ // Validate if a filename is safe to use
1858
+ // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
1859
+ bool fs_validate_filename(const std::string & filename) {
1860
+ if (!filename.length()) {
1861
+ // Empty filename invalid
1862
+ return false;
1863
+ }
1864
+ if (filename.length() > 255) {
1865
+ // Limit at common largest possible filename on Linux filesystems
1866
+ // to avoid unnecessary further validation
1867
+ // (On systems with smaller limits it will be caught by the OS)
1868
+ return false;
1869
+ }
1870
+
1871
+ std::u32string filename_utf32;
1872
+ try {
1873
+ std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
1874
+ filename_utf32 = converter.from_bytes(filename);
1875
+
1876
+ // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
1877
+ // or invalid encodings were encountered. Reject such attempts
1878
+ std::string filename_reencoded = converter.to_bytes(filename_utf32);
1879
+ if (filename_reencoded != filename) {
1880
+ return false;
1881
+ }
1882
+ } catch (const std::exception &) {
1883
+ return false;
1884
+ }
1885
+
1886
+ // Check for forbidden codepoints:
1887
+ // - Control characters
1888
+ // - Unicode equivalents of illegal characters
1889
+ // - UTF-16 surrogate pairs
1890
+ // - UTF-8 replacement character
1891
+ // - Byte order mark (BOM)
1892
+ // - Illegal characters: / \ : * ? " < > |
1893
+ for (char32_t c : filename_utf32) {
1894
+ if (c <= 0x1F // Control characters (C0)
1895
+ || c == 0x7F // Control characters (DEL)
1896
+ || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
1897
+ || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
1898
+ || c == 0x2215 // Division Slash (forward slash equivalent)
1899
+ || c == 0x2216 // Set Minus (backslash equivalent)
1900
+ || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
1901
+ || c == 0xFFFD // Replacement Character (UTF-8)
1902
+ || c == 0xFEFF // Byte Order Mark (BOM)
1903
+ || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
1904
+ || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
1905
+ return false;
1906
+ }
1907
+ }
1908
+
1909
+ // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
1910
+ // Unicode and other whitespace is not affected, only 0x20 space
1911
+ if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
1912
+ return false;
1913
+ }
1914
+
1915
+ // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
1916
+ if (filename.find("..") != std::string::npos) {
1917
+ return false;
1918
+ }
1919
+
1920
+ // Reject "."
1921
+ if (filename == ".") {
1922
+ return false;
1923
+ }
1924
+
1925
+ return true;
1926
+ }
1927
+
1928
+ // returns true if successful, false otherwise
1929
+ bool fs_create_directory_with_parents(const std::string & path) {
1930
+ #ifdef _WIN32
1931
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
1932
+ std::wstring wpath = converter.from_bytes(path);
1933
+
1934
+ // if the path already exists, check whether it's a directory
1935
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
1936
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
1937
+ return true;
1938
+ }
1939
+
1940
+ size_t pos_slash = 0;
1941
+
1942
+ // process path from front to back, procedurally creating directories
1943
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
1944
+ const std::wstring subpath = wpath.substr(0, pos_slash);
1945
+ const wchar_t * test = subpath.c_str();
1946
+
1947
+ const bool success = CreateDirectoryW(test, NULL);
1948
+ if (!success) {
1949
+ const DWORD error = GetLastError();
1950
+
1951
+ // if the path already exists, ensure that it's a directory
1952
+ if (error == ERROR_ALREADY_EXISTS) {
1953
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
1954
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
1955
+ return false;
1956
+ }
1957
+ } else {
1958
+ return false;
1959
+ }
1960
+ }
1961
+
1962
+ pos_slash += 1;
1963
+ }
1964
+
1965
+ return true;
1966
+ #else
1967
+ // if the path already exists, check whether it's a directory
1968
+ struct stat info;
1969
+ if (stat(path.c_str(), &info) == 0) {
1970
+ return S_ISDIR(info.st_mode);
1971
+ }
1972
+
1973
+ size_t pos_slash = 1; // skip leading slashes for directory creation
1974
+
1975
+ // process path from front to back, procedurally creating directories
1976
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
1977
+ const std::string subpath = path.substr(0, pos_slash);
1978
+ struct stat info;
1979
+
1980
+ // if the path already exists, ensure that it's a directory
1981
+ if (stat(subpath.c_str(), &info) == 0) {
1982
+ if (!S_ISDIR(info.st_mode)) {
1983
+ return false;
1984
+ }
1985
+ } else {
1986
+ // create parent directories
1987
+ const int ret = mkdir(subpath.c_str(), 0755);
1988
+ if (ret != 0) {
1989
+ return false;
1990
+ }
1991
+ }
1992
+
1993
+ pos_slash += 1;
1994
+ }
1995
+
1996
+ return true;
1997
+ #endif // _WIN32
1998
+ }
1999
+
2000
+ std::string fs_get_cache_directory() {
2001
+ std::string cache_directory = "";
2002
+ auto ensure_trailing_slash = [](std::string p) {
2003
+ // Make sure to add trailing slash
2004
+ if (p.back() != DIRECTORY_SEPARATOR) {
2005
+ p += DIRECTORY_SEPARATOR;
2006
+ }
2007
+ return p;
2008
+ };
2009
+ if (getenv("LLAMA_CACHE")) {
2010
+ cache_directory = std::getenv("LLAMA_CACHE");
2011
+ } else {
2012
+ #ifdef __linux__
2013
+ if (std::getenv("XDG_CACHE_HOME")) {
2014
+ cache_directory = std::getenv("XDG_CACHE_HOME");
2015
+ } else {
2016
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
2017
+ }
2018
+ #elif defined(__APPLE__)
2019
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
2020
+ #elif defined(_WIN32)
2021
+ cache_directory = std::getenv("LOCALAPPDATA");
2022
+ #endif // __linux__
2023
+ cache_directory = ensure_trailing_slash(cache_directory);
2024
+ cache_directory += "llama.cpp";
2025
+ }
2026
+ return ensure_trailing_slash(cache_directory);
2027
+ }
2028
+
2029
+ std::string fs_get_cache_file(const std::string & filename) {
2030
+ LM_GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
2031
+ std::string cache_directory = fs_get_cache_directory();
2032
+ const bool success = fs_create_directory_with_parents(cache_directory);
2033
+ if (!success) {
2034
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
2035
+ }
2036
+ return cache_directory + filename;
2037
+ }
2038
+
2039
+
2040
+ //
2041
+ // Model utils
2042
+ //
2043
+
2044
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2045
+ auto mparams = llama_model_params_from_gpt_params(params);
2046
+
2047
+ llama_model * model = nullptr;
2048
+
2049
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
2050
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
2051
+ } else if (!params.model_url.empty()) {
2052
+ model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
2053
+ } else {
2054
+ model = llama_load_model_from_file(params.model.c_str(), mparams);
2055
+ }
2056
+
2057
+ if (model == NULL) {
2058
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2059
+ return std::make_tuple(nullptr, nullptr);
2060
+ }
2061
+
2062
+ auto cparams = llama_context_params_from_gpt_params(params);
2063
+
2064
+ llama_context * lctx = llama_new_context_with_model(model, cparams);
2065
+ if (lctx == NULL) {
2066
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
2067
+ llama_free_model(model);
2068
+ return std::make_tuple(nullptr, nullptr);
2069
+ }
2070
+
2071
+ if (!params.control_vectors.empty()) {
2072
+ if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
2073
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
2074
+
2075
+ const auto cvec = llama_control_vector_load(params.control_vectors);
2076
+ if (cvec.n_embd == -1) {
2077
+ llama_free(lctx);
2078
+ llama_free_model(model);
2079
+ return std::make_tuple(nullptr, nullptr);
2080
+ }
2081
+
2082
+ int err = llama_control_vector_apply(lctx,
2083
+ cvec.data.data(),
2084
+ cvec.data.size(),
2085
+ cvec.n_embd,
2086
+ params.control_vector_layer_start,
2087
+ params.control_vector_layer_end);
2088
+ if (err) {
2089
+ llama_free(lctx);
2090
+ llama_free_model(model);
2091
+ return std::make_tuple(nullptr, nullptr);
2092
+ }
2093
+ }
2094
+
2095
+ for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2096
+ const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2097
+ float lora_scale = std::get<1>(params.lora_adapter[i]);
2098
+ int err = llama_model_apply_lora_from_file(model,
2099
+ lora_adapter.c_str(),
2100
+ lora_scale,
2101
+ ((i > 0) || params.lora_base.empty())
2102
+ ? NULL
2103
+ : params.lora_base.c_str(),
2104
+ params.n_threads);
2105
+ if (err != 0) {
2106
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2107
+ llama_free(lctx);
2108
+ llama_free_model(model);
2109
+ return std::make_tuple(nullptr, nullptr);
2110
+ }
2111
+ }
2112
+
2113
+ if (params.ignore_eos) {
2114
+ params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
2115
+ }
2116
+
2117
+ if (params.warmup) {
2118
+ LOG("warming up the model with an empty run\n");
2119
+
2120
+ std::vector<llama_token> tmp;
2121
+ llama_token bos = llama_token_bos(model);
2122
+ llama_token eos = llama_token_eos(model);
2123
+ // some models (e.g. T5) don't have a BOS token
2124
+ if (bos != -1) {
2125
+ tmp.push_back(bos);
2126
+ }
2127
+ tmp.push_back(eos);
2128
+
2129
+ if (llama_model_has_encoder(model)) {
2130
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
2131
+ llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
2132
+ if (decoder_start_token_id == -1) {
2133
+ decoder_start_token_id = bos;
2134
+ }
2135
+ tmp.clear();
2136
+ tmp.push_back(decoder_start_token_id);
2137
+ }
2138
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
2139
+ llama_kv_cache_clear(lctx);
2140
+ llama_synchronize(lctx);
2141
+ llama_reset_timings(lctx);
2142
+ }
2143
+
2144
+ return std::make_tuple(model, lctx);
2145
+ }
2146
+
2147
+ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
2148
+ auto mparams = llama_model_default_params();
2149
+
2150
+ if (params.n_gpu_layers != -1) {
2151
+ mparams.n_gpu_layers = params.n_gpu_layers;
2152
+ }
2153
+ mparams.rpc_servers = params.rpc_servers.c_str();
2154
+ mparams.main_gpu = params.main_gpu;
2155
+ mparams.split_mode = params.split_mode;
2156
+ mparams.tensor_split = params.tensor_split;
2157
+ mparams.use_mmap = params.use_mmap;
2158
+ mparams.use_mlock = params.use_mlock;
2159
+ mparams.check_tensors = params.check_tensors;
2160
+ if (params.kv_overrides.empty()) {
2161
+ mparams.kv_overrides = NULL;
2162
+ } else {
2163
+ LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
2164
+ mparams.kv_overrides = params.kv_overrides.data();
2165
+ }
2166
+
2167
+ return mparams;
2168
+ }
2169
+
2170
+ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
2171
+ if (s == "f32") {
2172
+ return LM_GGML_TYPE_F32;
2173
+ }
2174
+ if (s == "f16") {
2175
+ return LM_GGML_TYPE_F16;
2176
+ }
2177
+ if (s == "q8_0") {
2178
+ return LM_GGML_TYPE_Q8_0;
2179
+ }
2180
+ if (s == "q4_0") {
2181
+ return LM_GGML_TYPE_Q4_0;
2182
+ }
2183
+ if (s == "q4_1") {
2184
+ return LM_GGML_TYPE_Q4_1;
2185
+ }
2186
+ if (s == "iq4_nl") {
2187
+ return LM_GGML_TYPE_IQ4_NL;
2188
+ }
2189
+ if (s == "q5_0") {
2190
+ return LM_GGML_TYPE_Q5_0;
2191
+ }
2192
+ if (s == "q5_1") {
2193
+ return LM_GGML_TYPE_Q5_1;
2194
+ }
2195
+
2196
+ throw std::runtime_error("Invalid cache type: " + s);
2197
+ }
2198
+
2199
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
2200
+ auto cparams = llama_context_default_params();
2201
+
2202
+ cparams.n_ctx = params.n_ctx;
2203
+ cparams.n_seq_max = params.n_parallel;
2204
+ cparams.n_batch = params.n_batch;
2205
+ cparams.n_ubatch = params.n_ubatch;
2206
+ cparams.n_threads = params.n_threads;
2207
+ cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
2208
+ cparams.seed = params.seed;
2209
+ cparams.logits_all = params.logits_all;
2210
+ cparams.embeddings = params.embedding;
2211
+ cparams.rope_scaling_type = params.rope_scaling_type;
2212
+ cparams.rope_freq_base = params.rope_freq_base;
2213
+ cparams.rope_freq_scale = params.rope_freq_scale;
2214
+ cparams.yarn_ext_factor = params.yarn_ext_factor;
2215
+ cparams.yarn_attn_factor = params.yarn_attn_factor;
2216
+ cparams.yarn_beta_fast = params.yarn_beta_fast;
2217
+ cparams.yarn_beta_slow = params.yarn_beta_slow;
2218
+ cparams.yarn_orig_ctx = params.yarn_orig_ctx;
2219
+ cparams.pooling_type = params.pooling_type;
2220
+ cparams.attention_type = params.attention_type;
2221
+ cparams.defrag_thold = params.defrag_thold;
2222
+ cparams.cb_eval = params.cb_eval;
2223
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
2224
+ cparams.offload_kqv = !params.no_kv_offload;
2225
+ cparams.flash_attn = params.flash_attn;
2226
+
2227
+ cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
2228
+ cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
2229
+
2230
+ return cparams;
2231
+ }
2232
+
2233
+ #ifdef LLAMA_USE_CURL
2234
+
2235
+ static bool starts_with(const std::string & str, const std::string & prefix) {
2236
+ // While we wait for C++20's std::string::starts_with...
2237
+ return str.rfind(prefix, 0) == 0;
2238
+ }
2239
+
2240
+ static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
2241
+
2242
+ // Initialize libcurl
2243
+ std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
2244
+ if (!curl) {
2245
+ fprintf(stderr, "%s: error initializing libcurl\n", __func__);
2246
+ return false;
2247
+ }
2248
+
2249
+ bool force_download = false;
2250
+
2251
+ // Set the URL, allow to follow http redirection
2252
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
2253
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
2254
+
2255
+ // Check if hf-token or bearer-token was specified
2256
+ if (!hf_token.empty()) {
2257
+ std::string auth_header = "Authorization: Bearer ";
2258
+ auth_header += hf_token.c_str();
2259
+ struct curl_slist *http_headers = NULL;
2260
+ http_headers = curl_slist_append(http_headers, auth_header.c_str());
2261
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
2262
+ }
2263
+
2264
+ #if defined(_WIN32)
2265
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
2266
+ // operating system. Currently implemented under MS-Windows.
2267
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
2268
+ #endif
2269
+
2270
+ // Check if the file already exists locally
2271
+ struct stat model_file_info;
2272
+ auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
2273
+
2274
+ // If the file exists, check its JSON metadata companion file.
2275
+ std::string metadata_path = path + ".json";
2276
+ nlohmann::json metadata;
2277
+ std::string etag;
2278
+ std::string last_modified;
2279
+
2280
+ if (file_exists) {
2281
+ // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
2282
+ std::ifstream metadata_in(metadata_path);
2283
+ if (metadata_in.good()) {
2284
+ try {
2285
+ metadata_in >> metadata;
2286
+ fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
2287
+ if (metadata.contains("url") && metadata.at("url").is_string()) {
2288
+ auto previous_url = metadata.at("url").get<std::string>();
2289
+ if (previous_url != url) {
2290
+ fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
2291
+ return false;
2292
+ }
2293
+ }
2294
+ if (metadata.contains("etag") && metadata.at("etag").is_string()) {
2295
+ etag = metadata.at("etag");
2296
+ }
2297
+ if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
2298
+ last_modified = metadata.at("lastModified");
2299
+ }
2300
+ } catch (const nlohmann::json::exception & e) {
2301
+ fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
2302
+ return false;
2303
+ }
2304
+ }
2305
+ } else {
2306
+ fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
2307
+ }
2308
+
2309
+ // Send a HEAD request to retrieve the etag and last-modified headers
2310
+ struct llama_load_model_from_url_headers {
2311
+ std::string etag;
2312
+ std::string last_modified;
2313
+ };
2314
+ llama_load_model_from_url_headers headers;
2315
+ {
2316
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
2317
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
2318
+ llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
2319
+
2320
+ static std::regex header_regex("([^:]+): (.*)\r\n");
2321
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
2322
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
2323
+
2324
+ std::string header(buffer, n_items);
2325
+ std::smatch match;
2326
+ if (std::regex_match(header, match, header_regex)) {
2327
+ const std::string & key = match[1];
2328
+ const std::string & value = match[2];
2329
+ if (std::regex_match(key, match, etag_regex)) {
2330
+ headers->etag = value;
2331
+ } else if (std::regex_match(key, match, last_modified_regex)) {
2332
+ headers->last_modified = value;
2333
+ }
2334
+ }
2335
+ return n_items;
2336
+ };
2337
+
2338
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
2339
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
2340
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
2341
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
2342
+
2343
+ CURLcode res = curl_easy_perform(curl.get());
2344
+ if (res != CURLE_OK) {
2345
+ fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
2346
+ return false;
2347
+ }
2348
+
2349
+ long http_code = 0;
2350
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
2351
+ if (http_code != 200) {
2352
+ // HEAD not supported, we don't know if the file has changed
2353
+ // force trigger downloading
2354
+ force_download = true;
2355
+ fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
2356
+ }
2357
+ }
2358
+
2359
+ bool should_download = !file_exists || force_download;
2360
+ if (!should_download) {
2361
+ if (!etag.empty() && etag != headers.etag) {
2362
+ fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
2363
+ should_download = true;
2364
+ } else if (!last_modified.empty() && last_modified != headers.last_modified) {
2365
+ fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
2366
+ should_download = true;
2367
+ }
2368
+ }
2369
+ if (should_download) {
2370
+ std::string path_temporary = path + ".downloadInProgress";
2371
+ if (file_exists) {
2372
+ fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
2373
+ if (remove(path.c_str()) != 0) {
2374
+ fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
2375
+ return false;
2376
+ }
2377
+ }
2378
+
2379
+ // Set the output file
2380
+
2381
+ struct FILE_deleter {
2382
+ void operator()(FILE * f) const {
2383
+ fclose(f);
2384
+ }
2385
+ };
2386
+
2387
+ std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
2388
+ if (!outfile) {
2389
+ fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
2390
+ return false;
2391
+ }
2392
+
2393
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
2394
+ auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
2395
+ return fwrite(data, size, nmemb, (FILE *)fd);
2396
+ };
2397
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
2398
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
2399
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
2400
+
2401
+ // display download progress
2402
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
2403
+
2404
+ // helper function to hide password in URL
2405
+ auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
2406
+ std::size_t protocol_pos = url.find("://");
2407
+ if (protocol_pos == std::string::npos) {
2408
+ return url; // Malformed URL
2409
+ }
2410
+
2411
+ std::size_t at_pos = url.find('@', protocol_pos + 3);
2412
+ if (at_pos == std::string::npos) {
2413
+ return url; // No password in URL
2414
+ }
2415
+
2416
+ return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
2417
+ };
2418
+
2419
+ // start the download
2420
+ fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
2421
+ llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
2422
+ auto res = curl_easy_perform(curl.get());
2423
+ if (res != CURLE_OK) {
2424
+ fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
2425
+ return false;
2426
+ }
2427
+
2428
+ long http_code = 0;
2429
+ curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
2430
+ if (http_code < 200 || http_code >= 400) {
2431
+ fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
2432
+ return false;
2433
+ }
2434
+
2435
+ // Causes file to be closed explicitly here before we rename it.
2436
+ outfile.reset();
2437
+
2438
+ // Write the updated JSON metadata file.
2439
+ metadata.update({
2440
+ {"url", url},
2441
+ {"etag", headers.etag},
2442
+ {"lastModified", headers.last_modified}
2443
+ });
2444
+ std::ofstream(metadata_path) << metadata.dump(4);
2445
+ fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
2446
+
2447
+ if (rename(path_temporary.c_str(), path.c_str()) != 0) {
2448
+ fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
2449
+ return false;
2450
+ }
2451
+ }
2452
+
2453
+ return true;
2454
+ }
2455
+
2456
+ struct llama_model * llama_load_model_from_url(
2457
+ const char * model_url,
2458
+ const char * path_model,
2459
+ const char * hf_token,
2460
+ const struct llama_model_params & params) {
2461
+ // Basic validation of the model_url
2462
+ if (!model_url || strlen(model_url) == 0) {
2463
+ fprintf(stderr, "%s: invalid model_url\n", __func__);
2464
+ return NULL;
2465
+ }
2466
+
2467
+ if (!llama_download_file(model_url, path_model, hf_token)) {
2468
+ return NULL;
2469
+ }
2470
+
2471
+ // check for additional GGUFs split to download
2472
+ int n_split = 0;
2473
+ {
2474
+ struct lm_gguf_init_params lm_gguf_params = {
2475
+ /*.no_alloc = */ true,
2476
+ /*.ctx = */ NULL,
2477
+ };
2478
+ auto * ctx_gguf = lm_gguf_init_from_file(path_model, lm_gguf_params);
2479
+ if (!ctx_gguf) {
2480
+ fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
2481
+ return NULL;
2482
+ }
2483
+
2484
+ auto key_n_split = lm_gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
2485
+ if (key_n_split >= 0) {
2486
+ n_split = lm_gguf_get_val_u16(ctx_gguf, key_n_split);
2487
+ }
2488
+
2489
+ lm_gguf_free(ctx_gguf);
2490
+ }
2491
+
2492
+ if (n_split > 1) {
2493
+ char split_prefix[PATH_MAX] = {0};
2494
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
2495
+
2496
+ // Verify the first split file format
2497
+ // and extract split URL and PATH prefixes
2498
+ {
2499
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
2500
+ fprintf(stderr, "\n%s: unexpected model file name: %s"
2501
+ " n_split=%d\n", __func__, path_model, n_split);
2502
+ return NULL;
2503
+ }
2504
+
2505
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
2506
+ fprintf(stderr, "\n%s: unexpected model url: %s"
2507
+ " n_split=%d\n", __func__, model_url, n_split);
2508
+ return NULL;
2509
+ }
2510
+ }
2511
+
2512
+ // Prepare download in parallel
2513
+ std::vector<std::future<bool>> futures_download;
2514
+ for (int idx = 1; idx < n_split; idx++) {
2515
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
2516
+ char split_path[PATH_MAX] = {0};
2517
+ llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
2518
+
2519
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
2520
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
2521
+
2522
+ return llama_download_file(split_url, split_path, hf_token);
2523
+ }, idx));
2524
+ }
2525
+
2526
+ // Wait for all downloads to complete
2527
+ for (auto & f : futures_download) {
2528
+ if (!f.get()) {
2529
+ return NULL;
2530
+ }
2531
+ }
2532
+ }
2533
+
2534
+ return llama_load_model_from_file(path_model, params);
2535
+ }
2536
+
2537
+ struct llama_model * llama_load_model_from_hf(
2538
+ const char * repo,
2539
+ const char * model,
2540
+ const char * path_model,
2541
+ const char * hf_token,
2542
+ const struct llama_model_params & params) {
2543
+ // construct hugging face model url:
2544
+ //
2545
+ // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
2546
+ // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
2547
+ //
2548
+ // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
2549
+ // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
2550
+ //
2551
+
2552
+ std::string model_url = "https://huggingface.co/";
2553
+ model_url += repo;
2554
+ model_url += "/resolve/main/";
2555
+ model_url += model;
2556
+
2557
+ return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
2558
+ }
2559
+
2560
+ #else
2561
+
2562
+ struct llama_model * llama_load_model_from_url(
2563
+ const char * /*model_url*/,
2564
+ const char * /*path_model*/,
2565
+ const char * /*hf_token*/,
2566
+ const struct llama_model_params & /*params*/) {
2567
+ fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
2568
+ return nullptr;
2569
+ }
2570
+
2571
+ struct llama_model * llama_load_model_from_hf(
2572
+ const char * /*repo*/,
2573
+ const char * /*model*/,
2574
+ const char * /*path_model*/,
2575
+ const char * /*hf_token*/,
2576
+ const struct llama_model_params & /*params*/) {
2577
+ fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
2578
+ return nullptr;
2579
+ }
2580
+
2581
+ #endif // LLAMA_USE_CURL
2582
+
2583
+ //
2584
+ // Batch utils
2585
+ //
2586
+
2587
+ void llama_batch_clear(struct llama_batch & batch) {
2588
+ batch.n_tokens = 0;
2589
+ }
2590
+
2591
+ void llama_batch_add(
2592
+ struct llama_batch & batch,
2593
+ llama_token id,
2594
+ llama_pos pos,
2595
+ const std::vector<llama_seq_id> & seq_ids,
2596
+ bool logits) {
2597
+ batch.token [batch.n_tokens] = id;
2598
+ batch.pos [batch.n_tokens] = pos;
2599
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
2600
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
2601
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
2602
+ }
2603
+ batch.logits [batch.n_tokens] = logits;
2604
+
2605
+ batch.n_tokens++;
2606
+ }
2607
+
2608
+ //
2609
+ // Vocab utils
2610
+ //
2611
+
2612
+ std::vector<llama_token> llama_tokenize(
2613
+ const struct llama_context * ctx,
2614
+ const std::string & text,
2615
+ bool add_special,
2616
+ bool parse_special) {
2617
+ return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
2618
+ }
2619
+
2620
+ std::vector<llama_token> llama_tokenize(
2621
+ const struct llama_model * model,
2622
+ const std::string & text,
2623
+ bool add_special,
2624
+ bool parse_special) {
2625
+ // upper limit for the number of tokens
2626
+ int n_tokens = text.length() + 2 * add_special;
2627
+ std::vector<llama_token> result(n_tokens);
2628
+ n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
2629
+ if (n_tokens < 0) {
2630
+ result.resize(-n_tokens);
2631
+ int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
2632
+ LM_GGML_ASSERT(check == -n_tokens);
2633
+ } else {
2634
+ result.resize(n_tokens);
2635
+ }
2636
+ return result;
2637
+ }
2638
+
2639
+ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
2640
+ std::string piece;
2641
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
2642
+ const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2643
+ if (n_chars < 0) {
2644
+ piece.resize(-n_chars);
2645
+ int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2646
+ LM_GGML_ASSERT(check == -n_chars);
2647
+ }
2648
+ else {
2649
+ piece.resize(n_chars);
2650
+ }
2651
+
2652
+ return piece;
2653
+ }
2654
+
2655
+ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
2656
+ std::string text;
2657
+ text.resize(std::max(text.capacity(), tokens.size()));
2658
+ int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2659
+ if (n_chars < 0) {
2660
+ text.resize(-n_chars);
2661
+ n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2662
+ LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
2663
+ }
2664
+
2665
+ text.resize(n_chars);
2666
+
2667
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
2668
+ return text;
2669
+ }
2670
+
2671
+ bool llama_should_add_bos_token(const llama_model * model) {
2672
+ const int add_bos = llama_add_bos_token(model);
2673
+
2674
+ return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2675
+ }
2676
+
2677
+ //
2678
+ // Chat template utils
2679
+ //
2680
+
2681
+ bool llama_chat_verify_template(const std::string & tmpl) {
2682
+ llama_chat_message chat[] = {{"user", "test"}};
2683
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
2684
+ return res >= 0;
2685
+ }
2686
+
2687
+ std::string llama_chat_apply_template(const struct llama_model * model,
2688
+ const std::string & tmpl,
2689
+ const std::vector<llama_chat_msg> & msgs,
2690
+ bool add_ass) {
2691
+ int alloc_size = 0;
2692
+ bool fallback = false; // indicate if we must fallback to default chatml
2693
+ std::vector<llama_chat_message> chat;
2694
+ for (auto & msg : msgs) {
2695
+ chat.push_back({msg.role.c_str(), msg.content.c_str()});
2696
+ alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
2697
+ }
2698
+
2699
+ const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
2700
+ std::vector<char> buf(alloc_size);
2701
+
2702
+ // run the first time to get the total output length
2703
+ int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
2704
+
2705
+ // error: chat template is not supported
2706
+ if (res < 0) {
2707
+ if (ptr_tmpl != nullptr) {
2708
+ // if the custom "tmpl" is not supported, we throw an error
2709
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
2710
+ throw std::runtime_error("this custom template is not supported");
2711
+ } else {
2712
+ // If the built-in template is not supported, we default to chatml
2713
+ res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
2714
+ fallback = true;
2715
+ }
2716
+ }
2717
+
2718
+ // if it turns out that our buffer is too small, we resize it
2719
+ if ((size_t) res > buf.size()) {
2720
+ buf.resize(res);
2721
+ res = llama_chat_apply_template(
2722
+ fallback ? nullptr : model,
2723
+ fallback ? "chatml" : ptr_tmpl,
2724
+ chat.data(), chat.size(), add_ass, buf.data(), buf.size());
2725
+ }
2726
+
2727
+ std::string formatted_chat(buf.data(), res);
2728
+ return formatted_chat;
2729
+ }
2730
+
2731
+ std::string llama_chat_format_single(const struct llama_model * model,
2732
+ const std::string & tmpl,
2733
+ const std::vector<llama_chat_msg> & past_msg,
2734
+ const llama_chat_msg & new_msg,
2735
+ bool add_ass) {
2736
+ std::ostringstream ss;
2737
+ auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
2738
+ std::vector<llama_chat_msg> chat_new(past_msg);
2739
+ // if the past_msg ends with a newline, we must preserve it in the formatted version
2740
+ if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
2741
+ ss << "\n";
2742
+ };
2743
+ // format chat with new_msg
2744
+ chat_new.push_back(new_msg);
2745
+ auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
2746
+ // get the diff part
2747
+ ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
2748
+ return ss.str();
2749
+ }
2750
+
2751
+ std::string llama_chat_format_example(const struct llama_model * model,
2752
+ const std::string & tmpl) {
2753
+ std::vector<llama_chat_msg> msgs = {
2754
+ {"system", "You are a helpful assistant"},
2755
+ {"user", "Hello"},
2756
+ {"assistant", "Hi there"},
2757
+ {"user", "How are you?"},
2758
+ };
2759
+ return llama_chat_apply_template(model, tmpl, msgs, true);
2760
+ }
2761
+
2762
+ //
2763
+ // KV cache utils
2764
+ //
2765
+
2766
+ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
2767
+ static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
2768
+
2769
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
2770
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
2771
+
2772
+ llama_kv_cache_view_cell * c_curr = view.cells;
2773
+ llama_seq_id * cs_curr = view.cells_sequences;
2774
+
2775
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
2776
+ if (i % row_size == 0) {
2777
+ printf("\n%5d: ", i);
2778
+ }
2779
+ int seq_count = 0;
2780
+ for (int j = 0; j < view.n_seq_max; j++) {
2781
+ if (cs_curr[j] >= 0) { seq_count++; }
2782
+ }
2783
+ putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
2784
+ }
2785
+
2786
+ printf("\n=== Done dumping\n");
2787
+ }
2788
+
2789
+ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
2790
+ static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
2791
+
2792
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
2793
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
2794
+
2795
+ std::unordered_map<llama_seq_id, size_t> seqs;
2796
+ llama_kv_cache_view_cell * c_curr = view.cells;
2797
+ llama_seq_id * cs_curr = view.cells_sequences;
2798
+
2799
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
2800
+ for (int j = 0; j < view.n_seq_max; j++) {
2801
+ if (cs_curr[j] < 0) { continue; }
2802
+ if (seqs.find(cs_curr[j]) == seqs.end()) {
2803
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
2804
+ const size_t sz = seqs.size();
2805
+ seqs[cs_curr[j]] = sz;
2806
+ }
2807
+ }
2808
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
2809
+ }
2810
+
2811
+ printf("=== Sequence legend: ");
2812
+ for (const auto & it : seqs) {
2813
+ printf("%zu=%d, ", it.second, it.first);
2814
+ }
2815
+ printf("'+'=other sequence ids");
2816
+
2817
+ c_curr = view.cells;
2818
+ cs_curr = view.cells_sequences;
2819
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
2820
+ if (i % row_size == 0) {
2821
+ printf("\n%5d: ", i);
2822
+ }
2823
+ for (int j = 0; j < view.n_seq_max; j++) {
2824
+ if (cs_curr[j] >= 0) {
2825
+ const auto & it = seqs.find(cs_curr[j]);
2826
+ putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
2827
+ } else {
2828
+ putchar('.');
2829
+ }
2830
+ }
2831
+ putchar(' ');
2832
+ }
2833
+
2834
+ printf("\n=== Done dumping\n");
2835
+ }
2836
+
2837
+ //
2838
+ // Embedding utils
2839
+ //
2840
+
2841
+ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
2842
+ double sum = 0.0;
2843
+
2844
+ switch (embd_norm) {
2845
+ case -1: // no normalisation
2846
+ sum = 1.0;
2847
+ break;
2848
+ case 0: // max absolute
2849
+ for (int i = 0; i < n; i++) {
2850
+ if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
2851
+ }
2852
+ sum /= 32760.0; // make an int16 range
2853
+ break;
2854
+ case 2: // euclidean
2855
+ for (int i = 0; i < n; i++) {
2856
+ sum += inp[i] * inp[i];
2857
+ }
2858
+ sum = std::sqrt(sum);
2859
+ break;
2860
+ default: // p-norm (euclidean is p-norm p=2)
2861
+ for (int i = 0; i < n; i++) {
2862
+ sum += std::pow(std::abs(inp[i]), embd_norm);
2863
+ }
2864
+ sum = std::pow(sum, 1.0 / embd_norm);
2865
+ break;
2866
+ }
2867
+
2868
+ const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
2869
+
2870
+ for (int i = 0; i < n; i++) {
2871
+ out[i] = inp[i] * norm;
2872
+ }
2873
+ }
2874
+
2875
+ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
2876
+ double sum = 0.0;
2877
+ double sum1 = 0.0;
2878
+ double sum2 = 0.0;
2879
+
2880
+ for (int i = 0; i < n; i++) {
2881
+ sum += embd1[i] * embd2[i];
2882
+ sum1 += embd1[i] * embd1[i];
2883
+ sum2 += embd2[i] * embd2[i];
2884
+ }
2885
+
2886
+ // Handle the case where one or both vectors are zero vectors
2887
+ if (sum1 == 0.0 || sum2 == 0.0) {
2888
+ if (sum1 == 0.0 && sum2 == 0.0) {
2889
+ return 1.0f; // two zero vectors are similar
2890
+ }
2891
+ return 0.0f;
2892
+ }
2893
+
2894
+ return sum / (sqrt(sum1) * sqrt(sum2));
2895
+ }
2896
+
2897
+ //
2898
+ // Control vector utils
2899
+ //
2900
+
2901
+ static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
2902
+ llama_control_vector_data result = { -1, {} };
2903
+
2904
+ lm_ggml_context * ctx = nullptr;
2905
+ struct lm_gguf_init_params meta_lm_gguf_params = {
2906
+ /* .no_alloc = */ false,
2907
+ /* .ctx = */ &ctx,
2908
+ };
2909
+ struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(load_info.fname.c_str(), meta_lm_gguf_params);
2910
+ if (!ctx_gguf) {
2911
+ fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
2912
+ return result;
2913
+ }
2914
+
2915
+ int32_t n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
2916
+ if (n_tensors == 0) {
2917
+ fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
2918
+ }
2919
+
2920
+ for (int i = 0; i < n_tensors; i++) {
2921
+ std::string name = lm_gguf_get_tensor_name(ctx_gguf, i);
2922
+
2923
+ int layer_idx = -1;
2924
+
2925
+ // split on '.'
2926
+ size_t dotpos = name.find('.');
2927
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
2928
+ try {
2929
+ layer_idx = std::stoi(name.substr(dotpos + 1));
2930
+ } catch (...) {
2931
+ layer_idx = -1;
2932
+ }
2933
+ }
2934
+ if (layer_idx < 0) {
2935
+ fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
2936
+ result.n_embd = -1;
2937
+ break;
2938
+ } else if (layer_idx == 0) {
2939
+ fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
2940
+ result.n_embd = -1;
2941
+ break;
2942
+ }
2943
+
2944
+ struct lm_ggml_tensor * tensor = lm_ggml_get_tensor(ctx, name.c_str());
2945
+ if (tensor->type != LM_GGML_TYPE_F32) {
2946
+ fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
2947
+ result.n_embd = -1;
2948
+ break;
2949
+ }
2950
+ if (lm_ggml_n_dims(tensor) != 1) {
2951
+ fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
2952
+ result.n_embd = -1;
2953
+ break;
2954
+ }
2955
+
2956
+ if (result.n_embd == -1) {
2957
+ result.n_embd = lm_ggml_nelements(tensor);
2958
+ } else if (lm_ggml_nelements(tensor) != result.n_embd) {
2959
+ fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
2960
+ result.n_embd = -1;
2961
+ break;
2962
+ }
2963
+
2964
+ // extend if necessary - do not store data for layer 0 (it's not used)
2965
+ result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
2966
+
2967
+ const float * src = (const float *) tensor->data;
2968
+ float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
2969
+ for (int j = 0; j < result.n_embd; j++) {
2970
+ dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
2971
+ }
2972
+
2973
+ }
2974
+
2975
+ if (result.n_embd == -1) {
2976
+ fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
2977
+ result.data.clear();
2978
+ }
2979
+
2980
+ lm_gguf_free(ctx_gguf);
2981
+ lm_ggml_free(ctx);
2982
+
2983
+ return result;
2984
+ }
2985
+
2986
+ llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
2987
+ llama_control_vector_data result = { -1, {} };
2988
+
2989
+ for (const auto & info : load_infos) {
2990
+ auto cur = llama_control_vector_load_one(info);
2991
+
2992
+ if (cur.n_embd == -1) {
2993
+ result.n_embd = -1;
2994
+ break;
2995
+ }
2996
+ if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
2997
+ fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
2998
+ result.n_embd = -1;
2999
+ break;
3000
+ }
3001
+
3002
+ if (result.n_embd == -1) {
3003
+ result = std::move(cur);
3004
+ } else {
3005
+ result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
3006
+ for (size_t i = 0; i < cur.data.size(); i++) {
3007
+ result.data[i] += cur.data[i];
3008
+ }
3009
+ }
3010
+ }
3011
+
3012
+ if (result.n_embd == -1) {
3013
+ fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
3014
+ result.data.clear();
3015
+ }
3016
+
3017
+ return result;
3018
+ }
3019
+
3020
+ //
3021
+ // YAML utils
3022
+ //
3023
+
3024
+ void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
3025
+ if (data.empty()) {
3026
+ fprintf(stream, "%s:\n", prop_name);
3027
+ return;
3028
+ }
3029
+
3030
+ fprintf(stream, "%s: [", prop_name);
3031
+ for (size_t i = 0; i < data.size() - 1; ++i) {
3032
+ fprintf(stream, "%e, ", data[i]);
3033
+ }
3034
+ fprintf(stream, "%e]\n", data.back());
3035
+ }
3036
+
3037
+ void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
3038
+ if (data.empty()) {
3039
+ fprintf(stream, "%s:\n", prop_name);
3040
+ return;
3041
+ }
3042
+
3043
+ fprintf(stream, "%s: [", prop_name);
3044
+ for (size_t i = 0; i < data.size() - 1; ++i) {
3045
+ fprintf(stream, "%d, ", data[i]);
3046
+ }
3047
+ fprintf(stream, "%d]\n", data.back());
3048
+ }
3049
+
3050
+ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
3051
+ std::string data_str(data == NULL ? "" : data);
3052
+
3053
+ if (data_str.empty()) {
3054
+ fprintf(stream, "%s:\n", prop_name);
3055
+ return;
3056
+ }
3057
+
3058
+ size_t pos_start = 0;
3059
+ size_t pos_found = 0;
3060
+
3061
+ if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
3062
+ data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
3063
+ data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
3064
+ data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
3065
+ data_str = "\"" + data_str + "\"";
3066
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
3067
+ return;
3068
+ }
3069
+
3070
+ if (data_str.find('\n') == std::string::npos) {
3071
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
3072
+ return;
3073
+ }
3074
+
3075
+ fprintf(stream, "%s: |\n", prop_name);
3076
+ while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
3077
+ fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
3078
+ pos_start = pos_found + 1;
3079
+ }
3080
+ }
3081
+
3082
+ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
3083
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
3084
+ const llama_sampling_params & sparams = params.sparams;
3085
+
3086
+ fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
3087
+ fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
3088
+ fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false");
3089
+ fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false");
3090
+ fprintf(stream, "cpu_has_avx_vnni: %s\n", lm_ggml_cpu_has_avx_vnni() ? "true" : "false");
3091
+ fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false");
3092
+ fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false");
3093
+ fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false");
3094
+ fprintf(stream, "cpu_has_avx512_vnni: %s\n", lm_ggml_cpu_has_avx512_vnni() ? "true" : "false");
3095
+ fprintf(stream, "cpu_has_cuda: %s\n", lm_ggml_cpu_has_cuda() ? "true" : "false");
3096
+ fprintf(stream, "cpu_has_vulkan: %s\n", lm_ggml_cpu_has_vulkan() ? "true" : "false");
3097
+ fprintf(stream, "cpu_has_kompute: %s\n", lm_ggml_cpu_has_kompute() ? "true" : "false");
3098
+ fprintf(stream, "cpu_has_fma: %s\n", lm_ggml_cpu_has_fma() ? "true" : "false");
3099
+ fprintf(stream, "cpu_has_gpublas: %s\n", lm_ggml_cpu_has_gpublas() ? "true" : "false");
3100
+ fprintf(stream, "cpu_has_neon: %s\n", lm_ggml_cpu_has_neon() ? "true" : "false");
3101
+ fprintf(stream, "cpu_has_sve: %s\n", lm_ggml_cpu_has_sve() ? "true" : "false");
3102
+ fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false");
3103
+ fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false");
3104
+ fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false");
3105
+ fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false");
3106
+ fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false");
3107
+ fprintf(stream, "cpu_has_vsx: %s\n", lm_ggml_cpu_has_vsx() ? "true" : "false");
3108
+ fprintf(stream, "cpu_has_matmul_int8: %s\n", lm_ggml_cpu_has_matmul_int8() ? "true" : "false");
3109
+
3110
+ #ifdef NDEBUG
3111
+ fprintf(stream, "debug: false\n");
3112
+ #else
3113
+ fprintf(stream, "debug: true\n");
3114
+ #endif // NDEBUG
3115
+
3116
+ fprintf(stream, "model_desc: %s\n", model_desc);
3117
+ fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
3118
+
3119
+ #ifdef __OPTIMIZE__
3120
+ fprintf(stream, "optimize: true\n");
3121
+ #else
3122
+ fprintf(stream, "optimize: false\n");
3123
+ #endif // __OPTIMIZE__
3124
+
3125
+ fprintf(stream, "time: %s\n", timestamp.c_str());
3126
+
3127
+ fprintf(stream, "\n");
3128
+ fprintf(stream, "###############\n");
3129
+ fprintf(stream, "# User Inputs #\n");
3130
+ fprintf(stream, "###############\n");
3131
+ fprintf(stream, "\n");
3132
+
3133
+ fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
3134
+ fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
3135
+ yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
3136
+ fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
3137
+ fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
3138
+ fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
3139
+ fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
3140
+ fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
3141
+ fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
3142
+ fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
3143
+ yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
3144
+ fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
3145
+ fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
3146
+ fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
3147
+
3148
+ const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
3149
+ const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
3150
+ fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
3151
+
3152
+ yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
3153
+ fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
3154
+ yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
3155
+ fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
3156
+ fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
3157
+ fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
3158
+ fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
3159
+
3160
+ fprintf(stream, "logit_bias:\n");
3161
+ for (std::pair<llama_token, float> lb : sparams.logit_bias) {
3162
+ if (ignore_eos && lb.first == logit_bias_eos->first) {
3163
+ continue;
3164
+ }
3165
+ fprintf(stream, " %d: %f", lb.first, lb.second);
3166
+ }
3167
+
3168
+ fprintf(stream, "lora:\n");
3169
+ for (std::tuple<std::string, float> la : params.lora_adapter) {
3170
+ if (std::get<1>(la) != 1.0f) {
3171
+ continue;
3172
+ }
3173
+ fprintf(stream, " - %s\n", std::get<0>(la).c_str());
3174
+ }
3175
+ fprintf(stream, "lora_scaled:\n");
3176
+ for (std::tuple<std::string, float> la : params.lora_adapter) {
3177
+ if (std::get<1>(la) == 1.0f) {
3178
+ continue;
3179
+ }
3180
+ fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
3181
+ }
3182
+ fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
3183
+ fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
3184
+ fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
3185
+ fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
3186
+ fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
3187
+ fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
3188
+ fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
3189
+ fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
3190
+ fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
3191
+ fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
3192
+ fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
3193
+ fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
3194
+ fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
3195
+ fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
3196
+ fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
3197
+ fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
3198
+ fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
3199
+ fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
3200
+ yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
3201
+ fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
3202
+ fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
3203
+ fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
3204
+ yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
3205
+ fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
3206
+
3207
+ fprintf(stream, "reverse_prompt:\n");
3208
+ for (std::string ap : params.antiprompt) {
3209
+ size_t pos = 0;
3210
+ while ((pos = ap.find('\n', pos)) != std::string::npos) {
3211
+ ap.replace(pos, 1, "\\n");
3212
+ pos += 1;
3213
+ }
3214
+
3215
+ fprintf(stream, " - %s\n", ap.c_str());
3216
+ }
3217
+
3218
+ fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
3219
+ fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
3220
+ fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
3221
+ fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
3222
+ fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
3223
+ fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
3224
+ fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
3225
+
3226
+ const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
3227
+ yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
3228
+
3229
+ fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
3230
+ fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
3231
+ fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
3232
+ fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
3233
+ fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
3234
+ fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
3235
+ fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
3236
+ fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
3237
+ }