cui-llama.rn 1.2.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +20 -5
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +222 -34
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/common.cpp +1682 -2114
  9. package/cpp/common.h +600 -613
  10. package/cpp/ggml-aarch64.c +129 -3478
  11. package/cpp/ggml-aarch64.h +19 -39
  12. package/cpp/ggml-alloc.c +1040 -1040
  13. package/cpp/ggml-alloc.h +76 -76
  14. package/cpp/ggml-backend-impl.h +216 -216
  15. package/cpp/ggml-backend-reg.cpp +195 -0
  16. package/cpp/ggml-backend.cpp +1997 -2661
  17. package/cpp/ggml-backend.h +328 -314
  18. package/cpp/ggml-common.h +1853 -1853
  19. package/cpp/ggml-cpp.h +38 -38
  20. package/cpp/ggml-cpu-aarch64.c +3560 -0
  21. package/cpp/ggml-cpu-aarch64.h +30 -0
  22. package/cpp/ggml-cpu-impl.h +371 -614
  23. package/cpp/ggml-cpu-quants.c +10822 -0
  24. package/cpp/ggml-cpu-quants.h +63 -0
  25. package/cpp/ggml-cpu.c +13975 -13720
  26. package/cpp/ggml-cpu.cpp +663 -0
  27. package/cpp/ggml-cpu.h +177 -150
  28. package/cpp/ggml-impl.h +550 -296
  29. package/cpp/ggml-metal.h +66 -66
  30. package/cpp/ggml-metal.m +4294 -3933
  31. package/cpp/ggml-quants.c +5247 -15739
  32. package/cpp/ggml-quants.h +100 -147
  33. package/cpp/ggml-threading.cpp +12 -0
  34. package/cpp/ggml-threading.h +12 -0
  35. package/cpp/ggml.c +8180 -8390
  36. package/cpp/ggml.h +2411 -2441
  37. package/cpp/llama-grammar.cpp +1138 -1138
  38. package/cpp/llama-grammar.h +144 -144
  39. package/cpp/llama-impl.h +181 -181
  40. package/cpp/llama-sampling.cpp +2348 -2345
  41. package/cpp/llama-sampling.h +48 -48
  42. package/cpp/llama-vocab.cpp +1984 -1984
  43. package/cpp/llama-vocab.h +170 -170
  44. package/cpp/llama.cpp +22132 -22046
  45. package/cpp/llama.h +1253 -1255
  46. package/cpp/log.cpp +401 -401
  47. package/cpp/log.h +121 -121
  48. package/cpp/rn-llama.hpp +83 -19
  49. package/cpp/sampling.cpp +466 -466
  50. package/cpp/sgemm.cpp +1884 -1276
  51. package/ios/RNLlama.mm +43 -20
  52. package/ios/RNLlamaContext.h +9 -3
  53. package/ios/RNLlamaContext.mm +133 -33
  54. package/jest/mock.js +0 -1
  55. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  56. package/lib/commonjs/index.js +52 -15
  57. package/lib/commonjs/index.js.map +1 -1
  58. package/lib/module/NativeRNLlama.js.map +1 -1
  59. package/lib/module/index.js +51 -15
  60. package/lib/module/index.js.map +1 -1
  61. package/lib/typescript/NativeRNLlama.d.ts +29 -5
  62. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  63. package/lib/typescript/index.d.ts +12 -5
  64. package/lib/typescript/index.d.ts.map +1 -1
  65. package/package.json +1 -1
  66. package/src/NativeRNLlama.ts +41 -6
  67. package/src/index.ts +82 -27
  68. package/cpp/json-schema-to-grammar.cpp +0 -1045
  69. package/cpp/json-schema-to-grammar.h +0 -8
  70. package/cpp/json.hpp +0 -24766
package/cpp/common.cpp CHANGED
@@ -1,2114 +1,1682 @@
1
- #if defined(_MSC_VER)
2
- #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
- #endif
4
-
5
- #include "common.h"
6
- #include "log.h"
7
- // Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
8
- #define JSON_ASSERT LM_GGML_ASSERT
9
- #include "json.hpp"
10
- #include "json-schema-to-grammar.h"
11
- #include "llama.h"
12
-
13
- #include <algorithm>
14
- #include <cinttypes>
15
- #include <climits>
16
- #include <cmath>
17
- #include <codecvt>
18
- #include <cstdarg>
19
- #include <cstring>
20
- #include <ctime>
21
- #include <fstream>
22
- #include <iostream>
23
- #include <iterator>
24
- #include <regex>
25
- #include <sstream>
26
- #include <string>
27
- #include <thread>
28
- #include <unordered_map>
29
- #include <unordered_set>
30
- #include <vector>
31
-
32
- #if defined(__APPLE__) && defined(__MACH__)
33
- #include <sys/types.h>
34
- #include <sys/sysctl.h>
35
- #endif
36
-
37
- #if defined(_WIN32)
38
- #define WIN32_LEAN_AND_MEAN
39
- #ifndef NOMINMAX
40
- # define NOMINMAX
41
- #endif
42
- #include <locale>
43
- #include <windows.h>
44
- #include <fcntl.h>
45
- #include <io.h>
46
- #else
47
- #include <sys/ioctl.h>
48
- #include <sys/stat.h>
49
- #include <unistd.h>
50
- #endif
51
- #if defined(LLAMA_USE_CURL)
52
- #include <curl/curl.h>
53
- #include <curl/easy.h>
54
- #include <future>
55
- #endif
56
-
57
- // build info
58
- int LLAMA_BUILD_NUMBER = 0;
59
- char const *LLAMA_COMMIT = "unknown";
60
- char const *LLAMA_COMPILER = "unknown";
61
- char const *LLAMA_BUILD_TARGET = "unknown";
62
-
63
- #if defined(_MSC_VER)
64
- #pragma warning(disable: 4244 4267) // possible loss of data
65
- #endif
66
-
67
- #if defined(LLAMA_USE_CURL)
68
- #ifdef __linux__
69
- #include <linux/limits.h>
70
- #elif defined(_WIN32)
71
- #define PATH_MAX MAX_PATH
72
- #else
73
- #include <sys/syslimits.h>
74
- #endif
75
- #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
76
- #endif // LLAMA_USE_CURL
77
-
78
- using json = nlohmann::ordered_json;
79
-
80
- //
81
- // CPU utils
82
- //
83
-
84
- int32_t cpu_get_num_physical_cores() {
85
- #ifdef __linux__
86
- // enumerate the set of thread siblings, num entries is num cores
87
- std::unordered_set<std::string> siblings;
88
- for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
89
- std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
90
- + std::to_string(cpu) + "/topology/thread_siblings");
91
- if (!thread_siblings.is_open()) {
92
- break; // no more cpus
93
- }
94
- std::string line;
95
- if (std::getline(thread_siblings, line)) {
96
- siblings.insert(line);
97
- }
98
- }
99
- if (!siblings.empty()) {
100
- return static_cast<int32_t>(siblings.size());
101
- }
102
- #elif defined(__APPLE__) && defined(__MACH__)
103
- int32_t num_physical_cores;
104
- size_t len = sizeof(num_physical_cores);
105
- int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
106
- if (result == 0) {
107
- return num_physical_cores;
108
- }
109
- result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
110
- if (result == 0) {
111
- return num_physical_cores;
112
- }
113
- #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
114
- // TODO: windows + arm64 + mingw64
115
- unsigned int n_threads_win = std::thread::hardware_concurrency();
116
- unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
117
-
118
- DWORD buffer_size = 0;
119
- if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
120
- if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
121
- return default_threads;
122
- }
123
- }
124
-
125
- std::vector<char> buffer(buffer_size);
126
- if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
127
- return default_threads;
128
- }
129
-
130
- int32_t num_physical_cores = 0;
131
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
132
- while (buffer_size > 0) {
133
- if (info->Relationship == RelationProcessorCore) {
134
- num_physical_cores += info->Processor.GroupCount;
135
- }
136
- buffer_size -= info->Size;
137
- info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
138
- }
139
-
140
- return num_physical_cores > 0 ? num_physical_cores : default_threads;
141
- #endif
142
- unsigned int n_threads = std::thread::hardware_concurrency();
143
- return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
144
- }
145
-
146
- #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
147
- #include <pthread.h>
148
-
149
- static void cpuid(unsigned leaf, unsigned subleaf,
150
- unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
151
- __asm__("movq\t%%rbx,%%rsi\n\t"
152
- "cpuid\n\t"
153
- "xchgq\t%%rbx,%%rsi"
154
- : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
155
- : "0"(leaf), "2"(subleaf));
156
- }
157
-
158
- static int pin_cpu(int cpu) {
159
- cpu_set_t mask;
160
- CPU_ZERO(&mask);
161
- CPU_SET(cpu, &mask);
162
- return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
163
- }
164
-
165
- static bool is_hybrid_cpu(void) {
166
- unsigned eax, ebx, ecx, edx;
167
- cpuid(7, 0, &eax, &ebx, &ecx, &edx);
168
- return !!(edx & (1u << 15));
169
- }
170
-
171
- static bool is_running_on_efficiency_core(void) {
172
- unsigned eax, ebx, ecx, edx;
173
- cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
174
- int intel_atom = 0x20;
175
- int core_type = (eax & 0xff000000u) >> 24;
176
- return core_type == intel_atom;
177
- }
178
-
179
- static int cpu_count_math_cpus(int n_cpu) {
180
- int result = 0;
181
- for (int cpu = 0; cpu < n_cpu; ++cpu) {
182
- if (pin_cpu(cpu)) {
183
- return -1;
184
- }
185
- if (is_running_on_efficiency_core()) {
186
- continue; // efficiency cores harm lockstep threading
187
- }
188
- ++cpu; // hyperthreading isn't useful for linear algebra
189
- ++result;
190
- }
191
- return result;
192
- }
193
-
194
- #endif // __x86_64__ && __linux__
195
-
196
- /**
197
- * Returns number of CPUs on system that are useful for math.
198
- */
199
- int32_t cpu_get_num_math() {
200
- #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
201
- int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
202
- if (n_cpu < 1) {
203
- return cpu_get_num_physical_cores();
204
- }
205
- if (is_hybrid_cpu()) {
206
- cpu_set_t affinity;
207
- if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
208
- int result = cpu_count_math_cpus(n_cpu);
209
- pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
210
- if (result > 0) {
211
- return result;
212
- }
213
- }
214
- }
215
- #endif
216
- return cpu_get_num_physical_cores();
217
- }
218
-
219
- // Helper for setting process priority
220
-
221
- #if defined(_WIN32)
222
-
223
- bool set_process_priority(enum lm_ggml_sched_priority prio) {
224
- if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
225
- return true;
226
- }
227
-
228
- DWORD p = NORMAL_PRIORITY_CLASS;
229
- switch (prio) {
230
- case LM_GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
231
- case LM_GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
232
- case LM_GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
233
- case LM_GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
234
- }
235
-
236
- if (!SetPriorityClass(GetCurrentProcess(), p)) {
237
- LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
238
- return false;
239
- }
240
-
241
- return true;
242
- }
243
-
244
- #else // MacOS and POSIX
245
- #include <sys/types.h>
246
- #include <sys/resource.h>
247
-
248
- bool set_process_priority(enum lm_ggml_sched_priority prio) {
249
- if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
250
- return true;
251
- }
252
-
253
- int p = 0;
254
- switch (prio) {
255
- case LM_GGML_SCHED_PRIO_NORMAL: p = 0; break;
256
- case LM_GGML_SCHED_PRIO_MEDIUM: p = -5; break;
257
- case LM_GGML_SCHED_PRIO_HIGH: p = -10; break;
258
- case LM_GGML_SCHED_PRIO_REALTIME: p = -20; break;
259
- }
260
-
261
- if (!setpriority(PRIO_PROCESS, 0, p)) {
262
- LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
263
- return false;
264
- }
265
- return true;
266
- }
267
-
268
- #endif
269
-
270
- //
271
- // CLI argument parsing
272
- //
273
-
274
-
275
- void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
276
- int32_t n_set = 0;
277
-
278
- if (cpuparams.n_threads < 0) {
279
- // Assuming everything about cpuparams is invalid
280
- if (role_model != nullptr) {
281
- cpuparams = *role_model;
282
- } else {
283
- cpuparams.n_threads = cpu_get_num_math();
284
- }
285
- }
286
-
287
- for (int32_t i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
288
- if (cpuparams.cpumask[i]) {
289
- n_set++;
290
- }
291
- }
292
-
293
- if (n_set && n_set < cpuparams.n_threads) {
294
- // Not enough set bits, may experience performance issues.
295
- LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
296
- }
297
- }
298
-
299
- bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
300
- size_t dash_loc = range.find('-');
301
- if (dash_loc == std::string::npos) {
302
- LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
303
- return false;
304
- }
305
-
306
- size_t start_i;
307
- size_t end_i;
308
-
309
- if (dash_loc == 0) {
310
- start_i = 0;
311
- } else {
312
- start_i = std::stoull(range.substr(0, dash_loc));
313
- if (start_i >= LM_GGML_MAX_N_THREADS) {
314
- LOG_ERR("Start index out of bounds!\n");
315
- return false;
316
- }
317
- }
318
-
319
- if (dash_loc == range.length() - 1) {
320
- end_i = LM_GGML_MAX_N_THREADS - 1;
321
- } else {
322
- end_i = std::stoull(range.substr(dash_loc + 1));
323
- if (end_i >= LM_GGML_MAX_N_THREADS) {
324
- LOG_ERR("End index out of bounds!\n");
325
- return false;
326
- }
327
- }
328
-
329
- for (size_t i = start_i; i <= end_i; i++) {
330
- boolmask[i] = true;
331
- }
332
-
333
- return true;
334
- }
335
-
336
- bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
337
- // Discard potential 0x prefix
338
- size_t start_i = 0;
339
- if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
340
- start_i = 2;
341
- }
342
-
343
- size_t num_digits = mask.length() - start_i;
344
- if (num_digits > 128) num_digits = 128;
345
-
346
- size_t end_i = num_digits + start_i;
347
-
348
- for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
349
- char c = mask.at(i);
350
- int8_t id = c;
351
-
352
- if ((c >= '0' && c <= '9')) {
353
- id -= '0';
354
- } else if (c >= 'a' && c <= 'f') {
355
- id -= 'a' - 10;
356
- } else if (c >= 'A' && c <= 'F') {
357
- id -= 'A' - 10;
358
- } else {
359
- LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
360
- return false;
361
- }
362
-
363
- boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
364
- boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
365
- boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
366
- boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
367
- }
368
-
369
- return true;
370
- }
371
-
372
- void common_init() {
373
- llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
374
- if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
375
- common_log_add(common_log_main(), level, "%s", text);
376
- }
377
- }, NULL);
378
-
379
- #ifdef NDEBUG
380
- const char * build_type = "";
381
- #else
382
- const char * build_type = " (debug)";
383
- #endif
384
-
385
- LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
386
- }
387
-
388
- std::string common_params_get_system_info(const common_params & params) {
389
- std::ostringstream os;
390
-
391
- os << "system_info: n_threads = " << params.cpuparams.n_threads;
392
- if (params.cpuparams_batch.n_threads != -1) {
393
- os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
394
- }
395
- #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
396
- // TODO: windows + arm64 + mingw64
397
- DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
398
- os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
399
- #else
400
- os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
401
- #endif
402
-
403
- return os.str();
404
- }
405
-
406
- //
407
- // String utils
408
- //
409
-
410
- std::string string_format(const char * fmt, ...) {
411
- va_list ap;
412
- va_list ap2;
413
- va_start(ap, fmt);
414
- va_copy(ap2, ap);
415
- int size = vsnprintf(NULL, 0, fmt, ap);
416
- LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
417
- std::vector<char> buf(size + 1);
418
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
419
- LM_GGML_ASSERT(size2 == size);
420
- va_end(ap2);
421
- va_end(ap);
422
- return std::string(buf.data(), size);
423
- }
424
-
425
- std::string string_strip(const std::string & str) {
426
- size_t start = 0;
427
- size_t end = str.size();
428
- while (start < end && std::isspace(str[start])) {
429
- start++;
430
- }
431
- while (end > start && std::isspace(str[end - 1])) {
432
- end--;
433
- }
434
- return str.substr(start, end - start);
435
- }
436
-
437
- std::string string_get_sortable_timestamp() {
438
- using clock = std::chrono::system_clock;
439
-
440
- const clock::time_point current_time = clock::now();
441
- const time_t as_time_t = clock::to_time_t(current_time);
442
- char timestamp_no_ns[100];
443
- std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
444
-
445
- const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
446
- current_time.time_since_epoch() % 1000000000).count();
447
- char timestamp_ns[11];
448
- snprintf(timestamp_ns, 11, "%09" PRId64, ns);
449
-
450
- return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
451
- }
452
-
453
- void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
454
- if (search.empty()) {
455
- return;
456
- }
457
- std::string builder;
458
- builder.reserve(s.length());
459
- size_t pos = 0;
460
- size_t last_pos = 0;
461
- while ((pos = s.find(search, last_pos)) != std::string::npos) {
462
- builder.append(s, last_pos, pos - last_pos);
463
- builder.append(replace);
464
- last_pos = pos + search.length();
465
- }
466
- builder.append(s, last_pos, std::string::npos);
467
- s = std::move(builder);
468
- }
469
-
470
- std::string string_from(bool value) {
471
- return value ? "true" : "false";
472
- }
473
-
474
- std::string string_from(const std::vector<int> & values) {
475
- std::stringstream buf;
476
-
477
- buf << "[ ";
478
- bool first = true;
479
- for (auto e : values) {
480
- if (first) {
481
- first = false;
482
- } else {
483
- buf << ", ";
484
- }
485
- buf << std::to_string(e);
486
- }
487
- buf << " ]";
488
-
489
- return buf.str();
490
- }
491
-
492
- std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
493
- std::stringstream buf;
494
-
495
- buf << "[ ";
496
-
497
- bool first = true;
498
- for (const auto & token : tokens) {
499
- if (!first) {
500
- buf << ", ";
501
- } else {
502
- first = false;
503
- }
504
-
505
- auto detokenized = common_token_to_piece(ctx, token);
506
-
507
- detokenized.erase(
508
- std::remove_if(
509
- detokenized.begin(),
510
- detokenized.end(),
511
- [](const unsigned char c) { return !std::isprint(c); }),
512
- detokenized.end());
513
-
514
- buf << "'" << detokenized << "'"
515
- << ":" << std::to_string(token);
516
- }
517
-
518
- buf << " ]";
519
-
520
- return buf.str();
521
- }
522
-
523
- std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
524
- std::stringstream buf;
525
-
526
- buf << "[ ";
527
-
528
- bool first = true;
529
- for (int i = 0; i < batch.n_tokens; ++i) {
530
- if (!first) {
531
- buf << ", ";
532
- } else {
533
- first = false;
534
- }
535
-
536
- auto detokenized = common_token_to_piece(ctx, batch.token[i]);
537
-
538
- detokenized.erase(
539
- std::remove_if(
540
- detokenized.begin(),
541
- detokenized.end(),
542
- [](const unsigned char c) { return !std::isprint(c); }),
543
- detokenized.end());
544
-
545
- buf << "\n" << std::to_string(i)
546
- << ":token '" << detokenized << "'"
547
- << ":pos " << std::to_string(batch.pos[i])
548
- << ":n_seq_id " << std::to_string(batch.n_seq_id[i])
549
- << ":seq_id " << std::to_string(batch.seq_id[i][0])
550
- << ":logits " << std::to_string(batch.logits[i]);
551
- }
552
-
553
- buf << " ]";
554
-
555
- return buf.str();
556
- }
557
-
558
- void string_process_escapes(std::string & input) {
559
- std::size_t input_len = input.length();
560
- std::size_t output_idx = 0;
561
-
562
- for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
563
- if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
564
- switch (input[++input_idx]) {
565
- case 'n': input[output_idx++] = '\n'; break;
566
- case 'r': input[output_idx++] = '\r'; break;
567
- case 't': input[output_idx++] = '\t'; break;
568
- case '\'': input[output_idx++] = '\''; break;
569
- case '\"': input[output_idx++] = '\"'; break;
570
- case '\\': input[output_idx++] = '\\'; break;
571
- case 'x':
572
- // Handle \x12, etc
573
- if (input_idx + 2 < input_len) {
574
- const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
575
- char *err_p = nullptr;
576
- const long val = std::strtol(x, &err_p, 16);
577
- if (err_p == x + 2) {
578
- input_idx += 2;
579
- input[output_idx++] = char(val);
580
- break;
581
- }
582
- }
583
- // fall through
584
- default: input[output_idx++] = '\\';
585
- input[output_idx++] = input[input_idx]; break;
586
- }
587
- } else {
588
- input[output_idx++] = input[input_idx];
589
- }
590
- }
591
-
592
- input.resize(output_idx);
593
- }
594
-
595
- bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
596
- const char * sep = strchr(data, '=');
597
- if (sep == nullptr || sep - data >= 128) {
598
- LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
599
- return false;
600
- }
601
- llama_model_kv_override kvo;
602
- std::strncpy(kvo.key, data, sep - data);
603
- kvo.key[sep - data] = 0;
604
- sep++;
605
- if (strncmp(sep, "int:", 4) == 0) {
606
- sep += 4;
607
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
608
- kvo.val_i64 = std::atol(sep);
609
- } else if (strncmp(sep, "float:", 6) == 0) {
610
- sep += 6;
611
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
612
- kvo.val_f64 = std::atof(sep);
613
- } else if (strncmp(sep, "bool:", 5) == 0) {
614
- sep += 5;
615
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
616
- if (std::strcmp(sep, "true") == 0) {
617
- kvo.val_bool = true;
618
- } else if (std::strcmp(sep, "false") == 0) {
619
- kvo.val_bool = false;
620
- } else {
621
- LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
622
- return false;
623
- }
624
- } else if (strncmp(sep, "str:", 4) == 0) {
625
- sep += 4;
626
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
627
- if (strlen(sep) > 127) {
628
- LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
629
- return false;
630
- }
631
- strncpy(kvo.val_str, sep, 127);
632
- kvo.val_str[127] = '\0';
633
- } else {
634
- LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
635
- return false;
636
- }
637
- overrides.emplace_back(std::move(kvo));
638
- return true;
639
- }
640
-
641
- //
642
- // Filesystem utils
643
- //
644
-
645
- // Validate if a filename is safe to use
646
- // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
647
- bool fs_validate_filename(const std::string & filename) {
648
- if (!filename.length()) {
649
- // Empty filename invalid
650
- return false;
651
- }
652
- if (filename.length() > 255) {
653
- // Limit at common largest possible filename on Linux filesystems
654
- // to avoid unnecessary further validation
655
- // (On systems with smaller limits it will be caught by the OS)
656
- return false;
657
- }
658
-
659
- std::u32string filename_utf32;
660
- try {
661
- std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
662
- filename_utf32 = converter.from_bytes(filename);
663
-
664
- // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
665
- // or invalid encodings were encountered. Reject such attempts
666
- std::string filename_reencoded = converter.to_bytes(filename_utf32);
667
- if (filename_reencoded != filename) {
668
- return false;
669
- }
670
- } catch (const std::exception &) {
671
- return false;
672
- }
673
-
674
- // Check for forbidden codepoints:
675
- // - Control characters
676
- // - Unicode equivalents of illegal characters
677
- // - UTF-16 surrogate pairs
678
- // - UTF-8 replacement character
679
- // - Byte order mark (BOM)
680
- // - Illegal characters: / \ : * ? " < > |
681
- for (char32_t c : filename_utf32) {
682
- if (c <= 0x1F // Control characters (C0)
683
- || c == 0x7F // Control characters (DEL)
684
- || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
685
- || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
686
- || c == 0x2215 // Division Slash (forward slash equivalent)
687
- || c == 0x2216 // Set Minus (backslash equivalent)
688
- || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
689
- || c == 0xFFFD // Replacement Character (UTF-8)
690
- || c == 0xFEFF // Byte Order Mark (BOM)
691
- || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
692
- || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
693
- return false;
694
- }
695
- }
696
-
697
- // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
698
- // Unicode and other whitespace is not affected, only 0x20 space
699
- if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
700
- return false;
701
- }
702
-
703
- // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
704
- if (filename.find("..") != std::string::npos) {
705
- return false;
706
- }
707
-
708
- // Reject "."
709
- if (filename == ".") {
710
- return false;
711
- }
712
-
713
- return true;
714
- }
715
-
716
- // returns true if successful, false otherwise
717
- bool fs_create_directory_with_parents(const std::string & path) {
718
- #ifdef _WIN32
719
- std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
720
- std::wstring wpath = converter.from_bytes(path);
721
-
722
- // if the path already exists, check whether it's a directory
723
- const DWORD attributes = GetFileAttributesW(wpath.c_str());
724
- if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
725
- return true;
726
- }
727
-
728
- size_t pos_slash = 0;
729
-
730
- // process path from front to back, procedurally creating directories
731
- while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
732
- const std::wstring subpath = wpath.substr(0, pos_slash);
733
- const wchar_t * test = subpath.c_str();
734
-
735
- const bool success = CreateDirectoryW(test, NULL);
736
- if (!success) {
737
- const DWORD error = GetLastError();
738
-
739
- // if the path already exists, ensure that it's a directory
740
- if (error == ERROR_ALREADY_EXISTS) {
741
- const DWORD attributes = GetFileAttributesW(subpath.c_str());
742
- if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
743
- return false;
744
- }
745
- } else {
746
- return false;
747
- }
748
- }
749
-
750
- pos_slash += 1;
751
- }
752
-
753
- return true;
754
- #else
755
- // if the path already exists, check whether it's a directory
756
- struct stat info;
757
- if (stat(path.c_str(), &info) == 0) {
758
- return S_ISDIR(info.st_mode);
759
- }
760
-
761
- size_t pos_slash = 1; // skip leading slashes for directory creation
762
-
763
- // process path from front to back, procedurally creating directories
764
- while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
765
- const std::string subpath = path.substr(0, pos_slash);
766
- struct stat info;
767
-
768
- // if the path already exists, ensure that it's a directory
769
- if (stat(subpath.c_str(), &info) == 0) {
770
- if (!S_ISDIR(info.st_mode)) {
771
- return false;
772
- }
773
- } else {
774
- // create parent directories
775
- const int ret = mkdir(subpath.c_str(), 0755);
776
- if (ret != 0) {
777
- return false;
778
- }
779
- }
780
-
781
- pos_slash += 1;
782
- }
783
-
784
- return true;
785
- #endif // _WIN32
786
- }
787
-
788
- std::string fs_get_cache_directory() {
789
- std::string cache_directory = "";
790
- auto ensure_trailing_slash = [](std::string p) {
791
- // Make sure to add trailing slash
792
- if (p.back() != DIRECTORY_SEPARATOR) {
793
- p += DIRECTORY_SEPARATOR;
794
- }
795
- return p;
796
- };
797
- if (getenv("LLAMA_CACHE")) {
798
- cache_directory = std::getenv("LLAMA_CACHE");
799
- } else {
800
- #ifdef __linux__
801
- if (std::getenv("XDG_CACHE_HOME")) {
802
- cache_directory = std::getenv("XDG_CACHE_HOME");
803
- } else {
804
- cache_directory = std::getenv("HOME") + std::string("/.cache/");
805
- }
806
- #elif defined(__APPLE__)
807
- cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
808
- #elif defined(_WIN32)
809
- cache_directory = std::getenv("LOCALAPPDATA");
810
- #endif // __linux__
811
- cache_directory = ensure_trailing_slash(cache_directory);
812
- cache_directory += "llama.cpp";
813
- }
814
- return ensure_trailing_slash(cache_directory);
815
- }
816
-
817
- std::string fs_get_cache_file(const std::string & filename) {
818
- LM_GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
819
- std::string cache_directory = fs_get_cache_directory();
820
- const bool success = fs_create_directory_with_parents(cache_directory);
821
- if (!success) {
822
- throw std::runtime_error("failed to create cache directory: " + cache_directory);
823
- }
824
- return cache_directory + filename;
825
- }
826
-
827
-
828
- //
829
- // Model utils
830
- //
831
- struct common_init_result common_init_from_params(common_params & params) {
832
- common_init_result iparams;
833
- auto mparams = common_model_params_to_llama(params);
834
-
835
- llama_model * model = nullptr;
836
-
837
- if (!params.hf_repo.empty() && !params.hf_file.empty()) {
838
- model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
839
- } else if (!params.model_url.empty()) {
840
- model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
841
- } else {
842
- model = llama_load_model_from_file(params.model.c_str(), mparams);
843
- }
844
-
845
- if (model == NULL) {
846
- LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
847
- return iparams;
848
- }
849
-
850
- if (params.reranking) {
851
- bool ok = true;
852
-
853
- if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
854
- LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
855
- ok = false;
856
- }
857
-
858
- if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
859
- LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
860
- ok = false;
861
- }
862
-
863
- if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
864
- LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
865
- ok = false;
866
- }
867
-
868
- if (!ok) {
869
- llama_free_model(model);
870
-
871
- return iparams;
872
- }
873
- }
874
-
875
- auto cparams = common_context_params_to_llama(params);
876
-
877
- llama_context * lctx = llama_new_context_with_model(model, cparams);
878
- if (lctx == NULL) {
879
- LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
880
- llama_free_model(model);
881
- return iparams;
882
- }
883
-
884
- if (!params.control_vectors.empty()) {
885
- if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
886
- if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
887
-
888
- const auto cvec = common_control_vector_load(params.control_vectors);
889
- if (cvec.n_embd == -1) {
890
- llama_free(lctx);
891
- llama_free_model(model);
892
-
893
- return iparams;
894
- }
895
-
896
- int err = llama_control_vector_apply(lctx,
897
- cvec.data.data(),
898
- cvec.data.size(),
899
- cvec.n_embd,
900
- params.control_vector_layer_start,
901
- params.control_vector_layer_end);
902
- if (err) {
903
- llama_free(lctx);
904
- llama_free_model(model);
905
-
906
- return iparams;
907
- }
908
- }
909
-
910
- // load and optionally apply lora adapters
911
- for (auto & la : params.lora_adapters) {
912
- common_lora_adapter_container loaded_la;
913
- loaded_la.path = la.path;
914
- loaded_la.scale = la.scale;
915
- loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
916
- if (loaded_la.adapter == nullptr) {
917
- LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
918
- llama_free(lctx);
919
- llama_free_model(model);
920
- return iparams;
921
- }
922
- iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
923
- }
924
- if (!params.lora_init_without_apply) {
925
- common_lora_adapters_apply(lctx, iparams.lora_adapters);
926
- }
927
-
928
- if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
929
- LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
930
- params.sparams.ignore_eos = false;
931
- }
932
-
933
- if (params.warmup) {
934
- LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
935
-
936
- std::vector<llama_token> tmp;
937
- llama_token bos = llama_token_bos(model);
938
- llama_token eos = llama_token_eos(model);
939
- // some models (e.g. T5) don't have a BOS token
940
- if (bos != LLAMA_TOKEN_NULL) {
941
- tmp.push_back(bos);
942
- }
943
- if (eos != LLAMA_TOKEN_NULL) {
944
- tmp.push_back(eos);
945
- }
946
- if (tmp.empty()) {
947
- tmp.push_back(0);
948
- }
949
-
950
- if (llama_model_has_encoder(model)) {
951
- llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
952
- llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
953
- if (decoder_start_token_id == -1) {
954
- decoder_start_token_id = bos;
955
- }
956
- tmp.clear();
957
- tmp.push_back(decoder_start_token_id);
958
- }
959
- if (llama_model_has_decoder(model)) {
960
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
961
- }
962
- llama_kv_cache_clear(lctx);
963
- llama_synchronize(lctx);
964
- llama_perf_context_reset(lctx);
965
- }
966
-
967
- iparams.model = model;
968
- iparams.context = lctx;
969
-
970
- return iparams;
971
- }
972
-
973
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
974
- llama_lora_adapter_clear(ctx);
975
- for (auto & la : lora_adapters) {
976
- if (la.scale != 0.0f) {
977
- llama_lora_adapter_set(ctx, la.adapter, la.scale);
978
- }
979
- }
980
- }
981
-
982
- struct llama_model_params common_model_params_to_llama(const common_params & params) {
983
- auto mparams = llama_model_default_params();
984
-
985
- if (params.n_gpu_layers != -1) {
986
- mparams.n_gpu_layers = params.n_gpu_layers;
987
- }
988
-
989
- mparams.progress_callback_user_data = params.progress_callback_user_data;
990
- mparams.progress_callback = params.progress_callback;
991
- mparams.vocab_only = params.vocab_only;
992
- mparams.rpc_servers = params.rpc_servers.c_str();
993
- mparams.main_gpu = params.main_gpu;
994
- mparams.split_mode = params.split_mode;
995
- mparams.tensor_split = params.tensor_split;
996
- mparams.use_mmap = params.use_mmap;
997
- mparams.use_mlock = params.use_mlock;
998
- mparams.check_tensors = params.check_tensors;
999
- if (params.kv_overrides.empty()) {
1000
- mparams.kv_overrides = NULL;
1001
- } else {
1002
- LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1003
- mparams.kv_overrides = params.kv_overrides.data();
1004
- }
1005
-
1006
- return mparams;
1007
- }
1008
-
1009
- static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
1010
- if (s == "f32") {
1011
- return LM_GGML_TYPE_F32;
1012
- }
1013
- if (s == "f16") {
1014
- return LM_GGML_TYPE_F16;
1015
- }
1016
- if (s == "q8_0") {
1017
- return LM_GGML_TYPE_Q8_0;
1018
- }
1019
- if (s == "q4_0") {
1020
- return LM_GGML_TYPE_Q4_0;
1021
- }
1022
- if (s == "q4_1") {
1023
- return LM_GGML_TYPE_Q4_1;
1024
- }
1025
- if (s == "iq4_nl") {
1026
- return LM_GGML_TYPE_IQ4_NL;
1027
- }
1028
- if (s == "q5_0") {
1029
- return LM_GGML_TYPE_Q5_0;
1030
- }
1031
- if (s == "q5_1") {
1032
- return LM_GGML_TYPE_Q5_1;
1033
- }
1034
-
1035
- throw std::runtime_error("Unsupported cache type: " + s);
1036
- }
1037
-
1038
- struct llama_context_params common_context_params_to_llama(const common_params & params) {
1039
- auto cparams = llama_context_default_params();
1040
-
1041
- cparams.n_ctx = params.n_ctx;
1042
- cparams.n_seq_max = params.n_parallel;
1043
- cparams.n_batch = params.n_batch;
1044
- cparams.n_ubatch = params.n_ubatch;
1045
- cparams.n_threads = params.cpuparams.n_threads;
1046
- cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1047
- params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1048
- cparams.logits_all = params.logits_all;
1049
- cparams.embeddings = params.embedding;
1050
- cparams.rope_scaling_type = params.rope_scaling_type;
1051
- cparams.rope_freq_base = params.rope_freq_base;
1052
- cparams.rope_freq_scale = params.rope_freq_scale;
1053
- cparams.yarn_ext_factor = params.yarn_ext_factor;
1054
- cparams.yarn_attn_factor = params.yarn_attn_factor;
1055
- cparams.yarn_beta_fast = params.yarn_beta_fast;
1056
- cparams.yarn_beta_slow = params.yarn_beta_slow;
1057
- cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1058
- cparams.pooling_type = params.pooling_type;
1059
- cparams.attention_type = params.attention_type;
1060
- cparams.defrag_thold = params.defrag_thold;
1061
- cparams.cb_eval = params.cb_eval;
1062
- cparams.cb_eval_user_data = params.cb_eval_user_data;
1063
- cparams.offload_kqv = !params.no_kv_offload;
1064
- cparams.flash_attn = params.flash_attn;
1065
- cparams.no_perf = params.no_perf;
1066
-
1067
- if (params.reranking) {
1068
- cparams.embeddings = true;
1069
- cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1070
- }
1071
-
1072
- cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1073
- cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
1074
-
1075
- return cparams;
1076
- }
1077
-
1078
- struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
1079
- struct lm_ggml_threadpool_params tpp;
1080
-
1081
- lm_ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
1082
-
1083
- if (params.mask_valid) {
1084
- std::memcpy(&tpp.cpumask, &params.cpumask, LM_GGML_MAX_N_THREADS);
1085
- }
1086
-
1087
- tpp.prio = params.priority;
1088
- tpp.poll = params.poll;
1089
- tpp.strict_cpu = params.strict_cpu;
1090
-
1091
- return tpp;
1092
- }
1093
-
1094
- #ifdef LLAMA_USE_CURL
1095
-
1096
- #define CURL_MAX_RETRY 3
1097
- #define CURL_RETRY_DELAY_SECONDS 2
1098
-
1099
-
1100
- static bool starts_with(const std::string & str, const std::string & prefix) {
1101
- // While we wait for C++20's std::string::starts_with...
1102
- return str.rfind(prefix, 0) == 0;
1103
- }
1104
-
1105
- static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
1106
- int remaining_attempts = max_attempts;
1107
-
1108
- while (remaining_attempts > 0) {
1109
- LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
1110
-
1111
- CURLcode res = curl_easy_perform(curl);
1112
- if (res == CURLE_OK) {
1113
- return true;
1114
- }
1115
-
1116
- int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
1117
- LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
1118
-
1119
- remaining_attempts--;
1120
- std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
1121
- }
1122
-
1123
- LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
1124
-
1125
- return false;
1126
- }
1127
-
1128
- static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1129
-
1130
- // Initialize libcurl
1131
- std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1132
- if (!curl) {
1133
- LOG_ERR("%s: error initializing libcurl\n", __func__);
1134
- return false;
1135
- }
1136
-
1137
- bool force_download = false;
1138
-
1139
- // Set the URL, allow to follow http redirection
1140
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1141
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
1142
-
1143
- // Check if hf-token or bearer-token was specified
1144
- if (!hf_token.empty()) {
1145
- std::string auth_header = "Authorization: Bearer ";
1146
- auth_header += hf_token.c_str();
1147
- struct curl_slist *http_headers = NULL;
1148
- http_headers = curl_slist_append(http_headers, auth_header.c_str());
1149
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
1150
- }
1151
-
1152
- #if defined(_WIN32)
1153
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
1154
- // operating system. Currently implemented under MS-Windows.
1155
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1156
- #endif
1157
-
1158
- // Check if the file already exists locally
1159
- struct stat model_file_info;
1160
- auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
1161
-
1162
- // If the file exists, check its JSON metadata companion file.
1163
- std::string metadata_path = path + ".json";
1164
- nlohmann::json metadata;
1165
- std::string etag;
1166
- std::string last_modified;
1167
-
1168
- if (file_exists) {
1169
- // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
1170
- std::ifstream metadata_in(metadata_path);
1171
- if (metadata_in.good()) {
1172
- try {
1173
- metadata_in >> metadata;
1174
- LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
1175
- if (metadata.contains("url") && metadata.at("url").is_string()) {
1176
- auto previous_url = metadata.at("url").get<std::string>();
1177
- if (previous_url != url) {
1178
- LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
1179
- return false;
1180
- }
1181
- }
1182
- if (metadata.contains("etag") && metadata.at("etag").is_string()) {
1183
- etag = metadata.at("etag");
1184
- }
1185
- if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
1186
- last_modified = metadata.at("lastModified");
1187
- }
1188
- } catch (const nlohmann::json::exception & e) {
1189
- LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
1190
- return false;
1191
- }
1192
- }
1193
- } else {
1194
- LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
1195
- }
1196
-
1197
- // Send a HEAD request to retrieve the etag and last-modified headers
1198
- struct common_load_model_from_url_headers {
1199
- std::string etag;
1200
- std::string last_modified;
1201
- };
1202
- common_load_model_from_url_headers headers;
1203
- {
1204
- typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1205
- auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1206
- common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1207
-
1208
- static std::regex header_regex("([^:]+): (.*)\r\n");
1209
- static std::regex etag_regex("ETag", std::regex_constants::icase);
1210
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
1211
-
1212
- std::string header(buffer, n_items);
1213
- std::smatch match;
1214
- if (std::regex_match(header, match, header_regex)) {
1215
- const std::string & key = match[1];
1216
- const std::string & value = match[2];
1217
- if (std::regex_match(key, match, etag_regex)) {
1218
- headers->etag = value;
1219
- } else if (std::regex_match(key, match, last_modified_regex)) {
1220
- headers->last_modified = value;
1221
- }
1222
- }
1223
- return n_items;
1224
- };
1225
-
1226
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
1227
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
1228
- curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
1229
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
1230
-
1231
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1232
- if (!was_perform_successful) {
1233
- return false;
1234
- }
1235
-
1236
- long http_code = 0;
1237
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1238
- if (http_code != 200) {
1239
- // HEAD not supported, we don't know if the file has changed
1240
- // force trigger downloading
1241
- force_download = true;
1242
- LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
1243
- }
1244
- }
1245
-
1246
- bool should_download = !file_exists || force_download;
1247
- if (!should_download) {
1248
- if (!etag.empty() && etag != headers.etag) {
1249
- LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
1250
- should_download = true;
1251
- } else if (!last_modified.empty() && last_modified != headers.last_modified) {
1252
- LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
1253
- should_download = true;
1254
- }
1255
- }
1256
- if (should_download) {
1257
- std::string path_temporary = path + ".downloadInProgress";
1258
- if (file_exists) {
1259
- LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
1260
- if (remove(path.c_str()) != 0) {
1261
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
1262
- return false;
1263
- }
1264
- }
1265
-
1266
- // Set the output file
1267
-
1268
- struct FILE_deleter {
1269
- void operator()(FILE * f) const {
1270
- fclose(f);
1271
- }
1272
- };
1273
-
1274
- std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
1275
- if (!outfile) {
1276
- LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
1277
- return false;
1278
- }
1279
-
1280
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
1281
- auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
1282
- return fwrite(data, size, nmemb, (FILE *)fd);
1283
- };
1284
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
1285
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1286
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
1287
-
1288
- // display download progress
1289
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
1290
-
1291
- // helper function to hide password in URL
1292
- auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
1293
- std::size_t protocol_pos = url.find("://");
1294
- if (protocol_pos == std::string::npos) {
1295
- return url; // Malformed URL
1296
- }
1297
-
1298
- std::size_t at_pos = url.find('@', protocol_pos + 3);
1299
- if (at_pos == std::string::npos) {
1300
- return url; // No password in URL
1301
- }
1302
-
1303
- return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
1304
- };
1305
-
1306
- // start the download
1307
- LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
1308
- llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
1309
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1310
- if (!was_perform_successful) {
1311
- return false;
1312
- }
1313
-
1314
- long http_code = 0;
1315
- curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1316
- if (http_code < 200 || http_code >= 400) {
1317
- LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
1318
- return false;
1319
- }
1320
-
1321
- // Causes file to be closed explicitly here before we rename it.
1322
- outfile.reset();
1323
-
1324
- // Write the updated JSON metadata file.
1325
- metadata.update({
1326
- {"url", url},
1327
- {"etag", headers.etag},
1328
- {"lastModified", headers.last_modified}
1329
- });
1330
- std::ofstream(metadata_path) << metadata.dump(4);
1331
- LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
1332
-
1333
- if (rename(path_temporary.c_str(), path.c_str()) != 0) {
1334
- LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
1335
- return false;
1336
- }
1337
- }
1338
-
1339
- return true;
1340
- }
1341
-
1342
- struct llama_model * common_load_model_from_url(
1343
- const char * model_url,
1344
- const char * path_model,
1345
- const char * hf_token,
1346
- const struct llama_model_params & params) {
1347
- // Basic validation of the model_url
1348
- if (!model_url || strlen(model_url) == 0) {
1349
- LOG_ERR("%s: invalid model_url\n", __func__);
1350
- return NULL;
1351
- }
1352
-
1353
- if (!common_download_file(model_url, path_model, hf_token)) {
1354
- return NULL;
1355
- }
1356
-
1357
- // check for additional GGUFs split to download
1358
- int n_split = 0;
1359
- {
1360
- struct lm_gguf_init_params lm_gguf_params = {
1361
- /*.no_alloc = */ true,
1362
- /*.ctx = */ NULL,
1363
- };
1364
- auto * ctx_gguf = lm_gguf_init_from_file(path_model, lm_gguf_params);
1365
- if (!ctx_gguf) {
1366
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
1367
- return NULL;
1368
- }
1369
-
1370
- auto key_n_split = lm_gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
1371
- if (key_n_split >= 0) {
1372
- n_split = lm_gguf_get_val_u16(ctx_gguf, key_n_split);
1373
- }
1374
-
1375
- lm_gguf_free(ctx_gguf);
1376
- }
1377
-
1378
- if (n_split > 1) {
1379
- char split_prefix[PATH_MAX] = {0};
1380
- char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1381
-
1382
- // Verify the first split file format
1383
- // and extract split URL and PATH prefixes
1384
- {
1385
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
1386
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
1387
- return NULL;
1388
- }
1389
-
1390
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
1391
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
1392
- return NULL;
1393
- }
1394
- }
1395
-
1396
- // Prepare download in parallel
1397
- std::vector<std::future<bool>> futures_download;
1398
- for (int idx = 1; idx < n_split; idx++) {
1399
- futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
1400
- char split_path[PATH_MAX] = {0};
1401
- llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
1402
-
1403
- char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1404
- llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1405
-
1406
- return common_download_file(split_url, split_path, hf_token);
1407
- }, idx));
1408
- }
1409
-
1410
- // Wait for all downloads to complete
1411
- for (auto & f : futures_download) {
1412
- if (!f.get()) {
1413
- return NULL;
1414
- }
1415
- }
1416
- }
1417
-
1418
- return llama_load_model_from_file(path_model, params);
1419
- }
1420
-
1421
- struct llama_model * common_load_model_from_hf(
1422
- const char * repo,
1423
- const char * model,
1424
- const char * path_model,
1425
- const char * hf_token,
1426
- const struct llama_model_params & params) {
1427
- // construct hugging face model url:
1428
- //
1429
- // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
1430
- // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
1431
- //
1432
- // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
1433
- // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
1434
- //
1435
-
1436
- std::string model_url = "https://huggingface.co/";
1437
- model_url += repo;
1438
- model_url += "/resolve/main/";
1439
- model_url += model;
1440
-
1441
- return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1442
- }
1443
-
1444
- #else
1445
-
1446
- struct llama_model * common_load_model_from_url(
1447
- const char * /*model_url*/,
1448
- const char * /*path_model*/,
1449
- const char * /*hf_token*/,
1450
- const struct llama_model_params & /*params*/) {
1451
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
1452
- return nullptr;
1453
- }
1454
-
1455
- struct llama_model * common_load_model_from_hf(
1456
- const char * /*repo*/,
1457
- const char * /*model*/,
1458
- const char * /*path_model*/,
1459
- const char * /*hf_token*/,
1460
- const struct llama_model_params & /*params*/) {
1461
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1462
- return nullptr;
1463
- }
1464
-
1465
- #endif // LLAMA_USE_CURL
1466
-
1467
- //
1468
- // Batch utils
1469
- //
1470
-
1471
- void common_batch_clear(struct llama_batch & batch) {
1472
- batch.n_tokens = 0;
1473
- }
1474
-
1475
- void common_batch_add(
1476
- struct llama_batch & batch,
1477
- llama_token id,
1478
- llama_pos pos,
1479
- const std::vector<llama_seq_id> & seq_ids,
1480
- bool logits) {
1481
- LM_GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
1482
-
1483
- batch.token [batch.n_tokens] = id;
1484
- batch.pos [batch.n_tokens] = pos;
1485
- batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1486
- for (size_t i = 0; i < seq_ids.size(); ++i) {
1487
- batch.seq_id[batch.n_tokens][i] = seq_ids[i];
1488
- }
1489
- batch.logits [batch.n_tokens] = logits;
1490
-
1491
- batch.n_tokens++;
1492
- }
1493
-
1494
- //
1495
- // Vocab utils
1496
- //
1497
-
1498
- std::vector<llama_token> common_tokenize(
1499
- const struct llama_context * ctx,
1500
- const std::string & text,
1501
- bool add_special,
1502
- bool parse_special) {
1503
- return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1504
- }
1505
-
1506
- std::vector<llama_token> common_tokenize(
1507
- const struct llama_model * model,
1508
- const std::string & text,
1509
- bool add_special,
1510
- bool parse_special) {
1511
- // upper limit for the number of tokens
1512
- int n_tokens = text.length() + 2 * add_special;
1513
- std::vector<llama_token> result(n_tokens);
1514
- n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1515
- if (n_tokens < 0) {
1516
- result.resize(-n_tokens);
1517
- int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1518
- LM_GGML_ASSERT(check == -n_tokens);
1519
- } else {
1520
- result.resize(n_tokens);
1521
- }
1522
- return result;
1523
- }
1524
-
1525
- std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1526
- std::string piece;
1527
- piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1528
- const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1529
- if (n_chars < 0) {
1530
- piece.resize(-n_chars);
1531
- int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1532
- LM_GGML_ASSERT(check == -n_chars);
1533
- }
1534
- else {
1535
- piece.resize(n_chars);
1536
- }
1537
-
1538
- return piece;
1539
- }
1540
-
1541
- std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1542
- std::string text;
1543
- text.resize(std::max(text.capacity(), tokens.size()));
1544
- int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1545
- if (n_chars < 0) {
1546
- text.resize(-n_chars);
1547
- n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1548
- LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1549
- }
1550
-
1551
- text.resize(n_chars);
1552
-
1553
- // NOTE: the original tokenizer decodes bytes after collecting the pieces.
1554
- return text;
1555
- }
1556
-
1557
- //
1558
- // Chat template utils
1559
- //
1560
-
1561
- bool common_chat_verify_template(const std::string & tmpl) {
1562
- llama_chat_message chat[] = {{"user", "test"}};
1563
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1564
- return res >= 0;
1565
- }
1566
-
1567
- std::string common_chat_apply_template(const struct llama_model * model,
1568
- const std::string & tmpl,
1569
- const std::vector<common_chat_msg> & msgs,
1570
- bool add_ass) {
1571
- int alloc_size = 0;
1572
- bool fallback = false; // indicate if we must fallback to default chatml
1573
- std::vector<llama_chat_message> chat;
1574
- for (auto & msg : msgs) {
1575
- chat.push_back({msg.role.c_str(), msg.content.c_str()});
1576
- alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1577
- }
1578
-
1579
- const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
1580
- std::vector<char> buf(alloc_size);
1581
-
1582
- // run the first time to get the total output length
1583
- int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1584
-
1585
- // error: chat template is not supported
1586
- if (res < 0) {
1587
- if (ptr_tmpl != nullptr) {
1588
- // if the custom "tmpl" is not supported, we throw an error
1589
- // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1590
- throw std::runtime_error("this custom template is not supported");
1591
- } else {
1592
- // If the built-in template is not supported, we default to chatml
1593
- res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1594
- fallback = true;
1595
- }
1596
- }
1597
-
1598
- // if it turns out that our buffer is too small, we resize it
1599
- if ((size_t) res > buf.size()) {
1600
- buf.resize(res);
1601
- res = llama_chat_apply_template(
1602
- fallback ? nullptr : model,
1603
- fallback ? "chatml" : ptr_tmpl,
1604
- chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1605
- }
1606
-
1607
- std::string formatted_chat(buf.data(), res);
1608
- return formatted_chat;
1609
- }
1610
-
1611
- std::string common_chat_format_single(const struct llama_model * model,
1612
- const std::string & tmpl,
1613
- const std::vector<common_chat_msg> & past_msg,
1614
- const common_chat_msg & new_msg,
1615
- bool add_ass) {
1616
- std::ostringstream ss;
1617
- auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
1618
- std::vector<common_chat_msg> chat_new(past_msg);
1619
- // if the past_msg ends with a newline, we must preserve it in the formatted version
1620
- if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
1621
- ss << "\n";
1622
- };
1623
- // format chat with new_msg
1624
- chat_new.push_back(new_msg);
1625
- auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
1626
- // get the diff part
1627
- ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1628
- return ss.str();
1629
- }
1630
-
1631
- std::string common_chat_format_example(const struct llama_model * model,
1632
- const std::string & tmpl) {
1633
- std::vector<common_chat_msg> msgs = {
1634
- {"system", "You are a helpful assistant"},
1635
- {"user", "Hello"},
1636
- {"assistant", "Hi there"},
1637
- {"user", "How are you?"},
1638
- };
1639
- return common_chat_apply_template(model, tmpl, msgs, true);
1640
- }
1641
-
1642
- //
1643
- // KV cache utils
1644
- //
1645
-
1646
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1647
- static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1648
-
1649
- printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1650
- view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1651
-
1652
- llama_kv_cache_view_cell * c_curr = view.cells;
1653
- llama_seq_id * cs_curr = view.cells_sequences;
1654
-
1655
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1656
- if (i % row_size == 0) {
1657
- printf("\n%5d: ", i);
1658
- }
1659
- int seq_count = 0;
1660
- for (int j = 0; j < view.n_seq_max; j++) {
1661
- if (cs_curr[j] >= 0) { seq_count++; }
1662
- }
1663
- putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1664
- }
1665
-
1666
- printf("\n=== Done dumping\n");
1667
- }
1668
-
1669
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1670
- static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1671
-
1672
- printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1673
- view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1674
-
1675
- std::unordered_map<llama_seq_id, size_t> seqs;
1676
- llama_kv_cache_view_cell * c_curr = view.cells;
1677
- llama_seq_id * cs_curr = view.cells_sequences;
1678
-
1679
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1680
- for (int j = 0; j < view.n_seq_max; j++) {
1681
- if (cs_curr[j] < 0) { continue; }
1682
- if (seqs.find(cs_curr[j]) == seqs.end()) {
1683
- if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1684
- const size_t sz = seqs.size();
1685
- seqs[cs_curr[j]] = sz;
1686
- }
1687
- }
1688
- if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1689
- }
1690
-
1691
- printf("=== Sequence legend: ");
1692
- for (const auto & it : seqs) {
1693
- printf("%zu=%d, ", it.second, it.first);
1694
- }
1695
- printf("'+'=other sequence ids");
1696
-
1697
- c_curr = view.cells;
1698
- cs_curr = view.cells_sequences;
1699
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1700
- if (i % row_size == 0) {
1701
- printf("\n%5d: ", i);
1702
- }
1703
- for (int j = 0; j < view.n_seq_max; j++) {
1704
- if (cs_curr[j] >= 0) {
1705
- const auto & it = seqs.find(cs_curr[j]);
1706
- putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1707
- } else {
1708
- putchar('.');
1709
- }
1710
- }
1711
- putchar(' ');
1712
- }
1713
-
1714
- printf("\n=== Done dumping\n");
1715
- }
1716
-
1717
- //
1718
- // Embedding utils
1719
- //
1720
-
1721
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1722
- double sum = 0.0;
1723
-
1724
- switch (embd_norm) {
1725
- case -1: // no normalisation
1726
- sum = 1.0;
1727
- break;
1728
- case 0: // max absolute
1729
- for (int i = 0; i < n; i++) {
1730
- if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1731
- }
1732
- sum /= 32760.0; // make an int16 range
1733
- break;
1734
- case 2: // euclidean
1735
- for (int i = 0; i < n; i++) {
1736
- sum += inp[i] * inp[i];
1737
- }
1738
- sum = std::sqrt(sum);
1739
- break;
1740
- default: // p-norm (euclidean is p-norm p=2)
1741
- for (int i = 0; i < n; i++) {
1742
- sum += std::pow(std::abs(inp[i]), embd_norm);
1743
- }
1744
- sum = std::pow(sum, 1.0 / embd_norm);
1745
- break;
1746
- }
1747
-
1748
- const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
1749
-
1750
- for (int i = 0; i < n; i++) {
1751
- out[i] = inp[i] * norm;
1752
- }
1753
- }
1754
-
1755
- float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1756
- double sum = 0.0;
1757
- double sum1 = 0.0;
1758
- double sum2 = 0.0;
1759
-
1760
- for (int i = 0; i < n; i++) {
1761
- sum += embd1[i] * embd2[i];
1762
- sum1 += embd1[i] * embd1[i];
1763
- sum2 += embd2[i] * embd2[i];
1764
- }
1765
-
1766
- // Handle the case where one or both vectors are zero vectors
1767
- if (sum1 == 0.0 || sum2 == 0.0) {
1768
- if (sum1 == 0.0 && sum2 == 0.0) {
1769
- return 1.0f; // two zero vectors are similar
1770
- }
1771
- return 0.0f;
1772
- }
1773
-
1774
- return sum / (sqrt(sum1) * sqrt(sum2));
1775
- }
1776
-
1777
- //
1778
- // Control vector utils
1779
- //
1780
-
1781
- static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1782
- common_control_vector_data result = { -1, {} };
1783
-
1784
- lm_ggml_context * ctx = nullptr;
1785
- struct lm_gguf_init_params meta_lm_gguf_params = {
1786
- /* .no_alloc = */ false,
1787
- /* .ctx = */ &ctx,
1788
- };
1789
- struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(load_info.fname.c_str(), meta_lm_gguf_params);
1790
- if (!ctx_gguf) {
1791
- LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
1792
- return result;
1793
- }
1794
-
1795
- int32_t n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
1796
- if (n_tensors == 0) {
1797
- LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
1798
- }
1799
-
1800
- for (int i = 0; i < n_tensors; i++) {
1801
- std::string name = lm_gguf_get_tensor_name(ctx_gguf, i);
1802
-
1803
- int layer_idx = -1;
1804
-
1805
- // split on '.'
1806
- size_t dotpos = name.find('.');
1807
- if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
1808
- try {
1809
- layer_idx = std::stoi(name.substr(dotpos + 1));
1810
- } catch (...) {
1811
- layer_idx = -1;
1812
- }
1813
- }
1814
- if (layer_idx < 0) {
1815
- LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1816
- result.n_embd = -1;
1817
- break;
1818
- } else if (layer_idx == 0) {
1819
- LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1820
- result.n_embd = -1;
1821
- break;
1822
- }
1823
-
1824
- struct lm_ggml_tensor * tensor = lm_ggml_get_tensor(ctx, name.c_str());
1825
- if (tensor->type != LM_GGML_TYPE_F32) {
1826
- LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
1827
- result.n_embd = -1;
1828
- break;
1829
- }
1830
- if (lm_ggml_n_dims(tensor) != 1) {
1831
- LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
1832
- result.n_embd = -1;
1833
- break;
1834
- }
1835
-
1836
- if (result.n_embd == -1) {
1837
- result.n_embd = lm_ggml_nelements(tensor);
1838
- } else if (lm_ggml_nelements(tensor) != result.n_embd) {
1839
- LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
1840
- result.n_embd = -1;
1841
- break;
1842
- }
1843
-
1844
- // extend if necessary - do not store data for layer 0 (it's not used)
1845
- result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
1846
-
1847
- const float * src = (const float *) tensor->data;
1848
- float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
1849
- for (int j = 0; j < result.n_embd; j++) {
1850
- dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
1851
- }
1852
-
1853
- }
1854
-
1855
- if (result.n_embd == -1) {
1856
- LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
1857
- result.data.clear();
1858
- }
1859
-
1860
- lm_gguf_free(ctx_gguf);
1861
- lm_ggml_free(ctx);
1862
-
1863
- return result;
1864
- }
1865
-
1866
- common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1867
- common_control_vector_data result = { -1, {} };
1868
-
1869
- for (const auto & info : load_infos) {
1870
- auto cur = common_control_vector_load_one(info);
1871
-
1872
- if (cur.n_embd == -1) {
1873
- result.n_embd = -1;
1874
- break;
1875
- }
1876
- if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
1877
- LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
1878
- result.n_embd = -1;
1879
- break;
1880
- }
1881
-
1882
- if (result.n_embd == -1) {
1883
- result = std::move(cur);
1884
- } else {
1885
- result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
1886
- for (size_t i = 0; i < cur.data.size(); i++) {
1887
- result.data[i] += cur.data[i];
1888
- }
1889
- }
1890
- }
1891
-
1892
- if (result.n_embd == -1) {
1893
- LOG_ERR("%s: no valid control vector files passed\n", __func__);
1894
- result.data.clear();
1895
- }
1896
-
1897
- return result;
1898
- }
1899
-
1900
- //
1901
- // YAML utils
1902
- //
1903
-
1904
- void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
1905
- if (data.empty()) {
1906
- fprintf(stream, "%s:\n", prop_name);
1907
- return;
1908
- }
1909
-
1910
- fprintf(stream, "%s: [", prop_name);
1911
- for (size_t i = 0; i < data.size() - 1; ++i) {
1912
- fprintf(stream, "%e, ", data[i]);
1913
- }
1914
- fprintf(stream, "%e]\n", data.back());
1915
- }
1916
-
1917
- void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
1918
- if (data.empty()) {
1919
- fprintf(stream, "%s:\n", prop_name);
1920
- return;
1921
- }
1922
-
1923
- fprintf(stream, "%s: [", prop_name);
1924
- for (size_t i = 0; i < data.size() - 1; ++i) {
1925
- fprintf(stream, "%d, ", data[i]);
1926
- }
1927
- fprintf(stream, "%d]\n", data.back());
1928
- }
1929
-
1930
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
1931
- std::string data_str(data == NULL ? "" : data);
1932
-
1933
- if (data_str.empty()) {
1934
- fprintf(stream, "%s:\n", prop_name);
1935
- return;
1936
- }
1937
-
1938
- size_t pos_start = 0;
1939
- size_t pos_found = 0;
1940
-
1941
- if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
1942
- data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
1943
- data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
1944
- data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
1945
- data_str = "\"" + data_str + "\"";
1946
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1947
- return;
1948
- }
1949
-
1950
- if (data_str.find('\n') == std::string::npos) {
1951
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1952
- return;
1953
- }
1954
-
1955
- fprintf(stream, "%s: |\n", prop_name);
1956
- while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
1957
- fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
1958
- pos_start = pos_found + 1;
1959
- }
1960
- }
1961
-
1962
- void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
1963
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1964
- lm_ggml_cpu_init(); // some ARM features are detected at runtime
1965
-
1966
- const auto & sparams = params.sparams;
1967
-
1968
- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
1969
- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
1970
- fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false");
1971
- fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false");
1972
- fprintf(stream, "cpu_has_avx_vnni: %s\n", lm_ggml_cpu_has_avx_vnni() ? "true" : "false");
1973
- fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false");
1974
- fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false");
1975
- fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false");
1976
- fprintf(stream, "cpu_has_avx512_vnni: %s\n", lm_ggml_cpu_has_avx512_vnni() ? "true" : "false");
1977
- fprintf(stream, "cpu_has_cuda: %s\n", lm_ggml_cpu_has_cuda() ? "true" : "false");
1978
- fprintf(stream, "cpu_has_vulkan: %s\n", lm_ggml_cpu_has_vulkan() ? "true" : "false");
1979
- fprintf(stream, "cpu_has_kompute: %s\n", lm_ggml_cpu_has_kompute() ? "true" : "false");
1980
- fprintf(stream, "cpu_has_fma: %s\n", lm_ggml_cpu_has_fma() ? "true" : "false");
1981
- fprintf(stream, "cpu_has_gpublas: %s\n", lm_ggml_cpu_has_gpublas() ? "true" : "false");
1982
- fprintf(stream, "cpu_has_neon: %s\n", lm_ggml_cpu_has_neon() ? "true" : "false");
1983
- fprintf(stream, "cpu_has_sve: %s\n", lm_ggml_cpu_has_sve() ? "true" : "false");
1984
- fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false");
1985
- fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false");
1986
- fprintf(stream, "cpu_has_riscv_v: %s\n", lm_ggml_cpu_has_riscv_v() ? "true" : "false");
1987
- fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false");
1988
- fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false");
1989
- fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false");
1990
- fprintf(stream, "cpu_has_vsx: %s\n", lm_ggml_cpu_has_vsx() ? "true" : "false");
1991
- fprintf(stream, "cpu_has_matmul_int8: %s\n", lm_ggml_cpu_has_matmul_int8() ? "true" : "false");
1992
-
1993
- #ifdef NDEBUG
1994
- fprintf(stream, "debug: false\n");
1995
- #else
1996
- fprintf(stream, "debug: true\n");
1997
- #endif // NDEBUG
1998
-
1999
- fprintf(stream, "model_desc: %s\n", model_desc);
2000
- fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
2001
-
2002
- #ifdef __OPTIMIZE__
2003
- fprintf(stream, "optimize: true\n");
2004
- #else
2005
- fprintf(stream, "optimize: false\n");
2006
- #endif // __OPTIMIZE__
2007
-
2008
- fprintf(stream, "time: %s\n", timestamp.c_str());
2009
-
2010
- fprintf(stream, "\n");
2011
- fprintf(stream, "###############\n");
2012
- fprintf(stream, "# User Inputs #\n");
2013
- fprintf(stream, "###############\n");
2014
- fprintf(stream, "\n");
2015
-
2016
- fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2017
- fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2018
- fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2019
- fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2020
- fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2021
- fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
2022
- fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
2023
- fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
2024
- fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
2025
- fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2026
- fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2027
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2028
- yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
2029
- fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2030
- fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2031
- fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2032
- fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
2033
-
2034
- yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2035
- fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2036
- yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2037
- fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2038
- fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2039
- fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2040
- fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2041
-
2042
- fprintf(stream, "logit_bias:\n");
2043
- for (const auto & logit_bias : sparams.logit_bias) {
2044
- fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
2045
- }
2046
-
2047
- fprintf(stream, "lora:\n");
2048
- for (auto & la : params.lora_adapters) {
2049
- if (la.scale == 1.0f) {
2050
- fprintf(stream, " - %s\n", la.path.c_str());
2051
- }
2052
- }
2053
- fprintf(stream, "lora_scaled:\n");
2054
- for (auto & la : params.lora_adapters) {
2055
- if (la.scale != 1.0f) {
2056
- fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
2057
- }
2058
- }
2059
- fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
2060
- fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2061
- fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2062
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2063
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2064
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2065
- fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2066
- fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2067
- fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2068
- fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2069
- fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2070
- fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2071
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2072
- fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2073
- fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2074
- fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2075
- fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2076
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2077
- yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
2078
- fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2079
- fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2080
- fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2081
- yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2082
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2083
-
2084
- fprintf(stream, "reverse_prompt:\n");
2085
- for (std::string ap : params.antiprompt) {
2086
- size_t pos = 0;
2087
- while ((pos = ap.find('\n', pos)) != std::string::npos) {
2088
- ap.replace(pos, 1, "\\n");
2089
- pos += 1;
2090
- }
2091
-
2092
- fprintf(stream, " - %s\n", ap.c_str());
2093
- }
2094
-
2095
- fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2096
- fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2097
- fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2098
- fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2099
- fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2100
- fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2101
-
2102
- const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2103
- yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2104
-
2105
- fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
2106
- fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2107
- fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2108
- fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2109
- fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2110
- fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
2111
- fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
2112
- fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2113
- fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2114
- }
1
+ #if defined(_MSC_VER)
2
+ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
+ #endif
4
+
5
+ #include "common.h"
6
+ #include "log.h"
7
+ #include "llama.h"
8
+
9
+ #include <algorithm>
10
+ #include <cinttypes>
11
+ #include <climits>
12
+ #include <cmath>
13
+ #include <codecvt>
14
+ #include <cstdarg>
15
+ #include <cstring>
16
+ #include <ctime>
17
+ #include <fstream>
18
+ #include <iostream>
19
+ #include <iterator>
20
+ #include <regex>
21
+ #include <sstream>
22
+ #include <string>
23
+ #include <thread>
24
+ #include <unordered_map>
25
+ #include <unordered_set>
26
+ #include <vector>
27
+
28
+ #if defined(__APPLE__) && defined(__MACH__)
29
+ #include <sys/types.h>
30
+ #include <sys/sysctl.h>
31
+ #endif
32
+
33
+ #if defined(_WIN32)
34
+ #define WIN32_LEAN_AND_MEAN
35
+ #ifndef NOMINMAX
36
+ # define NOMINMAX
37
+ #endif
38
+ #include <locale>
39
+ #include <windows.h>
40
+ #include <fcntl.h>
41
+ #include <io.h>
42
+ #else
43
+ #include <sys/ioctl.h>
44
+ #include <sys/stat.h>
45
+ #include <unistd.h>
46
+ #endif
47
+ #if defined(LLAMA_USE_CURL)
48
+ #include <curl/curl.h>
49
+ #include <curl/easy.h>
50
+ #include <future>
51
+ #endif
52
+
53
+ // build info
54
+ int LLAMA_BUILD_NUMBER = 0;
55
+ char const *LLAMA_COMMIT = "unknown";
56
+ char const *LLAMA_COMPILER = "unknown";
57
+ char const *LLAMA_BUILD_TARGET = "unknown";
58
+
59
+ #if defined(_MSC_VER)
60
+ #pragma warning(disable: 4244 4267) // possible loss of data
61
+ #endif
62
+
63
+ #if defined(LLAMA_USE_CURL)
64
+ #ifdef __linux__
65
+ #include <linux/limits.h>
66
+ #elif defined(_WIN32)
67
+ #define PATH_MAX MAX_PATH
68
+ #else
69
+ #include <sys/syslimits.h>
70
+ #endif
71
+ #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
72
+ #endif // LLAMA_USE_CURL
73
+
74
+ //
75
+ // CPU utils
76
+ //
77
+
78
+ int32_t cpu_get_num_physical_cores() {
79
+ #ifdef __linux__
80
+ // enumerate the set of thread siblings, num entries is num cores
81
+ std::unordered_set<std::string> siblings;
82
+ for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
83
+ std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
84
+ + std::to_string(cpu) + "/topology/thread_siblings");
85
+ if (!thread_siblings.is_open()) {
86
+ break; // no more cpus
87
+ }
88
+ std::string line;
89
+ if (std::getline(thread_siblings, line)) {
90
+ siblings.insert(line);
91
+ }
92
+ }
93
+ if (!siblings.empty()) {
94
+ return static_cast<int32_t>(siblings.size());
95
+ }
96
+ #elif defined(__APPLE__) && defined(__MACH__)
97
+ int32_t num_physical_cores;
98
+ size_t len = sizeof(num_physical_cores);
99
+ int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
100
+ if (result == 0) {
101
+ return num_physical_cores;
102
+ }
103
+ result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
104
+ if (result == 0) {
105
+ return num_physical_cores;
106
+ }
107
+ #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
108
+ // TODO: windows + arm64 + mingw64
109
+ unsigned int n_threads_win = std::thread::hardware_concurrency();
110
+ unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
111
+
112
+ DWORD buffer_size = 0;
113
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
114
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
115
+ return default_threads;
116
+ }
117
+ }
118
+
119
+ std::vector<char> buffer(buffer_size);
120
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
121
+ return default_threads;
122
+ }
123
+
124
+ int32_t num_physical_cores = 0;
125
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
126
+ while (buffer_size > 0) {
127
+ if (info->Relationship == RelationProcessorCore) {
128
+ num_physical_cores += info->Processor.GroupCount;
129
+ }
130
+ buffer_size -= info->Size;
131
+ info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
132
+ }
133
+
134
+ return num_physical_cores > 0 ? num_physical_cores : default_threads;
135
+ #endif
136
+ unsigned int n_threads = std::thread::hardware_concurrency();
137
+ return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
138
+ }
139
+
140
+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
141
+ #include <pthread.h>
142
+
143
+ static void cpuid(unsigned leaf, unsigned subleaf,
144
+ unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
145
+ __asm__("movq\t%%rbx,%%rsi\n\t"
146
+ "cpuid\n\t"
147
+ "xchgq\t%%rbx,%%rsi"
148
+ : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
149
+ : "0"(leaf), "2"(subleaf));
150
+ }
151
+
152
+ static int pin_cpu(int cpu) {
153
+ cpu_set_t mask;
154
+ CPU_ZERO(&mask);
155
+ CPU_SET(cpu, &mask);
156
+ return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
157
+ }
158
+
159
+ static bool is_hybrid_cpu(void) {
160
+ unsigned eax, ebx, ecx, edx;
161
+ cpuid(7, 0, &eax, &ebx, &ecx, &edx);
162
+ return !!(edx & (1u << 15));
163
+ }
164
+
165
+ static bool is_running_on_efficiency_core(void) {
166
+ unsigned eax, ebx, ecx, edx;
167
+ cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
168
+ int intel_atom = 0x20;
169
+ int core_type = (eax & 0xff000000u) >> 24;
170
+ return core_type == intel_atom;
171
+ }
172
+
173
+ static int cpu_count_math_cpus(int n_cpu) {
174
+ int result = 0;
175
+ for (int cpu = 0; cpu < n_cpu; ++cpu) {
176
+ if (pin_cpu(cpu)) {
177
+ return -1;
178
+ }
179
+ if (is_running_on_efficiency_core()) {
180
+ continue; // efficiency cores harm lockstep threading
181
+ }
182
+ ++cpu; // hyperthreading isn't useful for linear algebra
183
+ ++result;
184
+ }
185
+ return result;
186
+ }
187
+
188
+ #endif // __x86_64__ && __linux__
189
+
190
+ /**
191
+ * Returns number of CPUs on system that are useful for math.
192
+ */
193
+ int32_t cpu_get_num_math() {
194
+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
195
+ int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
196
+ if (n_cpu < 1) {
197
+ return cpu_get_num_physical_cores();
198
+ }
199
+ if (is_hybrid_cpu()) {
200
+ cpu_set_t affinity;
201
+ if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
202
+ int result = cpu_count_math_cpus(n_cpu);
203
+ pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
204
+ if (result > 0) {
205
+ return result;
206
+ }
207
+ }
208
+ }
209
+ #endif
210
+ return cpu_get_num_physical_cores();
211
+ }
212
+
213
+ // Helper for setting process priority
214
+
215
+ #if defined(_WIN32)
216
+
217
+ bool set_process_priority(enum lm_ggml_sched_priority prio) {
218
+ if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
219
+ return true;
220
+ }
221
+
222
+ DWORD p = NORMAL_PRIORITY_CLASS;
223
+ switch (prio) {
224
+ case LM_GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
225
+ case LM_GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
226
+ case LM_GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
227
+ case LM_GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
228
+ }
229
+
230
+ if (!SetPriorityClass(GetCurrentProcess(), p)) {
231
+ LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
232
+ return false;
233
+ }
234
+
235
+ return true;
236
+ }
237
+
238
+ #else // MacOS and POSIX
239
+ #include <sys/types.h>
240
+ #include <sys/resource.h>
241
+
242
+ bool set_process_priority(enum lm_ggml_sched_priority prio) {
243
+ if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
244
+ return true;
245
+ }
246
+
247
+ int p = 0;
248
+ switch (prio) {
249
+ case LM_GGML_SCHED_PRIO_NORMAL: p = 0; break;
250
+ case LM_GGML_SCHED_PRIO_MEDIUM: p = -5; break;
251
+ case LM_GGML_SCHED_PRIO_HIGH: p = -10; break;
252
+ case LM_GGML_SCHED_PRIO_REALTIME: p = -20; break;
253
+ }
254
+
255
+ if (!setpriority(PRIO_PROCESS, 0, p)) {
256
+ LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
257
+ return false;
258
+ }
259
+ return true;
260
+ }
261
+
262
+ #endif
263
+
264
+ //
265
+ // CLI argument parsing
266
+ //
267
+
268
+
269
+ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
270
+ int32_t n_set = 0;
271
+
272
+ if (cpuparams.n_threads < 0) {
273
+ // Assuming everything about cpuparams is invalid
274
+ if (role_model != nullptr) {
275
+ cpuparams = *role_model;
276
+ } else {
277
+ cpuparams.n_threads = cpu_get_num_math();
278
+ }
279
+ }
280
+
281
+ for (int32_t i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
282
+ if (cpuparams.cpumask[i]) {
283
+ n_set++;
284
+ }
285
+ }
286
+
287
+ if (n_set && n_set < cpuparams.n_threads) {
288
+ // Not enough set bits, may experience performance issues.
289
+ LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
290
+ }
291
+ }
292
+
293
+ bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
294
+ size_t dash_loc = range.find('-');
295
+ if (dash_loc == std::string::npos) {
296
+ LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
297
+ return false;
298
+ }
299
+
300
+ size_t start_i;
301
+ size_t end_i;
302
+
303
+ if (dash_loc == 0) {
304
+ start_i = 0;
305
+ } else {
306
+ start_i = std::stoull(range.substr(0, dash_loc));
307
+ if (start_i >= LM_GGML_MAX_N_THREADS) {
308
+ LOG_ERR("Start index out of bounds!\n");
309
+ return false;
310
+ }
311
+ }
312
+
313
+ if (dash_loc == range.length() - 1) {
314
+ end_i = LM_GGML_MAX_N_THREADS - 1;
315
+ } else {
316
+ end_i = std::stoull(range.substr(dash_loc + 1));
317
+ if (end_i >= LM_GGML_MAX_N_THREADS) {
318
+ LOG_ERR("End index out of bounds!\n");
319
+ return false;
320
+ }
321
+ }
322
+
323
+ for (size_t i = start_i; i <= end_i; i++) {
324
+ boolmask[i] = true;
325
+ }
326
+
327
+ return true;
328
+ }
329
+
330
+ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
331
+ // Discard potential 0x prefix
332
+ size_t start_i = 0;
333
+ if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
334
+ start_i = 2;
335
+ }
336
+
337
+ size_t num_digits = mask.length() - start_i;
338
+ if (num_digits > 128) num_digits = 128;
339
+
340
+ size_t end_i = num_digits + start_i;
341
+
342
+ for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
343
+ char c = mask.at(i);
344
+ int8_t id = c;
345
+
346
+ if ((c >= '0' && c <= '9')) {
347
+ id -= '0';
348
+ } else if (c >= 'a' && c <= 'f') {
349
+ id -= 'a' - 10;
350
+ } else if (c >= 'A' && c <= 'F') {
351
+ id -= 'A' - 10;
352
+ } else {
353
+ LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
354
+ return false;
355
+ }
356
+
357
+ boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
358
+ boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
359
+ boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
360
+ boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
361
+ }
362
+
363
+ return true;
364
+ }
365
+
366
+ void common_init() {
367
+ llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
368
+ if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
369
+ common_log_add(common_log_main(), level, "%s", text);
370
+ }
371
+ }, NULL);
372
+
373
+ #ifdef NDEBUG
374
+ const char * build_type = "";
375
+ #else
376
+ const char * build_type = " (debug)";
377
+ #endif
378
+
379
+ LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
380
+ }
381
+
382
+ std::string common_params_get_system_info(const common_params & params) {
383
+ std::ostringstream os;
384
+
385
+ os << "system_info: n_threads = " << params.cpuparams.n_threads;
386
+ if (params.cpuparams_batch.n_threads != -1) {
387
+ os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
388
+ }
389
+ #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
390
+ // TODO: windows + arm64 + mingw64
391
+ DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
392
+ os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
393
+ #else
394
+ os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
395
+ #endif
396
+
397
+ return os.str();
398
+ }
399
+
400
+ //
401
+ // String utils
402
+ //
403
+
404
+ std::string string_format(const char * fmt, ...) {
405
+ va_list ap;
406
+ va_list ap2;
407
+ va_start(ap, fmt);
408
+ va_copy(ap2, ap);
409
+ int size = vsnprintf(NULL, 0, fmt, ap);
410
+ LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
411
+ std::vector<char> buf(size + 1);
412
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
413
+ LM_GGML_ASSERT(size2 == size);
414
+ va_end(ap2);
415
+ va_end(ap);
416
+ return std::string(buf.data(), size);
417
+ }
418
+
419
+ std::string string_strip(const std::string & str) {
420
+ size_t start = 0;
421
+ size_t end = str.size();
422
+ while (start < end && std::isspace(str[start])) {
423
+ start++;
424
+ }
425
+ while (end > start && std::isspace(str[end - 1])) {
426
+ end--;
427
+ }
428
+ return str.substr(start, end - start);
429
+ }
430
+
431
+ std::string string_get_sortable_timestamp() {
432
+ using clock = std::chrono::system_clock;
433
+
434
+ const clock::time_point current_time = clock::now();
435
+ const time_t as_time_t = clock::to_time_t(current_time);
436
+ char timestamp_no_ns[100];
437
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
438
+
439
+ const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
440
+ current_time.time_since_epoch() % 1000000000).count();
441
+ char timestamp_ns[11];
442
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
443
+
444
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
445
+ }
446
+
447
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
448
+ if (search.empty()) {
449
+ return;
450
+ }
451
+ std::string builder;
452
+ builder.reserve(s.length());
453
+ size_t pos = 0;
454
+ size_t last_pos = 0;
455
+ while ((pos = s.find(search, last_pos)) != std::string::npos) {
456
+ builder.append(s, last_pos, pos - last_pos);
457
+ builder.append(replace);
458
+ last_pos = pos + search.length();
459
+ }
460
+ builder.append(s, last_pos, std::string::npos);
461
+ s = std::move(builder);
462
+ }
463
+
464
+ std::string string_from(bool value) {
465
+ return value ? "true" : "false";
466
+ }
467
+
468
+ std::string string_from(const std::vector<int> & values) {
469
+ std::stringstream buf;
470
+
471
+ buf << "[ ";
472
+ bool first = true;
473
+ for (auto e : values) {
474
+ if (first) {
475
+ first = false;
476
+ } else {
477
+ buf << ", ";
478
+ }
479
+ buf << std::to_string(e);
480
+ }
481
+ buf << " ]";
482
+
483
+ return buf.str();
484
+ }
485
+
486
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
487
+ std::stringstream buf;
488
+
489
+ buf << "[ ";
490
+
491
+ bool first = true;
492
+ for (const auto & token : tokens) {
493
+ if (!first) {
494
+ buf << ", ";
495
+ } else {
496
+ first = false;
497
+ }
498
+
499
+ auto detokenized = common_token_to_piece(ctx, token);
500
+
501
+ detokenized.erase(
502
+ std::remove_if(
503
+ detokenized.begin(),
504
+ detokenized.end(),
505
+ [](const unsigned char c) { return !std::isprint(c); }),
506
+ detokenized.end());
507
+
508
+ buf << "'" << detokenized << "'"
509
+ << ":" << std::to_string(token);
510
+ }
511
+
512
+ buf << " ]";
513
+
514
+ return buf.str();
515
+ }
516
+
517
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
518
+ std::stringstream buf;
519
+
520
+ buf << "[ ";
521
+
522
+ bool first = true;
523
+ for (int i = 0; i < batch.n_tokens; ++i) {
524
+ if (!first) {
525
+ buf << ", ";
526
+ } else {
527
+ first = false;
528
+ }
529
+
530
+ auto detokenized = common_token_to_piece(ctx, batch.token[i]);
531
+
532
+ detokenized.erase(
533
+ std::remove_if(
534
+ detokenized.begin(),
535
+ detokenized.end(),
536
+ [](const unsigned char c) { return !std::isprint(c); }),
537
+ detokenized.end());
538
+
539
+ buf << "\n" << std::to_string(i)
540
+ << ":token '" << detokenized << "'"
541
+ << ":pos " << std::to_string(batch.pos[i])
542
+ << ":n_seq_id " << std::to_string(batch.n_seq_id[i])
543
+ << ":seq_id " << std::to_string(batch.seq_id[i][0])
544
+ << ":logits " << std::to_string(batch.logits[i]);
545
+ }
546
+
547
+ buf << " ]";
548
+
549
+ return buf.str();
550
+ }
551
+
552
+ void string_process_escapes(std::string & input) {
553
+ std::size_t input_len = input.length();
554
+ std::size_t output_idx = 0;
555
+
556
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
557
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
558
+ switch (input[++input_idx]) {
559
+ case 'n': input[output_idx++] = '\n'; break;
560
+ case 'r': input[output_idx++] = '\r'; break;
561
+ case 't': input[output_idx++] = '\t'; break;
562
+ case '\'': input[output_idx++] = '\''; break;
563
+ case '\"': input[output_idx++] = '\"'; break;
564
+ case '\\': input[output_idx++] = '\\'; break;
565
+ case 'x':
566
+ // Handle \x12, etc
567
+ if (input_idx + 2 < input_len) {
568
+ const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
569
+ char *err_p = nullptr;
570
+ const long val = std::strtol(x, &err_p, 16);
571
+ if (err_p == x + 2) {
572
+ input_idx += 2;
573
+ input[output_idx++] = char(val);
574
+ break;
575
+ }
576
+ }
577
+ // fall through
578
+ default: input[output_idx++] = '\\';
579
+ input[output_idx++] = input[input_idx]; break;
580
+ }
581
+ } else {
582
+ input[output_idx++] = input[input_idx];
583
+ }
584
+ }
585
+
586
+ input.resize(output_idx);
587
+ }
588
+
589
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
590
+ const char * sep = strchr(data, '=');
591
+ if (sep == nullptr || sep - data >= 128) {
592
+ LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
593
+ return false;
594
+ }
595
+ llama_model_kv_override kvo;
596
+ std::strncpy(kvo.key, data, sep - data);
597
+ kvo.key[sep - data] = 0;
598
+ sep++;
599
+ if (strncmp(sep, "int:", 4) == 0) {
600
+ sep += 4;
601
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
602
+ kvo.val_i64 = std::atol(sep);
603
+ } else if (strncmp(sep, "float:", 6) == 0) {
604
+ sep += 6;
605
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
606
+ kvo.val_f64 = std::atof(sep);
607
+ } else if (strncmp(sep, "bool:", 5) == 0) {
608
+ sep += 5;
609
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
610
+ if (std::strcmp(sep, "true") == 0) {
611
+ kvo.val_bool = true;
612
+ } else if (std::strcmp(sep, "false") == 0) {
613
+ kvo.val_bool = false;
614
+ } else {
615
+ LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
616
+ return false;
617
+ }
618
+ } else if (strncmp(sep, "str:", 4) == 0) {
619
+ sep += 4;
620
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
621
+ if (strlen(sep) > 127) {
622
+ LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
623
+ return false;
624
+ }
625
+ strncpy(kvo.val_str, sep, 127);
626
+ kvo.val_str[127] = '\0';
627
+ } else {
628
+ LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
629
+ return false;
630
+ }
631
+ overrides.emplace_back(std::move(kvo));
632
+ return true;
633
+ }
634
+
635
+ //
636
+ // Filesystem utils
637
+ //
638
+
639
+ // Validate if a filename is safe to use
640
+ // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
641
+ bool fs_validate_filename(const std::string & filename) {
642
+ if (!filename.length()) {
643
+ // Empty filename invalid
644
+ return false;
645
+ }
646
+ if (filename.length() > 255) {
647
+ // Limit at common largest possible filename on Linux filesystems
648
+ // to avoid unnecessary further validation
649
+ // (On systems with smaller limits it will be caught by the OS)
650
+ return false;
651
+ }
652
+
653
+ std::u32string filename_utf32;
654
+ try {
655
+ std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
656
+ filename_utf32 = converter.from_bytes(filename);
657
+
658
+ // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
659
+ // or invalid encodings were encountered. Reject such attempts
660
+ std::string filename_reencoded = converter.to_bytes(filename_utf32);
661
+ if (filename_reencoded != filename) {
662
+ return false;
663
+ }
664
+ } catch (const std::exception &) {
665
+ return false;
666
+ }
667
+
668
+ // Check for forbidden codepoints:
669
+ // - Control characters
670
+ // - Unicode equivalents of illegal characters
671
+ // - UTF-16 surrogate pairs
672
+ // - UTF-8 replacement character
673
+ // - Byte order mark (BOM)
674
+ // - Illegal characters: / \ : * ? " < > |
675
+ for (char32_t c : filename_utf32) {
676
+ if (c <= 0x1F // Control characters (C0)
677
+ || c == 0x7F // Control characters (DEL)
678
+ || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
679
+ || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
680
+ || c == 0x2215 // Division Slash (forward slash equivalent)
681
+ || c == 0x2216 // Set Minus (backslash equivalent)
682
+ || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
683
+ || c == 0xFFFD // Replacement Character (UTF-8)
684
+ || c == 0xFEFF // Byte Order Mark (BOM)
685
+ || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
686
+ || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
687
+ return false;
688
+ }
689
+ }
690
+
691
+ // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
692
+ // Unicode and other whitespace is not affected, only 0x20 space
693
+ if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
694
+ return false;
695
+ }
696
+
697
+ // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
698
+ if (filename.find("..") != std::string::npos) {
699
+ return false;
700
+ }
701
+
702
+ // Reject "."
703
+ if (filename == ".") {
704
+ return false;
705
+ }
706
+
707
+ return true;
708
+ }
709
+
710
+ // returns true if successful, false otherwise
711
+ bool fs_create_directory_with_parents(const std::string & path) {
712
+ #ifdef _WIN32
713
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
714
+ std::wstring wpath = converter.from_bytes(path);
715
+
716
+ // if the path already exists, check whether it's a directory
717
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
718
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
719
+ return true;
720
+ }
721
+
722
+ size_t pos_slash = 0;
723
+
724
+ // process path from front to back, procedurally creating directories
725
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
726
+ const std::wstring subpath = wpath.substr(0, pos_slash);
727
+ const wchar_t * test = subpath.c_str();
728
+
729
+ const bool success = CreateDirectoryW(test, NULL);
730
+ if (!success) {
731
+ const DWORD error = GetLastError();
732
+
733
+ // if the path already exists, ensure that it's a directory
734
+ if (error == ERROR_ALREADY_EXISTS) {
735
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
736
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
737
+ return false;
738
+ }
739
+ } else {
740
+ return false;
741
+ }
742
+ }
743
+
744
+ pos_slash += 1;
745
+ }
746
+
747
+ return true;
748
+ #else
749
+ // if the path already exists, check whether it's a directory
750
+ struct stat info;
751
+ if (stat(path.c_str(), &info) == 0) {
752
+ return S_ISDIR(info.st_mode);
753
+ }
754
+
755
+ size_t pos_slash = 1; // skip leading slashes for directory creation
756
+
757
+ // process path from front to back, procedurally creating directories
758
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
759
+ const std::string subpath = path.substr(0, pos_slash);
760
+ struct stat info;
761
+
762
+ // if the path already exists, ensure that it's a directory
763
+ if (stat(subpath.c_str(), &info) == 0) {
764
+ if (!S_ISDIR(info.st_mode)) {
765
+ return false;
766
+ }
767
+ } else {
768
+ // create parent directories
769
+ const int ret = mkdir(subpath.c_str(), 0755);
770
+ if (ret != 0) {
771
+ return false;
772
+ }
773
+ }
774
+
775
+ pos_slash += 1;
776
+ }
777
+
778
+ return true;
779
+ #endif // _WIN32
780
+ }
781
+
782
+ std::string fs_get_cache_directory() {
783
+ std::string cache_directory = "";
784
+ auto ensure_trailing_slash = [](std::string p) {
785
+ // Make sure to add trailing slash
786
+ if (p.back() != DIRECTORY_SEPARATOR) {
787
+ p += DIRECTORY_SEPARATOR;
788
+ }
789
+ return p;
790
+ };
791
+ if (getenv("LLAMA_CACHE")) {
792
+ cache_directory = std::getenv("LLAMA_CACHE");
793
+ } else {
794
+ #ifdef __linux__
795
+ if (std::getenv("XDG_CACHE_HOME")) {
796
+ cache_directory = std::getenv("XDG_CACHE_HOME");
797
+ } else {
798
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
799
+ }
800
+ #elif defined(__APPLE__)
801
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
802
+ #elif defined(_WIN32)
803
+ cache_directory = std::getenv("LOCALAPPDATA");
804
+ #endif // __linux__
805
+ cache_directory = ensure_trailing_slash(cache_directory);
806
+ cache_directory += "llama.cpp";
807
+ }
808
+ return ensure_trailing_slash(cache_directory);
809
+ }
810
+
811
+ std::string fs_get_cache_file(const std::string & filename) {
812
+ LM_GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
813
+ std::string cache_directory = fs_get_cache_directory();
814
+ const bool success = fs_create_directory_with_parents(cache_directory);
815
+ if (!success) {
816
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
817
+ }
818
+ return cache_directory + filename;
819
+ }
820
+
821
+
822
+ //
823
+ // Model utils
824
+ //
825
+ struct common_init_result common_init_from_params(common_params & params) {
826
+ common_init_result iparams;
827
+ auto mparams = common_model_params_to_llama(params);
828
+
829
+ llama_model * model = nullptr;
830
+
831
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
832
+ model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
833
+ } else if (!params.model_url.empty()) {
834
+ model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
835
+ } else {
836
+ model = llama_load_model_from_file(params.model.c_str(), mparams);
837
+ }
838
+
839
+ if (model == NULL) {
840
+ LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
841
+ return iparams;
842
+ }
843
+
844
+ if (params.reranking) {
845
+ bool ok = true;
846
+
847
+ if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
848
+ LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
849
+ ok = false;
850
+ }
851
+
852
+ if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
853
+ LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
854
+ ok = false;
855
+ }
856
+
857
+ if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
858
+ LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
859
+ ok = false;
860
+ }
861
+
862
+ if (!ok) {
863
+ llama_free_model(model);
864
+
865
+ return iparams;
866
+ }
867
+ }
868
+
869
+ auto cparams = common_context_params_to_llama(params);
870
+
871
+ llama_context * lctx = llama_new_context_with_model(model, cparams);
872
+ if (lctx == NULL) {
873
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
874
+ llama_free_model(model);
875
+ return iparams;
876
+ }
877
+
878
+ if (!params.control_vectors.empty()) {
879
+ if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
880
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
881
+
882
+ const auto cvec = common_control_vector_load(params.control_vectors);
883
+ if (cvec.n_embd == -1) {
884
+ llama_free(lctx);
885
+ llama_free_model(model);
886
+
887
+ return iparams;
888
+ }
889
+
890
+ int err = llama_control_vector_apply(lctx,
891
+ cvec.data.data(),
892
+ cvec.data.size(),
893
+ cvec.n_embd,
894
+ params.control_vector_layer_start,
895
+ params.control_vector_layer_end);
896
+ if (err) {
897
+ llama_free(lctx);
898
+ llama_free_model(model);
899
+
900
+ return iparams;
901
+ }
902
+ }
903
+
904
+ // load and optionally apply lora adapters
905
+ for (auto & la : params.lora_adapters) {
906
+ common_lora_adapter_container loaded_la;
907
+ loaded_la.path = la.path;
908
+ loaded_la.scale = la.scale;
909
+ loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
910
+ if (loaded_la.adapter == nullptr) {
911
+ LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
912
+ llama_free(lctx);
913
+ llama_free_model(model);
914
+ return iparams;
915
+ }
916
+ iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
917
+ }
918
+ if (!params.lora_init_without_apply) {
919
+ common_lora_adapters_apply(lctx, iparams.lora_adapters);
920
+ }
921
+
922
+ if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
923
+ LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
924
+ params.sparams.ignore_eos = false;
925
+ }
926
+
927
+ if (params.warmup) {
928
+ LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
929
+
930
+ std::vector<llama_token> tmp;
931
+ llama_token bos = llama_token_bos(model);
932
+ llama_token eos = llama_token_eos(model);
933
+ // some models (e.g. T5) don't have a BOS token
934
+ if (bos != LLAMA_TOKEN_NULL) {
935
+ tmp.push_back(bos);
936
+ }
937
+ if (eos != LLAMA_TOKEN_NULL) {
938
+ tmp.push_back(eos);
939
+ }
940
+ if (tmp.empty()) {
941
+ tmp.push_back(0);
942
+ }
943
+
944
+ if (llama_model_has_encoder(model)) {
945
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
946
+ llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
947
+ if (decoder_start_token_id == -1) {
948
+ decoder_start_token_id = bos;
949
+ }
950
+ tmp.clear();
951
+ tmp.push_back(decoder_start_token_id);
952
+ }
953
+ if (llama_model_has_decoder(model)) {
954
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
955
+ }
956
+ llama_kv_cache_clear(lctx);
957
+ llama_synchronize(lctx);
958
+ llama_perf_context_reset(lctx);
959
+ }
960
+
961
+ iparams.model = model;
962
+ iparams.context = lctx;
963
+
964
+ return iparams;
965
+ }
966
+
967
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
968
+ llama_lora_adapter_clear(ctx);
969
+ for (auto & la : lora_adapters) {
970
+ if (la.scale != 0.0f) {
971
+ llama_lora_adapter_set(ctx, la.adapter, la.scale);
972
+ }
973
+ }
974
+ }
975
+
976
+ struct llama_model_params common_model_params_to_llama(const common_params & params) {
977
+ auto mparams = llama_model_default_params();
978
+
979
+ if (params.n_gpu_layers != -1) {
980
+ mparams.n_gpu_layers = params.n_gpu_layers;
981
+ }
982
+
983
+ mparams.progress_callback_user_data = params.progress_callback_user_data;
984
+ mparams.progress_callback = params.progress_callback;
985
+ mparams.vocab_only = params.vocab_only;
986
+ mparams.rpc_servers = params.rpc_servers.c_str();
987
+ mparams.main_gpu = params.main_gpu;
988
+ mparams.split_mode = params.split_mode;
989
+ mparams.tensor_split = params.tensor_split;
990
+ mparams.use_mmap = params.use_mmap;
991
+ mparams.use_mlock = params.use_mlock;
992
+ mparams.check_tensors = params.check_tensors;
993
+ if (params.kv_overrides.empty()) {
994
+ mparams.kv_overrides = NULL;
995
+ } else {
996
+ LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
997
+ mparams.kv_overrides = params.kv_overrides.data();
998
+ }
999
+
1000
+ return mparams;
1001
+ }
1002
+
1003
+ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
1004
+ if (s == "f32") {
1005
+ return LM_GGML_TYPE_F32;
1006
+ }
1007
+ if (s == "f16") {
1008
+ return LM_GGML_TYPE_F16;
1009
+ }
1010
+ if (s == "bf16") {
1011
+ return LM_GGML_TYPE_BF16;
1012
+ }
1013
+ if (s == "q8_0") {
1014
+ return LM_GGML_TYPE_Q8_0;
1015
+ }
1016
+ if (s == "q4_0") {
1017
+ return LM_GGML_TYPE_Q4_0;
1018
+ }
1019
+ if (s == "q4_1") {
1020
+ return LM_GGML_TYPE_Q4_1;
1021
+ }
1022
+ if (s == "iq4_nl") {
1023
+ return LM_GGML_TYPE_IQ4_NL;
1024
+ }
1025
+ if (s == "q5_0") {
1026
+ return LM_GGML_TYPE_Q5_0;
1027
+ }
1028
+ if (s == "q5_1") {
1029
+ return LM_GGML_TYPE_Q5_1;
1030
+ }
1031
+
1032
+ throw std::runtime_error("Unsupported cache type: " + s);
1033
+ }
1034
+
1035
+ struct llama_context_params common_context_params_to_llama(const common_params & params) {
1036
+ auto cparams = llama_context_default_params();
1037
+
1038
+ cparams.n_ctx = params.n_ctx;
1039
+ cparams.n_seq_max = params.n_parallel;
1040
+ cparams.n_batch = params.n_batch;
1041
+ cparams.n_ubatch = params.n_ubatch;
1042
+ cparams.n_threads = params.cpuparams.n_threads;
1043
+ cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1044
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1045
+ cparams.logits_all = params.logits_all;
1046
+ cparams.embeddings = params.embedding;
1047
+ cparams.rope_scaling_type = params.rope_scaling_type;
1048
+ cparams.rope_freq_base = params.rope_freq_base;
1049
+ cparams.rope_freq_scale = params.rope_freq_scale;
1050
+ cparams.yarn_ext_factor = params.yarn_ext_factor;
1051
+ cparams.yarn_attn_factor = params.yarn_attn_factor;
1052
+ cparams.yarn_beta_fast = params.yarn_beta_fast;
1053
+ cparams.yarn_beta_slow = params.yarn_beta_slow;
1054
+ cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1055
+ cparams.pooling_type = params.pooling_type;
1056
+ cparams.attention_type = params.attention_type;
1057
+ cparams.defrag_thold = params.defrag_thold;
1058
+ cparams.cb_eval = params.cb_eval;
1059
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
1060
+ cparams.offload_kqv = !params.no_kv_offload;
1061
+ cparams.flash_attn = params.flash_attn;
1062
+ cparams.no_perf = params.no_perf;
1063
+
1064
+ if (params.reranking) {
1065
+ cparams.embeddings = true;
1066
+ cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1067
+ }
1068
+
1069
+ cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1070
+ cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
1071
+
1072
+ return cparams;
1073
+ }
1074
+
1075
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
1076
+ struct lm_ggml_threadpool_params tpp;
1077
+
1078
+ lm_ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
1079
+
1080
+ if (params.mask_valid) {
1081
+ std::memcpy(&tpp.cpumask, &params.cpumask, LM_GGML_MAX_N_THREADS);
1082
+ }
1083
+
1084
+ tpp.prio = params.priority;
1085
+ tpp.poll = params.poll;
1086
+ tpp.strict_cpu = params.strict_cpu;
1087
+
1088
+ return tpp;
1089
+ }
1090
+
1091
+ #ifdef LLAMA_USE_CURL
1092
+
1093
+ #define CURL_MAX_RETRY 3
1094
+ #define CURL_RETRY_DELAY_SECONDS 2
1095
+
1096
+
1097
+ static bool starts_with(const std::string & str, const std::string & prefix) {
1098
+ // While we wait for C++20's std::string::starts_with...
1099
+ return str.rfind(prefix, 0) == 0;
1100
+ }
1101
+
1102
+ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
1103
+ int remaining_attempts = max_attempts;
1104
+
1105
+ while (remaining_attempts > 0) {
1106
+ LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
1107
+
1108
+ CURLcode res = curl_easy_perform(curl);
1109
+ if (res == CURLE_OK) {
1110
+ return true;
1111
+ }
1112
+
1113
+ int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
1114
+ LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
1115
+
1116
+ remaining_attempts--;
1117
+ std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
1118
+ }
1119
+
1120
+ LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
1121
+
1122
+ return false;
1123
+ }
1124
+
1125
+ struct llama_model * common_load_model_from_url(
1126
+ const char * model_url,
1127
+ const char * path_model,
1128
+ const char * hf_token,
1129
+ const struct llama_model_params & params) {
1130
+ // Basic validation of the model_url
1131
+ if (!model_url || strlen(model_url) == 0) {
1132
+ LOG_ERR("%s: invalid model_url\n", __func__);
1133
+ return NULL;
1134
+ }
1135
+
1136
+ if (!common_download_file(model_url, path_model, hf_token)) {
1137
+ return NULL;
1138
+ }
1139
+
1140
+ // check for additional GGUFs split to download
1141
+ int n_split = 0;
1142
+ {
1143
+ struct lm_gguf_init_params lm_gguf_params = {
1144
+ /*.no_alloc = */ true,
1145
+ /*.ctx = */ NULL,
1146
+ };
1147
+ auto * ctx_gguf = lm_gguf_init_from_file(path_model, lm_gguf_params);
1148
+ if (!ctx_gguf) {
1149
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
1150
+ return NULL;
1151
+ }
1152
+
1153
+ auto key_n_split = lm_gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
1154
+ if (key_n_split >= 0) {
1155
+ n_split = lm_gguf_get_val_u16(ctx_gguf, key_n_split);
1156
+ }
1157
+
1158
+ lm_gguf_free(ctx_gguf);
1159
+ }
1160
+
1161
+ if (n_split > 1) {
1162
+ char split_prefix[PATH_MAX] = {0};
1163
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1164
+
1165
+ // Verify the first split file format
1166
+ // and extract split URL and PATH prefixes
1167
+ {
1168
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
1169
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
1170
+ return NULL;
1171
+ }
1172
+
1173
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
1174
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
1175
+ return NULL;
1176
+ }
1177
+ }
1178
+
1179
+ // Prepare download in parallel
1180
+ std::vector<std::future<bool>> futures_download;
1181
+ for (int idx = 1; idx < n_split; idx++) {
1182
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
1183
+ char split_path[PATH_MAX] = {0};
1184
+ llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
1185
+
1186
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1187
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1188
+
1189
+ return common_download_file(split_url, split_path, hf_token);
1190
+ }, idx));
1191
+ }
1192
+
1193
+ // Wait for all downloads to complete
1194
+ for (auto & f : futures_download) {
1195
+ if (!f.get()) {
1196
+ return NULL;
1197
+ }
1198
+ }
1199
+ }
1200
+
1201
+ return llama_load_model_from_file(path_model, params);
1202
+ }
1203
+
1204
+ struct llama_model * common_load_model_from_hf(
1205
+ const char * repo,
1206
+ const char * model,
1207
+ const char * path_model,
1208
+ const char * hf_token,
1209
+ const struct llama_model_params & params) {
1210
+ // construct hugging face model url:
1211
+ //
1212
+ // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
1213
+ // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
1214
+ //
1215
+ // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
1216
+ // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
1217
+ //
1218
+
1219
+ std::string model_url = "https://huggingface.co/";
1220
+ model_url += repo;
1221
+ model_url += "/resolve/main/";
1222
+ model_url += model;
1223
+
1224
+ return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1225
+ }
1226
+
1227
+ #else
1228
+
1229
+ struct llama_model * common_load_model_from_url(
1230
+ const char * /*model_url*/,
1231
+ const char * /*path_model*/,
1232
+ const char * /*hf_token*/,
1233
+ const struct llama_model_params & /*params*/) {
1234
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
1235
+ return nullptr;
1236
+ }
1237
+
1238
+ struct llama_model * common_load_model_from_hf(
1239
+ const char * /*repo*/,
1240
+ const char * /*model*/,
1241
+ const char * /*path_model*/,
1242
+ const char * /*hf_token*/,
1243
+ const struct llama_model_params & /*params*/) {
1244
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1245
+ return nullptr;
1246
+ }
1247
+
1248
+ #endif // LLAMA_USE_CURL
1249
+
1250
+ //
1251
+ // Batch utils
1252
+ //
1253
+
1254
+ void common_batch_clear(struct llama_batch & batch) {
1255
+ batch.n_tokens = 0;
1256
+ }
1257
+
1258
+ void common_batch_add(
1259
+ struct llama_batch & batch,
1260
+ llama_token id,
1261
+ llama_pos pos,
1262
+ const std::vector<llama_seq_id> & seq_ids,
1263
+ bool logits) {
1264
+ LM_GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
1265
+
1266
+ batch.token [batch.n_tokens] = id;
1267
+ batch.pos [batch.n_tokens] = pos;
1268
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1269
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
1270
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
1271
+ }
1272
+ batch.logits [batch.n_tokens] = logits;
1273
+
1274
+ batch.n_tokens++;
1275
+ }
1276
+
1277
+ //
1278
+ // Vocab utils
1279
+ //
1280
+
1281
+ std::vector<llama_token> common_tokenize(
1282
+ const struct llama_context * ctx,
1283
+ const std::string & text,
1284
+ bool add_special,
1285
+ bool parse_special) {
1286
+ return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1287
+ }
1288
+
1289
+ std::vector<llama_token> common_tokenize(
1290
+ const struct llama_model * model,
1291
+ const std::string & text,
1292
+ bool add_special,
1293
+ bool parse_special) {
1294
+ // upper limit for the number of tokens
1295
+ int n_tokens = text.length() + 2 * add_special;
1296
+ std::vector<llama_token> result(n_tokens);
1297
+ n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1298
+ if (n_tokens < 0) {
1299
+ result.resize(-n_tokens);
1300
+ int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1301
+ LM_GGML_ASSERT(check == -n_tokens);
1302
+ } else {
1303
+ result.resize(n_tokens);
1304
+ }
1305
+ return result;
1306
+ }
1307
+
1308
+ std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1309
+ std::string piece;
1310
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1311
+ const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1312
+ if (n_chars < 0) {
1313
+ piece.resize(-n_chars);
1314
+ int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1315
+ LM_GGML_ASSERT(check == -n_chars);
1316
+ }
1317
+ else {
1318
+ piece.resize(n_chars);
1319
+ }
1320
+
1321
+ return piece;
1322
+ }
1323
+
1324
+ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1325
+ std::string text;
1326
+ text.resize(std::max(text.capacity(), tokens.size()));
1327
+ int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1328
+ if (n_chars < 0) {
1329
+ text.resize(-n_chars);
1330
+ n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1331
+ LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1332
+ }
1333
+
1334
+ text.resize(n_chars);
1335
+
1336
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
1337
+ return text;
1338
+ }
1339
+
1340
+ //
1341
+ // Chat template utils
1342
+ //
1343
+
1344
+ bool common_chat_verify_template(const std::string & tmpl) {
1345
+ llama_chat_message chat[] = {{"user", "test"}};
1346
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1347
+ return res >= 0;
1348
+ }
1349
+
1350
+ std::string common_chat_apply_template(const struct llama_model * model,
1351
+ const std::string & tmpl,
1352
+ const std::vector<common_chat_msg> & msgs,
1353
+ bool add_ass) {
1354
+ int alloc_size = 0;
1355
+ bool fallback = false; // indicate if we must fallback to default chatml
1356
+ std::vector<llama_chat_message> chat;
1357
+ for (auto & msg : msgs) {
1358
+ chat.push_back({msg.role.c_str(), msg.content.c_str()});
1359
+ alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1360
+ }
1361
+
1362
+ const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
1363
+ std::vector<char> buf(alloc_size);
1364
+
1365
+ // run the first time to get the total output length
1366
+ int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1367
+
1368
+ // error: chat template is not supported
1369
+ if (res < 0) {
1370
+ if (ptr_tmpl != nullptr) {
1371
+ // if the custom "tmpl" is not supported, we throw an error
1372
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1373
+ throw std::runtime_error("this custom template is not supported");
1374
+ } else {
1375
+ // If the built-in template is not supported, we default to chatml
1376
+ res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1377
+ fallback = true;
1378
+ }
1379
+ }
1380
+
1381
+ // if it turns out that our buffer is too small, we resize it
1382
+ if ((size_t) res > buf.size()) {
1383
+ buf.resize(res);
1384
+ res = llama_chat_apply_template(
1385
+ fallback ? nullptr : model,
1386
+ fallback ? "chatml" : ptr_tmpl,
1387
+ chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1388
+ }
1389
+
1390
+ std::string formatted_chat(buf.data(), res);
1391
+ return formatted_chat;
1392
+ }
1393
+
1394
+ std::string common_chat_format_single(const struct llama_model * model,
1395
+ const std::string & tmpl,
1396
+ const std::vector<common_chat_msg> & past_msg,
1397
+ const common_chat_msg & new_msg,
1398
+ bool add_ass) {
1399
+ std::ostringstream ss;
1400
+ auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
1401
+ std::vector<common_chat_msg> chat_new(past_msg);
1402
+ // if the past_msg ends with a newline, we must preserve it in the formatted version
1403
+ if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
1404
+ ss << "\n";
1405
+ };
1406
+ // format chat with new_msg
1407
+ chat_new.push_back(new_msg);
1408
+ auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
1409
+ // get the diff part
1410
+ ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1411
+ return ss.str();
1412
+ }
1413
+
1414
+ std::string common_chat_format_example(const struct llama_model * model,
1415
+ const std::string & tmpl) {
1416
+ std::vector<common_chat_msg> msgs = {
1417
+ {"system", "You are a helpful assistant"},
1418
+ {"user", "Hello"},
1419
+ {"assistant", "Hi there"},
1420
+ {"user", "How are you?"},
1421
+ };
1422
+ return common_chat_apply_template(model, tmpl, msgs, true);
1423
+ }
1424
+
1425
+ //
1426
+ // KV cache utils
1427
+ //
1428
+
1429
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1430
+ static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1431
+
1432
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1433
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1434
+
1435
+ llama_kv_cache_view_cell * c_curr = view.cells;
1436
+ llama_seq_id * cs_curr = view.cells_sequences;
1437
+
1438
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1439
+ if (i % row_size == 0) {
1440
+ printf("\n%5d: ", i);
1441
+ }
1442
+ int seq_count = 0;
1443
+ for (int j = 0; j < view.n_seq_max; j++) {
1444
+ if (cs_curr[j] >= 0) { seq_count++; }
1445
+ }
1446
+ putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1447
+ }
1448
+
1449
+ printf("\n=== Done dumping\n");
1450
+ }
1451
+
1452
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1453
+ static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1454
+
1455
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1456
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1457
+
1458
+ std::unordered_map<llama_seq_id, size_t> seqs;
1459
+ llama_kv_cache_view_cell * c_curr = view.cells;
1460
+ llama_seq_id * cs_curr = view.cells_sequences;
1461
+
1462
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1463
+ for (int j = 0; j < view.n_seq_max; j++) {
1464
+ if (cs_curr[j] < 0) { continue; }
1465
+ if (seqs.find(cs_curr[j]) == seqs.end()) {
1466
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1467
+ const size_t sz = seqs.size();
1468
+ seqs[cs_curr[j]] = sz;
1469
+ }
1470
+ }
1471
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1472
+ }
1473
+
1474
+ printf("=== Sequence legend: ");
1475
+ for (const auto & it : seqs) {
1476
+ printf("%zu=%d, ", it.second, it.first);
1477
+ }
1478
+ printf("'+'=other sequence ids");
1479
+
1480
+ c_curr = view.cells;
1481
+ cs_curr = view.cells_sequences;
1482
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1483
+ if (i % row_size == 0) {
1484
+ printf("\n%5d: ", i);
1485
+ }
1486
+ for (int j = 0; j < view.n_seq_max; j++) {
1487
+ if (cs_curr[j] >= 0) {
1488
+ const auto & it = seqs.find(cs_curr[j]);
1489
+ putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1490
+ } else {
1491
+ putchar('.');
1492
+ }
1493
+ }
1494
+ putchar(' ');
1495
+ }
1496
+
1497
+ printf("\n=== Done dumping\n");
1498
+ }
1499
+
1500
+ //
1501
+ // Embedding utils
1502
+ //
1503
+
1504
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1505
+ double sum = 0.0;
1506
+
1507
+ switch (embd_norm) {
1508
+ case -1: // no normalisation
1509
+ sum = 1.0;
1510
+ break;
1511
+ case 0: // max absolute
1512
+ for (int i = 0; i < n; i++) {
1513
+ if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1514
+ }
1515
+ sum /= 32760.0; // make an int16 range
1516
+ break;
1517
+ case 2: // euclidean
1518
+ for (int i = 0; i < n; i++) {
1519
+ sum += inp[i] * inp[i];
1520
+ }
1521
+ sum = std::sqrt(sum);
1522
+ break;
1523
+ default: // p-norm (euclidean is p-norm p=2)
1524
+ for (int i = 0; i < n; i++) {
1525
+ sum += std::pow(std::abs(inp[i]), embd_norm);
1526
+ }
1527
+ sum = std::pow(sum, 1.0 / embd_norm);
1528
+ break;
1529
+ }
1530
+
1531
+ const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
1532
+
1533
+ for (int i = 0; i < n; i++) {
1534
+ out[i] = inp[i] * norm;
1535
+ }
1536
+ }
1537
+
1538
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1539
+ double sum = 0.0;
1540
+ double sum1 = 0.0;
1541
+ double sum2 = 0.0;
1542
+
1543
+ for (int i = 0; i < n; i++) {
1544
+ sum += embd1[i] * embd2[i];
1545
+ sum1 += embd1[i] * embd1[i];
1546
+ sum2 += embd2[i] * embd2[i];
1547
+ }
1548
+
1549
+ // Handle the case where one or both vectors are zero vectors
1550
+ if (sum1 == 0.0 || sum2 == 0.0) {
1551
+ if (sum1 == 0.0 && sum2 == 0.0) {
1552
+ return 1.0f; // two zero vectors are similar
1553
+ }
1554
+ return 0.0f;
1555
+ }
1556
+
1557
+ return sum / (sqrt(sum1) * sqrt(sum2));
1558
+ }
1559
+
1560
+ //
1561
+ // Control vector utils
1562
+ //
1563
+
1564
+ static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1565
+ common_control_vector_data result = { -1, {} };
1566
+
1567
+ lm_ggml_context * ctx = nullptr;
1568
+ struct lm_gguf_init_params meta_lm_gguf_params = {
1569
+ /* .no_alloc = */ false,
1570
+ /* .ctx = */ &ctx,
1571
+ };
1572
+ struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(load_info.fname.c_str(), meta_lm_gguf_params);
1573
+ if (!ctx_gguf) {
1574
+ LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
1575
+ return result;
1576
+ }
1577
+
1578
+ int32_t n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
1579
+ if (n_tensors == 0) {
1580
+ LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
1581
+ }
1582
+
1583
+ for (int i = 0; i < n_tensors; i++) {
1584
+ std::string name = lm_gguf_get_tensor_name(ctx_gguf, i);
1585
+
1586
+ int layer_idx = -1;
1587
+
1588
+ // split on '.'
1589
+ size_t dotpos = name.find('.');
1590
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
1591
+ try {
1592
+ layer_idx = std::stoi(name.substr(dotpos + 1));
1593
+ } catch (...) {
1594
+ layer_idx = -1;
1595
+ }
1596
+ }
1597
+ if (layer_idx < 0) {
1598
+ LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1599
+ result.n_embd = -1;
1600
+ break;
1601
+ } else if (layer_idx == 0) {
1602
+ LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1603
+ result.n_embd = -1;
1604
+ break;
1605
+ }
1606
+
1607
+ struct lm_ggml_tensor * tensor = lm_ggml_get_tensor(ctx, name.c_str());
1608
+ if (tensor->type != LM_GGML_TYPE_F32) {
1609
+ LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
1610
+ result.n_embd = -1;
1611
+ break;
1612
+ }
1613
+ if (lm_ggml_n_dims(tensor) != 1) {
1614
+ LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
1615
+ result.n_embd = -1;
1616
+ break;
1617
+ }
1618
+
1619
+ if (result.n_embd == -1) {
1620
+ result.n_embd = lm_ggml_nelements(tensor);
1621
+ } else if (lm_ggml_nelements(tensor) != result.n_embd) {
1622
+ LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
1623
+ result.n_embd = -1;
1624
+ break;
1625
+ }
1626
+
1627
+ // extend if necessary - do not store data for layer 0 (it's not used)
1628
+ result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
1629
+
1630
+ const float * src = (const float *) tensor->data;
1631
+ float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
1632
+ for (int j = 0; j < result.n_embd; j++) {
1633
+ dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
1634
+ }
1635
+
1636
+ }
1637
+
1638
+ if (result.n_embd == -1) {
1639
+ LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
1640
+ result.data.clear();
1641
+ }
1642
+
1643
+ lm_gguf_free(ctx_gguf);
1644
+ lm_ggml_free(ctx);
1645
+
1646
+ return result;
1647
+ }
1648
+
1649
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1650
+ common_control_vector_data result = { -1, {} };
1651
+
1652
+ for (const auto & info : load_infos) {
1653
+ auto cur = common_control_vector_load_one(info);
1654
+
1655
+ if (cur.n_embd == -1) {
1656
+ result.n_embd = -1;
1657
+ break;
1658
+ }
1659
+ if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
1660
+ LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
1661
+ result.n_embd = -1;
1662
+ break;
1663
+ }
1664
+
1665
+ if (result.n_embd == -1) {
1666
+ result = std::move(cur);
1667
+ } else {
1668
+ result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
1669
+ for (size_t i = 0; i < cur.data.size(); i++) {
1670
+ result.data[i] += cur.data[i];
1671
+ }
1672
+ }
1673
+ }
1674
+
1675
+ if (result.n_embd == -1) {
1676
+ LOG_ERR("%s: no valid control vector files passed\n", __func__);
1677
+ result.data.clear();
1678
+ }
1679
+
1680
+ return result;
1681
+ }
1682
+