@fugood/llama.node 1.4.13 → 1.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +11 -1
- package/lib/index.js +2 -1
- package/lib/index.ts +2 -0
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +2 -2
- package/src/LlamaContext.cpp +5 -2
- package/src/llama.cpp/common/arg.cpp +150 -56
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/common.cpp +2 -1
- package/src/llama.cpp/common/common.h +10 -7
- package/src/llama.cpp/common/download.cpp +104 -55
- package/src/llama.cpp/common/download.h +26 -5
- package/src/llama.cpp/common/preset.cpp +76 -1
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +5 -2
- package/src/llama.cpp/src/llama-mmap.cpp +70 -37
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +15 -5
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +63 -27
|
@@ -110,7 +110,7 @@ struct llama_file::impl {
|
|
|
110
110
|
}
|
|
111
111
|
}
|
|
112
112
|
|
|
113
|
-
void read_raw(void * ptr, size_t len)
|
|
113
|
+
void read_raw(void * ptr, size_t len) {
|
|
114
114
|
size_t bytes_read = 0;
|
|
115
115
|
while (bytes_read < len) {
|
|
116
116
|
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
|
@@ -127,7 +127,7 @@ struct llama_file::impl {
|
|
|
127
127
|
}
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
-
uint32_t read_u32()
|
|
130
|
+
uint32_t read_u32() {
|
|
131
131
|
uint32_t val;
|
|
132
132
|
read_raw(&val, sizeof(val));
|
|
133
133
|
return val;
|
|
@@ -154,8 +154,8 @@ struct llama_file::impl {
|
|
|
154
154
|
write_raw(&val, sizeof(val));
|
|
155
155
|
}
|
|
156
156
|
|
|
157
|
-
|
|
158
|
-
|
|
157
|
+
bool has_direct_io() const {
|
|
158
|
+
return true;
|
|
159
159
|
}
|
|
160
160
|
|
|
161
161
|
~impl() {
|
|
@@ -164,33 +164,45 @@ struct llama_file::impl {
|
|
|
164
164
|
}
|
|
165
165
|
}
|
|
166
166
|
#else
|
|
167
|
-
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
|
167
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
|
|
168
168
|
#ifdef __linux__
|
|
169
169
|
// Try unbuffered I/O for read only
|
|
170
170
|
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
|
171
|
-
|
|
171
|
+
if (init_fd()) {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
|
|
175
|
+
fname, strerror(errno));
|
|
176
|
+
}
|
|
177
|
+
#endif
|
|
178
|
+
init_fp(mode);
|
|
179
|
+
}
|
|
172
180
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
181
|
+
#ifdef __linux__
|
|
182
|
+
bool init_fd() {
|
|
183
|
+
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
|
|
176
184
|
|
|
177
|
-
|
|
178
|
-
|
|
185
|
+
if (fd != -1) {
|
|
186
|
+
struct stat file_stats{};
|
|
187
|
+
fstat(fd, &file_stats);
|
|
179
188
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
183
|
-
}
|
|
184
|
-
return;
|
|
185
|
-
}
|
|
189
|
+
size = file_stats.st_size;
|
|
190
|
+
alignment = file_stats.st_blksize;
|
|
186
191
|
|
|
187
|
-
|
|
188
|
-
|
|
192
|
+
off_t ret = lseek(fd, 0, SEEK_SET);
|
|
193
|
+
if (ret == -1) {
|
|
194
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
195
|
+
}
|
|
196
|
+
return true;
|
|
189
197
|
}
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
190
200
|
#endif
|
|
191
|
-
|
|
201
|
+
|
|
202
|
+
void init_fp(const char * mode) {
|
|
203
|
+
fp = ggml_fopen(fname.c_str(), mode);
|
|
192
204
|
if (fp == NULL) {
|
|
193
|
-
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
205
|
+
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
|
|
194
206
|
}
|
|
195
207
|
seek(0, SEEK_END);
|
|
196
208
|
size = tell();
|
|
@@ -226,7 +238,7 @@ struct llama_file::impl {
|
|
|
226
238
|
}
|
|
227
239
|
}
|
|
228
240
|
|
|
229
|
-
void
|
|
241
|
+
void read_raw_unsafe(void * ptr, size_t len) {
|
|
230
242
|
if (len == 0) {
|
|
231
243
|
return;
|
|
232
244
|
}
|
|
@@ -249,6 +261,17 @@ struct llama_file::impl {
|
|
|
249
261
|
if (errno == EINTR) {
|
|
250
262
|
continue; // Interrupted by signal, retry
|
|
251
263
|
}
|
|
264
|
+
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
|
265
|
+
if (errno == EFAULT) {
|
|
266
|
+
auto curr_off = tell();
|
|
267
|
+
close(fd);
|
|
268
|
+
fd = -1;
|
|
269
|
+
alignment = 1;
|
|
270
|
+
init_fp("rb");
|
|
271
|
+
seek(curr_off, SEEK_SET);
|
|
272
|
+
read_raw_unsafe(ptr, len);
|
|
273
|
+
return;
|
|
274
|
+
}
|
|
252
275
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
253
276
|
}
|
|
254
277
|
if (ret == 0) {
|
|
@@ -266,7 +289,8 @@ struct llama_file::impl {
|
|
|
266
289
|
}
|
|
267
290
|
}
|
|
268
291
|
|
|
269
|
-
void read_aligned_chunk(
|
|
292
|
+
void read_aligned_chunk(void * dest, size_t size) {
|
|
293
|
+
size_t offset = tell();
|
|
270
294
|
off_t aligned_offset = offset & ~(alignment - 1);
|
|
271
295
|
off_t offset_from_alignment = offset - aligned_offset;
|
|
272
296
|
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
|
@@ -283,13 +307,21 @@ struct llama_file::impl {
|
|
|
283
307
|
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
|
284
308
|
|
|
285
309
|
seek(aligned_offset, SEEK_SET);
|
|
286
|
-
|
|
310
|
+
read_raw_unsafe(buffer.get(), bytes_to_read);
|
|
287
311
|
|
|
288
312
|
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
|
289
313
|
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
|
290
314
|
}
|
|
291
315
|
|
|
292
|
-
|
|
316
|
+
void read_raw(void * ptr, size_t len) {
|
|
317
|
+
if (has_direct_io()) {
|
|
318
|
+
read_aligned_chunk(ptr, len);
|
|
319
|
+
} else {
|
|
320
|
+
read_raw_unsafe(ptr, len);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
uint32_t read_u32() {
|
|
293
325
|
uint32_t ret;
|
|
294
326
|
read_raw(&ret, sizeof(ret));
|
|
295
327
|
return ret;
|
|
@@ -310,6 +342,10 @@ struct llama_file::impl {
|
|
|
310
342
|
write_raw(&val, sizeof(val));
|
|
311
343
|
}
|
|
312
344
|
|
|
345
|
+
bool has_direct_io() const {
|
|
346
|
+
return fd != -1 && alignment > 1;
|
|
347
|
+
}
|
|
348
|
+
|
|
313
349
|
~impl() {
|
|
314
350
|
if (fd != -1) {
|
|
315
351
|
close(fd);
|
|
@@ -318,17 +354,9 @@ struct llama_file::impl {
|
|
|
318
354
|
}
|
|
319
355
|
}
|
|
320
356
|
int fd = -1;
|
|
357
|
+
std::string fname;
|
|
321
358
|
#endif
|
|
322
359
|
|
|
323
|
-
void read_raw_at(void * ptr, size_t len, size_t offset) const {
|
|
324
|
-
if (alignment != 1) {
|
|
325
|
-
read_aligned_chunk(offset, ptr, len);
|
|
326
|
-
} else {
|
|
327
|
-
seek(offset, SEEK_SET);
|
|
328
|
-
read_raw(ptr, len);
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
|
|
332
360
|
size_t read_alignment() const {
|
|
333
361
|
return alignment;
|
|
334
362
|
}
|
|
@@ -347,6 +375,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
|
|
|
347
375
|
size_t llama_file::size() const { return pimpl->size; }
|
|
348
376
|
|
|
349
377
|
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
|
378
|
+
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
|
|
350
379
|
|
|
351
380
|
int llama_file::file_id() const {
|
|
352
381
|
#ifdef _WIN32
|
|
@@ -361,10 +390,14 @@ int llama_file::file_id() const {
|
|
|
361
390
|
}
|
|
362
391
|
|
|
363
392
|
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
|
364
|
-
void llama_file::read_raw(void * ptr, size_t len)
|
|
365
|
-
|
|
393
|
+
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
|
394
|
+
#ifdef _WIN32
|
|
395
|
+
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
|
396
|
+
#else
|
|
397
|
+
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
|
|
398
|
+
#endif
|
|
366
399
|
|
|
367
|
-
uint32_t llama_file::read_u32()
|
|
400
|
+
uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
|
|
368
401
|
|
|
369
402
|
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
|
|
370
403
|
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
|
|
@@ -24,15 +24,16 @@ struct llama_file {
|
|
|
24
24
|
|
|
25
25
|
void seek(size_t offset, int whence) const;
|
|
26
26
|
|
|
27
|
-
void read_raw(void * ptr, size_t len)
|
|
28
|
-
void
|
|
29
|
-
void read_aligned_chunk(
|
|
30
|
-
uint32_t read_u32()
|
|
27
|
+
void read_raw(void * ptr, size_t len);
|
|
28
|
+
void read_raw_unsafe(void * ptr, size_t len);
|
|
29
|
+
void read_aligned_chunk(void * dest, size_t size);
|
|
30
|
+
uint32_t read_u32();
|
|
31
31
|
|
|
32
32
|
void write_raw(const void * ptr, size_t len) const;
|
|
33
33
|
void write_u32(uint32_t val) const;
|
|
34
34
|
|
|
35
35
|
size_t read_alignment() const;
|
|
36
|
+
bool has_direct_io() const;
|
|
36
37
|
private:
|
|
37
38
|
struct impl;
|
|
38
39
|
std::unique_ptr<impl> pimpl;
|
|
@@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader(
|
|
|
495
495
|
const std::string & fname,
|
|
496
496
|
std::vector<std::string> & splits,
|
|
497
497
|
bool use_mmap,
|
|
498
|
+
bool use_direct_io,
|
|
498
499
|
bool check_tensors,
|
|
499
500
|
bool no_alloc,
|
|
500
501
|
const llama_model_kv_override * param_overrides_p,
|
|
@@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader(
|
|
|
527
528
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
528
529
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
529
530
|
|
|
530
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb",
|
|
531
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
|
531
532
|
contexts.emplace_back(ctx);
|
|
532
533
|
|
|
534
|
+
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
|
535
|
+
|
|
536
|
+
// Disable mmap in case Direct I/O is enabled and available
|
|
537
|
+
if (use_direct_io && use_mmap) {
|
|
538
|
+
use_mmap = false;
|
|
539
|
+
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
|
540
|
+
}
|
|
541
|
+
|
|
533
542
|
// Save tensors data offset of the main file.
|
|
534
543
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
|
535
544
|
// so we build a unified tensors index for weights.
|
|
@@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader(
|
|
|
595
604
|
}
|
|
596
605
|
}
|
|
597
606
|
|
|
598
|
-
files.emplace_back(new llama_file(fname_split, "rb",
|
|
607
|
+
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
|
599
608
|
contexts.emplace_back(ctx);
|
|
600
609
|
|
|
601
610
|
// Save tensors data offset info of the shard.
|
|
@@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader(
|
|
|
739
748
|
}
|
|
740
749
|
|
|
741
750
|
this->use_mmap = use_mmap;
|
|
751
|
+
this->use_direct_io = use_direct_io;
|
|
742
752
|
this->check_tensors = check_tensors;
|
|
743
753
|
this->no_alloc = no_alloc;
|
|
744
754
|
}
|
|
@@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data(
|
|
|
1100
1110
|
const auto & file = files.at(weight->idx);
|
|
1101
1111
|
|
|
1102
1112
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
1103
|
-
file->
|
|
1113
|
+
file->seek(weight->offs, SEEK_SET);
|
|
1114
|
+
file->read_raw(cur->data, n_size);
|
|
1104
1115
|
if (check_tensors) {
|
|
1105
1116
|
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
|
1106
1117
|
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
|
@@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data(
|
|
|
1132
1143
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
1133
1144
|
|
|
1134
1145
|
// Read aligned chunk from file
|
|
1135
|
-
file->
|
|
1146
|
+
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
|
1136
1147
|
|
|
1137
1148
|
// Calculate actual data portion (excluding alignment padding)
|
|
1138
1149
|
uintptr_t ptr_data = ptr_dest_aligned;
|
|
@@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data(
|
|
|
1162
1173
|
}
|
|
1163
1174
|
} else {
|
|
1164
1175
|
read_buf.resize(n_size);
|
|
1165
|
-
file->
|
|
1176
|
+
file->seek(weight->offs, SEEK_SET);
|
|
1177
|
+
file->read_raw(read_buf.data(), n_size);
|
|
1166
1178
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
|
1167
1179
|
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
|
1168
1180
|
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
@@ -70,6 +70,7 @@ struct llama_model_loader {
|
|
|
70
70
|
size_t n_bytes = 0;
|
|
71
71
|
|
|
72
72
|
bool use_mmap = false;
|
|
73
|
+
bool use_direct_io = false;
|
|
73
74
|
bool check_tensors;
|
|
74
75
|
bool no_alloc;
|
|
75
76
|
|
|
@@ -97,6 +98,7 @@ struct llama_model_loader {
|
|
|
97
98
|
const std::string & fname,
|
|
98
99
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
99
100
|
bool use_mmap,
|
|
101
|
+
bool use_direct_io,
|
|
100
102
|
bool check_tensors,
|
|
101
103
|
bool no_alloc,
|
|
102
104
|
const llama_model_kv_override * param_overrides_p,
|
|
@@ -2440,7 +2440,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2440
2440
|
|
|
2441
2441
|
const bool use_mmap_buffer = true;
|
|
2442
2442
|
|
|
2443
|
-
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n",
|
|
2443
|
+
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
|
|
2444
|
+
__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
|
|
2444
2445
|
|
|
2445
2446
|
// build a list of buffer types for the CPU and GPU devices
|
|
2446
2447
|
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
|
|
@@ -2451,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2451
2452
|
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
|
|
2452
2453
|
}
|
|
2453
2454
|
|
|
2455
|
+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2456
|
+
if (cpu_dev == nullptr) {
|
|
2457
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
2458
|
+
}
|
|
2459
|
+
|
|
2454
2460
|
// calculate the split points
|
|
2455
2461
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
|
|
2456
2462
|
std::vector<float> splits(n_devices());
|
|
@@ -2461,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2461
2467
|
size_t total;
|
|
2462
2468
|
size_t free;
|
|
2463
2469
|
ggml_backend_dev_memory(dev, &free, &total);
|
|
2470
|
+
|
|
2471
|
+
// devices can return 0 bytes for free and total memory if they do not
|
|
2472
|
+
// have any to report. in this case, we will use the host memory as a fallback
|
|
2473
|
+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
|
2474
|
+
if (free == 0 && total == 0) {
|
|
2475
|
+
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
|
2476
|
+
}
|
|
2464
2477
|
splits[i] = free;
|
|
2465
2478
|
}
|
|
2466
2479
|
} else {
|
|
@@ -2477,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2477
2490
|
splits[i] /= split_sum;
|
|
2478
2491
|
}
|
|
2479
2492
|
|
|
2480
|
-
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2481
|
-
if (cpu_dev == nullptr) {
|
|
2482
|
-
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
2483
|
-
}
|
|
2484
2493
|
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
|
2485
2494
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
|
2486
2495
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
@@ -7973,6 +7982,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7973
7982
|
/*.kv_overrides =*/ nullptr,
|
|
7974
7983
|
/*.vocab_only =*/ false,
|
|
7975
7984
|
/*.use_mmap =*/ true,
|
|
7985
|
+
/*.use_direct_io =*/ true,
|
|
7976
7986
|
/*.use_mlock =*/ false,
|
|
7977
7987
|
/*.check_tensors =*/ false,
|
|
7978
7988
|
/*.use_extra_bufts =*/ true,
|
|
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
596
|
}
|
|
597
597
|
|
|
598
598
|
std::vector<std::string> splits = {};
|
|
599
|
-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
599
|
+
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
600
600
|
ml.init_mappings(false); // no prefetching
|
|
601
601
|
|
|
602
602
|
llama_model model(llama_model_default_params());
|
|
@@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|
|
111
111
|
}
|
|
112
112
|
}
|
|
113
113
|
for (size_t i = 0; i < ret.size(); i++) {
|
|
114
|
-
size_t free
|
|
114
|
+
size_t free;
|
|
115
|
+
size_t total;
|
|
115
116
|
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
|
117
|
+
|
|
118
|
+
// devices can return 0 bytes for free and total memory if they do not
|
|
119
|
+
// have any to report. in this case, we will use the host memory as a fallback
|
|
120
|
+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
|
121
|
+
if (free == 0 && total == 0) {
|
|
122
|
+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
123
|
+
if (cpu_dev == nullptr) {
|
|
124
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
125
|
+
}
|
|
126
|
+
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
|
127
|
+
}
|
|
116
128
|
ret[i].free = free;
|
|
117
129
|
ret[i].total = total;
|
|
118
130
|
}
|
|
@@ -147,9 +159,8 @@ class llama_params_fit_exception : public std::runtime_error {
|
|
|
147
159
|
static void llama_params_fit_impl(
|
|
148
160
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
149
161
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
150
|
-
size_t
|
|
162
|
+
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
151
163
|
constexpr int64_t MiB = 1024*1024;
|
|
152
|
-
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
|
153
164
|
typedef std::vector<llama_device_memory_data> dmds_t;
|
|
154
165
|
const llama_model_params default_mparams = llama_model_default_params();
|
|
155
166
|
|
|
@@ -168,6 +179,12 @@ static void llama_params_fit_impl(
|
|
|
168
179
|
return;
|
|
169
180
|
}
|
|
170
181
|
|
|
182
|
+
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
|
183
|
+
margins.reserve(nd);
|
|
184
|
+
for (size_t id = 0; id < nd; id++) {
|
|
185
|
+
margins.push_back(margins_s[id]);
|
|
186
|
+
}
|
|
187
|
+
|
|
171
188
|
std::vector<std::string> dev_names;
|
|
172
189
|
{
|
|
173
190
|
dev_names.reserve(nd);
|
|
@@ -187,9 +204,10 @@ static void llama_params_fit_impl(
|
|
|
187
204
|
|
|
188
205
|
int64_t sum_free = 0;
|
|
189
206
|
int64_t sum_projected_free = 0;
|
|
190
|
-
int64_t min_projected_free = INT64_MAX;
|
|
191
207
|
int64_t sum_projected_used = 0;
|
|
192
208
|
int64_t sum_projected_model = 0;
|
|
209
|
+
std::vector<int64_t> projected_free_per_device;
|
|
210
|
+
projected_free_per_device.reserve(nd);
|
|
193
211
|
|
|
194
212
|
if (nd > 1) {
|
|
195
213
|
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
|
@@ -199,45 +217,63 @@ static void llama_params_fit_impl(
|
|
|
199
217
|
|
|
200
218
|
const int64_t projected_used = dmd.mb.total();
|
|
201
219
|
const int64_t projected_free = dmd.free - projected_used;
|
|
220
|
+
projected_free_per_device.push_back(projected_free);
|
|
202
221
|
|
|
203
222
|
sum_free += dmd.free;
|
|
204
223
|
sum_projected_used += projected_used;
|
|
205
224
|
sum_projected_free += projected_free;
|
|
206
|
-
min_projected_free = std::min(min_projected_free, projected_free);
|
|
207
225
|
sum_projected_model += dmd.mb.model;
|
|
208
226
|
|
|
209
227
|
if (nd > 1) {
|
|
210
|
-
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %
|
|
211
|
-
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB,
|
|
212
|
-
projected_free >= 0 ? "surplus" : "deficit");
|
|
228
|
+
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
|
229
|
+
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
|
213
230
|
}
|
|
214
231
|
}
|
|
215
232
|
assert(sum_free >= 0 && sum_projected_used >= 0);
|
|
216
233
|
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
|
217
234
|
__func__, sum_projected_used/MiB, sum_free/MiB);
|
|
218
|
-
if (
|
|
219
|
-
if (
|
|
235
|
+
if (nd == 1) {
|
|
236
|
+
if (projected_free_per_device[0] >= margins[0]) {
|
|
220
237
|
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
|
221
|
-
__func__,
|
|
238
|
+
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
} else {
|
|
242
|
+
bool changes_needed = false;
|
|
243
|
+
for (size_t id = 0; id < nd; id++) {
|
|
244
|
+
if (projected_free_per_device[id] < margins[id]) {
|
|
245
|
+
changes_needed = true;
|
|
246
|
+
break;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
if (!changes_needed) {
|
|
250
|
+
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
|
222
251
|
return;
|
|
223
252
|
}
|
|
224
|
-
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
|
225
|
-
__func__, min_projected_free/MiB, margin/MiB);
|
|
226
|
-
return;
|
|
227
253
|
}
|
|
228
254
|
|
|
229
255
|
// step 2: try reducing memory use by reducing the context size
|
|
230
256
|
|
|
231
257
|
{
|
|
232
|
-
int64_t global_surplus = sum_projected_free
|
|
258
|
+
int64_t global_surplus = sum_projected_free;
|
|
259
|
+
for (size_t id = 0; id < nd; id++) {
|
|
260
|
+
global_surplus -= margins[id];
|
|
261
|
+
}
|
|
233
262
|
if (global_surplus < 0) {
|
|
234
|
-
|
|
235
|
-
"%s: cannot
|
|
236
|
-
|
|
237
|
-
|
|
263
|
+
if (nd == 1) {
|
|
264
|
+
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
|
265
|
+
__func__, margins[0]/MiB, -global_surplus/MiB);
|
|
266
|
+
} else {
|
|
267
|
+
LLAMA_LOG_INFO(
|
|
268
|
+
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
|
269
|
+
__func__, -global_surplus/MiB);
|
|
270
|
+
}
|
|
238
271
|
if (cparams->n_ctx == 0) {
|
|
239
272
|
if (hp_nct > n_ctx_min) {
|
|
240
|
-
int64_t sum_used_target = sum_free
|
|
273
|
+
int64_t sum_used_target = sum_free;
|
|
274
|
+
for (size_t id = 0; id < nd; id++) {
|
|
275
|
+
sum_used_target -= margins[id];
|
|
276
|
+
}
|
|
241
277
|
if (nd > 1) {
|
|
242
278
|
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
|
243
279
|
// - for dense models only whole layers can be assigned to devices
|
|
@@ -448,9 +484,9 @@ static void llama_params_fit_impl(
|
|
|
448
484
|
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
|
449
485
|
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
450
486
|
|
|
451
|
-
for (
|
|
452
|
-
global_surplus_cpu_moe +=
|
|
453
|
-
global_surplus_cpu_moe -= int64_t(
|
|
487
|
+
for (size_t id = 0; id < nd; id++) {
|
|
488
|
+
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
|
489
|
+
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
|
454
490
|
}
|
|
455
491
|
|
|
456
492
|
if (global_surplus_cpu_moe > 0) {
|
|
@@ -469,7 +505,7 @@ static void llama_params_fit_impl(
|
|
|
469
505
|
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
|
470
506
|
targets.reserve(nd);
|
|
471
507
|
for (size_t id = 0; id < nd; id++) {
|
|
472
|
-
targets.push_back(dmds_full[id].free -
|
|
508
|
+
targets.push_back(dmds_full[id].free - margins[id]);
|
|
473
509
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
474
510
|
}
|
|
475
511
|
|
|
@@ -701,11 +737,11 @@ static void llama_params_fit_impl(
|
|
|
701
737
|
enum llama_params_fit_status llama_params_fit(
|
|
702
738
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
703
739
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
704
|
-
size_t
|
|
740
|
+
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
705
741
|
const int64_t t0_us = llama_time_us();
|
|
706
742
|
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
|
707
743
|
try {
|
|
708
|
-
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides,
|
|
744
|
+
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
|
709
745
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
|
710
746
|
} catch (const llama_params_fit_exception & e) {
|
|
711
747
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
|
@@ -794,7 +830,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
794
830
|
model.t_start_us = tm.t_start_us;
|
|
795
831
|
|
|
796
832
|
try {
|
|
797
|
-
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
|
833
|
+
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
|
798
834
|
|
|
799
835
|
ml.print_info();
|
|
800
836
|
|