@fugood/llama.node 1.4.8 → 1.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +43 -0
- package/lib/parallel.js +26 -0
- package/lib/parallel.ts +33 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +12 -14
- package/src/LlamaCompletionWorker.cpp +3 -1
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +16 -1
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -4
- package/src/llama.cpp/common/arg.cpp +159 -42
- package/src/llama.cpp/common/arg.h +10 -1
- package/src/llama.cpp/common/common.cpp +1 -1
- package/src/llama.cpp/common/common.h +6 -2
- package/src/llama.cpp/common/preset.cpp +197 -5
- package/src/llama.cpp/common/preset.h +45 -3
- package/src/llama.cpp/common/sampling.cpp +51 -37
- package/src/llama.cpp/common/sampling.h +6 -3
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/src/llama-arch.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
- package/src/llama.cpp/src/llama-model.cpp +7 -5
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama.cpp +22 -32
|
@@ -13,9 +13,10 @@
|
|
|
13
13
|
#ifdef __has_include
|
|
14
14
|
#if __has_include(<unistd.h>)
|
|
15
15
|
#include <unistd.h>
|
|
16
|
+
#include <fcntl.h>
|
|
17
|
+
#include <sys/stat.h>
|
|
16
18
|
#if defined(_POSIX_MAPPED_FILES)
|
|
17
19
|
#include <sys/mman.h>
|
|
18
|
-
#include <fcntl.h>
|
|
19
20
|
#endif
|
|
20
21
|
#if defined(_POSIX_MEMLOCK_RANGE)
|
|
21
22
|
#include <sys/resource.h>
|
|
@@ -74,7 +75,7 @@ struct llama_file::impl {
|
|
|
74
75
|
return ret;
|
|
75
76
|
}
|
|
76
77
|
|
|
77
|
-
impl(const char * fname, const char * mode) {
|
|
78
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
|
78
79
|
fp = ggml_fopen(fname, mode);
|
|
79
80
|
if (fp == NULL) {
|
|
80
81
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
@@ -153,13 +154,40 @@ struct llama_file::impl {
|
|
|
153
154
|
write_raw(&val, sizeof(val));
|
|
154
155
|
}
|
|
155
156
|
|
|
157
|
+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
|
|
158
|
+
throw std::runtime_error("DirectIO is not implemented on Windows.");
|
|
159
|
+
}
|
|
160
|
+
|
|
156
161
|
~impl() {
|
|
157
162
|
if (fp) {
|
|
158
163
|
std::fclose(fp);
|
|
159
164
|
}
|
|
160
165
|
}
|
|
161
166
|
#else
|
|
162
|
-
impl(const char * fname, const char * mode) {
|
|
167
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
|
168
|
+
#ifdef __linux__
|
|
169
|
+
// Try unbuffered I/O for read only
|
|
170
|
+
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
|
171
|
+
fd = open(fname, O_RDONLY | O_DIRECT);
|
|
172
|
+
|
|
173
|
+
if (fd != -1) {
|
|
174
|
+
struct stat file_stats{};
|
|
175
|
+
fstat(fd, &file_stats);
|
|
176
|
+
|
|
177
|
+
size = file_stats.st_size;
|
|
178
|
+
alignment = file_stats.st_blksize;
|
|
179
|
+
|
|
180
|
+
off_t ret = lseek(fd, 0, SEEK_SET);
|
|
181
|
+
if (ret == -1) {
|
|
182
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
183
|
+
}
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
|
|
188
|
+
fname, strerror(errno));
|
|
189
|
+
}
|
|
190
|
+
#endif
|
|
163
191
|
fp = ggml_fopen(fname, mode);
|
|
164
192
|
if (fp == NULL) {
|
|
165
193
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
@@ -170,27 +198,30 @@ struct llama_file::impl {
|
|
|
170
198
|
}
|
|
171
199
|
|
|
172
200
|
size_t tell() const {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
|
201
|
+
if (fd == -1) {
|
|
202
|
+
long ret = std::ftell(fp);
|
|
203
|
+
if (ret == -1) {
|
|
204
|
+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return (size_t) ret;
|
|
181
208
|
}
|
|
182
209
|
|
|
183
|
-
|
|
210
|
+
off_t pos = lseek(fd, 0, SEEK_CUR);
|
|
211
|
+
if (pos == -1) {
|
|
212
|
+
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
|
|
213
|
+
}
|
|
214
|
+
return (size_t) pos;
|
|
184
215
|
}
|
|
185
216
|
|
|
186
217
|
void seek(size_t offset, int whence) const {
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
if (ret
|
|
218
|
+
off_t ret = 0;
|
|
219
|
+
if (fd == -1) {
|
|
220
|
+
ret = std::fseek(fp, (long) offset, whence);
|
|
221
|
+
} else {
|
|
222
|
+
ret = lseek(fd, offset, whence);
|
|
223
|
+
}
|
|
224
|
+
if (ret == -1) {
|
|
194
225
|
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
195
226
|
}
|
|
196
227
|
}
|
|
@@ -200,13 +231,55 @@ struct llama_file::impl {
|
|
|
200
231
|
return;
|
|
201
232
|
}
|
|
202
233
|
errno = 0;
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
234
|
+
if (fd == -1) {
|
|
235
|
+
std::size_t ret = std::fread(ptr, len, 1, fp);
|
|
236
|
+
if (ferror(fp)) {
|
|
237
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
238
|
+
}
|
|
239
|
+
if (ret != 1) {
|
|
240
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
|
241
|
+
}
|
|
242
|
+
} else {
|
|
243
|
+
bool successful = false;
|
|
244
|
+
while (!successful) {
|
|
245
|
+
off_t ret = read(fd, ptr, len);
|
|
246
|
+
|
|
247
|
+
if (ret == -1) {
|
|
248
|
+
if (errno == EINTR) {
|
|
249
|
+
continue; // Interrupted by signal, retry
|
|
250
|
+
}
|
|
251
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
252
|
+
}
|
|
253
|
+
if (ret == 0) {
|
|
254
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
successful = true;
|
|
258
|
+
}
|
|
206
259
|
}
|
|
207
|
-
|
|
208
|
-
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
|
|
263
|
+
off_t aligned_offset = offset & ~(alignment - 1);
|
|
264
|
+
off_t offset_from_alignment = offset - aligned_offset;
|
|
265
|
+
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
|
266
|
+
|
|
267
|
+
void * raw_buffer = nullptr;
|
|
268
|
+
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
|
|
269
|
+
if (ret != 0) {
|
|
270
|
+
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
|
|
209
271
|
}
|
|
272
|
+
|
|
273
|
+
struct aligned_buffer_deleter {
|
|
274
|
+
void operator()(void * p) const { free(p); }
|
|
275
|
+
};
|
|
276
|
+
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
|
277
|
+
|
|
278
|
+
seek(aligned_offset, SEEK_SET);
|
|
279
|
+
read_raw(buffer.get(), bytes_to_read);
|
|
280
|
+
|
|
281
|
+
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
|
282
|
+
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
|
210
283
|
}
|
|
211
284
|
|
|
212
285
|
uint32_t read_u32() const {
|
|
@@ -231,22 +304,43 @@ struct llama_file::impl {
|
|
|
231
304
|
}
|
|
232
305
|
|
|
233
306
|
~impl() {
|
|
234
|
-
if (
|
|
307
|
+
if (fd != -1) {
|
|
308
|
+
close(fd);
|
|
309
|
+
} else {
|
|
235
310
|
std::fclose(fp);
|
|
236
311
|
}
|
|
237
312
|
}
|
|
313
|
+
int fd = -1;
|
|
238
314
|
#endif
|
|
239
315
|
|
|
240
|
-
|
|
241
|
-
|
|
316
|
+
void read_raw_at(void * ptr, size_t len, size_t offset) const {
|
|
317
|
+
if (alignment != 1) {
|
|
318
|
+
read_aligned_chunk(offset, ptr, len);
|
|
319
|
+
} else {
|
|
320
|
+
seek(offset, SEEK_SET);
|
|
321
|
+
read_raw(ptr, len);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
size_t read_alignment() const {
|
|
326
|
+
return alignment;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
size_t alignment = 1;
|
|
330
|
+
|
|
331
|
+
FILE * fp{};
|
|
332
|
+
size_t size{};
|
|
242
333
|
};
|
|
243
334
|
|
|
244
|
-
llama_file::llama_file(const char * fname, const char * mode
|
|
335
|
+
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
|
336
|
+
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
|
245
337
|
llama_file::~llama_file() = default;
|
|
246
338
|
|
|
247
339
|
size_t llama_file::tell() const { return pimpl->tell(); }
|
|
248
340
|
size_t llama_file::size() const { return pimpl->size; }
|
|
249
341
|
|
|
342
|
+
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
|
343
|
+
|
|
250
344
|
int llama_file::file_id() const {
|
|
251
345
|
#ifdef _WIN32
|
|
252
346
|
return _fileno(pimpl->fp);
|
|
@@ -261,6 +355,7 @@ int llama_file::file_id() const {
|
|
|
261
355
|
|
|
262
356
|
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
|
263
357
|
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
|
|
358
|
+
void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
|
|
264
359
|
|
|
265
360
|
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
|
|
266
361
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include <cstdint>
|
|
4
4
|
#include <memory>
|
|
5
5
|
#include <vector>
|
|
6
|
+
#include <cstdio>
|
|
6
7
|
|
|
7
8
|
struct llama_file;
|
|
8
9
|
struct llama_mmap;
|
|
@@ -13,7 +14,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
|
|
13
14
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|
14
15
|
|
|
15
16
|
struct llama_file {
|
|
16
|
-
llama_file(const char * fname, const char * mode);
|
|
17
|
+
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
|
17
18
|
~llama_file();
|
|
18
19
|
|
|
19
20
|
size_t tell() const;
|
|
@@ -24,11 +25,14 @@ struct llama_file {
|
|
|
24
25
|
void seek(size_t offset, int whence) const;
|
|
25
26
|
|
|
26
27
|
void read_raw(void * ptr, size_t len) const;
|
|
28
|
+
void read_raw_at(void * ptr, size_t len, size_t offset) const;
|
|
29
|
+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
|
|
27
30
|
uint32_t read_u32() const;
|
|
28
31
|
|
|
29
32
|
void write_raw(const void * ptr, size_t len) const;
|
|
30
33
|
void write_u32(uint32_t val) const;
|
|
31
34
|
|
|
35
|
+
size_t read_alignment() const;
|
|
32
36
|
private:
|
|
33
37
|
struct impl;
|
|
34
38
|
std::unique_ptr<impl> pimpl;
|
|
@@ -504,7 +504,7 @@ llama_model_loader::llama_model_loader(
|
|
|
504
504
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
505
505
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
506
506
|
|
|
507
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
507
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
|
|
508
508
|
contexts.emplace_back(ctx);
|
|
509
509
|
|
|
510
510
|
// Save tensors data offset of the main file.
|
|
@@ -572,7 +572,7 @@ llama_model_loader::llama_model_loader(
|
|
|
572
572
|
}
|
|
573
573
|
}
|
|
574
574
|
|
|
575
|
-
files.emplace_back(new llama_file(fname_split, "rb"));
|
|
575
|
+
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
|
|
576
576
|
contexts.emplace_back(ctx);
|
|
577
577
|
|
|
578
578
|
// Save tensors data offset info of the shard.
|
|
@@ -935,7 +935,15 @@ bool llama_model_loader::load_all_data(
|
|
|
935
935
|
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
|
936
936
|
// NVMe raid configurations might require more / larger buffers.
|
|
937
937
|
constexpr size_t n_buffers = 4;
|
|
938
|
-
|
|
938
|
+
|
|
939
|
+
size_t alignment = 1;
|
|
940
|
+
for (const auto & file : files) {
|
|
941
|
+
alignment = std::max(file->read_alignment(), alignment);
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
// Buffer size: balance between memory usage and I/O efficiency
|
|
945
|
+
// 64MB works well for NVMe drives
|
|
946
|
+
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
|
|
939
947
|
|
|
940
948
|
std::vector<ggml_backend_buffer_t> host_buffers;
|
|
941
949
|
std::vector<ggml_backend_event_t> events;
|
|
@@ -985,6 +993,7 @@ bool llama_model_loader::load_all_data(
|
|
|
985
993
|
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
|
986
994
|
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
|
987
995
|
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
|
996
|
+
|
|
988
997
|
if (!buf) {
|
|
989
998
|
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
|
990
999
|
ggml_backend_dev_name(dev));
|
|
@@ -1066,9 +1075,9 @@ bool llama_model_loader::load_all_data(
|
|
|
1066
1075
|
}
|
|
1067
1076
|
} else {
|
|
1068
1077
|
const auto & file = files.at(weight->idx);
|
|
1078
|
+
|
|
1069
1079
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
1070
|
-
file->
|
|
1071
|
-
file->read_raw(cur->data, n_size);
|
|
1080
|
+
file->read_raw_at(cur->data, n_size, weight->offs);
|
|
1072
1081
|
if (check_tensors) {
|
|
1073
1082
|
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
|
1074
1083
|
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
|
@@ -1077,26 +1086,60 @@ bool llama_model_loader::load_all_data(
|
|
|
1077
1086
|
} else {
|
|
1078
1087
|
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
|
1079
1088
|
if (upload_backend) {
|
|
1080
|
-
|
|
1089
|
+
size_t offset = weight->offs;
|
|
1090
|
+
alignment = file->read_alignment();
|
|
1091
|
+
size_t aligned_offset = offset & ~(alignment - 1);
|
|
1092
|
+
size_t offset_from_alignment = offset - aligned_offset;
|
|
1093
|
+
file->seek(aligned_offset, SEEK_SET);
|
|
1094
|
+
|
|
1095
|
+
// Calculate aligned read boundaries
|
|
1096
|
+
size_t read_start = aligned_offset;
|
|
1097
|
+
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
|
|
1081
1098
|
|
|
1082
1099
|
size_t bytes_read = 0;
|
|
1100
|
+
size_t data_read = 0; // Actual tensor data copied (excluding padding)
|
|
1101
|
+
|
|
1102
|
+
while (bytes_read < read_end - read_start) {
|
|
1103
|
+
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
|
|
1083
1104
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1105
|
+
// Align the destination pointer within the pinned buffer
|
|
1106
|
+
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
|
|
1086
1107
|
|
|
1108
|
+
// Wait for previous upload to complete before reusing buffer
|
|
1087
1109
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
1088
|
-
|
|
1089
|
-
|
|
1110
|
+
|
|
1111
|
+
// Read aligned chunk from file
|
|
1112
|
+
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
|
1113
|
+
|
|
1114
|
+
// Calculate actual data portion (excluding alignment padding)
|
|
1115
|
+
uintptr_t ptr_data = ptr_dest_aligned;
|
|
1116
|
+
size_t data_to_copy = read_size;
|
|
1117
|
+
|
|
1118
|
+
// Skip alignment padding at start of first chunk
|
|
1119
|
+
if (bytes_read == 0) {
|
|
1120
|
+
ptr_data += offset_from_alignment;
|
|
1121
|
+
data_to_copy -= offset_from_alignment;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
// Trim alignment padding at end of last chunk
|
|
1125
|
+
if (aligned_offset + bytes_read + read_size > offset + n_size) {
|
|
1126
|
+
data_to_copy -= (read_end - (offset + n_size));
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
// Async upload actual data to GPU
|
|
1130
|
+
ggml_backend_tensor_set_async(upload_backend, cur,
|
|
1131
|
+
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
|
|
1090
1132
|
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
|
1091
1133
|
|
|
1092
|
-
|
|
1134
|
+
data_read += data_to_copy;
|
|
1135
|
+
bytes_read += read_size;
|
|
1136
|
+
|
|
1093
1137
|
++buffer_idx;
|
|
1094
1138
|
buffer_idx %= n_buffers;
|
|
1095
1139
|
}
|
|
1096
1140
|
} else {
|
|
1097
1141
|
read_buf.resize(n_size);
|
|
1098
|
-
file->
|
|
1099
|
-
file->read_raw(read_buf.data(), n_size);
|
|
1142
|
+
file->read_raw_at(read_buf.data(), n_size, weight->offs);
|
|
1100
1143
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
|
1101
1144
|
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
|
1102
1145
|
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
@@ -2378,10 +2378,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2378
2378
|
if (cpu_dev == nullptr) {
|
|
2379
2379
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
2380
2380
|
}
|
|
2381
|
-
const int i_gpu_start = std::max((
|
|
2382
|
-
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (
|
|
2381
|
+
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
|
2382
|
+
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
|
2383
2383
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
2384
|
-
const bool is_swa = il < (
|
|
2384
|
+
const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
|
|
2385
2385
|
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
|
2386
2386
|
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
|
|
2387
2387
|
return {cpu_dev, &pimpl->cpu_buft_list};
|
|
@@ -6693,10 +6693,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6693
6693
|
if (llama_supports_gpu_offload()) {
|
|
6694
6694
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
|
6695
6695
|
|
|
6696
|
-
|
|
6697
|
-
if (
|
|
6696
|
+
int n_repeating = n_gpu;
|
|
6697
|
+
if (n_repeating > 0) {
|
|
6698
6698
|
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
|
|
6699
|
+
n_repeating--;
|
|
6699
6700
|
}
|
|
6701
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
|
|
6700
6702
|
|
|
6701
6703
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
|
6702
6704
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
|
@@ -362,23 +362,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
|
|
362
362
|
}
|
|
363
363
|
|
|
364
364
|
void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
|
|
365
|
+
if (!smpl) {
|
|
366
|
+
return;
|
|
367
|
+
}
|
|
368
|
+
|
|
365
369
|
if (smpl->iface->accept) {
|
|
366
370
|
smpl->iface->accept(smpl, token);
|
|
367
371
|
}
|
|
368
372
|
}
|
|
369
373
|
|
|
370
374
|
void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
|
|
375
|
+
if (!smpl) {
|
|
376
|
+
return;
|
|
377
|
+
}
|
|
378
|
+
|
|
371
379
|
GGML_ASSERT(smpl->iface->apply);
|
|
372
380
|
smpl->iface->apply(smpl, cur_p);
|
|
373
381
|
}
|
|
374
382
|
|
|
375
383
|
void llama_sampler_reset(struct llama_sampler * smpl) {
|
|
384
|
+
if (!smpl) {
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
|
|
376
388
|
if (smpl->iface->reset) {
|
|
377
389
|
smpl->iface->reset(smpl);
|
|
378
390
|
}
|
|
379
391
|
}
|
|
380
392
|
|
|
381
393
|
struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
|
|
394
|
+
if (!smpl) {
|
|
395
|
+
return nullptr;
|
|
396
|
+
}
|
|
397
|
+
|
|
382
398
|
if (smpl->iface->clone) {
|
|
383
399
|
return smpl->iface->clone(smpl);
|
|
384
400
|
}
|
|
@@ -292,10 +292,6 @@ static void llama_params_fit_impl(
|
|
|
292
292
|
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
293
293
|
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
|
294
294
|
}
|
|
295
|
-
if (hp_ngl < 2*nd) {
|
|
296
|
-
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
|
|
297
|
-
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
|
|
298
|
-
}
|
|
299
295
|
}
|
|
300
296
|
if (!tensor_buft_overrides) {
|
|
301
297
|
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
|
|
@@ -362,8 +358,7 @@ static void llama_params_fit_impl(
|
|
|
362
358
|
auto set_ngl_tensor_split_tbo = [&](
|
|
363
359
|
const std::vector<ngl_t> & ngl_per_device,
|
|
364
360
|
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
|
365
|
-
llama_model_params & mparams
|
|
366
|
-
const bool add_nonrepeating) {
|
|
361
|
+
llama_model_params & mparams) {
|
|
367
362
|
mparams.n_gpu_layers = 0;
|
|
368
363
|
for (size_t id = 0; id < nd; id++) {
|
|
369
364
|
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
|
@@ -371,13 +366,9 @@ static void llama_params_fit_impl(
|
|
|
371
366
|
tensor_split[id] = ngl_per_device[id].n_layer;
|
|
372
367
|
}
|
|
373
368
|
}
|
|
374
|
-
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
|
|
375
|
-
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
|
|
369
|
+
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
|
370
|
+
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
|
376
371
|
|
|
377
|
-
if (add_nonrepeating) {
|
|
378
|
-
mparams.n_gpu_layers += 1;
|
|
379
|
-
tensor_split[nd - 1] += 1;
|
|
380
|
-
}
|
|
381
372
|
mparams.tensor_split = tensor_split;
|
|
382
373
|
|
|
383
374
|
size_t itbo = 0;
|
|
@@ -408,10 +399,9 @@ static void llama_params_fit_impl(
|
|
|
408
399
|
auto get_memory_for_layers = [&](
|
|
409
400
|
const char * func_name,
|
|
410
401
|
const std::vector<ngl_t> & ngl_per_device,
|
|
411
|
-
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts
|
|
412
|
-
const bool add_nonrepeating) -> std::vector<int64_t> {
|
|
402
|
+
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
|
413
403
|
llama_model_params mparams_copy = *mparams;
|
|
414
|
-
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy
|
|
404
|
+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
|
415
405
|
|
|
416
406
|
const dmds_t dmd_nl = llama_get_device_memory_data(
|
|
417
407
|
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
@@ -469,9 +459,6 @@ static void llama_params_fit_impl(
|
|
|
469
459
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
470
460
|
}
|
|
471
461
|
|
|
472
|
-
// whether for the optimal memory use we expect to load at least some MoE tensors:
|
|
473
|
-
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
|
|
474
|
-
|
|
475
462
|
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
|
476
463
|
overflow_bufts.reserve(nd);
|
|
477
464
|
for (size_t id = 0; id < nd - 1; ++id) {
|
|
@@ -480,7 +467,7 @@ static void llama_params_fit_impl(
|
|
|
480
467
|
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
481
468
|
|
|
482
469
|
std::vector<ngl_t> ngl_per_device(nd);
|
|
483
|
-
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts
|
|
470
|
+
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
|
484
471
|
if (hp_nex > 0) {
|
|
485
472
|
for (size_t id = 0; id < nd; id++) {
|
|
486
473
|
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
|
@@ -493,13 +480,14 @@ static void llama_params_fit_impl(
|
|
|
493
480
|
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
|
494
481
|
// - check memory use of our guess, replace either the low or high bound
|
|
495
482
|
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
|
483
|
+
// - the last device has the output layer, which cannot be a partial layer
|
|
496
484
|
if (hp_nex == 0) {
|
|
497
485
|
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
|
498
486
|
} else {
|
|
499
487
|
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
|
500
488
|
}
|
|
501
489
|
for (int id = nd - 1; id >= 0; id--) {
|
|
502
|
-
uint32_t n_unassigned = hp_ngl;
|
|
490
|
+
uint32_t n_unassigned = hp_ngl + 1;
|
|
503
491
|
for (size_t jd = id + 1; jd < nd; ++jd) {
|
|
504
492
|
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
|
505
493
|
n_unassigned -= ngl_per_device[jd].n_layer;
|
|
@@ -508,10 +496,10 @@ static void llama_params_fit_impl(
|
|
|
508
496
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
509
497
|
ngl_per_device_high[id].n_layer = n_unassigned;
|
|
510
498
|
if (hp_nex > 0) {
|
|
511
|
-
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
|
|
499
|
+
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
|
512
500
|
}
|
|
513
501
|
if (ngl_per_device_high[id].n_layer > 0) {
|
|
514
|
-
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts
|
|
502
|
+
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
515
503
|
if (mem_high[id] > targets[id]) {
|
|
516
504
|
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
|
517
505
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
@@ -526,7 +514,7 @@ static void llama_params_fit_impl(
|
|
|
526
514
|
if (hp_nex) {
|
|
527
515
|
ngl_per_device_test[id].n_part += step_size;
|
|
528
516
|
}
|
|
529
|
-
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts
|
|
517
|
+
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
530
518
|
|
|
531
519
|
if (mem_test[id] <= targets[id]) {
|
|
532
520
|
ngl_per_device = ngl_per_device_test;
|
|
@@ -542,6 +530,7 @@ static void llama_params_fit_impl(
|
|
|
542
530
|
} else {
|
|
543
531
|
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
|
544
532
|
ngl_per_device = ngl_per_device_high;
|
|
533
|
+
mem = mem_high;
|
|
545
534
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
|
546
535
|
}
|
|
547
536
|
}
|
|
@@ -552,7 +541,7 @@ static void llama_params_fit_impl(
|
|
|
552
541
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
|
553
542
|
}
|
|
554
543
|
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
|
555
|
-
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams
|
|
544
|
+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
556
545
|
return;
|
|
557
546
|
}
|
|
558
547
|
|
|
@@ -575,13 +564,13 @@ static void llama_params_fit_impl(
|
|
|
575
564
|
for (size_t id = 0; id <= id_dense_start; id++) {
|
|
576
565
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
577
566
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
|
578
|
-
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
|
|
567
|
+
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
|
579
568
|
ngl_per_device_high[id].n_layer += n_layer_move;
|
|
580
569
|
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
|
581
570
|
ngl_per_device_high[jd].n_part = 0;
|
|
582
571
|
}
|
|
583
572
|
size_t id_dense_start_high = nd - 1;
|
|
584
|
-
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts
|
|
573
|
+
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
585
574
|
|
|
586
575
|
if (mem_high[id] > targets[id]) {
|
|
587
576
|
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
|
@@ -609,7 +598,7 @@ static void llama_params_fit_impl(
|
|
|
609
598
|
break;
|
|
610
599
|
}
|
|
611
600
|
}
|
|
612
|
-
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts
|
|
601
|
+
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
613
602
|
|
|
614
603
|
if (mem_test[id] <= targets[id]) {
|
|
615
604
|
ngl_per_device = ngl_per_device_test;
|
|
@@ -629,13 +618,14 @@ static void llama_params_fit_impl(
|
|
|
629
618
|
}
|
|
630
619
|
} else {
|
|
631
620
|
ngl_per_device = ngl_per_device_high;
|
|
621
|
+
mem = mem_high;
|
|
632
622
|
id_dense_start = id_dense_start_high;
|
|
633
623
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
|
634
624
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
|
635
625
|
}
|
|
636
626
|
|
|
637
627
|
// try to fit at least part of one more layer
|
|
638
|
-
if (ngl_per_device[id_dense_start].n_layer > 0) {
|
|
628
|
+
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
|
639
629
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
640
630
|
size_t id_dense_start_test = id_dense_start;
|
|
641
631
|
ngl_per_device_test[id_dense_start_test].n_layer--;
|
|
@@ -647,7 +637,7 @@ static void llama_params_fit_impl(
|
|
|
647
637
|
}
|
|
648
638
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
649
639
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
650
|
-
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts
|
|
640
|
+
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
651
641
|
if (mem_test[id] < targets[id]) {
|
|
652
642
|
ngl_per_device = ngl_per_device_test;
|
|
653
643
|
mem = mem_test;
|
|
@@ -657,7 +647,7 @@ static void llama_params_fit_impl(
|
|
|
657
647
|
|
|
658
648
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
659
649
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
660
|
-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts
|
|
650
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
661
651
|
if (mem_test[id] < targets[id]) {
|
|
662
652
|
ngl_per_device = ngl_per_device_test;
|
|
663
653
|
mem = mem_test;
|
|
@@ -668,7 +658,7 @@ static void llama_params_fit_impl(
|
|
|
668
658
|
} else {
|
|
669
659
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
670
660
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
671
|
-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts
|
|
661
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
672
662
|
if (mem_test[id] < targets[id]) {
|
|
673
663
|
ngl_per_device = ngl_per_device_test;
|
|
674
664
|
mem = mem_test;
|
|
@@ -685,7 +675,7 @@ static void llama_params_fit_impl(
|
|
|
685
675
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
686
676
|
}
|
|
687
677
|
|
|
688
|
-
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams
|
|
678
|
+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
689
679
|
}
|
|
690
680
|
|
|
691
681
|
bool llama_params_fit(
|