@fugood/llama.node 1.4.12 → 1.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +11 -1
- package/lib/index.js +2 -1
- package/lib/index.ts +2 -0
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaContext.cpp +5 -2
- package/src/llama.cpp/common/arg.cpp +249 -101
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +21 -1
- package/src/llama.cpp/common/common.h +20 -7
- package/src/llama.cpp/common/download.cpp +104 -55
- package/src/llama.cpp/common/download.h +26 -5
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/preset.cpp +76 -1
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +92 -10
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-mmap.cpp +70 -37
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +66 -16
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1233 -171
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +101 -57
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
|
@@ -110,7 +110,7 @@ struct llama_file::impl {
|
|
|
110
110
|
}
|
|
111
111
|
}
|
|
112
112
|
|
|
113
|
-
void read_raw(void * ptr, size_t len)
|
|
113
|
+
void read_raw(void * ptr, size_t len) {
|
|
114
114
|
size_t bytes_read = 0;
|
|
115
115
|
while (bytes_read < len) {
|
|
116
116
|
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
|
@@ -127,7 +127,7 @@ struct llama_file::impl {
|
|
|
127
127
|
}
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
-
uint32_t read_u32()
|
|
130
|
+
uint32_t read_u32() {
|
|
131
131
|
uint32_t val;
|
|
132
132
|
read_raw(&val, sizeof(val));
|
|
133
133
|
return val;
|
|
@@ -154,8 +154,8 @@ struct llama_file::impl {
|
|
|
154
154
|
write_raw(&val, sizeof(val));
|
|
155
155
|
}
|
|
156
156
|
|
|
157
|
-
|
|
158
|
-
|
|
157
|
+
bool has_direct_io() const {
|
|
158
|
+
return true;
|
|
159
159
|
}
|
|
160
160
|
|
|
161
161
|
~impl() {
|
|
@@ -164,33 +164,45 @@ struct llama_file::impl {
|
|
|
164
164
|
}
|
|
165
165
|
}
|
|
166
166
|
#else
|
|
167
|
-
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
|
167
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
|
|
168
168
|
#ifdef __linux__
|
|
169
169
|
// Try unbuffered I/O for read only
|
|
170
170
|
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
|
171
|
-
|
|
171
|
+
if (init_fd()) {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
|
|
175
|
+
fname, strerror(errno));
|
|
176
|
+
}
|
|
177
|
+
#endif
|
|
178
|
+
init_fp(mode);
|
|
179
|
+
}
|
|
172
180
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
181
|
+
#ifdef __linux__
|
|
182
|
+
bool init_fd() {
|
|
183
|
+
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
|
|
176
184
|
|
|
177
|
-
|
|
178
|
-
|
|
185
|
+
if (fd != -1) {
|
|
186
|
+
struct stat file_stats{};
|
|
187
|
+
fstat(fd, &file_stats);
|
|
179
188
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
183
|
-
}
|
|
184
|
-
return;
|
|
185
|
-
}
|
|
189
|
+
size = file_stats.st_size;
|
|
190
|
+
alignment = file_stats.st_blksize;
|
|
186
191
|
|
|
187
|
-
|
|
188
|
-
|
|
192
|
+
off_t ret = lseek(fd, 0, SEEK_SET);
|
|
193
|
+
if (ret == -1) {
|
|
194
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
195
|
+
}
|
|
196
|
+
return true;
|
|
189
197
|
}
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
190
200
|
#endif
|
|
191
|
-
|
|
201
|
+
|
|
202
|
+
void init_fp(const char * mode) {
|
|
203
|
+
fp = ggml_fopen(fname.c_str(), mode);
|
|
192
204
|
if (fp == NULL) {
|
|
193
|
-
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
205
|
+
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
|
|
194
206
|
}
|
|
195
207
|
seek(0, SEEK_END);
|
|
196
208
|
size = tell();
|
|
@@ -226,7 +238,7 @@ struct llama_file::impl {
|
|
|
226
238
|
}
|
|
227
239
|
}
|
|
228
240
|
|
|
229
|
-
void
|
|
241
|
+
void read_raw_unsafe(void * ptr, size_t len) {
|
|
230
242
|
if (len == 0) {
|
|
231
243
|
return;
|
|
232
244
|
}
|
|
@@ -249,6 +261,17 @@ struct llama_file::impl {
|
|
|
249
261
|
if (errno == EINTR) {
|
|
250
262
|
continue; // Interrupted by signal, retry
|
|
251
263
|
}
|
|
264
|
+
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
|
265
|
+
if (errno == EFAULT) {
|
|
266
|
+
auto curr_off = tell();
|
|
267
|
+
close(fd);
|
|
268
|
+
fd = -1;
|
|
269
|
+
alignment = 1;
|
|
270
|
+
init_fp("rb");
|
|
271
|
+
seek(curr_off, SEEK_SET);
|
|
272
|
+
read_raw_unsafe(ptr, len);
|
|
273
|
+
return;
|
|
274
|
+
}
|
|
252
275
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
253
276
|
}
|
|
254
277
|
if (ret == 0) {
|
|
@@ -266,7 +289,8 @@ struct llama_file::impl {
|
|
|
266
289
|
}
|
|
267
290
|
}
|
|
268
291
|
|
|
269
|
-
void read_aligned_chunk(
|
|
292
|
+
void read_aligned_chunk(void * dest, size_t size) {
|
|
293
|
+
size_t offset = tell();
|
|
270
294
|
off_t aligned_offset = offset & ~(alignment - 1);
|
|
271
295
|
off_t offset_from_alignment = offset - aligned_offset;
|
|
272
296
|
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
|
@@ -283,13 +307,21 @@ struct llama_file::impl {
|
|
|
283
307
|
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
|
284
308
|
|
|
285
309
|
seek(aligned_offset, SEEK_SET);
|
|
286
|
-
|
|
310
|
+
read_raw_unsafe(buffer.get(), bytes_to_read);
|
|
287
311
|
|
|
288
312
|
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
|
289
313
|
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
|
290
314
|
}
|
|
291
315
|
|
|
292
|
-
|
|
316
|
+
void read_raw(void * ptr, size_t len) {
|
|
317
|
+
if (has_direct_io()) {
|
|
318
|
+
read_aligned_chunk(ptr, len);
|
|
319
|
+
} else {
|
|
320
|
+
read_raw_unsafe(ptr, len);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
uint32_t read_u32() {
|
|
293
325
|
uint32_t ret;
|
|
294
326
|
read_raw(&ret, sizeof(ret));
|
|
295
327
|
return ret;
|
|
@@ -310,6 +342,10 @@ struct llama_file::impl {
|
|
|
310
342
|
write_raw(&val, sizeof(val));
|
|
311
343
|
}
|
|
312
344
|
|
|
345
|
+
bool has_direct_io() const {
|
|
346
|
+
return fd != -1 && alignment > 1;
|
|
347
|
+
}
|
|
348
|
+
|
|
313
349
|
~impl() {
|
|
314
350
|
if (fd != -1) {
|
|
315
351
|
close(fd);
|
|
@@ -318,17 +354,9 @@ struct llama_file::impl {
|
|
|
318
354
|
}
|
|
319
355
|
}
|
|
320
356
|
int fd = -1;
|
|
357
|
+
std::string fname;
|
|
321
358
|
#endif
|
|
322
359
|
|
|
323
|
-
void read_raw_at(void * ptr, size_t len, size_t offset) const {
|
|
324
|
-
if (alignment != 1) {
|
|
325
|
-
read_aligned_chunk(offset, ptr, len);
|
|
326
|
-
} else {
|
|
327
|
-
seek(offset, SEEK_SET);
|
|
328
|
-
read_raw(ptr, len);
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
|
|
332
360
|
size_t read_alignment() const {
|
|
333
361
|
return alignment;
|
|
334
362
|
}
|
|
@@ -347,6 +375,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
|
|
|
347
375
|
size_t llama_file::size() const { return pimpl->size; }
|
|
348
376
|
|
|
349
377
|
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
|
378
|
+
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
|
|
350
379
|
|
|
351
380
|
int llama_file::file_id() const {
|
|
352
381
|
#ifdef _WIN32
|
|
@@ -361,10 +390,14 @@ int llama_file::file_id() const {
|
|
|
361
390
|
}
|
|
362
391
|
|
|
363
392
|
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
|
364
|
-
void llama_file::read_raw(void * ptr, size_t len)
|
|
365
|
-
|
|
393
|
+
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
|
394
|
+
#ifdef _WIN32
|
|
395
|
+
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
|
396
|
+
#else
|
|
397
|
+
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
|
|
398
|
+
#endif
|
|
366
399
|
|
|
367
|
-
uint32_t llama_file::read_u32()
|
|
400
|
+
uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
|
|
368
401
|
|
|
369
402
|
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
|
|
370
403
|
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
|
|
@@ -24,15 +24,16 @@ struct llama_file {
|
|
|
24
24
|
|
|
25
25
|
void seek(size_t offset, int whence) const;
|
|
26
26
|
|
|
27
|
-
void read_raw(void * ptr, size_t len)
|
|
28
|
-
void
|
|
29
|
-
void read_aligned_chunk(
|
|
30
|
-
uint32_t read_u32()
|
|
27
|
+
void read_raw(void * ptr, size_t len);
|
|
28
|
+
void read_raw_unsafe(void * ptr, size_t len);
|
|
29
|
+
void read_aligned_chunk(void * dest, size_t size);
|
|
30
|
+
uint32_t read_u32();
|
|
31
31
|
|
|
32
32
|
void write_raw(const void * ptr, size_t len) const;
|
|
33
33
|
void write_u32(uint32_t val) const;
|
|
34
34
|
|
|
35
35
|
size_t read_alignment() const;
|
|
36
|
+
bool has_direct_io() const;
|
|
36
37
|
private:
|
|
37
38
|
struct impl;
|
|
38
39
|
std::unique_ptr<impl> pimpl;
|
|
@@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader(
|
|
|
495
495
|
const std::string & fname,
|
|
496
496
|
std::vector<std::string> & splits,
|
|
497
497
|
bool use_mmap,
|
|
498
|
+
bool use_direct_io,
|
|
498
499
|
bool check_tensors,
|
|
499
500
|
bool no_alloc,
|
|
500
501
|
const llama_model_kv_override * param_overrides_p,
|
|
@@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader(
|
|
|
527
528
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
528
529
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
529
530
|
|
|
530
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb",
|
|
531
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
|
531
532
|
contexts.emplace_back(ctx);
|
|
532
533
|
|
|
534
|
+
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
|
535
|
+
|
|
536
|
+
// Disable mmap in case Direct I/O is enabled and available
|
|
537
|
+
if (use_direct_io && use_mmap) {
|
|
538
|
+
use_mmap = false;
|
|
539
|
+
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
|
540
|
+
}
|
|
541
|
+
|
|
533
542
|
// Save tensors data offset of the main file.
|
|
534
543
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
|
535
544
|
// so we build a unified tensors index for weights.
|
|
@@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader(
|
|
|
595
604
|
}
|
|
596
605
|
}
|
|
597
606
|
|
|
598
|
-
files.emplace_back(new llama_file(fname_split, "rb",
|
|
607
|
+
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
|
599
608
|
contexts.emplace_back(ctx);
|
|
600
609
|
|
|
601
610
|
// Save tensors data offset info of the shard.
|
|
@@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader(
|
|
|
739
748
|
}
|
|
740
749
|
|
|
741
750
|
this->use_mmap = use_mmap;
|
|
751
|
+
this->use_direct_io = use_direct_io;
|
|
742
752
|
this->check_tensors = check_tensors;
|
|
743
753
|
this->no_alloc = no_alloc;
|
|
744
754
|
}
|
|
@@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data(
|
|
|
1100
1110
|
const auto & file = files.at(weight->idx);
|
|
1101
1111
|
|
|
1102
1112
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
1103
|
-
file->
|
|
1113
|
+
file->seek(weight->offs, SEEK_SET);
|
|
1114
|
+
file->read_raw(cur->data, n_size);
|
|
1104
1115
|
if (check_tensors) {
|
|
1105
1116
|
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
|
1106
1117
|
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
|
@@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data(
|
|
|
1132
1143
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
1133
1144
|
|
|
1134
1145
|
// Read aligned chunk from file
|
|
1135
|
-
file->
|
|
1146
|
+
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
|
1136
1147
|
|
|
1137
1148
|
// Calculate actual data portion (excluding alignment padding)
|
|
1138
1149
|
uintptr_t ptr_data = ptr_dest_aligned;
|
|
@@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data(
|
|
|
1162
1173
|
}
|
|
1163
1174
|
} else {
|
|
1164
1175
|
read_buf.resize(n_size);
|
|
1165
|
-
file->
|
|
1176
|
+
file->seek(weight->offs, SEEK_SET);
|
|
1177
|
+
file->read_raw(read_buf.data(), n_size);
|
|
1166
1178
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
|
1167
1179
|
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
|
1168
1180
|
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
@@ -70,6 +70,7 @@ struct llama_model_loader {
|
|
|
70
70
|
size_t n_bytes = 0;
|
|
71
71
|
|
|
72
72
|
bool use_mmap = false;
|
|
73
|
+
bool use_direct_io = false;
|
|
73
74
|
bool check_tensors;
|
|
74
75
|
bool no_alloc;
|
|
75
76
|
|
|
@@ -97,6 +98,7 @@ struct llama_model_loader {
|
|
|
97
98
|
const std::string & fname,
|
|
98
99
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
99
100
|
bool use_mmap,
|
|
101
|
+
bool use_direct_io,
|
|
100
102
|
bool check_tensors,
|
|
101
103
|
bool no_alloc,
|
|
102
104
|
const llama_model_kv_override * param_overrides_p,
|
|
@@ -146,6 +146,9 @@ void llama_model_saver::add_kv_from_model() {
|
|
|
146
146
|
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
|
147
147
|
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
148
148
|
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
149
|
+
if (hparams.n_embd_out > 0) {
|
|
150
|
+
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
|
|
151
|
+
}
|
|
149
152
|
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
150
153
|
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
151
154
|
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
|
@@ -507,6 +507,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
507
507
|
|
|
508
508
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
509
509
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
510
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
|
|
510
511
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
511
512
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
512
513
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
@@ -578,6 +579,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
578
579
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
|
579
580
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
|
580
581
|
|
|
582
|
+
// TODO: Handle SWA metadata similarly when models start implementing it
|
|
581
583
|
// rope_freq_scale (inverse of the kv) is optional
|
|
582
584
|
float ropescale = 0.0f;
|
|
583
585
|
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
|
@@ -586,10 +588,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
586
588
|
}
|
|
587
589
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
588
590
|
|
|
589
|
-
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
|
|
590
|
-
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
591
|
-
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
592
|
-
|
|
593
591
|
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
|
594
592
|
|
|
595
593
|
// non-transformer models do not have attention heads
|
|
@@ -677,6 +675,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
677
675
|
hparams.f_attn_temp_scale = 0.1f;
|
|
678
676
|
hparams.f_attn_temp_offset = 1.0f;
|
|
679
677
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
678
|
+
|
|
679
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
680
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
681
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
680
682
|
}
|
|
681
683
|
|
|
682
684
|
switch (hparams.n_expert) {
|
|
@@ -722,6 +724,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
722
724
|
if (hparams.n_swa > 0) {
|
|
723
725
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
724
726
|
hparams.set_swa_pattern(4);
|
|
727
|
+
|
|
728
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
729
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
730
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
725
731
|
} else {
|
|
726
732
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
727
733
|
}
|
|
@@ -1243,7 +1249,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1243
1249
|
if (found_swa && hparams.n_swa > 0) {
|
|
1244
1250
|
uint32_t swa_period = 8;
|
|
1245
1251
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1246
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1247
1252
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
1248
1253
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1249
1254
|
hparams.set_swa_pattern(swa_period);
|
|
@@ -1309,7 +1314,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1309
1314
|
hparams.n_swa = 4096; // default value of gemma 2
|
|
1310
1315
|
hparams.set_swa_pattern(2);
|
|
1311
1316
|
hparams.attn_soft_cap = true;
|
|
1317
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1318
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1312
1319
|
|
|
1320
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1313
1321
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1314
1322
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1315
1323
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
@@ -1334,8 +1342,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1334
1342
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1335
1343
|
hparams.set_swa_pattern(6);
|
|
1336
1344
|
|
|
1337
|
-
hparams.rope_freq_base_train_swa
|
|
1338
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1345
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1339
1346
|
} else {
|
|
1340
1347
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1341
1348
|
}
|
|
@@ -1365,10 +1372,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1365
1372
|
hparams.set_swa_pattern(5);
|
|
1366
1373
|
|
|
1367
1374
|
hparams.n_layer_kv_from_start = 20;
|
|
1368
|
-
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1369
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1370
1375
|
hparams.f_attention_scale = 1.0f;
|
|
1371
1376
|
|
|
1377
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1372
1378
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1373
1379
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1374
1380
|
|
|
@@ -1384,9 +1390,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1384
1390
|
hparams.set_swa_pattern(6);
|
|
1385
1391
|
|
|
1386
1392
|
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1387
|
-
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1388
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1389
1393
|
|
|
1394
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1390
1395
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1391
1396
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1392
1397
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
@@ -1525,7 +1530,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1525
1530
|
{
|
|
1526
1531
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1527
1532
|
hparams.set_swa_pattern(4);
|
|
1533
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1534
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1528
1535
|
|
|
1536
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1529
1537
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1530
1538
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
1531
1539
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1564,6 +1572,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1564
1572
|
if (found_swa && hparams.n_swa > 0) {
|
|
1565
1573
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1566
1574
|
hparams.set_swa_pattern(4);
|
|
1575
|
+
|
|
1576
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1577
|
+
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
|
|
1578
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1567
1579
|
} else {
|
|
1568
1580
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1569
1581
|
}
|
|
@@ -1906,6 +1918,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1906
1918
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1907
1919
|
hparams.n_swa = 4096;
|
|
1908
1920
|
hparams.set_swa_pattern(4);
|
|
1921
|
+
|
|
1922
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1923
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1924
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1909
1925
|
}
|
|
1910
1926
|
|
|
1911
1927
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
@@ -2208,6 +2224,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2208
2224
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2209
2225
|
hparams.set_swa_pattern(2);
|
|
2210
2226
|
|
|
2227
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2228
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
2229
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2230
|
+
|
|
2211
2231
|
switch (hparams.n_layer) {
|
|
2212
2232
|
case 24: type = LLM_TYPE_20B; break;
|
|
2213
2233
|
case 36: type = LLM_TYPE_120B; break;
|
|
@@ -2252,6 +2272,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2252
2272
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2253
2273
|
hparams.n_swa = 4096;
|
|
2254
2274
|
hparams.set_swa_pattern(4, true);
|
|
2275
|
+
|
|
2276
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2277
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
2278
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2255
2279
|
} else {
|
|
2256
2280
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
2257
2281
|
hparams.n_no_rope_layer_step = hparams.n_layer;
|
|
@@ -2416,7 +2440,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2416
2440
|
|
|
2417
2441
|
const bool use_mmap_buffer = true;
|
|
2418
2442
|
|
|
2419
|
-
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n",
|
|
2443
|
+
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
|
|
2444
|
+
__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
|
|
2420
2445
|
|
|
2421
2446
|
// build a list of buffer types for the CPU and GPU devices
|
|
2422
2447
|
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
|
|
@@ -2427,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2427
2452
|
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
|
|
2428
2453
|
}
|
|
2429
2454
|
|
|
2455
|
+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2456
|
+
if (cpu_dev == nullptr) {
|
|
2457
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
2458
|
+
}
|
|
2459
|
+
|
|
2430
2460
|
// calculate the split points
|
|
2431
2461
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
|
|
2432
2462
|
std::vector<float> splits(n_devices());
|
|
@@ -2437,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2437
2467
|
size_t total;
|
|
2438
2468
|
size_t free;
|
|
2439
2469
|
ggml_backend_dev_memory(dev, &free, &total);
|
|
2470
|
+
|
|
2471
|
+
// devices can return 0 bytes for free and total memory if they do not
|
|
2472
|
+
// have any to report. in this case, we will use the host memory as a fallback
|
|
2473
|
+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
|
2474
|
+
if (free == 0 && total == 0) {
|
|
2475
|
+
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
|
2476
|
+
}
|
|
2440
2477
|
splits[i] = free;
|
|
2441
2478
|
}
|
|
2442
2479
|
} else {
|
|
@@ -2453,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2453
2490
|
splits[i] /= split_sum;
|
|
2454
2491
|
}
|
|
2455
2492
|
|
|
2456
|
-
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2457
|
-
if (cpu_dev == nullptr) {
|
|
2458
|
-
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
2459
|
-
}
|
|
2460
2493
|
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
|
2461
2494
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
|
2462
2495
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
@@ -6446,6 +6479,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6446
6479
|
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
|
|
6447
6480
|
}
|
|
6448
6481
|
}
|
|
6482
|
+
|
|
6483
|
+
// for LFM2-ColBert-350M
|
|
6484
|
+
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
|
|
6449
6485
|
} break;
|
|
6450
6486
|
case LLM_ARCH_SMALLTHINKER:
|
|
6451
6487
|
{
|
|
@@ -7098,6 +7134,10 @@ void llama_model::print_info() const {
|
|
|
7098
7134
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
7099
7135
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
7100
7136
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
7137
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7138
|
+
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
|
7139
|
+
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
|
7140
|
+
}
|
|
7101
7141
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
7102
7142
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7103
7143
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
@@ -7910,12 +7950,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7910
7950
|
// add on pooling layer
|
|
7911
7951
|
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
|
7912
7952
|
|
|
7953
|
+
// add backend sampling layers (if any)
|
|
7954
|
+
llm->build_sampling();
|
|
7955
|
+
|
|
7913
7956
|
// if the gguf model was converted with --sentence-transformers-dense-modules
|
|
7914
7957
|
// there will be two additional dense projection layers
|
|
7915
7958
|
// dense linear projections are applied after pooling
|
|
7916
7959
|
// TODO: move reranking logic here and generalize
|
|
7917
7960
|
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
|
7918
7961
|
|
|
7962
|
+
llm->res->set_outputs();
|
|
7963
|
+
|
|
7919
7964
|
return llm->res->get_gf();
|
|
7920
7965
|
}
|
|
7921
7966
|
|
|
@@ -7937,6 +7982,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7937
7982
|
/*.kv_overrides =*/ nullptr,
|
|
7938
7983
|
/*.vocab_only =*/ false,
|
|
7939
7984
|
/*.use_mmap =*/ true,
|
|
7985
|
+
/*.use_direct_io =*/ true,
|
|
7940
7986
|
/*.use_mlock =*/ false,
|
|
7941
7987
|
/*.check_tensors =*/ false,
|
|
7942
7988
|
/*.use_extra_bufts =*/ true,
|
|
@@ -7971,6 +8017,10 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
|
|
7971
8017
|
return model->hparams.n_embd_inp();
|
|
7972
8018
|
}
|
|
7973
8019
|
|
|
8020
|
+
int32_t llama_model_n_embd_out(const llama_model * model) {
|
|
8021
|
+
return model->hparams.get_n_embd_out();
|
|
8022
|
+
}
|
|
8023
|
+
|
|
7974
8024
|
int32_t llama_model_n_layer(const llama_model * model) {
|
|
7975
8025
|
return model->hparams.n_layer;
|
|
7976
8026
|
}
|
|
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
596
|
}
|
|
597
597
|
|
|
598
598
|
std::vector<std::string> splits = {};
|
|
599
|
-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
599
|
+
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
600
600
|
ml.init_mappings(false); // no prefetching
|
|
601
601
|
|
|
602
602
|
llama_model model(llama_model_default_params());
|