@fugood/llama.node 1.4.12 → 1.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +11 -1
- package/lib/index.js +2 -1
- package/lib/index.ts +2 -0
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaContext.cpp +5 -2
- package/src/llama.cpp/common/arg.cpp +249 -101
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +21 -1
- package/src/llama.cpp/common/common.h +20 -7
- package/src/llama.cpp/common/download.cpp +104 -55
- package/src/llama.cpp/common/download.h +26 -5
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/preset.cpp +76 -1
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +92 -10
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-mmap.cpp +70 -37
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +66 -16
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1233 -171
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +101 -57
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
|
@@ -14,7 +14,16 @@ struct llama_grammar;
|
|
|
14
14
|
struct llama_sampler_chain {
|
|
15
15
|
llama_sampler_chain_params params;
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
// has .backend_init() been called?
|
|
18
|
+
bool is_init = false;
|
|
19
|
+
|
|
20
|
+
struct info {
|
|
21
|
+
bool is_backend;
|
|
22
|
+
|
|
23
|
+
llama_sampler * ptr;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
std::vector<info> samplers;
|
|
18
27
|
|
|
19
28
|
// pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
|
|
20
29
|
std::vector<llama_token_data> cur;
|
|
@@ -27,9 +36,9 @@ struct llama_sampler_chain {
|
|
|
27
36
|
};
|
|
28
37
|
|
|
29
38
|
struct llama_sampler * llama_sampler_init_dry_testing(
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
39
|
+
int32_t context_size,
|
|
40
|
+
float dry_multiplier,
|
|
41
|
+
float dry_base,
|
|
42
|
+
int32_t dry_allowed_length,
|
|
43
|
+
int32_t dry_penalty_last_n,
|
|
44
|
+
const std::vector<std::vector<llama_token>> & seq_breakers);
|
|
@@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|
|
111
111
|
}
|
|
112
112
|
}
|
|
113
113
|
for (size_t i = 0; i < ret.size(); i++) {
|
|
114
|
-
size_t free
|
|
114
|
+
size_t free;
|
|
115
|
+
size_t total;
|
|
115
116
|
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
|
117
|
+
|
|
118
|
+
// devices can return 0 bytes for free and total memory if they do not
|
|
119
|
+
// have any to report. in this case, we will use the host memory as a fallback
|
|
120
|
+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
|
121
|
+
if (free == 0 && total == 0) {
|
|
122
|
+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
123
|
+
if (cpu_dev == nullptr) {
|
|
124
|
+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
125
|
+
}
|
|
126
|
+
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
|
127
|
+
}
|
|
116
128
|
ret[i].free = free;
|
|
117
129
|
ret[i].total = total;
|
|
118
130
|
}
|
|
@@ -147,9 +159,8 @@ class llama_params_fit_exception : public std::runtime_error {
|
|
|
147
159
|
static void llama_params_fit_impl(
|
|
148
160
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
149
161
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
150
|
-
size_t
|
|
162
|
+
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
151
163
|
constexpr int64_t MiB = 1024*1024;
|
|
152
|
-
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
|
153
164
|
typedef std::vector<llama_device_memory_data> dmds_t;
|
|
154
165
|
const llama_model_params default_mparams = llama_model_default_params();
|
|
155
166
|
|
|
@@ -168,6 +179,12 @@ static void llama_params_fit_impl(
|
|
|
168
179
|
return;
|
|
169
180
|
}
|
|
170
181
|
|
|
182
|
+
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
|
183
|
+
margins.reserve(nd);
|
|
184
|
+
for (size_t id = 0; id < nd; id++) {
|
|
185
|
+
margins.push_back(margins_s[id]);
|
|
186
|
+
}
|
|
187
|
+
|
|
171
188
|
std::vector<std::string> dev_names;
|
|
172
189
|
{
|
|
173
190
|
dev_names.reserve(nd);
|
|
@@ -187,9 +204,10 @@ static void llama_params_fit_impl(
|
|
|
187
204
|
|
|
188
205
|
int64_t sum_free = 0;
|
|
189
206
|
int64_t sum_projected_free = 0;
|
|
190
|
-
int64_t min_projected_free = INT64_MAX;
|
|
191
207
|
int64_t sum_projected_used = 0;
|
|
192
208
|
int64_t sum_projected_model = 0;
|
|
209
|
+
std::vector<int64_t> projected_free_per_device;
|
|
210
|
+
projected_free_per_device.reserve(nd);
|
|
193
211
|
|
|
194
212
|
if (nd > 1) {
|
|
195
213
|
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
|
@@ -199,45 +217,63 @@ static void llama_params_fit_impl(
|
|
|
199
217
|
|
|
200
218
|
const int64_t projected_used = dmd.mb.total();
|
|
201
219
|
const int64_t projected_free = dmd.free - projected_used;
|
|
220
|
+
projected_free_per_device.push_back(projected_free);
|
|
202
221
|
|
|
203
222
|
sum_free += dmd.free;
|
|
204
223
|
sum_projected_used += projected_used;
|
|
205
224
|
sum_projected_free += projected_free;
|
|
206
|
-
min_projected_free = std::min(min_projected_free, projected_free);
|
|
207
225
|
sum_projected_model += dmd.mb.model;
|
|
208
226
|
|
|
209
227
|
if (nd > 1) {
|
|
210
|
-
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %
|
|
211
|
-
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB,
|
|
212
|
-
projected_free >= 0 ? "surplus" : "deficit");
|
|
228
|
+
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
|
229
|
+
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
|
213
230
|
}
|
|
214
231
|
}
|
|
215
232
|
assert(sum_free >= 0 && sum_projected_used >= 0);
|
|
216
233
|
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
|
217
234
|
__func__, sum_projected_used/MiB, sum_free/MiB);
|
|
218
|
-
if (
|
|
219
|
-
if (
|
|
235
|
+
if (nd == 1) {
|
|
236
|
+
if (projected_free_per_device[0] >= margins[0]) {
|
|
220
237
|
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
|
221
|
-
__func__,
|
|
238
|
+
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
} else {
|
|
242
|
+
bool changes_needed = false;
|
|
243
|
+
for (size_t id = 0; id < nd; id++) {
|
|
244
|
+
if (projected_free_per_device[id] < margins[id]) {
|
|
245
|
+
changes_needed = true;
|
|
246
|
+
break;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
if (!changes_needed) {
|
|
250
|
+
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
|
222
251
|
return;
|
|
223
252
|
}
|
|
224
|
-
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
|
225
|
-
__func__, min_projected_free/MiB, margin/MiB);
|
|
226
|
-
return;
|
|
227
253
|
}
|
|
228
254
|
|
|
229
255
|
// step 2: try reducing memory use by reducing the context size
|
|
230
256
|
|
|
231
257
|
{
|
|
232
|
-
int64_t global_surplus = sum_projected_free
|
|
258
|
+
int64_t global_surplus = sum_projected_free;
|
|
259
|
+
for (size_t id = 0; id < nd; id++) {
|
|
260
|
+
global_surplus -= margins[id];
|
|
261
|
+
}
|
|
233
262
|
if (global_surplus < 0) {
|
|
234
|
-
|
|
235
|
-
"%s: cannot
|
|
236
|
-
|
|
237
|
-
|
|
263
|
+
if (nd == 1) {
|
|
264
|
+
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
|
265
|
+
__func__, margins[0]/MiB, -global_surplus/MiB);
|
|
266
|
+
} else {
|
|
267
|
+
LLAMA_LOG_INFO(
|
|
268
|
+
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
|
269
|
+
__func__, -global_surplus/MiB);
|
|
270
|
+
}
|
|
238
271
|
if (cparams->n_ctx == 0) {
|
|
239
272
|
if (hp_nct > n_ctx_min) {
|
|
240
|
-
int64_t sum_used_target = sum_free
|
|
273
|
+
int64_t sum_used_target = sum_free;
|
|
274
|
+
for (size_t id = 0; id < nd; id++) {
|
|
275
|
+
sum_used_target -= margins[id];
|
|
276
|
+
}
|
|
241
277
|
if (nd > 1) {
|
|
242
278
|
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
|
243
279
|
// - for dense models only whole layers can be assigned to devices
|
|
@@ -359,6 +395,11 @@ static void llama_params_fit_impl(
|
|
|
359
395
|
|
|
360
396
|
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
|
361
397
|
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
|
398
|
+
|
|
399
|
+
uint32_t n_full() const {
|
|
400
|
+
assert(n_layer >= n_part);
|
|
401
|
+
return n_layer - n_part;
|
|
402
|
+
}
|
|
362
403
|
};
|
|
363
404
|
|
|
364
405
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
@@ -382,7 +423,7 @@ static void llama_params_fit_impl(
|
|
|
382
423
|
|
|
383
424
|
size_t itbo = 0;
|
|
384
425
|
for (size_t id = 0; id < nd; id++) {
|
|
385
|
-
il0 += ngl_per_device[id].
|
|
426
|
+
il0 += ngl_per_device[id].n_full();
|
|
386
427
|
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
|
387
428
|
if (itbo + 1 >= ntbo) {
|
|
388
429
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
@@ -393,7 +434,7 @@ static void llama_params_fit_impl(
|
|
|
393
434
|
+ std::to_string(ntbo) + " is insufficient for model");
|
|
394
435
|
}
|
|
395
436
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
|
396
|
-
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
|
437
|
+
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
|
397
438
|
itbo++;
|
|
398
439
|
}
|
|
399
440
|
il0 += ngl_per_device[id].n_part;
|
|
@@ -443,9 +484,9 @@ static void llama_params_fit_impl(
|
|
|
443
484
|
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
|
444
485
|
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
445
486
|
|
|
446
|
-
for (
|
|
447
|
-
global_surplus_cpu_moe +=
|
|
448
|
-
global_surplus_cpu_moe -= int64_t(
|
|
487
|
+
for (size_t id = 0; id < nd; id++) {
|
|
488
|
+
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
|
489
|
+
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
|
449
490
|
}
|
|
450
491
|
|
|
451
492
|
if (global_surplus_cpu_moe > 0) {
|
|
@@ -464,24 +505,18 @@ static void llama_params_fit_impl(
|
|
|
464
505
|
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
|
465
506
|
targets.reserve(nd);
|
|
466
507
|
for (size_t id = 0; id < nd; id++) {
|
|
467
|
-
targets.push_back(dmds_full[id].free -
|
|
508
|
+
targets.push_back(dmds_full[id].free - margins[id]);
|
|
468
509
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
469
510
|
}
|
|
470
511
|
|
|
471
|
-
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial
|
|
512
|
+
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
|
472
513
|
overflow_bufts.reserve(nd);
|
|
473
|
-
for (size_t id = 0; id < nd
|
|
474
|
-
overflow_bufts.push_back(
|
|
514
|
+
for (size_t id = 0; id < nd; id++) {
|
|
515
|
+
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
475
516
|
}
|
|
476
|
-
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
477
517
|
|
|
478
518
|
std::vector<ngl_t> ngl_per_device(nd);
|
|
479
519
|
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
|
480
|
-
if (hp_nex > 0) {
|
|
481
|
-
for (size_t id = 0; id < nd; id++) {
|
|
482
|
-
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
520
|
|
|
486
521
|
// optimize the number of layers per device using the method of false position:
|
|
487
522
|
// - ngl_per_device has 0 layers for each device, lower bound
|
|
@@ -512,9 +547,6 @@ static void llama_params_fit_impl(
|
|
|
512
547
|
if (mem_high[id] > targets[id]) {
|
|
513
548
|
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
|
514
549
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
515
|
-
if (hp_nex > 0 && size_t(id) == nd - 1) {
|
|
516
|
-
delta--;
|
|
517
|
-
}
|
|
518
550
|
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
|
519
551
|
while (delta > 1) {
|
|
520
552
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
@@ -524,7 +556,8 @@ static void llama_params_fit_impl(
|
|
|
524
556
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
525
557
|
ngl_per_device_test[id].n_layer += step_size;
|
|
526
558
|
if (hp_nex) {
|
|
527
|
-
ngl_per_device_test[id].n_part +=
|
|
559
|
+
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
|
560
|
+
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
|
528
561
|
}
|
|
529
562
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
530
563
|
|
|
@@ -573,7 +606,7 @@ static void llama_params_fit_impl(
|
|
|
573
606
|
assert(id_dense_start < nd);
|
|
574
607
|
|
|
575
608
|
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
|
576
|
-
for (size_t id = 0; id <= id_dense_start; id++) {
|
|
609
|
+
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
|
577
610
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
578
611
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
|
579
612
|
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
|
@@ -585,12 +618,8 @@ static void llama_params_fit_impl(
|
|
|
585
618
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
586
619
|
|
|
587
620
|
if (mem_high[id] > targets[id]) {
|
|
588
|
-
assert(ngl_per_device_high[id].
|
|
589
|
-
|
|
590
|
-
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
591
|
-
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
592
|
-
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
593
|
-
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
621
|
+
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
622
|
+
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
594
623
|
while (delta > 1) {
|
|
595
624
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
596
625
|
step_size = std::max(step_size, uint32_t(1));
|
|
@@ -606,7 +635,7 @@ static void llama_params_fit_impl(
|
|
|
606
635
|
ngl_per_device_test[id].n_layer += n_convert_jd;
|
|
607
636
|
n_converted_test += n_convert_jd;
|
|
608
637
|
|
|
609
|
-
if (ngl_per_device_test[id_dense_start_test].
|
|
638
|
+
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
|
610
639
|
break;
|
|
611
640
|
}
|
|
612
641
|
}
|
|
@@ -625,8 +654,8 @@ static void llama_params_fit_impl(
|
|
|
625
654
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
|
626
655
|
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
|
627
656
|
}
|
|
628
|
-
|
|
629
|
-
|
|
657
|
+
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
658
|
+
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
630
659
|
}
|
|
631
660
|
} else {
|
|
632
661
|
ngl_per_device = ngl_per_device_high;
|
|
@@ -644,14 +673,19 @@ static void llama_params_fit_impl(
|
|
|
644
673
|
ngl_per_device_test[id_dense_start_test].n_part--;
|
|
645
674
|
ngl_per_device_test[id].n_layer++;
|
|
646
675
|
ngl_per_device_test[id].n_part++;
|
|
647
|
-
if (ngl_per_device_test[id_dense_start_test].
|
|
676
|
+
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
|
648
677
|
id_dense_start_test++;
|
|
649
678
|
}
|
|
650
679
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
680
|
+
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
|
681
|
+
if (id < nd - 1) {
|
|
682
|
+
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
|
683
|
+
}
|
|
651
684
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
652
|
-
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
685
|
+
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
653
686
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
654
687
|
ngl_per_device = ngl_per_device_test;
|
|
688
|
+
overflow_bufts = overflow_bufts_test;
|
|
655
689
|
mem = mem_test;
|
|
656
690
|
id_dense_start = id_dense_start_test;
|
|
657
691
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
|
@@ -659,9 +693,10 @@ static void llama_params_fit_impl(
|
|
|
659
693
|
|
|
660
694
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
661
695
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
662
|
-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
696
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
663
697
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
664
698
|
ngl_per_device = ngl_per_device_test;
|
|
699
|
+
overflow_bufts = overflow_bufts_test;
|
|
665
700
|
mem = mem_test;
|
|
666
701
|
id_dense_start = id_dense_start_test;
|
|
667
702
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
|
@@ -670,9 +705,10 @@ static void llama_params_fit_impl(
|
|
|
670
705
|
} else {
|
|
671
706
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
672
707
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
673
|
-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
708
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
674
709
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
675
710
|
ngl_per_device = ngl_per_device_test;
|
|
711
|
+
overflow_bufts = overflow_bufts_test;
|
|
676
712
|
mem = mem_test;
|
|
677
713
|
id_dense_start = id_dense_start_test;
|
|
678
714
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
|
@@ -687,17 +723,25 @@ static void llama_params_fit_impl(
|
|
|
687
723
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
688
724
|
}
|
|
689
725
|
|
|
726
|
+
// print info for devices that were not changed during the conversion from dense only to full layers:
|
|
727
|
+
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
|
728
|
+
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
729
|
+
LLAMA_LOG_INFO(
|
|
730
|
+
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
731
|
+
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
732
|
+
}
|
|
733
|
+
|
|
690
734
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
691
735
|
}
|
|
692
736
|
|
|
693
737
|
enum llama_params_fit_status llama_params_fit(
|
|
694
738
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
695
739
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
696
|
-
size_t
|
|
740
|
+
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
697
741
|
const int64_t t0_us = llama_time_us();
|
|
698
742
|
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
|
699
743
|
try {
|
|
700
|
-
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides,
|
|
744
|
+
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
|
701
745
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
|
702
746
|
} catch (const llama_params_fit_exception & e) {
|
|
703
747
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
|
@@ -713,7 +757,7 @@ enum llama_params_fit_status llama_params_fit(
|
|
|
713
757
|
|
|
714
758
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
|
715
759
|
struct llama_sampler_chain_params result = {
|
|
716
|
-
/*.no_perf
|
|
760
|
+
/*.no_perf =*/ true,
|
|
717
761
|
};
|
|
718
762
|
|
|
719
763
|
return result;
|
|
@@ -786,7 +830,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
786
830
|
model.t_start_us = tm.t_start_us;
|
|
787
831
|
|
|
788
832
|
try {
|
|
789
|
-
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
|
833
|
+
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
|
790
834
|
|
|
791
835
|
ml.print_info();
|
|
792
836
|
|
|
@@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|
|
22
22
|
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
|
|
23
23
|
|
|
24
24
|
for (int il = 0; il < n_layer; ++il) {
|
|
25
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
26
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
27
|
+
|
|
25
28
|
ggml_tensor * inpSA = inpL;
|
|
26
29
|
|
|
30
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
31
|
+
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
32
|
+
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
33
|
+
|
|
27
34
|
// dual attention normalization (pre)
|
|
28
35
|
cur = build_norm(inpL,
|
|
29
36
|
model.layers[il].attn_norm, NULL,
|
|
@@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|
|
56
63
|
cb(Qcur, "Qcur_normed", il);
|
|
57
64
|
cb(Kcur, "Kcur_normed", il);
|
|
58
65
|
|
|
59
|
-
// RoPE only for sliding_attention layers
|
|
60
|
-
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
61
|
-
((il + 1) % hparams.n_no_rope_layer_step) != 0;
|
|
62
66
|
if (use_rope) {
|
|
63
67
|
Qcur = ggml_rope_ext(
|
|
64
68
|
ctx0, Qcur, inp_pos, nullptr,
|
|
65
|
-
n_rot, rope_type, n_ctx_orig,
|
|
69
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
66
70
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
67
71
|
cb(Qcur, "Qcur_rope", il);
|
|
68
72
|
|
|
69
73
|
Kcur = ggml_rope_ext(
|
|
70
74
|
ctx0, Kcur, inp_pos, nullptr,
|
|
71
|
-
n_rot, rope_type, n_ctx_orig,
|
|
75
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
72
76
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
73
77
|
cb(Kcur, "Kcur_rope", il);
|
|
74
78
|
}
|
|
@@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
|
|
|
21
21
|
|
|
22
22
|
for (int il = 0; il < n_layer; ++il) {
|
|
23
23
|
const bool is_swa = hparams.is_swa(il);
|
|
24
|
+
// UNUSED:
|
|
25
|
+
// const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
26
|
+
// const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
24
27
|
|
|
25
28
|
// norm
|
|
26
29
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
|
|
@@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
|
|
19
19
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
20
20
|
|
|
21
21
|
for (int il = 0; il < n_layer; ++il) {
|
|
22
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
23
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
24
|
+
|
|
22
25
|
// norm
|
|
23
26
|
cur = build_norm(inpL,
|
|
24
27
|
model.layers[il].attn_norm, NULL,
|
|
@@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
|
|
43
46
|
|
|
44
47
|
Qcur = ggml_rope_ext(
|
|
45
48
|
ctx0, Qcur, inp_pos, nullptr,
|
|
46
|
-
n_rot, rope_type, n_ctx_orig,
|
|
49
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
47
50
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
48
51
|
|
|
49
52
|
Kcur = ggml_rope_ext(
|
|
50
53
|
ctx0, Kcur, inp_pos, nullptr,
|
|
51
|
-
n_rot, rope_type, n_ctx_orig,
|
|
54
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
52
55
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
53
56
|
|
|
54
57
|
cb(Qcur, "Qcur", il);
|
|
@@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
|
|
25
25
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
26
26
|
|
|
27
27
|
for (int il = 0; il < n_layer; ++il) {
|
|
28
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
29
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
30
|
+
|
|
28
31
|
ggml_tensor * inpSA = inpL;
|
|
29
32
|
|
|
33
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
30
34
|
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
31
35
|
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
32
36
|
|
|
@@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
|
|
67
71
|
if (use_rope) {
|
|
68
72
|
Qcur = ggml_rope_ext(
|
|
69
73
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
70
|
-
n_rot, rope_type, n_ctx_orig,
|
|
74
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
71
75
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
72
76
|
);
|
|
73
77
|
|
|
74
78
|
Kcur = ggml_rope_ext(
|
|
75
79
|
ctx0, Kcur, inp_pos, rope_factors,
|
|
76
|
-
n_rot, rope_type, n_ctx_orig,
|
|
80
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
77
81
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
78
82
|
);
|
|
79
83
|
} else if (inp_attn_scale) {
|
|
@@ -23,7 +23,8 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
|
|
|
23
23
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
24
24
|
|
|
25
25
|
for (int il = 0; il < n_layer; ++il) {
|
|
26
|
-
float freq_base_l
|
|
26
|
+
const float freq_base_l = model.get_rope_freq_base(cparams, il);
|
|
27
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
27
28
|
|
|
28
29
|
cur = inpL;
|
|
29
30
|
|
|
@@ -48,13 +49,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
|
|
|
48
49
|
// RoPE
|
|
49
50
|
Qcur = ggml_rope_ext(
|
|
50
51
|
ctx0, Qcur, inp_pos, nullptr,
|
|
51
|
-
n_rot, rope_type, n_ctx_orig, freq_base_l,
|
|
52
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
52
53
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
53
54
|
);
|
|
54
55
|
|
|
55
56
|
Kcur = ggml_rope_ext(
|
|
56
57
|
ctx0, Kcur, inp_pos, nullptr,
|
|
57
|
-
n_rot, rope_type, n_ctx_orig, freq_base_l,
|
|
58
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
58
59
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
59
60
|
);
|
|
60
61
|
|
|
@@ -14,6 +14,9 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
|
|
14
14
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15
15
|
|
|
16
16
|
for (int il = 0; il < n_layer; ++il) {
|
|
17
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
18
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
19
|
+
|
|
17
20
|
ggml_tensor * inpSA = inpL;
|
|
18
21
|
|
|
19
22
|
// norm
|
|
@@ -49,13 +52,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
|
|
49
52
|
|
|
50
53
|
Qcur = ggml_rope_ext(
|
|
51
54
|
ctx0, Qcur, inp_pos, nullptr,
|
|
52
|
-
n_rot, rope_type, n_ctx_orig,
|
|
55
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
53
56
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
54
57
|
);
|
|
55
58
|
|
|
56
59
|
Kcur = ggml_rope_ext(
|
|
57
60
|
ctx0, Kcur, inp_pos, nullptr,
|
|
58
|
-
n_rot, rope_type, n_ctx_orig,
|
|
61
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
59
62
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
60
63
|
);
|
|
61
64
|
|
|
@@ -26,10 +26,16 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
|
|
26
26
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
27
27
|
|
|
28
28
|
for (int il = 0; il < n_layer; ++il) {
|
|
29
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
30
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
31
|
+
|
|
29
32
|
ggml_tensor * inpSA = inpL;
|
|
30
|
-
ggml_tensor * probs = nullptr;
|
|
31
33
|
|
|
32
|
-
|
|
34
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
35
|
+
const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
|
|
36
|
+
il % hparams.n_no_rope_layer_step != 0;
|
|
37
|
+
|
|
38
|
+
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
|
|
33
39
|
cb(probs, "ffn_moe_logits", il);
|
|
34
40
|
|
|
35
41
|
// norm
|
|
@@ -52,11 +58,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
|
|
52
58
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
53
59
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
54
60
|
|
|
55
|
-
if (
|
|
56
|
-
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
61
|
+
if (use_rope) {
|
|
62
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
57
63
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
58
64
|
|
|
59
|
-
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
65
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
60
66
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
61
67
|
}
|
|
62
68
|
cb(Qcur, "Qcur", il);
|