@fugood/llama.node 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +31 -31
- package/src/llama.cpp/common/arg.cpp +128 -59
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +36 -7
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +42 -23
- package/src/llama.cpp/common/common.h +11 -1
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +100 -12
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +78 -0
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +637 -49
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +12 -5
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +337 -26
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
- package/src/llama.cpp/src/llama-sampling.h +19 -7
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +87 -64
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -140,6 +140,10 @@ enum layer_fraction_t {
|
|
|
140
140
|
};
|
|
141
141
|
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
|
142
142
|
|
|
143
|
+
class llama_params_fit_exception : public std::runtime_error {
|
|
144
|
+
using std::runtime_error::runtime_error;
|
|
145
|
+
};
|
|
146
|
+
|
|
143
147
|
static void llama_params_fit_impl(
|
|
144
148
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
145
149
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
@@ -181,12 +185,11 @@ static void llama_params_fit_impl(
|
|
|
181
185
|
}
|
|
182
186
|
}
|
|
183
187
|
|
|
184
|
-
int64_t
|
|
188
|
+
int64_t sum_free = 0;
|
|
185
189
|
int64_t sum_projected_free = 0;
|
|
186
190
|
int64_t min_projected_free = INT64_MAX;
|
|
187
191
|
int64_t sum_projected_used = 0;
|
|
188
192
|
int64_t sum_projected_model = 0;
|
|
189
|
-
int64_t sum_projected_ctx = 0;
|
|
190
193
|
|
|
191
194
|
if (nd > 1) {
|
|
192
195
|
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
|
@@ -197,12 +200,11 @@ static void llama_params_fit_impl(
|
|
|
197
200
|
const int64_t projected_used = dmd.mb.total();
|
|
198
201
|
const int64_t projected_free = dmd.free - projected_used;
|
|
199
202
|
|
|
200
|
-
|
|
203
|
+
sum_free += dmd.free;
|
|
201
204
|
sum_projected_used += projected_used;
|
|
202
205
|
sum_projected_free += projected_free;
|
|
203
206
|
min_projected_free = std::min(min_projected_free, projected_free);
|
|
204
207
|
sum_projected_model += dmd.mb.model;
|
|
205
|
-
sum_projected_ctx += dmd.mb.context;
|
|
206
208
|
|
|
207
209
|
if (nd > 1) {
|
|
208
210
|
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
|
@@ -210,10 +212,9 @@ static void llama_params_fit_impl(
|
|
|
210
212
|
projected_free >= 0 ? "surplus" : "deficit");
|
|
211
213
|
}
|
|
212
214
|
}
|
|
213
|
-
assert(
|
|
214
|
-
assert(sum_projected_used >= sum_projected_ctx);
|
|
215
|
+
assert(sum_free >= 0 && sum_projected_used >= 0);
|
|
215
216
|
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
|
216
|
-
__func__, sum_projected_used/MiB,
|
|
217
|
+
__func__, sum_projected_used/MiB, sum_free/MiB);
|
|
217
218
|
if (min_projected_free >= margin) {
|
|
218
219
|
if (nd == 1) {
|
|
219
220
|
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
|
@@ -236,9 +237,7 @@ static void llama_params_fit_impl(
|
|
|
236
237
|
__func__, margin/MiB, -global_surplus/MiB);
|
|
237
238
|
if (cparams->n_ctx == 0) {
|
|
238
239
|
if (hp_nct > n_ctx_min) {
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
int64_t memory_reduction = -global_surplus;
|
|
240
|
+
int64_t sum_used_target = sum_free - nd*margin_s;
|
|
242
241
|
if (nd > 1) {
|
|
243
242
|
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
|
244
243
|
// - for dense models only whole layers can be assigned to devices
|
|
@@ -246,24 +245,34 @@ static void llama_params_fit_impl(
|
|
|
246
245
|
// - on average we expect a waste of 0.5 layers/tensors per device
|
|
247
246
|
// - use slightly more than the expected average for nd devices to be safe
|
|
248
247
|
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
|
249
|
-
|
|
248
|
+
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
|
250
249
|
}
|
|
251
250
|
|
|
252
|
-
|
|
253
|
-
cparams->n_ctx =
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
251
|
+
int64_t sum_projected_used_min_ctx = 0;
|
|
252
|
+
cparams->n_ctx = n_ctx_min;
|
|
253
|
+
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
254
|
+
for (const auto & dmd : dmds_min_ctx) {
|
|
255
|
+
sum_projected_used_min_ctx += dmd.mb.total();
|
|
256
|
+
}
|
|
257
|
+
if (sum_used_target > sum_projected_used_min_ctx) {
|
|
258
|
+
// linear interpolation between minimum and maximum context size:
|
|
259
|
+
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
|
260
|
+
/ (sum_projected_used - sum_projected_used_min_ctx);
|
|
261
|
+
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
|
262
|
+
|
|
263
|
+
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
|
264
|
+
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
|
265
|
+
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
|
266
|
+
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
|
262
267
|
if (nd == 1) {
|
|
263
268
|
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
|
264
269
|
return;
|
|
265
270
|
}
|
|
266
271
|
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
|
272
|
+
} else {
|
|
273
|
+
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
|
274
|
+
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
|
275
|
+
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
|
267
276
|
}
|
|
268
277
|
} else {
|
|
269
278
|
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
|
@@ -276,28 +285,28 @@ static void llama_params_fit_impl(
|
|
|
276
285
|
}
|
|
277
286
|
|
|
278
287
|
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
|
279
|
-
throw
|
|
288
|
+
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
|
280
289
|
}
|
|
281
290
|
if (nd > 1) {
|
|
282
291
|
if (!tensor_split) {
|
|
283
|
-
throw
|
|
292
|
+
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
|
284
293
|
}
|
|
285
294
|
if (mparams->tensor_split) {
|
|
286
295
|
for (size_t id = 0; id < nd; id++) {
|
|
287
296
|
if (mparams->tensor_split[id] != 0.0f) {
|
|
288
|
-
throw
|
|
297
|
+
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
|
|
289
298
|
}
|
|
290
299
|
}
|
|
291
300
|
}
|
|
292
301
|
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
293
|
-
throw
|
|
302
|
+
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
|
294
303
|
}
|
|
295
304
|
}
|
|
296
305
|
if (!tensor_buft_overrides) {
|
|
297
|
-
throw
|
|
306
|
+
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
|
298
307
|
}
|
|
299
308
|
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
|
300
|
-
throw
|
|
309
|
+
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
|
301
310
|
}
|
|
302
311
|
|
|
303
312
|
// step 3: iteratively fill the back to front with "dense" layers
|
|
@@ -350,6 +359,11 @@ static void llama_params_fit_impl(
|
|
|
350
359
|
|
|
351
360
|
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
|
352
361
|
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
|
362
|
+
|
|
363
|
+
uint32_t n_full() const {
|
|
364
|
+
assert(n_layer >= n_part);
|
|
365
|
+
return n_layer - n_part;
|
|
366
|
+
}
|
|
353
367
|
};
|
|
354
368
|
|
|
355
369
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
@@ -373,18 +387,18 @@ static void llama_params_fit_impl(
|
|
|
373
387
|
|
|
374
388
|
size_t itbo = 0;
|
|
375
389
|
for (size_t id = 0; id < nd; id++) {
|
|
376
|
-
il0 += ngl_per_device[id].
|
|
390
|
+
il0 += ngl_per_device[id].n_full();
|
|
377
391
|
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
|
378
392
|
if (itbo + 1 >= ntbo) {
|
|
379
393
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
380
394
|
tensor_buft_overrides[itbo].buft = nullptr;
|
|
381
395
|
itbo++;
|
|
382
396
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
|
383
|
-
throw
|
|
384
|
-
+ std::to_string(ntbo) + " is insufficient for model
|
|
397
|
+
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
|
398
|
+
+ std::to_string(ntbo) + " is insufficient for model");
|
|
385
399
|
}
|
|
386
400
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
|
387
|
-
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
|
401
|
+
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
|
388
402
|
itbo++;
|
|
389
403
|
}
|
|
390
404
|
il0 += ngl_per_device[id].n_part;
|
|
@@ -459,20 +473,14 @@ static void llama_params_fit_impl(
|
|
|
459
473
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
460
474
|
}
|
|
461
475
|
|
|
462
|
-
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial
|
|
476
|
+
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
|
463
477
|
overflow_bufts.reserve(nd);
|
|
464
|
-
for (size_t id = 0; id < nd
|
|
465
|
-
overflow_bufts.push_back(
|
|
478
|
+
for (size_t id = 0; id < nd; id++) {
|
|
479
|
+
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
466
480
|
}
|
|
467
|
-
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
468
481
|
|
|
469
482
|
std::vector<ngl_t> ngl_per_device(nd);
|
|
470
483
|
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
|
471
|
-
if (hp_nex > 0) {
|
|
472
|
-
for (size_t id = 0; id < nd; id++) {
|
|
473
|
-
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
|
474
|
-
}
|
|
475
|
-
}
|
|
476
484
|
|
|
477
485
|
// optimize the number of layers per device using the method of false position:
|
|
478
486
|
// - ngl_per_device has 0 layers for each device, lower bound
|
|
@@ -512,7 +520,8 @@ static void llama_params_fit_impl(
|
|
|
512
520
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
513
521
|
ngl_per_device_test[id].n_layer += step_size;
|
|
514
522
|
if (hp_nex) {
|
|
515
|
-
ngl_per_device_test[id].n_part +=
|
|
523
|
+
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
|
524
|
+
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
|
516
525
|
}
|
|
517
526
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
518
527
|
|
|
@@ -561,7 +570,7 @@ static void llama_params_fit_impl(
|
|
|
561
570
|
assert(id_dense_start < nd);
|
|
562
571
|
|
|
563
572
|
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
|
564
|
-
for (size_t id = 0; id <= id_dense_start; id++) {
|
|
573
|
+
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
|
565
574
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
566
575
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
|
567
576
|
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
|
@@ -573,12 +582,8 @@ static void llama_params_fit_impl(
|
|
|
573
582
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
574
583
|
|
|
575
584
|
if (mem_high[id] > targets[id]) {
|
|
576
|
-
assert(ngl_per_device_high[id].
|
|
577
|
-
|
|
578
|
-
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
579
|
-
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
580
|
-
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
581
|
-
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
585
|
+
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
586
|
+
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
582
587
|
while (delta > 1) {
|
|
583
588
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
584
589
|
step_size = std::max(step_size, uint32_t(1));
|
|
@@ -594,7 +599,7 @@ static void llama_params_fit_impl(
|
|
|
594
599
|
ngl_per_device_test[id].n_layer += n_convert_jd;
|
|
595
600
|
n_converted_test += n_convert_jd;
|
|
596
601
|
|
|
597
|
-
if (ngl_per_device_test[id_dense_start_test].
|
|
602
|
+
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
|
598
603
|
break;
|
|
599
604
|
}
|
|
600
605
|
}
|
|
@@ -613,8 +618,8 @@ static void llama_params_fit_impl(
|
|
|
613
618
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
|
614
619
|
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
|
615
620
|
}
|
|
616
|
-
|
|
617
|
-
|
|
621
|
+
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
622
|
+
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
618
623
|
}
|
|
619
624
|
} else {
|
|
620
625
|
ngl_per_device = ngl_per_device_high;
|
|
@@ -632,14 +637,19 @@ static void llama_params_fit_impl(
|
|
|
632
637
|
ngl_per_device_test[id_dense_start_test].n_part--;
|
|
633
638
|
ngl_per_device_test[id].n_layer++;
|
|
634
639
|
ngl_per_device_test[id].n_part++;
|
|
635
|
-
if (ngl_per_device_test[id_dense_start_test].
|
|
640
|
+
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
|
636
641
|
id_dense_start_test++;
|
|
637
642
|
}
|
|
638
643
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
644
|
+
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
|
645
|
+
if (id < nd - 1) {
|
|
646
|
+
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
|
647
|
+
}
|
|
639
648
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
640
|
-
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
641
|
-
if (mem_test[id] < targets[id]) {
|
|
649
|
+
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
650
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
642
651
|
ngl_per_device = ngl_per_device_test;
|
|
652
|
+
overflow_bufts = overflow_bufts_test;
|
|
643
653
|
mem = mem_test;
|
|
644
654
|
id_dense_start = id_dense_start_test;
|
|
645
655
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
|
@@ -647,9 +657,10 @@ static void llama_params_fit_impl(
|
|
|
647
657
|
|
|
648
658
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
649
659
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
650
|
-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
651
|
-
if (mem_test[id] < targets[id]) {
|
|
660
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
661
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
652
662
|
ngl_per_device = ngl_per_device_test;
|
|
663
|
+
overflow_bufts = overflow_bufts_test;
|
|
653
664
|
mem = mem_test;
|
|
654
665
|
id_dense_start = id_dense_start_test;
|
|
655
666
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
|
@@ -658,9 +669,10 @@ static void llama_params_fit_impl(
|
|
|
658
669
|
} else {
|
|
659
670
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
660
671
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
661
|
-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
662
|
-
if (mem_test[id] < targets[id]) {
|
|
672
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
673
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
663
674
|
ngl_per_device = ngl_per_device_test;
|
|
675
|
+
overflow_bufts = overflow_bufts_test;
|
|
664
676
|
mem = mem_test;
|
|
665
677
|
id_dense_start = id_dense_start_test;
|
|
666
678
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
|
@@ -675,30 +687,41 @@ static void llama_params_fit_impl(
|
|
|
675
687
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
676
688
|
}
|
|
677
689
|
|
|
690
|
+
// print info for devices that were not changed during the conversion from dense only to full layers:
|
|
691
|
+
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
|
692
|
+
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
693
|
+
LLAMA_LOG_INFO(
|
|
694
|
+
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
695
|
+
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
696
|
+
}
|
|
697
|
+
|
|
678
698
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
679
699
|
}
|
|
680
700
|
|
|
681
|
-
|
|
701
|
+
enum llama_params_fit_status llama_params_fit(
|
|
682
702
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
683
703
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
684
704
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
685
705
|
const int64_t t0_us = llama_time_us();
|
|
686
|
-
|
|
706
|
+
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
|
687
707
|
try {
|
|
688
708
|
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
|
689
709
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
|
690
|
-
} catch (const
|
|
710
|
+
} catch (const llama_params_fit_exception & e) {
|
|
691
711
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
|
692
|
-
|
|
712
|
+
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
|
|
713
|
+
} catch (const std::runtime_error & e) {
|
|
714
|
+
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
|
715
|
+
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
|
|
693
716
|
}
|
|
694
717
|
const int64_t t1_us = llama_time_us();
|
|
695
718
|
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
|
696
|
-
return
|
|
719
|
+
return status;
|
|
697
720
|
}
|
|
698
721
|
|
|
699
722
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
|
700
723
|
struct llama_sampler_chain_params result = {
|
|
701
|
-
/*.no_perf
|
|
724
|
+
/*.no_perf =*/ true,
|
|
702
725
|
};
|
|
703
726
|
|
|
704
727
|
return result;
|
|
@@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|
|
22
22
|
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
|
|
23
23
|
|
|
24
24
|
for (int il = 0; il < n_layer; ++il) {
|
|
25
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
26
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
27
|
+
|
|
25
28
|
ggml_tensor * inpSA = inpL;
|
|
26
29
|
|
|
30
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
31
|
+
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
32
|
+
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
33
|
+
|
|
27
34
|
// dual attention normalization (pre)
|
|
28
35
|
cur = build_norm(inpL,
|
|
29
36
|
model.layers[il].attn_norm, NULL,
|
|
@@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|
|
56
63
|
cb(Qcur, "Qcur_normed", il);
|
|
57
64
|
cb(Kcur, "Kcur_normed", il);
|
|
58
65
|
|
|
59
|
-
// RoPE only for sliding_attention layers
|
|
60
|
-
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
61
|
-
((il + 1) % hparams.n_no_rope_layer_step) != 0;
|
|
62
66
|
if (use_rope) {
|
|
63
67
|
Qcur = ggml_rope_ext(
|
|
64
68
|
ctx0, Qcur, inp_pos, nullptr,
|
|
65
|
-
n_rot, rope_type, n_ctx_orig,
|
|
69
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
66
70
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
67
71
|
cb(Qcur, "Qcur_rope", il);
|
|
68
72
|
|
|
69
73
|
Kcur = ggml_rope_ext(
|
|
70
74
|
ctx0, Kcur, inp_pos, nullptr,
|
|
71
|
-
n_rot, rope_type, n_ctx_orig,
|
|
75
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
72
76
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
73
77
|
cb(Kcur, "Kcur_rope", il);
|
|
74
78
|
}
|
|
@@ -142,11 +142,13 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
|
|
142
142
|
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
143
143
|
cb(cur, "ffn_out", il);
|
|
144
144
|
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
145
|
+
const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
|
|
146
|
+
auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
|
|
145
147
|
cur = build_ffn(cur,
|
|
146
|
-
model.layers[il].ffn_up,
|
|
148
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
147
149
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
148
150
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
|
|
149
|
-
|
|
151
|
+
type_op, LLM_FFN_PAR, il);
|
|
150
152
|
cb(cur, "ffn_out", il);
|
|
151
153
|
} else {
|
|
152
154
|
cur = build_ffn(cur,
|
|
@@ -3,12 +3,14 @@
|
|
|
3
3
|
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
|
|
4
4
|
llm_graph_context(params) {
|
|
5
5
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6
|
-
float
|
|
6
|
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
7
7
|
|
|
8
8
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
9
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
10
10
|
|
|
11
|
-
ggml_tensor *inpL
|
|
11
|
+
ggml_tensor * inpL;
|
|
12
|
+
ggml_tensor * cur;
|
|
13
|
+
|
|
12
14
|
inpL = build_inp_embd(model.tok_embd);
|
|
13
15
|
|
|
14
16
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
@@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
|
|
|
44
46
|
}
|
|
45
47
|
|
|
46
48
|
ggml_tensor * inpSA = inpL;
|
|
47
|
-
cur
|
|
49
|
+
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
48
50
|
|
|
49
51
|
// build self attention
|
|
50
52
|
{
|
|
@@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
|
|
|
21
21
|
|
|
22
22
|
for (int il = 0; il < n_layer; ++il) {
|
|
23
23
|
const bool is_swa = hparams.is_swa(il);
|
|
24
|
+
// UNUSED:
|
|
25
|
+
// const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
26
|
+
// const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
24
27
|
|
|
25
28
|
// norm
|
|
26
29
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
|
|
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
215
215
|
model.layers[il].ffn_exp_probs_b,
|
|
216
216
|
n_expert, n_expert_used,
|
|
217
217
|
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
218
|
-
|
|
218
|
+
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
|
219
219
|
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
220
220
|
il);
|
|
221
221
|
cb(moe_out, "ffn_moe_out", il);
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
|
|
6
4
|
llm_graph_context(params) {
|
|
7
5
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
@@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
|
|
|
12
10
|
inpL = build_inp_embd(model.tok_embd);
|
|
13
11
|
|
|
14
12
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
cb(inpL, "inp_scaled", -1);
|
|
18
|
-
}
|
|
13
|
+
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
|
14
|
+
cb(inpL, "inp_scaled", -1);
|
|
19
15
|
|
|
20
16
|
// inp_pos - contains the positions
|
|
21
17
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
@@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
|
|
19
19
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
20
20
|
|
|
21
21
|
for (int il = 0; il < n_layer; ++il) {
|
|
22
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
23
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
24
|
+
|
|
22
25
|
// norm
|
|
23
26
|
cur = build_norm(inpL,
|
|
24
27
|
model.layers[il].attn_norm, NULL,
|
|
@@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
|
|
43
46
|
|
|
44
47
|
Qcur = ggml_rope_ext(
|
|
45
48
|
ctx0, Qcur, inp_pos, nullptr,
|
|
46
|
-
n_rot, rope_type, n_ctx_orig,
|
|
49
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
47
50
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
48
51
|
|
|
49
52
|
Kcur = ggml_rope_ext(
|
|
50
53
|
ctx0, Kcur, inp_pos, nullptr,
|
|
51
|
-
n_rot, rope_type, n_ctx_orig,
|
|
54
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
52
55
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
53
56
|
|
|
54
57
|
cb(Qcur, "Qcur", il);
|
|
@@ -10,10 +10,9 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
|
|
|
10
10
|
inpL = build_inp_embd(model.tok_embd);
|
|
11
11
|
|
|
12
12
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
}
|
|
13
|
+
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
|
14
|
+
cb(inpL, "inp_scaled", -1);
|
|
15
|
+
|
|
17
16
|
// inp_pos - contains the positions
|
|
18
17
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
19
18
|
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
|
|
6
4
|
llm_graph_context(params),
|
|
7
5
|
model(model),
|
|
@@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
|
|
15
13
|
inpL = build_inp_embd(model.tok_embd);
|
|
16
14
|
|
|
17
15
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
}
|
|
16
|
+
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
|
17
|
+
cb(inpL, "inp_scaled", -1);
|
|
18
|
+
|
|
22
19
|
// inp_pos - contains the positions
|
|
23
20
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
24
21
|
|
|
@@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
|
|
|
248
245
|
// equivalent to get_per_layer_inputs() in python code
|
|
249
246
|
// output shape: [n_embd_altup, n_layer, n_tokens]
|
|
250
247
|
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
|
251
|
-
auto
|
|
248
|
+
auto inp = std::make_unique<llm_graph_input_embd>();
|
|
252
249
|
ggml_tensor * inp_per_layer;
|
|
253
250
|
if (ubatch.token) {
|
|
254
251
|
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
|
@@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
|
|
25
25
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
26
26
|
|
|
27
27
|
for (int il = 0; il < n_layer; ++il) {
|
|
28
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
29
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
30
|
+
|
|
28
31
|
ggml_tensor * inpSA = inpL;
|
|
29
32
|
|
|
33
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
30
34
|
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
31
35
|
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
32
36
|
|
|
@@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
|
|
67
71
|
if (use_rope) {
|
|
68
72
|
Qcur = ggml_rope_ext(
|
|
69
73
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
70
|
-
n_rot, rope_type, n_ctx_orig,
|
|
74
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
71
75
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
72
76
|
);
|
|
73
77
|
|
|
74
78
|
Kcur = ggml_rope_ext(
|
|
75
79
|
ctx0, Kcur, inp_pos, rope_factors,
|
|
76
|
-
n_rot, rope_type, n_ctx_orig,
|
|
80
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
77
81
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
78
82
|
);
|
|
79
83
|
} else if (inp_attn_scale) {
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
template <bool embed>
|
|
4
|
+
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
5
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
6
|
|
|
6
7
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -14,7 +15,14 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
|
|
|
14
15
|
// inp_pos - contains the positions
|
|
15
16
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
|
|
19
|
+
|
|
20
|
+
inp_attn_type * inp_attn = nullptr;
|
|
21
|
+
if constexpr (embed) {
|
|
22
|
+
inp_attn = build_attn_inp_no_cache();
|
|
23
|
+
} else {
|
|
24
|
+
inp_attn = build_attn_inp_kv();
|
|
25
|
+
}
|
|
18
26
|
|
|
19
27
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
20
28
|
|
|
@@ -145,11 +153,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
|
|
|
145
153
|
cb(cur, "result_norm", -1);
|
|
146
154
|
res->t_embd = cur;
|
|
147
155
|
|
|
148
|
-
|
|
149
|
-
|
|
156
|
+
if constexpr (!embed) {
|
|
157
|
+
// lm_head
|
|
158
|
+
cur = build_lora_mm(model.output, cur);
|
|
150
159
|
|
|
151
|
-
|
|
152
|
-
|
|
160
|
+
cb(cur, "result_output", -1);
|
|
161
|
+
res->t_logits = cur;
|
|
162
|
+
}
|
|
153
163
|
|
|
154
164
|
ggml_build_forward_expand(gf, cur);
|
|
155
165
|
}
|
|
166
|
+
|
|
167
|
+
template struct llm_build_llama<false>;
|
|
168
|
+
template struct llm_build_llama<true>;
|