llama_cpp 0.10.2 → 0.10.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-backend.c +6 -10
- data/ext/llama_cpp/src/ggml-cuda.cu +510 -372
- data/ext/llama_cpp/src/ggml-quants.c +25 -344
- data/ext/llama_cpp/src/ggml.c +7 -8
- data/ext/llama_cpp/src/ggml.h +2 -0
- data/ext/llama_cpp/src/llama.cpp +432 -39
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -198,6 +198,7 @@ enum llm_arch {
|
|
198
198
|
LLM_ARCH_STABLELM,
|
199
199
|
LLM_ARCH_QWEN,
|
200
200
|
LLM_ARCH_PHI2,
|
201
|
+
LLM_ARCH_PLAMO,
|
201
202
|
LLM_ARCH_UNKNOWN,
|
202
203
|
};
|
203
204
|
|
@@ -216,6 +217,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
216
217
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
217
218
|
{ LLM_ARCH_QWEN, "qwen" },
|
218
219
|
{ LLM_ARCH_PHI2, "phi2" },
|
220
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
219
221
|
};
|
220
222
|
|
221
223
|
enum llm_kv {
|
@@ -352,6 +354,7 @@ enum llm_tensor {
|
|
352
354
|
LLM_TENSOR_FFN_GATE,
|
353
355
|
LLM_TENSOR_FFN_DOWN,
|
354
356
|
LLM_TENSOR_FFN_UP,
|
357
|
+
LLM_TENSOR_FFN_ACT,
|
355
358
|
LLM_TENSOR_FFN_DOWN_EXP,
|
356
359
|
LLM_TENSOR_FFN_GATE_EXP,
|
357
360
|
LLM_TENSOR_FFN_UP_EXP,
|
@@ -420,6 +423,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
420
423
|
LLM_ARCH_GPT2,
|
421
424
|
{
|
422
425
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
426
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
427
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
428
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
429
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
430
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
431
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
432
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
433
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
434
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
423
435
|
},
|
424
436
|
},
|
425
437
|
{
|
@@ -471,6 +483,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
471
483
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
472
484
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
473
485
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
486
|
+
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
474
487
|
},
|
475
488
|
},
|
476
489
|
{
|
@@ -567,6 +580,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
567
580
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
568
581
|
},
|
569
582
|
},
|
583
|
+
{
|
584
|
+
LLM_ARCH_PLAMO,
|
585
|
+
{
|
586
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
587
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
588
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
589
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
590
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
591
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
592
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
593
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
594
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
595
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
596
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
597
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
598
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
599
|
+
},
|
600
|
+
},
|
570
601
|
|
571
602
|
{
|
572
603
|
LLM_ARCH_UNKNOWN,
|
@@ -778,7 +809,7 @@ struct llama_file {
|
|
778
809
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
779
810
|
}
|
780
811
|
if (ret != 1) {
|
781
|
-
throw std::runtime_error(
|
812
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
782
813
|
}
|
783
814
|
}
|
784
815
|
|
@@ -931,22 +962,22 @@ struct llama_mmap {
|
|
931
962
|
#elif defined(_WIN32)
|
932
963
|
static constexpr bool SUPPORTED = true;
|
933
964
|
|
934
|
-
llama_mmap(struct llama_file * file,
|
935
|
-
(
|
965
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
|
966
|
+
GGML_UNUSED(numa);
|
936
967
|
|
937
968
|
size = file->size;
|
938
969
|
|
939
970
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
940
971
|
|
941
972
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
942
|
-
DWORD error = GetLastError();
|
943
973
|
|
944
974
|
if (hMapping == NULL) {
|
975
|
+
DWORD error = GetLastError();
|
945
976
|
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
946
977
|
}
|
947
978
|
|
948
979
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
949
|
-
error = GetLastError();
|
980
|
+
DWORD error = GetLastError();
|
950
981
|
CloseHandle(hMapping);
|
951
982
|
|
952
983
|
if (addr == NULL) {
|
@@ -954,7 +985,7 @@ struct llama_mmap {
|
|
954
985
|
}
|
955
986
|
|
956
987
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
957
|
-
if (prefetch) {
|
988
|
+
if (prefetch > 0) {
|
958
989
|
// Advise the kernel to preload the mapped memory
|
959
990
|
WIN32_MEMORY_RANGE_ENTRY range;
|
960
991
|
range.VirtualAddress = addr;
|
@@ -977,26 +1008,26 @@ struct llama_mmap {
|
|
977
1008
|
|
978
1009
|
~llama_mmap() {
|
979
1010
|
if (!UnmapViewOfFile(addr)) {
|
980
|
-
|
1011
|
+
LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
|
981
1012
|
llama_format_win_err(GetLastError()).c_str());
|
982
1013
|
}
|
983
1014
|
}
|
984
1015
|
#else
|
985
1016
|
static constexpr bool SUPPORTED = false;
|
986
1017
|
|
987
|
-
llama_mmap(struct llama_file * file,
|
988
|
-
(
|
989
|
-
(
|
990
|
-
(
|
1018
|
+
llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
|
1019
|
+
GGML_UNUSED(file);
|
1020
|
+
GGML_UNUSED(prefetch);
|
1021
|
+
GGML_UNUSED(numa);
|
991
1022
|
|
992
|
-
throw std::runtime_error(
|
1023
|
+
throw std::runtime_error("mmap not supported");
|
993
1024
|
}
|
994
1025
|
|
995
|
-
void
|
996
|
-
(
|
997
|
-
(
|
1026
|
+
void unmap_fragment(size_t first, size_t last) {
|
1027
|
+
GGML_UNUSED(first);
|
1028
|
+
GGML_UNUSED(last);
|
998
1029
|
|
999
|
-
throw std::runtime_error(
|
1030
|
+
throw std::runtime_error("mmap not supported");
|
1000
1031
|
}
|
1001
1032
|
#endif
|
1002
1033
|
};
|
@@ -1172,21 +1203,27 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1172
1203
|
}
|
1173
1204
|
|
1174
1205
|
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
1206
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
1207
|
+
|
1175
1208
|
#ifdef GGML_USE_METAL
|
1176
1209
|
if (n_gpu_layers > 0) {
|
1177
|
-
|
1210
|
+
buft = ggml_backend_metal_buffer_type();
|
1178
1211
|
}
|
1179
1212
|
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1180
1213
|
if (n_gpu_layers > 0) {
|
1181
|
-
|
1214
|
+
buft = ggml_backend_cuda_buffer_type(0);
|
1182
1215
|
}
|
1183
1216
|
#elif defined(GGML_USE_CUBLAS)
|
1184
|
-
|
1217
|
+
buft = ggml_backend_cuda_host_buffer_type();
|
1185
1218
|
#elif defined(GGML_USE_CPU_HBM)
|
1186
|
-
|
1219
|
+
buft = ggml_backend_cpu_hbm_buffer_type();
|
1187
1220
|
#endif
|
1188
1221
|
|
1189
|
-
|
1222
|
+
if (buft == nullptr) {
|
1223
|
+
buft = ggml_backend_cpu_buffer_type();
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
return buft;
|
1190
1227
|
|
1191
1228
|
GGML_UNUSED(n_gpu_layers);
|
1192
1229
|
}
|
@@ -1223,6 +1260,10 @@ enum e_model {
|
|
1223
1260
|
MODEL_40B,
|
1224
1261
|
MODEL_65B,
|
1225
1262
|
MODEL_70B,
|
1263
|
+
MODEL_SMALL,
|
1264
|
+
MODEL_MEDIUM,
|
1265
|
+
MODEL_LARGE,
|
1266
|
+
MODEL_XL,
|
1226
1267
|
};
|
1227
1268
|
|
1228
1269
|
static const size_t kiB = 1024;
|
@@ -1254,6 +1295,7 @@ struct llama_hparams {
|
|
1254
1295
|
float f_clamp_kqv;
|
1255
1296
|
float f_max_alibi_bias;
|
1256
1297
|
|
1298
|
+
|
1257
1299
|
bool operator!=(const llama_hparams & other) const {
|
1258
1300
|
if (this->vocab_only != other.vocab_only) return true;
|
1259
1301
|
if (this->n_vocab != other.n_vocab) return true;
|
@@ -1270,7 +1312,7 @@ struct llama_hparams {
|
|
1270
1312
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1271
1313
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1272
1314
|
|
1273
|
-
const float EPSILON = 1e-
|
1315
|
+
const float EPSILON = 1e-9f;
|
1274
1316
|
|
1275
1317
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1276
1318
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
@@ -1357,6 +1399,7 @@ struct llama_layer {
|
|
1357
1399
|
// ff bias
|
1358
1400
|
struct ggml_tensor * ffn_down_b; // b2
|
1359
1401
|
struct ggml_tensor * ffn_up_b; // b3
|
1402
|
+
struct ggml_tensor * ffn_act;
|
1360
1403
|
};
|
1361
1404
|
|
1362
1405
|
struct llama_kv_cell {
|
@@ -2517,18 +2560,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2517
2560
|
|
2518
2561
|
static const char * llama_model_type_name(e_model type) {
|
2519
2562
|
switch (type) {
|
2520
|
-
case MODEL_1B:
|
2521
|
-
case MODEL_3B:
|
2522
|
-
case MODEL_7B:
|
2523
|
-
case MODEL_8B:
|
2524
|
-
case MODEL_13B:
|
2525
|
-
case MODEL_15B:
|
2526
|
-
case MODEL_30B:
|
2527
|
-
case MODEL_34B:
|
2528
|
-
case MODEL_40B:
|
2529
|
-
case MODEL_65B:
|
2530
|
-
case MODEL_70B:
|
2531
|
-
|
2563
|
+
case MODEL_1B: return "1B";
|
2564
|
+
case MODEL_3B: return "3B";
|
2565
|
+
case MODEL_7B: return "7B";
|
2566
|
+
case MODEL_8B: return "8B";
|
2567
|
+
case MODEL_13B: return "13B";
|
2568
|
+
case MODEL_15B: return "15B";
|
2569
|
+
case MODEL_30B: return "30B";
|
2570
|
+
case MODEL_34B: return "34B";
|
2571
|
+
case MODEL_40B: return "40B";
|
2572
|
+
case MODEL_65B: return "65B";
|
2573
|
+
case MODEL_70B: return "70B";
|
2574
|
+
case MODEL_SMALL: return "0.1B";
|
2575
|
+
case MODEL_MEDIUM: return "0.4B";
|
2576
|
+
case MODEL_LARGE: return "0.8B";
|
2577
|
+
case MODEL_XL: return "1.5B";
|
2578
|
+
default: return "?B";
|
2532
2579
|
}
|
2533
2580
|
}
|
2534
2581
|
|
@@ -2738,6 +2785,26 @@ static void llm_load_hparams(
|
|
2738
2785
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2739
2786
|
}
|
2740
2787
|
} break;
|
2788
|
+
case LLM_ARCH_PLAMO:
|
2789
|
+
{
|
2790
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2791
|
+
|
2792
|
+
switch (hparams.n_layer) {
|
2793
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
2794
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2795
|
+
}
|
2796
|
+
} break;
|
2797
|
+
case LLM_ARCH_GPT2:
|
2798
|
+
{
|
2799
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2800
|
+
switch (hparams.n_layer) {
|
2801
|
+
case 12: model.type = e_model::MODEL_SMALL; break;
|
2802
|
+
case 24: model.type = e_model::MODEL_MEDIUM; break;
|
2803
|
+
case 36: model.type = e_model::MODEL_LARGE; break;
|
2804
|
+
case 48: model.type = e_model::MODEL_XL; break;
|
2805
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2806
|
+
}
|
2807
|
+
} break;
|
2741
2808
|
|
2742
2809
|
default: (void)0;
|
2743
2810
|
}
|
@@ -3431,7 +3498,6 @@ static bool llm_load_tensors(
|
|
3431
3498
|
case LLM_ARCH_MPT:
|
3432
3499
|
{
|
3433
3500
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3434
|
-
|
3435
3501
|
// output
|
3436
3502
|
{
|
3437
3503
|
ggml_backend_type backend_norm;
|
@@ -3469,6 +3535,9 @@ static bool llm_load_tensors(
|
|
3469
3535
|
|
3470
3536
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3471
3537
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3538
|
+
|
3539
|
+
// AWQ ScaleActivation layer
|
3540
|
+
layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
|
3472
3541
|
}
|
3473
3542
|
} break;
|
3474
3543
|
case LLM_ARCH_STABLELM:
|
@@ -3619,6 +3688,105 @@ static bool llm_load_tensors(
|
|
3619
3688
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3620
3689
|
}
|
3621
3690
|
} break;
|
3691
|
+
case LLM_ARCH_PLAMO:
|
3692
|
+
{
|
3693
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3694
|
+
|
3695
|
+
// output
|
3696
|
+
{
|
3697
|
+
ggml_backend_type backend_norm;
|
3698
|
+
ggml_backend_type backend_output;
|
3699
|
+
|
3700
|
+
if (n_gpu_layers > int(n_layer)) {
|
3701
|
+
backend_norm = llama_backend_offload;
|
3702
|
+
backend_output = llama_backend_offload_split;
|
3703
|
+
} else {
|
3704
|
+
backend_norm = GGML_BACKEND_CPU;
|
3705
|
+
backend_output = GGML_BACKEND_CPU;
|
3706
|
+
}
|
3707
|
+
|
3708
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3709
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3710
|
+
}
|
3711
|
+
|
3712
|
+
const uint32_t n_ff = hparams.n_ff;
|
3713
|
+
|
3714
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3715
|
+
|
3716
|
+
model.layers.resize(n_layer);
|
3717
|
+
|
3718
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3719
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3720
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3721
|
+
|
3722
|
+
auto & layer = model.layers[i];
|
3723
|
+
|
3724
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3725
|
+
|
3726
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
3727
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3728
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3729
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3730
|
+
|
3731
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3732
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3733
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3734
|
+
}
|
3735
|
+
} break;
|
3736
|
+
case LLM_ARCH_GPT2:
|
3737
|
+
{
|
3738
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3739
|
+
model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
3740
|
+
|
3741
|
+
// output
|
3742
|
+
{
|
3743
|
+
ggml_backend_type backend_norm;
|
3744
|
+
ggml_backend_type backend_output;
|
3745
|
+
|
3746
|
+
if (n_gpu_layers > int(n_layer)) {
|
3747
|
+
backend_norm = llama_backend_offload;
|
3748
|
+
backend_output = llama_backend_offload_split;
|
3749
|
+
} else {
|
3750
|
+
backend_norm = GGML_BACKEND_CPU;
|
3751
|
+
backend_output = GGML_BACKEND_CPU;
|
3752
|
+
}
|
3753
|
+
|
3754
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3755
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3756
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3757
|
+
}
|
3758
|
+
|
3759
|
+
const uint32_t n_ff = hparams.n_ff;
|
3760
|
+
|
3761
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3762
|
+
|
3763
|
+
model.layers.resize(n_layer);
|
3764
|
+
|
3765
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3766
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3767
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3768
|
+
|
3769
|
+
auto & layer = model.layers[i];
|
3770
|
+
|
3771
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3772
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3773
|
+
|
3774
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
3775
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
3776
|
+
|
3777
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3778
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
3779
|
+
|
3780
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3781
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
3782
|
+
|
3783
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
3784
|
+
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
3785
|
+
|
3786
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3787
|
+
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
3788
|
+
}
|
3789
|
+
} break;
|
3622
3790
|
default:
|
3623
3791
|
throw std::runtime_error("unknown architecture");
|
3624
3792
|
}
|
@@ -3954,6 +4122,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|
3954
4122
|
struct ggml_tensor * gate_b,
|
3955
4123
|
struct ggml_tensor * down,
|
3956
4124
|
struct ggml_tensor * down_b,
|
4125
|
+
struct ggml_tensor * act_scales,
|
3957
4126
|
llm_ffn_op_type type_op,
|
3958
4127
|
llm_ffn_gate_type type_gate,
|
3959
4128
|
const llm_build_cb & cb,
|
@@ -3998,6 +4167,10 @@ static struct ggml_tensor * llm_build_ffn(
|
|
3998
4167
|
{
|
3999
4168
|
cur = ggml_gelu(ctx, cur);
|
4000
4169
|
cb(cur, "ffn_gelu", il);
|
4170
|
+
if (act_scales != NULL) {
|
4171
|
+
cur = ggml_div(ctx, cur, act_scales);
|
4172
|
+
cb(cur, "ffn_act", il);
|
4173
|
+
}
|
4001
4174
|
} break;
|
4002
4175
|
case LLM_FFN_RELU:
|
4003
4176
|
{
|
@@ -4316,6 +4489,7 @@ struct llm_build_context {
|
|
4316
4489
|
model.layers[il].ffn_up, NULL,
|
4317
4490
|
model.layers[il].ffn_gate, NULL,
|
4318
4491
|
model.layers[il].ffn_down, NULL,
|
4492
|
+
NULL,
|
4319
4493
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4320
4494
|
cb(cur, "ffn_out", il);
|
4321
4495
|
} else {
|
@@ -4495,6 +4669,7 @@ struct llm_build_context {
|
|
4495
4669
|
model.layers[il].ffn_up, NULL,
|
4496
4670
|
model.layers[il].ffn_gate, NULL,
|
4497
4671
|
model.layers[il].ffn_down, NULL,
|
4672
|
+
NULL,
|
4498
4673
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4499
4674
|
cb(cur, "ffn_out", il);
|
4500
4675
|
}
|
@@ -4609,6 +4784,7 @@ struct llm_build_context {
|
|
4609
4784
|
model.layers[il].ffn_up, NULL,
|
4610
4785
|
NULL, NULL,
|
4611
4786
|
model.layers[il].ffn_down, NULL,
|
4787
|
+
NULL,
|
4612
4788
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
4613
4789
|
cb(cur, "ffn_out", il);
|
4614
4790
|
}
|
@@ -4713,6 +4889,7 @@ struct llm_build_context {
|
|
4713
4889
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
4714
4890
|
NULL, NULL,
|
4715
4891
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
4892
|
+
NULL,
|
4716
4893
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
4717
4894
|
cb(cur, "ffn_out", il);
|
4718
4895
|
}
|
@@ -4917,6 +5094,7 @@ struct llm_build_context {
|
|
4917
5094
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
4918
5095
|
NULL, NULL,
|
4919
5096
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5097
|
+
NULL,
|
4920
5098
|
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
4921
5099
|
cb(cur, "ffn_out", il);
|
4922
5100
|
}
|
@@ -5003,6 +5181,7 @@ struct llm_build_context {
|
|
5003
5181
|
model.layers[il].ffn_up, NULL,
|
5004
5182
|
model.layers[il].ffn_gate, NULL,
|
5005
5183
|
model.layers[il].ffn_down, NULL,
|
5184
|
+
NULL,
|
5006
5185
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5007
5186
|
cb(cur, "ffn_out", il);
|
5008
5187
|
}
|
@@ -5098,6 +5277,7 @@ struct llm_build_context {
|
|
5098
5277
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5099
5278
|
NULL, NULL,
|
5100
5279
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5280
|
+
NULL,
|
5101
5281
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5102
5282
|
cb(cur, "ffn_out", il);
|
5103
5283
|
}
|
@@ -5183,11 +5363,11 @@ struct llm_build_context {
|
|
5183
5363
|
NULL,
|
5184
5364
|
LLM_NORM, cb, il);
|
5185
5365
|
cb(cur, "ffn_norm", il);
|
5186
|
-
|
5187
5366
|
cur = llm_build_ffn(ctx0, cur,
|
5188
5367
|
model.layers[il].ffn_up, NULL,
|
5189
5368
|
NULL, NULL,
|
5190
5369
|
model.layers[il].ffn_down, NULL,
|
5370
|
+
model.layers[il].ffn_act,
|
5191
5371
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5192
5372
|
cb(cur, "ffn_out", il);
|
5193
5373
|
}
|
@@ -5296,6 +5476,7 @@ struct llm_build_context {
|
|
5296
5476
|
model.layers[il].ffn_up, NULL,
|
5297
5477
|
model.layers[il].ffn_gate, NULL,
|
5298
5478
|
model.layers[il].ffn_down, NULL,
|
5479
|
+
NULL,
|
5299
5480
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5300
5481
|
cb(cur, "ffn_out", il);
|
5301
5482
|
}
|
@@ -5408,6 +5589,7 @@ struct llm_build_context {
|
|
5408
5589
|
model.layers[il].ffn_up, NULL,
|
5409
5590
|
model.layers[il].ffn_gate, NULL,
|
5410
5591
|
model.layers[il].ffn_down, NULL,
|
5592
|
+
NULL,
|
5411
5593
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5412
5594
|
cb(cur, "ffn_out", il);
|
5413
5595
|
}
|
@@ -5515,6 +5697,7 @@ struct llm_build_context {
|
|
5515
5697
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5516
5698
|
NULL, NULL,
|
5517
5699
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5700
|
+
NULL,
|
5518
5701
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5519
5702
|
cb(ffn_output, "ffn_out", il);
|
5520
5703
|
}
|
@@ -5544,6 +5727,206 @@ struct llm_build_context {
|
|
5544
5727
|
|
5545
5728
|
return gf;
|
5546
5729
|
}
|
5730
|
+
|
5731
|
+
struct ggml_cgraph * build_plamo() {
|
5732
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5733
|
+
|
5734
|
+
struct ggml_tensor * cur;
|
5735
|
+
struct ggml_tensor * inpL;
|
5736
|
+
|
5737
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5738
|
+
cb(inpL, "inp_embd", -1);
|
5739
|
+
|
5740
|
+
// inp_pos - contains the positions
|
5741
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5742
|
+
cb(inp_pos, "inp_pos", -1);
|
5743
|
+
|
5744
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5745
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5746
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5747
|
+
|
5748
|
+
// shift the entire K-cache if needed
|
5749
|
+
if (do_rope_shift) {
|
5750
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
5751
|
+
}
|
5752
|
+
|
5753
|
+
for (int il = 0; il < n_layer; ++il) {
|
5754
|
+
|
5755
|
+
// norm
|
5756
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5757
|
+
model.layers[il].attn_norm, NULL,
|
5758
|
+
LLM_NORM_RMS, cb, il);
|
5759
|
+
cb(cur, "attn_norm", il);
|
5760
|
+
|
5761
|
+
struct ggml_tensor * attention_norm = cur;
|
5762
|
+
|
5763
|
+
// self-attention
|
5764
|
+
{
|
5765
|
+
// compute Q and K and RoPE them
|
5766
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
5767
|
+
cb(Qcur, "Qcur", il);
|
5768
|
+
|
5769
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
5770
|
+
cb(Kcur, "Kcur", il);
|
5771
|
+
|
5772
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
5773
|
+
cb(Vcur, "Vcur", il);
|
5774
|
+
|
5775
|
+
Qcur = ggml_rope_custom(
|
5776
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5777
|
+
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5778
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
5779
|
+
cb(Qcur, "Qcur", il);
|
5780
|
+
|
5781
|
+
Kcur = ggml_rope_custom(
|
5782
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5783
|
+
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5784
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
5785
|
+
cb(Kcur, "Kcur", il);
|
5786
|
+
|
5787
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5788
|
+
|
5789
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5790
|
+
model.layers[il].wo, NULL,
|
5791
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5792
|
+
cb(cur, "kqv_out", il);
|
5793
|
+
}
|
5794
|
+
struct ggml_tensor * sa_out = cur;
|
5795
|
+
|
5796
|
+
cur = attention_norm;
|
5797
|
+
|
5798
|
+
// feed-forward network
|
5799
|
+
{
|
5800
|
+
cur = llm_build_ffn(ctx0, cur,
|
5801
|
+
model.layers[il].ffn_up, NULL,
|
5802
|
+
model.layers[il].ffn_gate, NULL,
|
5803
|
+
model.layers[il].ffn_down, NULL,
|
5804
|
+
NULL,
|
5805
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5806
|
+
cb(cur, "ffn_out", il);
|
5807
|
+
}
|
5808
|
+
|
5809
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
5810
|
+
cb(cur, "l_out", il);
|
5811
|
+
|
5812
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5813
|
+
cb(cur, "l_out", il);
|
5814
|
+
|
5815
|
+
// input for next layer
|
5816
|
+
inpL = cur;
|
5817
|
+
}
|
5818
|
+
|
5819
|
+
cur = inpL;
|
5820
|
+
|
5821
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
5822
|
+
model.output_norm, NULL,
|
5823
|
+
LLM_NORM_RMS, cb, -1);
|
5824
|
+
cb(cur, "result_norm", -1);
|
5825
|
+
|
5826
|
+
// lm_head
|
5827
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5828
|
+
cb(cur, "result_output", -1);
|
5829
|
+
|
5830
|
+
ggml_build_forward_expand(gf, cur);
|
5831
|
+
|
5832
|
+
return gf;
|
5833
|
+
}
|
5834
|
+
|
5835
|
+
struct ggml_cgraph * build_gpt2() {
|
5836
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5837
|
+
|
5838
|
+
struct ggml_tensor * cur;
|
5839
|
+
struct ggml_tensor * pos;
|
5840
|
+
struct ggml_tensor * inpL;
|
5841
|
+
|
5842
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5843
|
+
cb(inpL, "inp_embd", -1);
|
5844
|
+
|
5845
|
+
// inp_pos - contains the positions
|
5846
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5847
|
+
cb(inp_pos, "inp_pos", -1);
|
5848
|
+
|
5849
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5850
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5851
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5852
|
+
|
5853
|
+
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
5854
|
+
cb(pos, "pos_embd", -1);
|
5855
|
+
|
5856
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
5857
|
+
cb(inpL, "inpL", -1);
|
5858
|
+
|
5859
|
+
for (int il = 0; il < n_layer; ++il) {
|
5860
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5861
|
+
model.layers[il].attn_norm,
|
5862
|
+
model.layers[il].attn_norm_b,
|
5863
|
+
LLM_NORM, cb, il);
|
5864
|
+
cb(cur, "attn_norm", il);
|
5865
|
+
|
5866
|
+
// self-attention
|
5867
|
+
{
|
5868
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5869
|
+
cb(cur, "wqkv", il);
|
5870
|
+
|
5871
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5872
|
+
cb(cur, "bqkv", il);
|
5873
|
+
|
5874
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5875
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5876
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5877
|
+
|
5878
|
+
cb(Qcur, "Qcur", il);
|
5879
|
+
cb(Kcur, "Kcur", il);
|
5880
|
+
cb(Vcur, "Vcur", il);
|
5881
|
+
|
5882
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5883
|
+
|
5884
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5885
|
+
|
5886
|
+
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5887
|
+
model.layers[il].wo, model.layers[il].bo,
|
5888
|
+
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5889
|
+
cb(cur, "kqv_out", il);
|
5890
|
+
}
|
5891
|
+
|
5892
|
+
// add the input
|
5893
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
5894
|
+
cb(ffn_inp, "ffn_inp", il);
|
5895
|
+
|
5896
|
+
// FF
|
5897
|
+
{
|
5898
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
5899
|
+
model.layers[il].ffn_norm,
|
5900
|
+
model.layers[il].ffn_norm_b,
|
5901
|
+
LLM_NORM, cb, il);
|
5902
|
+
cb(cur, "ffn_norm", il);
|
5903
|
+
|
5904
|
+
cur = llm_build_ffn(ctx0, cur,
|
5905
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5906
|
+
NULL, NULL,
|
5907
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5908
|
+
NULL,
|
5909
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5910
|
+
cb(cur, "ffn_out", il);
|
5911
|
+
}
|
5912
|
+
|
5913
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
5914
|
+
cb(inpL, "l_out", il);
|
5915
|
+
}
|
5916
|
+
|
5917
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5918
|
+
model.output_norm,
|
5919
|
+
model.output_norm_b,
|
5920
|
+
LLM_NORM, cb, -1);
|
5921
|
+
cb(cur, "result_norm", -1);
|
5922
|
+
|
5923
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5924
|
+
cb(cur, "result_output", -1);
|
5925
|
+
|
5926
|
+
ggml_build_forward_expand(gf, cur);
|
5927
|
+
|
5928
|
+
return gf;
|
5929
|
+
}
|
5547
5930
|
};
|
5548
5931
|
|
5549
5932
|
//
|
@@ -5699,6 +6082,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5699
6082
|
{ "ffn_gate", OFFLOAD_FUNC },
|
5700
6083
|
{ "ffn_gate_b", OFFLOAD_FUNC },
|
5701
6084
|
{ "ffn_gate_par", OFFLOAD_FUNC },
|
6085
|
+
{ "ffn_act", OFFLOAD_FUNC },
|
5702
6086
|
{ "ffn_down", OFFLOAD_FUNC },
|
5703
6087
|
{ "ffn_down_b", OFFLOAD_FUNC },
|
5704
6088
|
{ "ffn_out", OFFLOAD_FUNC },
|
@@ -6054,6 +6438,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6054
6438
|
{
|
6055
6439
|
result = llm.build_phi2();
|
6056
6440
|
} break;
|
6441
|
+
case LLM_ARCH_PLAMO:
|
6442
|
+
{
|
6443
|
+
result = llm.build_plamo();
|
6444
|
+
} break;
|
6445
|
+
case LLM_ARCH_GPT2:
|
6446
|
+
{
|
6447
|
+
result = llm.build_gpt2();
|
6448
|
+
} break;
|
6057
6449
|
default:
|
6058
6450
|
GGML_ASSERT(false);
|
6059
6451
|
}
|
@@ -9327,7 +9719,8 @@ struct llama_context * llama_new_context_with_model(
|
|
9327
9719
|
ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
|
9328
9720
|
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
9329
9721
|
if (model->n_gpu_layers > 0) {
|
9330
|
-
|
9722
|
+
// the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
|
9723
|
+
ggml_cuda_set_scratch_size(alloc_size + 64);
|
9331
9724
|
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
9332
9725
|
|
9333
9726
|
// calculate total VRAM usage
|
@@ -10289,7 +10682,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
10289
10682
|
std::string result = model->vocab.id_to_token[token].text;
|
10290
10683
|
llama_unescape_whitespace(result);
|
10291
10684
|
if (length < (int) result.length()) {
|
10292
|
-
return -result.length();
|
10685
|
+
return -(int) result.length();
|
10293
10686
|
}
|
10294
10687
|
memcpy(buf, result.c_str(), result.length());
|
10295
10688
|
return result.length();
|
@@ -10319,7 +10712,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
10319
10712
|
std::string result = model->vocab.id_to_token[token].text;
|
10320
10713
|
result = llama_decode_text(result);
|
10321
10714
|
if (length < (int) result.length()) {
|
10322
|
-
return -result.length();
|
10715
|
+
return -(int) result.length();
|
10323
10716
|
}
|
10324
10717
|
memcpy(buf, result.c_str(), result.length());
|
10325
10718
|
return result.length();
|