llama_cpp 0.12.3 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +22 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -2
- data/vendor/tmp/llama.cpp/Makefile +23 -4
- data/vendor/tmp/llama.cpp/ggml-alloc.c +85 -25
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +115 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +121 -86
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +706 -15
- data/vendor/tmp/llama.cpp/ggml-quants.h +17 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +350 -57
- data/vendor/tmp/llama.cpp/ggml.h +7 -1
- data/vendor/tmp/llama.cpp/llama.cpp +574 -39
- data/vendor/tmp/llama.cpp/llama.h +11 -15
- metadata +9 -2
@@ -11,6 +11,12 @@
|
|
11
11
|
# include "ggml-cuda.h"
|
12
12
|
#elif defined(GGML_USE_CLBLAST)
|
13
13
|
# include "ggml-opencl.h"
|
14
|
+
#elif defined(GGML_USE_VULKAN)
|
15
|
+
# include "ggml-vulkan.h"
|
16
|
+
#elif defined(GGML_USE_SYCL)
|
17
|
+
# include "ggml-sycl.h"
|
18
|
+
#elif defined(GGML_USE_KOMPUTE)
|
19
|
+
# include "ggml-kompute.h"
|
14
20
|
#endif
|
15
21
|
|
16
22
|
#ifdef GGML_USE_METAL
|
@@ -52,6 +58,7 @@
|
|
52
58
|
#include <algorithm>
|
53
59
|
#include <array>
|
54
60
|
#include <cassert>
|
61
|
+
#include <cfloat>
|
55
62
|
#include <cinttypes>
|
56
63
|
#include <climits>
|
57
64
|
#include <cmath>
|
@@ -196,6 +203,8 @@ enum llm_arch {
|
|
196
203
|
LLM_ARCH_PHI2,
|
197
204
|
LLM_ARCH_PLAMO,
|
198
205
|
LLM_ARCH_CODESHELL,
|
206
|
+
LLM_ARCH_ORION,
|
207
|
+
LLM_ARCH_INTERNLM2,
|
199
208
|
LLM_ARCH_UNKNOWN,
|
200
209
|
};
|
201
210
|
|
@@ -217,6 +226,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
217
226
|
{ LLM_ARCH_PHI2, "phi2" },
|
218
227
|
{ LLM_ARCH_PLAMO, "plamo" },
|
219
228
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
229
|
+
{ LLM_ARCH_ORION, "orion" },
|
230
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
220
231
|
};
|
221
232
|
|
222
233
|
enum llm_kv {
|
@@ -269,6 +280,7 @@ enum llm_kv {
|
|
269
280
|
LLM_KV_TOKENIZER_PAD_ID,
|
270
281
|
LLM_KV_TOKENIZER_ADD_BOS,
|
271
282
|
LLM_KV_TOKENIZER_ADD_EOS,
|
283
|
+
LLM_KV_TOKENIZER_ADD_PREFIX,
|
272
284
|
LLM_KV_TOKENIZER_HF_JSON,
|
273
285
|
LLM_KV_TOKENIZER_RWKV,
|
274
286
|
};
|
@@ -323,6 +335,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
323
335
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
324
336
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
325
337
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
338
|
+
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
326
339
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
327
340
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
328
341
|
};
|
@@ -641,7 +654,42 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
641
654
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
642
655
|
},
|
643
656
|
},
|
644
|
-
|
657
|
+
{
|
658
|
+
LLM_ARCH_ORION,
|
659
|
+
{
|
660
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
661
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
662
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
663
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
664
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
665
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
666
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
667
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
668
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
669
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
670
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
671
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
672
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
673
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
674
|
+
},
|
675
|
+
},
|
676
|
+
{
|
677
|
+
LLM_ARCH_INTERNLM2,
|
678
|
+
{
|
679
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
680
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
681
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
682
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
683
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
684
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
685
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
686
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
687
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
688
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
689
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
690
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
691
|
+
},
|
692
|
+
},
|
645
693
|
{
|
646
694
|
LLM_ARCH_UNKNOWN,
|
647
695
|
{
|
@@ -1132,10 +1180,10 @@ struct llama_mlock {
|
|
1132
1180
|
#ifdef __APPLE__
|
1133
1181
|
#define MLOCK_SUGGESTION \
|
1134
1182
|
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
1135
|
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing
|
1183
|
+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
|
1136
1184
|
#else
|
1137
1185
|
#define MLOCK_SUGGESTION \
|
1138
|
-
"Try increasing
|
1186
|
+
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
|
1139
1187
|
#endif
|
1140
1188
|
|
1141
1189
|
bool raw_lock(const void * addr, size_t size) const {
|
@@ -1256,8 +1304,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1256
1304
|
if (host_buffer) {
|
1257
1305
|
buft = ggml_backend_cuda_host_buffer_type();
|
1258
1306
|
}
|
1307
|
+
#elif defined(GGML_USE_SYCL)
|
1308
|
+
buft = ggml_backend_sycl_host_buffer_type();
|
1259
1309
|
#elif defined(GGML_USE_CPU_HBM)
|
1260
1310
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
1311
|
+
#elif defined(GGML_USE_VULKAN)
|
1312
|
+
if (host_buffer) {
|
1313
|
+
buft = ggml_backend_vk_host_buffer_type();
|
1314
|
+
}
|
1261
1315
|
#endif
|
1262
1316
|
|
1263
1317
|
if (buft == nullptr) {
|
@@ -1275,8 +1329,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1275
1329
|
buft = ggml_backend_metal_buffer_type();
|
1276
1330
|
#elif defined(GGML_USE_CUBLAS)
|
1277
1331
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1332
|
+
#elif defined(GGML_USE_VULKAN)
|
1333
|
+
buft = ggml_backend_vk_buffer_type();
|
1334
|
+
#elif defined(GGML_USE_SYCL)
|
1335
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
1278
1336
|
#elif defined(GGML_USE_CLBLAST)
|
1279
1337
|
buft = ggml_backend_opencl_buffer_type();
|
1338
|
+
#elif defined(GGML_USE_KOMPUTE)
|
1339
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
1340
|
+
if (buft == nullptr) {
|
1341
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
1342
|
+
}
|
1280
1343
|
#endif
|
1281
1344
|
|
1282
1345
|
if (buft == nullptr) {
|
@@ -1332,7 +1395,9 @@ enum e_model {
|
|
1332
1395
|
MODEL_7B,
|
1333
1396
|
MODEL_8B,
|
1334
1397
|
MODEL_13B,
|
1398
|
+
MODEL_14B,
|
1335
1399
|
MODEL_15B,
|
1400
|
+
MODEL_20B,
|
1336
1401
|
MODEL_30B,
|
1337
1402
|
MODEL_34B,
|
1338
1403
|
MODEL_40B,
|
@@ -1574,6 +1639,8 @@ struct llama_vocab {
|
|
1574
1639
|
id special_suffix_id = 32008;
|
1575
1640
|
id special_eot_id = 32010;
|
1576
1641
|
|
1642
|
+
bool add_space_prefix = true;
|
1643
|
+
|
1577
1644
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
1578
1645
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
1579
1646
|
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
@@ -2323,6 +2390,7 @@ struct llama_model_loader {
|
|
2323
2390
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2324
2391
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2325
2392
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2393
|
+
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2326
2394
|
default:
|
2327
2395
|
{
|
2328
2396
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2668,9 +2736,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2668
2736
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
2669
2737
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
2670
2738
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2671
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "
|
2739
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
2672
2740
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2673
2741
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
2742
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2674
2743
|
|
2675
2744
|
default: return "unknown, may not work";
|
2676
2745
|
}
|
@@ -2683,7 +2752,9 @@ static const char * llama_model_type_name(e_model type) {
|
|
2683
2752
|
case MODEL_7B: return "7B";
|
2684
2753
|
case MODEL_8B: return "8B";
|
2685
2754
|
case MODEL_13B: return "13B";
|
2755
|
+
case MODEL_14B: return "14B";
|
2686
2756
|
case MODEL_15B: return "15B";
|
2757
|
+
case MODEL_20B: return "20B";
|
2687
2758
|
case MODEL_30B: return "30B";
|
2688
2759
|
case MODEL_34B: return "34B";
|
2689
2760
|
case MODEL_40B: return "40B";
|
@@ -2696,6 +2767,14 @@ static const char * llama_model_type_name(e_model type) {
|
|
2696
2767
|
default: return "?B";
|
2697
2768
|
}
|
2698
2769
|
}
|
2770
|
+
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2771
|
+
switch (type) {
|
2772
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2773
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2774
|
+
default: return "unknown";
|
2775
|
+
}
|
2776
|
+
}
|
2777
|
+
|
2699
2778
|
|
2700
2779
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
2701
2780
|
model.arch = ml.get_arch();
|
@@ -2950,7 +3029,24 @@ static void llm_load_hparams(
|
|
2950
3029
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2951
3030
|
}
|
2952
3031
|
} break;
|
3032
|
+
case LLM_ARCH_ORION:
|
3033
|
+
{
|
3034
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2953
3035
|
|
3036
|
+
switch (hparams.n_layer) {
|
3037
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
3038
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3039
|
+
}
|
3040
|
+
} break;
|
3041
|
+
case LLM_ARCH_INTERNLM2:
|
3042
|
+
{
|
3043
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3044
|
+
switch (hparams.n_layer) {
|
3045
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3046
|
+
case 48: model.type = e_model::MODEL_20B; break;
|
3047
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3048
|
+
}
|
3049
|
+
} break;
|
2954
3050
|
default: (void)0;
|
2955
3051
|
}
|
2956
3052
|
|
@@ -3002,6 +3098,11 @@ static void llm_load_vocab(
|
|
3002
3098
|
vocab.special_unk_id = 0;
|
3003
3099
|
vocab.special_sep_id = -1;
|
3004
3100
|
vocab.special_pad_id = -1;
|
3101
|
+
|
3102
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
3103
|
+
if (add_space_prefix_keyidx != -1) {
|
3104
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
3105
|
+
} // The default value of add_space_prefix is true.
|
3005
3106
|
} else if (tokenizer_name == "gpt2") {
|
3006
3107
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
3007
3108
|
|
@@ -3214,7 +3315,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3214
3315
|
// hparams
|
3215
3316
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
3216
3317
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
3217
|
-
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type
|
3318
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
3218
3319
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
3219
3320
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
3220
3321
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
@@ -3933,6 +4034,65 @@ static bool llm_load_tensors(
|
|
3933
4034
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3934
4035
|
}
|
3935
4036
|
} break;
|
4037
|
+
case LLM_ARCH_ORION:
|
4038
|
+
{
|
4039
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4040
|
+
{
|
4041
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4042
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4043
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4044
|
+
}
|
4045
|
+
for (int i = 0; i < n_layer; ++i) {
|
4046
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4047
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4048
|
+
|
4049
|
+
auto & layer = model.layers[i];
|
4050
|
+
|
4051
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4052
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4053
|
+
|
4054
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4055
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4056
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4057
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4058
|
+
|
4059
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4060
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
4061
|
+
|
4062
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4063
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4064
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4065
|
+
}
|
4066
|
+
} break;
|
4067
|
+
case LLM_ARCH_INTERNLM2:
|
4068
|
+
{
|
4069
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4070
|
+
|
4071
|
+
// output
|
4072
|
+
{
|
4073
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4074
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4075
|
+
}
|
4076
|
+
|
4077
|
+
for (int i = 0; i < n_layer; ++i) {
|
4078
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4079
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4080
|
+
|
4081
|
+
auto & layer = model.layers[i];
|
4082
|
+
|
4083
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4084
|
+
// layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4085
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4086
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4087
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4088
|
+
|
4089
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4090
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4091
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4092
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4093
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4094
|
+
}
|
4095
|
+
} break;
|
3936
4096
|
default:
|
3937
4097
|
throw std::runtime_error("unknown architecture");
|
3938
4098
|
}
|
@@ -4029,7 +4189,7 @@ static bool llm_load_tensors(
|
|
4029
4189
|
}
|
4030
4190
|
|
4031
4191
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
4032
|
-
static int llama_model_load(const std::string & fname, llama_model & model,
|
4192
|
+
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
4033
4193
|
try {
|
4034
4194
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
4035
4195
|
|
@@ -4050,6 +4210,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
|
4050
4210
|
return 0;
|
4051
4211
|
}
|
4052
4212
|
|
4213
|
+
#ifdef GGML_USE_KOMPUTE
|
4214
|
+
if (params.n_gpu_layers > 0 && (
|
4215
|
+
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|
4216
|
+
|| !(
|
4217
|
+
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
4218
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
4219
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
4220
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
4221
|
+
)
|
4222
|
+
)) {
|
4223
|
+
// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
|
4224
|
+
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
|
4225
|
+
params.n_gpu_layers = 0;
|
4226
|
+
}
|
4227
|
+
#endif
|
4228
|
+
|
4053
4229
|
if (!llm_load_tensors(
|
4054
4230
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
4055
4231
|
params.progress_callback, params.progress_callback_user_data
|
@@ -6366,6 +6542,245 @@ struct llm_build_context {
|
|
6366
6542
|
|
6367
6543
|
return gf;
|
6368
6544
|
}
|
6545
|
+
|
6546
|
+
struct ggml_cgraph * build_orion() {
|
6547
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6548
|
+
|
6549
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6550
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6551
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6552
|
+
|
6553
|
+
struct ggml_tensor * cur;
|
6554
|
+
struct ggml_tensor * inpL;
|
6555
|
+
|
6556
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6557
|
+
cb(inpL, "inp_embd", -1);
|
6558
|
+
|
6559
|
+
// inp_pos - contains the positions
|
6560
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6561
|
+
cb(inp_pos, "inp_pos", -1);
|
6562
|
+
|
6563
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6564
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6565
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6566
|
+
|
6567
|
+
// shift the entire K-cache if needed
|
6568
|
+
if (do_rope_shift) {
|
6569
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6570
|
+
}
|
6571
|
+
|
6572
|
+
for (int il = 0; il < n_layer; ++il) {
|
6573
|
+
struct ggml_tensor * inpSA = inpL;
|
6574
|
+
|
6575
|
+
// norm
|
6576
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6577
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
6578
|
+
LLM_NORM, cb, il);
|
6579
|
+
cb(cur, "attn_norm", il);
|
6580
|
+
|
6581
|
+
// self-attention
|
6582
|
+
{
|
6583
|
+
// compute Q and K and RoPE them
|
6584
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6585
|
+
cb(Qcur, "Qcur", il);
|
6586
|
+
// if (model.layers[il].bq) {
|
6587
|
+
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6588
|
+
// cb(Qcur, "Qcur", il);
|
6589
|
+
// }
|
6590
|
+
|
6591
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6592
|
+
cb(Kcur, "Kcur", il);
|
6593
|
+
// if (model.layers[il].bk) {
|
6594
|
+
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6595
|
+
// cb(Kcur, "Kcur", il);
|
6596
|
+
// }
|
6597
|
+
|
6598
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6599
|
+
cb(Vcur, "Vcur", il);
|
6600
|
+
// if (model.layers[il].bv) {
|
6601
|
+
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6602
|
+
// cb(Vcur, "Vcur", il);
|
6603
|
+
// }
|
6604
|
+
|
6605
|
+
Qcur = ggml_rope_custom(
|
6606
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6607
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6608
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6609
|
+
);
|
6610
|
+
cb(Qcur, "Qcur", il);
|
6611
|
+
|
6612
|
+
Kcur = ggml_rope_custom(
|
6613
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6614
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6615
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6616
|
+
);
|
6617
|
+
cb(Kcur, "Kcur", il);
|
6618
|
+
|
6619
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6620
|
+
model.layers[il].wo, NULL,
|
6621
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6622
|
+
cb(cur, "kqv_out", il);
|
6623
|
+
}
|
6624
|
+
|
6625
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6626
|
+
cb(ffn_inp, "ffn_inp", il);
|
6627
|
+
|
6628
|
+
// feed-forward network
|
6629
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6630
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
6631
|
+
LLM_NORM, cb, il);
|
6632
|
+
cb(cur, "ffn_norm", il);
|
6633
|
+
|
6634
|
+
cur = llm_build_ffn(ctx0, cur,
|
6635
|
+
model.layers[il].ffn_up, NULL,
|
6636
|
+
model.layers[il].ffn_gate, NULL,
|
6637
|
+
model.layers[il].ffn_down, NULL,
|
6638
|
+
NULL,
|
6639
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6640
|
+
cb(cur, "ffn_out", il);
|
6641
|
+
|
6642
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6643
|
+
cb(cur, "l_out", il);
|
6644
|
+
|
6645
|
+
// input for next layer
|
6646
|
+
inpL = cur;
|
6647
|
+
}
|
6648
|
+
|
6649
|
+
cur = inpL;
|
6650
|
+
|
6651
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6652
|
+
model.output_norm, model.output_norm_b,
|
6653
|
+
LLM_NORM, cb, -1);
|
6654
|
+
cb(cur, "result_norm", -1);
|
6655
|
+
|
6656
|
+
// lm_head
|
6657
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6658
|
+
cb(cur, "result_output", -1);
|
6659
|
+
|
6660
|
+
ggml_build_forward_expand(gf, cur);
|
6661
|
+
|
6662
|
+
return gf;
|
6663
|
+
}
|
6664
|
+
|
6665
|
+
struct ggml_cgraph * build_internlm2() {
|
6666
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6667
|
+
|
6668
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6669
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6670
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6671
|
+
|
6672
|
+
struct ggml_tensor * cur;
|
6673
|
+
struct ggml_tensor * inpL;
|
6674
|
+
|
6675
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6676
|
+
cb(inpL, "inp_embd", -1);
|
6677
|
+
|
6678
|
+
// inp_pos - contains the positions
|
6679
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6680
|
+
cb(inp_pos, "inp_pos", -1);
|
6681
|
+
|
6682
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6683
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6684
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6685
|
+
|
6686
|
+
// shift the entire K-cache if needed
|
6687
|
+
if (do_rope_shift) {
|
6688
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6689
|
+
}
|
6690
|
+
|
6691
|
+
for (int il = 0; il < n_layer; ++il) {
|
6692
|
+
struct ggml_tensor * inpSA = inpL;
|
6693
|
+
|
6694
|
+
// norm
|
6695
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6696
|
+
model.layers[il].attn_norm, NULL,
|
6697
|
+
LLM_NORM_RMS, cb, il);
|
6698
|
+
cb(cur, "attn_norm", il);
|
6699
|
+
|
6700
|
+
// self-attention
|
6701
|
+
{
|
6702
|
+
// compute Q and K and RoPE them
|
6703
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6704
|
+
cb(Qcur, "Qcur", il);
|
6705
|
+
if (model.layers[il].bq) {
|
6706
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6707
|
+
cb(Qcur, "Qcur", il);
|
6708
|
+
}
|
6709
|
+
|
6710
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6711
|
+
cb(Kcur, "Kcur", il);
|
6712
|
+
if (model.layers[il].bk) {
|
6713
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6714
|
+
cb(Kcur, "Kcur", il);
|
6715
|
+
}
|
6716
|
+
|
6717
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6718
|
+
cb(Vcur, "Vcur", il);
|
6719
|
+
if (model.layers[il].bv) {
|
6720
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6721
|
+
cb(Vcur, "Vcur", il);
|
6722
|
+
}
|
6723
|
+
|
6724
|
+
Qcur = ggml_rope_custom(
|
6725
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6726
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
6727
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6728
|
+
);
|
6729
|
+
cb(Qcur, "Qcur", il);
|
6730
|
+
|
6731
|
+
Kcur = ggml_rope_custom(
|
6732
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6733
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
6734
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6735
|
+
);
|
6736
|
+
cb(Kcur, "Kcur", il);
|
6737
|
+
|
6738
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6739
|
+
model.layers[il].wo, model.layers[il].bo,
|
6740
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6741
|
+
cb(cur, "kqv_out", il);
|
6742
|
+
}
|
6743
|
+
|
6744
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6745
|
+
cb(ffn_inp, "ffn_inp", il);
|
6746
|
+
|
6747
|
+
// feed-forward network
|
6748
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6749
|
+
model.layers[il].ffn_norm, NULL,
|
6750
|
+
LLM_NORM_RMS, cb, il);
|
6751
|
+
cb(cur, "ffn_norm", il);
|
6752
|
+
|
6753
|
+
cur = llm_build_ffn(ctx0, cur,
|
6754
|
+
model.layers[il].ffn_up, NULL,
|
6755
|
+
model.layers[il].ffn_gate, NULL,
|
6756
|
+
model.layers[il].ffn_down, NULL,
|
6757
|
+
NULL,
|
6758
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6759
|
+
cb(cur, "ffn_out", il);
|
6760
|
+
|
6761
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6762
|
+
cb(cur, "l_out", il);
|
6763
|
+
|
6764
|
+
// input for next layer
|
6765
|
+
inpL = cur;
|
6766
|
+
}
|
6767
|
+
|
6768
|
+
cur = inpL;
|
6769
|
+
|
6770
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6771
|
+
model.output_norm, NULL,
|
6772
|
+
LLM_NORM_RMS, cb, -1);
|
6773
|
+
cb(cur, "result_norm", -1);
|
6774
|
+
|
6775
|
+
// lm_head
|
6776
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6777
|
+
cb(cur, "result_output", -1);
|
6778
|
+
|
6779
|
+
ggml_build_forward_expand(gf, cur);
|
6780
|
+
|
6781
|
+
return gf;
|
6782
|
+
}
|
6783
|
+
|
6369
6784
|
};
|
6370
6785
|
|
6371
6786
|
static struct ggml_cgraph * llama_build_graph(
|
@@ -6520,6 +6935,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6520
6935
|
{
|
6521
6936
|
result = llm.build_codeshell();
|
6522
6937
|
} break;
|
6938
|
+
case LLM_ARCH_ORION:
|
6939
|
+
{
|
6940
|
+
result = llm.build_orion();
|
6941
|
+
} break;
|
6942
|
+
case LLM_ARCH_INTERNLM2:
|
6943
|
+
{
|
6944
|
+
result = llm.build_internlm2();
|
6945
|
+
} break;
|
6523
6946
|
default:
|
6524
6947
|
GGML_ASSERT(false);
|
6525
6948
|
}
|
@@ -6651,11 +7074,6 @@ static int llama_decode_internal(
|
|
6651
7074
|
n_threads = std::min(4, n_threads);
|
6652
7075
|
}
|
6653
7076
|
|
6654
|
-
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
6655
|
-
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
6656
|
-
n_threads = 1;
|
6657
|
-
}
|
6658
|
-
|
6659
7077
|
#ifdef GGML_USE_MPI
|
6660
7078
|
const int64_t n_layer = hparams.n_layer;
|
6661
7079
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
@@ -7467,7 +7885,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
7467
7885
|
//
|
7468
7886
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
7469
7887
|
if (&fragment == &fragment_buffer.front()) {
|
7470
|
-
|
7888
|
+
if (vocab.add_space_prefix) {
|
7889
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
7890
|
+
}
|
7471
7891
|
}
|
7472
7892
|
|
7473
7893
|
#ifdef PRETOKENIZERDEBUG
|
@@ -7946,6 +8366,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
|
|
7946
8366
|
}
|
7947
8367
|
|
7948
8368
|
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
8369
|
+
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
8370
|
+
// if (k >= (int32_t)candidates->size) {
|
8371
|
+
// return;
|
8372
|
+
// }
|
8373
|
+
|
7949
8374
|
const int64_t t_start_sample_us = ggml_time_us();
|
7950
8375
|
|
7951
8376
|
k = std::max(k, (int) min_keep);
|
@@ -8054,21 +8479,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
|
8054
8479
|
return;
|
8055
8480
|
}
|
8056
8481
|
|
8057
|
-
llama_sample_softmax(ctx, candidates);
|
8058
|
-
|
8059
8482
|
const int64_t t_start_sample_us = ggml_time_us();
|
8060
8483
|
|
8061
|
-
|
8062
|
-
|
8484
|
+
bool min_p_applied = false;
|
8485
|
+
|
8486
|
+
// if the candidates aren't sorted, try the unsorted implementation first
|
8487
|
+
if (!candidates->sorted) {
|
8488
|
+
std::vector<llama_token_data> filtered_tokens;
|
8489
|
+
|
8490
|
+
float max_logit = -FLT_MAX;
|
8491
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
8492
|
+
max_logit = std::max(max_logit, candidates->data[i].logit);
|
8493
|
+
}
|
8494
|
+
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
8495
|
+
|
8496
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
8497
|
+
if (candidates->data[i].logit >= min_logit) {
|
8498
|
+
filtered_tokens.push_back(candidates->data[i]);
|
8499
|
+
}
|
8500
|
+
}
|
8063
8501
|
|
8064
|
-
|
8065
|
-
if (
|
8066
|
-
|
8502
|
+
// if we have enough values the operation was a success
|
8503
|
+
if (filtered_tokens.size() >= min_keep) {
|
8504
|
+
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
8505
|
+
candidates->size = filtered_tokens.size();
|
8506
|
+
min_p_applied = true;
|
8067
8507
|
}
|
8068
8508
|
}
|
8069
8509
|
|
8070
|
-
//
|
8071
|
-
|
8510
|
+
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
8511
|
+
if (!min_p_applied) {
|
8512
|
+
// Sort the logits in descending order
|
8513
|
+
if (!candidates->sorted) {
|
8514
|
+
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
8515
|
+
return a.logit > b.logit;
|
8516
|
+
});
|
8517
|
+
candidates->sorted = true;
|
8518
|
+
}
|
8519
|
+
|
8520
|
+
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
8521
|
+
size_t i = 1; // first token always matches
|
8522
|
+
|
8523
|
+
for (; i < candidates->size; ++i) {
|
8524
|
+
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
8525
|
+
break; // prob too small
|
8526
|
+
}
|
8527
|
+
}
|
8528
|
+
|
8529
|
+
// Resize the output vector to keep only the matching tokens
|
8530
|
+
candidates->size = i;
|
8531
|
+
}
|
8072
8532
|
|
8073
8533
|
if (ctx) {
|
8074
8534
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -8972,6 +9432,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8972
9432
|
else if (new_type != GGML_TYPE_Q8_0) {
|
8973
9433
|
new_type = GGML_TYPE_Q6_K;
|
8974
9434
|
}
|
9435
|
+
} else if (name == "token_embd.weight") {
|
9436
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
9437
|
+
new_type = GGML_TYPE_Q2_K;
|
9438
|
+
}
|
9439
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
9440
|
+
new_type = GGML_TYPE_Q4_K;
|
9441
|
+
}
|
8975
9442
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
8976
9443
|
if (name.find("attn_v.weight") != std::string::npos) {
|
8977
9444
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
@@ -8982,7 +9449,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8982
9449
|
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
8983
9450
|
++qs.i_ffn_down;
|
8984
9451
|
}
|
8985
|
-
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
8986
9452
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
8987
9453
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
8988
9454
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
@@ -8990,6 +9456,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8990
9456
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
8991
9457
|
new_type = GGML_TYPE_Q4_K;
|
8992
9458
|
}
|
9459
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) {
|
9460
|
+
new_type = GGML_TYPE_Q4_K;
|
9461
|
+
}
|
8993
9462
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
8994
9463
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
8995
9464
|
}
|
@@ -9027,6 +9496,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9027
9496
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
9028
9497
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
9029
9498
|
}
|
9499
|
+
//else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
9500
|
+
// if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
9501
|
+
//}
|
9030
9502
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
9031
9503
|
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
9032
9504
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
@@ -9058,13 +9530,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9058
9530
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
9059
9531
|
if (arch != LLM_ARCH_FALCON) {
|
9060
9532
|
if (qs.model.hparams.n_expert == 8) {
|
9061
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
|
9533
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
9062
9534
|
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
9063
9535
|
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
9064
9536
|
new_type = GGML_TYPE_Q5_K;
|
9065
9537
|
}
|
9066
9538
|
} else {
|
9067
9539
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
9540
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
9068
9541
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
9069
9542
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
9070
9543
|
}
|
@@ -9107,7 +9580,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9107
9580
|
bool convert_incompatible_tensor = false;
|
9108
9581
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
9109
9582
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
9110
|
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS
|
9583
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
9584
|
+
new_type == GGML_TYPE_IQ3_XXS) {
|
9111
9585
|
int nx = tensor->ne[0];
|
9112
9586
|
int ny = tensor->ne[1];
|
9113
9587
|
if (nx % QK_K != 0) {
|
@@ -9121,6 +9595,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9121
9595
|
switch (new_type) {
|
9122
9596
|
case GGML_TYPE_IQ2_XXS:
|
9123
9597
|
case GGML_TYPE_IQ2_XS:
|
9598
|
+
case GGML_TYPE_IQ3_XXS:
|
9124
9599
|
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
9125
9600
|
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
9126
9601
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
@@ -9162,6 +9637,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9162
9637
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
9163
9638
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
9164
9639
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
9640
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
9165
9641
|
|
9166
9642
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
9167
9643
|
}
|
@@ -9812,18 +10288,45 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
9812
10288
|
return result;
|
9813
10289
|
}
|
9814
10290
|
|
9815
|
-
|
9816
|
-
|
10291
|
+
size_t llama_max_devices(void) {
|
10292
|
+
#if defined(GGML_USE_METAL)
|
10293
|
+
return 1;
|
10294
|
+
#elif defined(GGML_USE_CUBLAS)
|
10295
|
+
return GGML_CUDA_MAX_DEVICES;
|
10296
|
+
#elif defined(GGML_USE_SYCL)
|
10297
|
+
return GGML_SYCL_MAX_DEVICES;
|
10298
|
+
#else
|
10299
|
+
return 1;
|
10300
|
+
#endif
|
9817
10301
|
}
|
9818
10302
|
|
9819
|
-
bool
|
10303
|
+
bool llama_supports_mmap(void) {
|
9820
10304
|
return llama_mmap::SUPPORTED;
|
9821
10305
|
}
|
9822
10306
|
|
9823
|
-
bool
|
10307
|
+
bool llama_supports_mlock(void) {
|
9824
10308
|
return llama_mlock::SUPPORTED;
|
9825
10309
|
}
|
9826
10310
|
|
10311
|
+
bool llama_supports_gpu_offload(void) {
|
10312
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
10313
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
10314
|
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
10315
|
+
return true;
|
10316
|
+
#else
|
10317
|
+
return false;
|
10318
|
+
#endif
|
10319
|
+
}
|
10320
|
+
|
10321
|
+
// deprecated:
|
10322
|
+
bool llama_mmap_supported(void) {
|
10323
|
+
return llama_supports_mmap();
|
10324
|
+
}
|
10325
|
+
|
10326
|
+
bool llama_mlock_supported(void) {
|
10327
|
+
return llama_supports_mlock();
|
10328
|
+
}
|
10329
|
+
|
9827
10330
|
void llama_backend_init(bool numa) {
|
9828
10331
|
ggml_time_init();
|
9829
10332
|
|
@@ -9855,8 +10358,8 @@ int64_t llama_time_us(void) {
|
|
9855
10358
|
}
|
9856
10359
|
|
9857
10360
|
struct llama_model * llama_load_model_from_file(
|
9858
|
-
|
9859
|
-
|
10361
|
+
const char * path_model,
|
10362
|
+
struct llama_model_params params) {
|
9860
10363
|
ggml_time_init();
|
9861
10364
|
|
9862
10365
|
llama_model * model = new llama_model;
|
@@ -9997,6 +10500,36 @@ struct llama_context * llama_new_context_with_model(
|
|
9997
10500
|
}
|
9998
10501
|
}
|
9999
10502
|
}
|
10503
|
+
#elif defined(GGML_USE_VULKAN)
|
10504
|
+
if (model->n_gpu_layers > 0) {
|
10505
|
+
ggml_backend_t backend = ggml_backend_vk_init();
|
10506
|
+
if (backend == nullptr) {
|
10507
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
10508
|
+
llama_free(ctx);
|
10509
|
+
return nullptr;
|
10510
|
+
}
|
10511
|
+
ctx->backends.push_back(backend);
|
10512
|
+
}
|
10513
|
+
#elif defined(GGML_USE_SYCL)
|
10514
|
+
if (model->n_gpu_layers > 0) {
|
10515
|
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
10516
|
+
if (backend == nullptr) {
|
10517
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
10518
|
+
llama_free(ctx);
|
10519
|
+
return nullptr;
|
10520
|
+
}
|
10521
|
+
ctx->backends.push_back(backend);
|
10522
|
+
}
|
10523
|
+
#elif defined(GGML_USE_KOMPUTE)
|
10524
|
+
if (model->n_gpu_layers > 0) {
|
10525
|
+
auto * backend = ggml_backend_kompute_init(model->main_gpu);
|
10526
|
+
if (backend == nullptr) {
|
10527
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
10528
|
+
llama_free(ctx);
|
10529
|
+
return nullptr;
|
10530
|
+
}
|
10531
|
+
ctx->backends.push_back(backend);
|
10532
|
+
}
|
10000
10533
|
#endif
|
10001
10534
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
10002
10535
|
if (ctx->backend_cpu == nullptr) {
|
@@ -10844,22 +11377,24 @@ struct llama_batch llama_batch_get_one(
|
|
10844
11377
|
};
|
10845
11378
|
}
|
10846
11379
|
|
10847
|
-
struct llama_batch llama_batch_init(int32_t
|
11380
|
+
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
|
10848
11381
|
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
10849
11382
|
|
10850
11383
|
if (embd) {
|
10851
|
-
batch.embd = (float *) malloc(sizeof(float) *
|
11384
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
10852
11385
|
} else {
|
10853
|
-
batch.token = (llama_token *) malloc(sizeof(llama_token) *
|
11386
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
10854
11387
|
}
|
10855
11388
|
|
10856
|
-
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) *
|
10857
|
-
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) *
|
10858
|
-
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) *
|
10859
|
-
for (int i = 0; i <
|
11389
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
|
11390
|
+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
|
11391
|
+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
|
11392
|
+
for (int i = 0; i < n_tokens_alloc; ++i) {
|
10860
11393
|
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
10861
11394
|
}
|
10862
|
-
batch.
|
11395
|
+
batch.seq_id[n_tokens_alloc] = nullptr;
|
11396
|
+
|
11397
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
|
10863
11398
|
|
10864
11399
|
return batch;
|
10865
11400
|
}
|
@@ -10870,7 +11405,7 @@ void llama_batch_free(struct llama_batch batch) {
|
|
10870
11405
|
if (batch.pos) free(batch.pos);
|
10871
11406
|
if (batch.n_seq_id) free(batch.n_seq_id);
|
10872
11407
|
if (batch.seq_id) {
|
10873
|
-
for (int i = 0; i
|
11408
|
+
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
|
10874
11409
|
free(batch.seq_id[i]);
|
10875
11410
|
}
|
10876
11411
|
free(batch.seq_id);
|