llama_cpp 0.15.0 → 0.15.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -7
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +303 -23
- data/vendor/tmp/llama.cpp/ggml-impl.h +84 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +137 -133
- data/vendor/tmp/llama.cpp/ggml-metal.metal +87 -110
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +2220 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +35 -152
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +953 -268
- data/vendor/tmp/llama.cpp/ggml.c +1762 -681
- data/vendor/tmp/llama.cpp/ggml.h +43 -24
- data/vendor/tmp/llama.cpp/llama.cpp +533 -296
- data/vendor/tmp/llama.cpp/llama.h +10 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -1637
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +286 -176
- data/vendor/tmp/llama.cpp/unicode.h +44 -10
- metadata +4 -2
@@ -7,6 +7,10 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
+
#ifdef GGML_USE_RPC
|
11
|
+
# include "ggml-rpc.h"
|
12
|
+
#endif
|
13
|
+
|
10
14
|
#ifdef GGML_USE_CUDA
|
11
15
|
# include "ggml-cuda.h"
|
12
16
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -205,6 +209,7 @@ enum llm_arch {
|
|
205
209
|
LLM_ARCH_REFACT,
|
206
210
|
LLM_ARCH_BERT,
|
207
211
|
LLM_ARCH_NOMIC_BERT,
|
212
|
+
LLM_ARCH_JINA_BERT_V2,
|
208
213
|
LLM_ARCH_BLOOM,
|
209
214
|
LLM_ARCH_STABLELM,
|
210
215
|
LLM_ARCH_QWEN,
|
@@ -228,39 +233,40 @@ enum llm_arch {
|
|
228
233
|
};
|
229
234
|
|
230
235
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
231
|
-
{ LLM_ARCH_LLAMA, "llama"
|
232
|
-
{ LLM_ARCH_FALCON, "falcon"
|
233
|
-
{ LLM_ARCH_GROK, "grok"
|
234
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
235
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
236
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
237
|
-
{ LLM_ARCH_MPT, "mpt"
|
238
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
239
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
240
|
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
241
|
-
{ LLM_ARCH_REFACT, "refact"
|
242
|
-
{ LLM_ARCH_BERT, "bert"
|
243
|
-
{ LLM_ARCH_NOMIC_BERT, "nomic-bert"
|
244
|
-
{
|
245
|
-
{
|
246
|
-
{
|
247
|
-
{
|
248
|
-
{
|
249
|
-
{
|
250
|
-
{
|
251
|
-
{
|
252
|
-
{
|
253
|
-
{
|
254
|
-
{
|
255
|
-
{
|
256
|
-
{
|
257
|
-
{
|
258
|
-
{
|
259
|
-
{
|
260
|
-
{
|
261
|
-
{
|
262
|
-
{
|
263
|
-
{
|
236
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
237
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
238
|
+
{ LLM_ARCH_GROK, "grok" },
|
239
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
240
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
241
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
242
|
+
{ LLM_ARCH_MPT, "mpt" },
|
243
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
244
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
245
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
246
|
+
{ LLM_ARCH_REFACT, "refact" },
|
247
|
+
{ LLM_ARCH_BERT, "bert" },
|
248
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
249
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
250
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
251
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
252
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
253
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
254
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
255
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
256
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
257
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
258
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
259
|
+
{ LLM_ARCH_ORION, "orion" },
|
260
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
261
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
262
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
263
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
264
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
265
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
266
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
267
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
268
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
269
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
264
270
|
};
|
265
271
|
|
266
272
|
enum llm_kv {
|
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
691
697
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
692
698
|
},
|
693
699
|
},
|
700
|
+
{
|
701
|
+
LLM_ARCH_JINA_BERT_V2,
|
702
|
+
{
|
703
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
704
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
705
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
706
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
707
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
708
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
709
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
710
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
711
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
712
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
713
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
714
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
715
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
716
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
717
|
+
},
|
718
|
+
},
|
694
719
|
{
|
695
720
|
LLM_ARCH_BLOOM,
|
696
721
|
{
|
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1664
1689
|
GGML_UNUSED(host_buffer);
|
1665
1690
|
}
|
1666
1691
|
|
1667
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
1668
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1669
|
-
|
1670
|
-
#ifdef GGML_USE_METAL
|
1671
|
-
buft = ggml_backend_metal_buffer_type();
|
1672
|
-
#elif defined(GGML_USE_CUDA)
|
1673
|
-
buft = ggml_backend_cuda_buffer_type(gpu);
|
1674
|
-
#elif defined(GGML_USE_VULKAN)
|
1675
|
-
buft = ggml_backend_vk_buffer_type(gpu);
|
1676
|
-
#elif defined(GGML_USE_SYCL)
|
1677
|
-
buft = ggml_backend_sycl_buffer_type(gpu);
|
1678
|
-
#elif defined(GGML_USE_CLBLAST)
|
1679
|
-
buft = ggml_backend_opencl_buffer_type();
|
1680
|
-
#elif defined(GGML_USE_KOMPUTE)
|
1681
|
-
buft = ggml_backend_kompute_buffer_type(gpu);
|
1682
|
-
if (buft == nullptr) {
|
1683
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
1684
|
-
}
|
1685
|
-
#endif
|
1686
|
-
|
1687
|
-
if (buft == nullptr) {
|
1688
|
-
buft = llama_default_buffer_type_cpu(true);
|
1689
|
-
}
|
1690
|
-
return buft;
|
1691
|
-
|
1692
|
-
GGML_UNUSED(gpu);
|
1693
|
-
}
|
1694
|
-
|
1695
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1696
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1697
|
-
|
1698
|
-
#ifdef GGML_USE_CUDA
|
1699
|
-
if (ggml_backend_cuda_get_device_count() > 1) {
|
1700
|
-
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1701
|
-
}
|
1702
|
-
#endif
|
1703
|
-
|
1704
|
-
#ifdef GGML_USE_SYCL
|
1705
|
-
if (ggml_backend_sycl_get_device_count() > 1) {
|
1706
|
-
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1707
|
-
}
|
1708
|
-
#endif
|
1709
|
-
|
1710
|
-
if (buft == nullptr) {
|
1711
|
-
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1712
|
-
}
|
1713
|
-
return buft;
|
1714
|
-
|
1715
|
-
GGML_UNUSED(tensor_split);
|
1716
|
-
}
|
1717
|
-
|
1718
|
-
static size_t llama_get_device_count() {
|
1719
|
-
#if defined(GGML_USE_CUDA)
|
1720
|
-
return ggml_backend_cuda_get_device_count();
|
1721
|
-
#elif defined(GGML_USE_SYCL)
|
1722
|
-
return ggml_backend_sycl_get_device_count();
|
1723
|
-
#elif defined(GGML_USE_VULKAN)
|
1724
|
-
return ggml_backend_vk_get_device_count();
|
1725
|
-
#else
|
1726
|
-
return 1;
|
1727
|
-
#endif
|
1728
|
-
}
|
1729
|
-
|
1730
|
-
static size_t llama_get_device_memory(int device) {
|
1731
|
-
#if defined(GGML_USE_CUDA)
|
1732
|
-
size_t total;
|
1733
|
-
size_t free;
|
1734
|
-
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1735
|
-
return free;
|
1736
|
-
#elif defined(GGML_USE_SYCL)
|
1737
|
-
size_t total;
|
1738
|
-
size_t free;
|
1739
|
-
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1740
|
-
return free;
|
1741
|
-
#elif defined(GGML_USE_VULKAN)
|
1742
|
-
size_t total;
|
1743
|
-
size_t free;
|
1744
|
-
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1745
|
-
return free;
|
1746
|
-
#else
|
1747
|
-
return 1;
|
1748
|
-
GGML_UNUSED(device);
|
1749
|
-
#endif
|
1750
|
-
}
|
1751
|
-
|
1752
1692
|
//
|
1753
1693
|
// globals
|
1754
1694
|
//
|
@@ -1845,7 +1785,7 @@ struct llama_hparams {
|
|
1845
1785
|
float f_logit_scale = 0.0f;
|
1846
1786
|
|
1847
1787
|
bool causal_attn = true;
|
1848
|
-
bool use_alibi = false;
|
1788
|
+
bool use_alibi = false;
|
1849
1789
|
|
1850
1790
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1851
1791
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -2189,6 +2129,8 @@ struct llama_model {
|
|
2189
2129
|
int main_gpu;
|
2190
2130
|
int n_gpu_layers;
|
2191
2131
|
|
2132
|
+
std::vector<std::string> rpc_servers;
|
2133
|
+
|
2192
2134
|
// gguf metadata
|
2193
2135
|
std::unordered_map<std::string, std::string> gguf_kv;
|
2194
2136
|
|
@@ -2317,7 +2259,6 @@ struct llama_context {
|
|
2317
2259
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2318
2260
|
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2319
2261
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2320
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2321
2262
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2322
2263
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2323
2264
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
@@ -2333,6 +2274,104 @@ struct llama_context {
|
|
2333
2274
|
#endif
|
2334
2275
|
};
|
2335
2276
|
|
2277
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
2278
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2279
|
+
|
2280
|
+
#ifdef GGML_USE_RPC
|
2281
|
+
std::string endpoint = model.rpc_servers[gpu];
|
2282
|
+
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
2283
|
+
#elif defined(GGML_USE_METAL)
|
2284
|
+
buft = ggml_backend_metal_buffer_type();
|
2285
|
+
#elif defined(GGML_USE_CUDA)
|
2286
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
2287
|
+
#elif defined(GGML_USE_VULKAN)
|
2288
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
2289
|
+
#elif defined(GGML_USE_SYCL)
|
2290
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
2291
|
+
#elif defined(GGML_USE_CLBLAST)
|
2292
|
+
buft = ggml_backend_opencl_buffer_type();
|
2293
|
+
#elif defined(GGML_USE_KOMPUTE)
|
2294
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
2295
|
+
if (buft == nullptr) {
|
2296
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
2297
|
+
}
|
2298
|
+
#endif
|
2299
|
+
|
2300
|
+
if (buft == nullptr) {
|
2301
|
+
buft = llama_default_buffer_type_cpu(true);
|
2302
|
+
}
|
2303
|
+
return buft;
|
2304
|
+
GGML_UNUSED(model);
|
2305
|
+
GGML_UNUSED(gpu);
|
2306
|
+
}
|
2307
|
+
|
2308
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
2309
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2310
|
+
|
2311
|
+
#ifdef GGML_USE_CUDA
|
2312
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
2313
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
2314
|
+
}
|
2315
|
+
#endif
|
2316
|
+
|
2317
|
+
#ifdef GGML_USE_SYCL
|
2318
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
2319
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
2320
|
+
}
|
2321
|
+
#endif
|
2322
|
+
|
2323
|
+
if (buft == nullptr) {
|
2324
|
+
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
2325
|
+
}
|
2326
|
+
return buft;
|
2327
|
+
|
2328
|
+
GGML_UNUSED(tensor_split);
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
2332
|
+
#if defined(GGML_USE_RPC)
|
2333
|
+
return model.rpc_servers.size();
|
2334
|
+
#elif defined(GGML_USE_CUDA)
|
2335
|
+
return ggml_backend_cuda_get_device_count();
|
2336
|
+
#elif defined(GGML_USE_SYCL)
|
2337
|
+
return ggml_backend_sycl_get_device_count();
|
2338
|
+
#elif defined(GGML_USE_VULKAN)
|
2339
|
+
return ggml_backend_vk_get_device_count();
|
2340
|
+
#else
|
2341
|
+
return 1;
|
2342
|
+
#endif
|
2343
|
+
GGML_UNUSED(model);
|
2344
|
+
}
|
2345
|
+
|
2346
|
+
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
2347
|
+
#if defined(GGML_USE_RPC)
|
2348
|
+
size_t total;
|
2349
|
+
size_t free;
|
2350
|
+
std::string endpoint = model.rpc_servers[device];
|
2351
|
+
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
2352
|
+
return free;
|
2353
|
+
#elif defined(GGML_USE_CUDA)
|
2354
|
+
size_t total;
|
2355
|
+
size_t free;
|
2356
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
2357
|
+
return free;
|
2358
|
+
#elif defined(GGML_USE_SYCL)
|
2359
|
+
size_t total;
|
2360
|
+
size_t free;
|
2361
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
2362
|
+
return free;
|
2363
|
+
#elif defined(GGML_USE_VULKAN)
|
2364
|
+
size_t total;
|
2365
|
+
size_t free;
|
2366
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
2367
|
+
return free;
|
2368
|
+
#else
|
2369
|
+
return 1;
|
2370
|
+
#endif
|
2371
|
+
GGML_UNUSED(model);
|
2372
|
+
GGML_UNUSED(device);
|
2373
|
+
}
|
2374
|
+
|
2336
2375
|
//
|
2337
2376
|
// kv cache helpers
|
2338
2377
|
//
|
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
|
2785
2824
|
cache.do_defrag = true;
|
2786
2825
|
}
|
2787
2826
|
|
2827
|
+
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
2828
|
+
// the FA kernels require padding to avoid extra runtime boundary checks
|
2829
|
+
return cparams.flash_attn ? 256u : 32u;
|
2830
|
+
}
|
2831
|
+
|
2788
2832
|
//
|
2789
2833
|
// model loading and saving
|
2790
2834
|
//
|
@@ -3175,6 +3219,7 @@ struct llama_model_loader {
|
|
3175
3219
|
switch (type_max) {
|
3176
3220
|
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
3177
3221
|
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
3222
|
+
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
3178
3223
|
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
3179
3224
|
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
3180
3225
|
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
@@ -3666,6 +3711,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3666
3711
|
switch (ftype) {
|
3667
3712
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
3668
3713
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
3714
|
+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
3669
3715
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
3670
3716
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
3671
3717
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
@@ -3777,6 +3823,12 @@ static void llm_load_hparams(
|
|
3777
3823
|
|
3778
3824
|
// get hparams kv
|
3779
3825
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
3826
|
+
|
3827
|
+
// everything past this point is not vocab-related
|
3828
|
+
if (hparams.vocab_only) {
|
3829
|
+
return;
|
3830
|
+
}
|
3831
|
+
|
3780
3832
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
3781
3833
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
3782
3834
|
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
@@ -3858,7 +3910,7 @@ static void llm_load_hparams(
|
|
3858
3910
|
switch (hparams.n_layer) {
|
3859
3911
|
case 22: model.type = e_model::MODEL_1B; break;
|
3860
3912
|
case 26: model.type = e_model::MODEL_3B; break;
|
3861
|
-
case 32: model.type = hparams.
|
3913
|
+
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
3862
3914
|
case 40: model.type = e_model::MODEL_13B; break;
|
3863
3915
|
case 48: model.type = e_model::MODEL_34B; break;
|
3864
3916
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3960,6 +4012,19 @@ static void llm_load_hparams(
|
|
3960
4012
|
model.type = e_model::MODEL_335M; break; // bge-large
|
3961
4013
|
}
|
3962
4014
|
} break;
|
4015
|
+
case LLM_ARCH_JINA_BERT_V2:
|
4016
|
+
{
|
4017
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4018
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
4019
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
4020
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
4021
|
+
hparams.f_max_alibi_bias = 8.0f;
|
4022
|
+
|
4023
|
+
switch (hparams.n_layer) {
|
4024
|
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
4025
|
+
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
4026
|
+
}
|
4027
|
+
} break;
|
3963
4028
|
case LLM_ARCH_NOMIC_BERT:
|
3964
4029
|
{
|
3965
4030
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -4381,8 +4446,27 @@ static void llm_load_vocab(
|
|
4381
4446
|
tokenizer_pre == "starcoder") {
|
4382
4447
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4383
4448
|
} else if (
|
4384
|
-
tokenizer_pre == "gpt-2"
|
4449
|
+
tokenizer_pre == "gpt-2" ||
|
4450
|
+
tokenizer_pre == "jina-es" ||
|
4451
|
+
tokenizer_pre == "jina-de" ||
|
4452
|
+
tokenizer_pre == "jina-v2-es" ||
|
4453
|
+
tokenizer_pre == "jina-v2-de") {
|
4385
4454
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4455
|
+
} else if (
|
4456
|
+
tokenizer_pre == "refact") {
|
4457
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
4458
|
+
} else if (
|
4459
|
+
tokenizer_pre == "command-r") {
|
4460
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
4461
|
+
} else if (
|
4462
|
+
tokenizer_pre == "qwen2") {
|
4463
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4464
|
+
} else if (
|
4465
|
+
tokenizer_pre == "olmo") {
|
4466
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
4467
|
+
} else if (
|
4468
|
+
tokenizer_pre == "dbrx") {
|
4469
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4386
4470
|
} else {
|
4387
4471
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4388
4472
|
}
|
@@ -4726,13 +4810,13 @@ static bool llm_load_tensors(
|
|
4726
4810
|
|
4727
4811
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
4728
4812
|
// calculate the split points
|
4729
|
-
int device_count = llama_get_device_count();
|
4813
|
+
int device_count = llama_get_device_count(model);
|
4730
4814
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
4731
4815
|
std::vector<float> splits(device_count);
|
4732
4816
|
if (all_zero) {
|
4733
4817
|
// default split, by free memory
|
4734
4818
|
for (int i = 0; i < device_count; ++i) {
|
4735
|
-
splits[i] = llama_get_device_memory(i);
|
4819
|
+
splits[i] = llama_get_device_memory(model, i);
|
4736
4820
|
}
|
4737
4821
|
} else {
|
4738
4822
|
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
@@ -4752,35 +4836,35 @@ static bool llm_load_tensors(
|
|
4752
4836
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
4753
4837
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4754
4838
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
4755
|
-
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
4839
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
4756
4840
|
}
|
4757
4841
|
// assign the output layer
|
4758
4842
|
if (n_gpu_layers > n_layer) {
|
4759
4843
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
4760
|
-
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
4844
|
+
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
4761
4845
|
} else {
|
4762
4846
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
4763
4847
|
}
|
4764
4848
|
} else {
|
4765
4849
|
ggml_backend_buffer_type_t split_buft;
|
4766
4850
|
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
4767
|
-
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
4851
|
+
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
4768
4852
|
} else {
|
4769
4853
|
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
4770
|
-
split_buft = llama_default_buffer_type_offload(main_gpu);
|
4854
|
+
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
4771
4855
|
}
|
4772
4856
|
// assign the repeating layers
|
4773
4857
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4774
4858
|
model.buft_layer[i] = {
|
4775
4859
|
split_buft,
|
4776
|
-
llama_default_buffer_type_offload(main_gpu)
|
4860
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4777
4861
|
};
|
4778
4862
|
}
|
4779
4863
|
// assign the output layer
|
4780
4864
|
if (n_gpu_layers > n_layer) {
|
4781
4865
|
model.buft_output = {
|
4782
4866
|
split_buft,
|
4783
|
-
llama_default_buffer_type_offload(main_gpu)
|
4867
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4784
4868
|
};
|
4785
4869
|
} else {
|
4786
4870
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
@@ -5225,6 +5309,50 @@ static bool llm_load_tensors(
|
|
5225
5309
|
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5226
5310
|
}
|
5227
5311
|
} break;
|
5312
|
+
case LLM_ARCH_JINA_BERT_V2:
|
5313
|
+
{
|
5314
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
|
5315
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
|
5316
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
5317
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
5318
|
+
|
5319
|
+
for (int i = 0; i < n_layer; ++i) {
|
5320
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5321
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5322
|
+
|
5323
|
+
auto & layer = model.layers[i]; // JinaBertLayer
|
5324
|
+
|
5325
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5326
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5327
|
+
|
5328
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
5329
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
5330
|
+
|
5331
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5332
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5333
|
+
|
5334
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
5335
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
5336
|
+
|
5337
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5338
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5339
|
+
|
5340
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
|
5341
|
+
layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
|
5342
|
+
|
5343
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
5344
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
5345
|
+
|
5346
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5347
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5348
|
+
|
5349
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5350
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5351
|
+
|
5352
|
+
layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
5353
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5354
|
+
}
|
5355
|
+
} break;
|
5228
5356
|
case LLM_ARCH_BLOOM:
|
5229
5357
|
{
|
5230
5358
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -6120,6 +6248,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
6120
6248
|
|| !(
|
6121
6249
|
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
6122
6250
|
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
6251
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
6123
6252
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
6124
6253
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
6125
6254
|
)
|
@@ -6300,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|
6300
6429
|
llm_ffn_gate_type type_gate,
|
6301
6430
|
const llm_build_cb & cb,
|
6302
6431
|
int il) {
|
6303
|
-
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
6432
|
+
struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
|
6304
6433
|
cb(tmp, "ffn_up", il);
|
6305
6434
|
|
6306
6435
|
if (up_b) {
|
@@ -6482,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6482
6611
|
struct ggml_tensor * wo_b,
|
6483
6612
|
struct ggml_tensor * q_cur,
|
6484
6613
|
struct ggml_tensor * kq_mask,
|
6485
|
-
struct ggml_tensor * kq_pos,
|
6486
6614
|
int32_t n_tokens,
|
6487
6615
|
int32_t n_kv,
|
6488
6616
|
float kq_scale,
|
@@ -6494,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6494
6622
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
6495
6623
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6496
6624
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
6625
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6497
6626
|
|
6498
6627
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
6499
6628
|
cb(q, "q", il);
|
@@ -6512,26 +6641,22 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6512
6641
|
GGML_UNUSED(model);
|
6513
6642
|
GGML_UNUSED(n_ctx);
|
6514
6643
|
|
6515
|
-
// note: if this assert triggers, then some check has failed earlier
|
6516
|
-
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6517
|
-
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6518
|
-
|
6519
6644
|
// split cached v into n_head heads (not transposed)
|
6520
6645
|
struct ggml_tensor * v =
|
6521
6646
|
ggml_view_3d(ctx, kv.v_l[il],
|
6522
6647
|
n_embd_head_v, n_kv, n_head_kv,
|
6523
|
-
ggml_row_size(kv.v_l[il]->type,
|
6524
|
-
ggml_row_size(kv.v_l[il]->type,
|
6648
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
6649
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
6525
6650
|
0);
|
6526
6651
|
cb(v, "v", il);
|
6527
6652
|
|
6528
|
-
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6653
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6529
6654
|
|
6530
6655
|
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6531
6656
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6532
6657
|
}
|
6533
6658
|
|
6534
|
-
cur = ggml_reshape_2d(ctx, cur,
|
6659
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
6535
6660
|
} else {
|
6536
6661
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6537
6662
|
cb(kq, "kq", il);
|
@@ -6556,28 +6681,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6556
6681
|
kq = ggml_scale(ctx, kq, 30);
|
6557
6682
|
}
|
6558
6683
|
|
6559
|
-
|
6560
|
-
|
6561
|
-
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6562
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6563
|
-
if (hparams.use_alibi) {
|
6564
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
6565
|
-
cb(kq, "kq_scaled", il);
|
6566
|
-
|
6567
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6568
|
-
cb(kq, "kq_scaled_alibi", il);
|
6569
|
-
|
6570
|
-
kq = ggml_add(ctx, kq, kq_mask);
|
6571
|
-
cb(kq, "kq_masked", il);
|
6572
|
-
|
6573
|
-
kq = ggml_soft_max(ctx, kq);
|
6574
|
-
cb(kq, "kq_soft_max", il);
|
6575
|
-
} else
|
6576
|
-
#endif
|
6577
|
-
{
|
6578
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6579
|
-
cb(kq, "kq_soft_max_ext", il);
|
6580
|
-
}
|
6684
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6685
|
+
cb(kq, "kq_soft_max_ext", il);
|
6581
6686
|
|
6582
6687
|
GGML_ASSERT(kv.size == n_ctx);
|
6583
6688
|
|
@@ -6596,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6596
6701
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6597
6702
|
cb(kqv_merged, "kqv_merged", il);
|
6598
6703
|
|
6599
|
-
cur = ggml_cont_2d(ctx, kqv_merged,
|
6704
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
6600
6705
|
cb(cur, "kqv_merged_cont", il);
|
6601
6706
|
}
|
6602
6707
|
|
@@ -6627,7 +6732,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6627
6732
|
struct ggml_tensor * v_cur,
|
6628
6733
|
struct ggml_tensor * q_cur,
|
6629
6734
|
struct ggml_tensor * kq_mask,
|
6630
|
-
struct ggml_tensor * kq_pos,
|
6631
6735
|
int32_t n_tokens,
|
6632
6736
|
int32_t kv_head,
|
6633
6737
|
int32_t n_kv,
|
@@ -6646,7 +6750,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6646
6750
|
struct ggml_tensor * cur;
|
6647
6751
|
|
6648
6752
|
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6649
|
-
q_cur, kq_mask,
|
6753
|
+
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
6650
6754
|
cb(cur, "kqv_out", il);
|
6651
6755
|
|
6652
6756
|
return cur;
|
@@ -6753,18 +6857,17 @@ struct llm_build_context {
|
|
6753
6857
|
|
6754
6858
|
ctx0 = ggml_init(params);
|
6755
6859
|
|
6756
|
-
lctx.inp_tokens
|
6757
|
-
lctx.inp_embd
|
6758
|
-
lctx.inp_pos
|
6860
|
+
lctx.inp_tokens = nullptr;
|
6861
|
+
lctx.inp_embd = nullptr;
|
6862
|
+
lctx.inp_pos = nullptr;
|
6759
6863
|
lctx.inp_out_ids = nullptr;
|
6760
6864
|
lctx.inp_KQ_mask = nullptr;
|
6761
|
-
lctx.inp_KQ_pos = nullptr;
|
6762
6865
|
lctx.inp_K_shift = nullptr;
|
6763
|
-
lctx.inp_mean
|
6764
|
-
lctx.inp_cls
|
6765
|
-
lctx.inp_s_copy
|
6766
|
-
lctx.inp_s_mask
|
6767
|
-
lctx.inp_s_seq
|
6866
|
+
lctx.inp_mean = nullptr;
|
6867
|
+
lctx.inp_cls = nullptr;
|
6868
|
+
lctx.inp_s_copy = nullptr;
|
6869
|
+
lctx.inp_s_mask = nullptr;
|
6870
|
+
lctx.inp_s_seq = nullptr;
|
6768
6871
|
}
|
6769
6872
|
|
6770
6873
|
void free() {
|
@@ -6914,19 +7017,6 @@ struct llm_build_context {
|
|
6914
7017
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6915
7018
|
}
|
6916
7019
|
|
6917
|
-
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6918
|
-
if (causal) {
|
6919
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6920
|
-
} else {
|
6921
|
-
// TODO: this will be needed for ALiBi-based BERT models
|
6922
|
-
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6923
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6924
|
-
}
|
6925
|
-
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6926
|
-
ggml_set_input(lctx.inp_KQ_pos);
|
6927
|
-
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6928
|
-
}
|
6929
|
-
|
6930
7020
|
struct ggml_tensor * build_inp_mean() {
|
6931
7021
|
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6932
7022
|
cb(lctx.inp_mean, "inp_mean", -1);
|
@@ -7032,7 +7122,7 @@ struct llm_build_context {
|
|
7032
7122
|
|
7033
7123
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7034
7124
|
model.layers[il].wo, model.layers[il].bo,
|
7035
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7125
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7036
7126
|
}
|
7037
7127
|
|
7038
7128
|
if (il == n_layer - 1) {
|
@@ -7125,9 +7215,6 @@ struct llm_build_context {
|
|
7125
7215
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7126
7216
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7127
7217
|
|
7128
|
-
// positions of the tokens in the KV cache
|
7129
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7130
|
-
|
7131
7218
|
for (int il = 0; il < n_layer; ++il) {
|
7132
7219
|
struct ggml_tensor * inpSA = inpL;
|
7133
7220
|
|
@@ -7172,7 +7259,7 @@ struct llm_build_context {
|
|
7172
7259
|
|
7173
7260
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7174
7261
|
model.layers[il].wo, NULL,
|
7175
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7262
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7176
7263
|
}
|
7177
7264
|
|
7178
7265
|
if (il == n_layer - 1) {
|
@@ -7242,9 +7329,6 @@ struct llm_build_context {
|
|
7242
7329
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7243
7330
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7244
7331
|
|
7245
|
-
// positions of the tokens in the KV cache
|
7246
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7247
|
-
|
7248
7332
|
for (int il = 0; il < n_layer; ++il) {
|
7249
7333
|
struct ggml_tensor * inpSA = inpL;
|
7250
7334
|
|
@@ -7279,7 +7363,7 @@ struct llm_build_context {
|
|
7279
7363
|
cb(Kcur, "Kcur", il);
|
7280
7364
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7281
7365
|
model.layers[il].wo, NULL,
|
7282
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7366
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7283
7367
|
}
|
7284
7368
|
|
7285
7369
|
if (il == n_layer - 1) {
|
@@ -7399,7 +7483,7 @@ struct llm_build_context {
|
|
7399
7483
|
|
7400
7484
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7401
7485
|
model.layers[il].wo, NULL,
|
7402
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7486
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7403
7487
|
}
|
7404
7488
|
|
7405
7489
|
if (il == n_layer - 1) {
|
@@ -7524,7 +7608,7 @@ struct llm_build_context {
|
|
7524
7608
|
|
7525
7609
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7526
7610
|
model.layers[il].wo, model.layers[il].bo,
|
7527
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7611
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7528
7612
|
}
|
7529
7613
|
|
7530
7614
|
if (il == n_layer - 1) {
|
@@ -7676,7 +7760,7 @@ struct llm_build_context {
|
|
7676
7760
|
|
7677
7761
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7678
7762
|
model.layers[il].wo, NULL,
|
7679
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7763
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7680
7764
|
}
|
7681
7765
|
|
7682
7766
|
if (il == n_layer - 1) {
|
@@ -7788,7 +7872,7 @@ struct llm_build_context {
|
|
7788
7872
|
|
7789
7873
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7790
7874
|
model.layers[il].wo, model.layers[il].bo,
|
7791
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7875
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7792
7876
|
}
|
7793
7877
|
|
7794
7878
|
if (il == n_layer - 1) {
|
@@ -7992,7 +8076,7 @@ struct llm_build_context {
|
|
7992
8076
|
|
7993
8077
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7994
8078
|
model.layers[il].wo, model.layers[il].bo,
|
7995
|
-
Kcur, Vcur, Q, KQ_mask,
|
8079
|
+
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7996
8080
|
}
|
7997
8081
|
|
7998
8082
|
if (il == n_layer - 1) {
|
@@ -8058,9 +8142,6 @@ struct llm_build_context {
|
|
8058
8142
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8059
8143
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8060
8144
|
|
8061
|
-
// positions of the tokens in the KV cache
|
8062
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8063
|
-
|
8064
8145
|
for (int il = 0; il < n_layer; ++il) {
|
8065
8146
|
struct ggml_tensor * inpSA = inpL;
|
8066
8147
|
|
@@ -8088,7 +8169,7 @@ struct llm_build_context {
|
|
8088
8169
|
|
8089
8170
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8090
8171
|
model.layers[il].wo, NULL,
|
8091
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8172
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8092
8173
|
}
|
8093
8174
|
|
8094
8175
|
if (il == n_layer - 1) {
|
@@ -8150,8 +8231,11 @@ struct llm_build_context {
|
|
8150
8231
|
|
8151
8232
|
struct ggml_tensor * cur;
|
8152
8233
|
struct ggml_tensor * inpL;
|
8234
|
+
struct ggml_tensor * inp_pos = nullptr;
|
8153
8235
|
|
8154
|
-
|
8236
|
+
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
8237
|
+
inp_pos = build_inp_pos();
|
8238
|
+
}
|
8155
8239
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
8156
8240
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
8157
8241
|
|
@@ -8182,13 +8266,26 @@ struct llm_build_context {
|
|
8182
8266
|
struct ggml_tensor * Vcur;
|
8183
8267
|
|
8184
8268
|
// self-attention
|
8185
|
-
if (model.arch == LLM_ARCH_BERT) {
|
8269
|
+
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8186
8270
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
8187
8271
|
cb(Qcur, "Qcur", il);
|
8188
8272
|
|
8273
|
+
if (model.layers[il].attn_q_norm) {
|
8274
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8275
|
+
model.layers[il].attn_q_norm,
|
8276
|
+
model.layers[il].attn_q_norm_b,
|
8277
|
+
LLM_NORM, cb, il);
|
8278
|
+
}
|
8279
|
+
|
8189
8280
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
8190
8281
|
cb(Kcur, "Kcur", il);
|
8191
8282
|
|
8283
|
+
if (model.layers[il].attn_k_norm) {
|
8284
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8285
|
+
model.layers[il].attn_k_norm,
|
8286
|
+
model.layers[il].attn_k_norm_b,
|
8287
|
+
LLM_NORM, cb, il);
|
8288
|
+
}
|
8192
8289
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
8193
8290
|
cb(Vcur, "Vcur", il);
|
8194
8291
|
|
@@ -8228,7 +8325,7 @@ struct llm_build_context {
|
|
8228
8325
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
8229
8326
|
cb(kq, "kq", il);
|
8230
8327
|
|
8231
|
-
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask,
|
8328
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
8232
8329
|
cb(kq, "kq_soft_max_ext", il);
|
8233
8330
|
|
8234
8331
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
@@ -8279,6 +8376,13 @@ struct llm_build_context {
|
|
8279
8376
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8280
8377
|
NULL,
|
8281
8378
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8379
|
+
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8380
|
+
cur = llm_build_ffn(ctx0, cur,
|
8381
|
+
model.layers[il].ffn_up, NULL,
|
8382
|
+
model.layers[il].ffn_gate, NULL,
|
8383
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8384
|
+
NULL,
|
8385
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
8282
8386
|
} else {
|
8283
8387
|
cur = llm_build_ffn(ctx0, cur,
|
8284
8388
|
model.layers[il].ffn_up, NULL,
|
@@ -8345,9 +8449,6 @@ struct llm_build_context {
|
|
8345
8449
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8346
8450
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8347
8451
|
|
8348
|
-
// positions of the tokens in the KV cache
|
8349
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8350
|
-
|
8351
8452
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
8352
8453
|
model.tok_norm,
|
8353
8454
|
model.tok_norm_b,
|
@@ -8381,7 +8482,7 @@ struct llm_build_context {
|
|
8381
8482
|
|
8382
8483
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8383
8484
|
model.layers[il].wo, model.layers[il].bo,
|
8384
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8485
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8385
8486
|
}
|
8386
8487
|
|
8387
8488
|
if (il == n_layer - 1) {
|
@@ -8446,9 +8547,6 @@ struct llm_build_context {
|
|
8446
8547
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8447
8548
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8448
8549
|
|
8449
|
-
// positions of the tokens in the KV cache
|
8450
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8451
|
-
|
8452
8550
|
if (model.pos_embd) {
|
8453
8551
|
// inp_pos - contains the positions
|
8454
8552
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
@@ -8512,13 +8610,13 @@ struct llm_build_context {
|
|
8512
8610
|
|
8513
8611
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8514
8612
|
model.layers[il].wo, model.layers[il].bo,
|
8515
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8613
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8516
8614
|
} else {
|
8517
8615
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8518
8616
|
|
8519
8617
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8520
8618
|
model.layers[il].wo, model.layers[il].bo,
|
8521
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8619
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8522
8620
|
}
|
8523
8621
|
}
|
8524
8622
|
|
@@ -8662,7 +8760,7 @@ struct llm_build_context {
|
|
8662
8760
|
|
8663
8761
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8664
8762
|
model.layers[il].wo, NULL,
|
8665
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8763
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8666
8764
|
}
|
8667
8765
|
|
8668
8766
|
if (il == n_layer - 1) {
|
@@ -8780,7 +8878,7 @@ struct llm_build_context {
|
|
8780
8878
|
|
8781
8879
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8782
8880
|
model.layers[il].wo, NULL,
|
8783
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8881
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8784
8882
|
}
|
8785
8883
|
|
8786
8884
|
if (il == n_layer - 1) {
|
@@ -8893,7 +8991,7 @@ struct llm_build_context {
|
|
8893
8991
|
|
8894
8992
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8895
8993
|
model.layers[il].wo, model.layers[il].bo,
|
8896
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8994
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8897
8995
|
}
|
8898
8996
|
|
8899
8997
|
if (il == n_layer - 1) {
|
@@ -9007,7 +9105,7 @@ struct llm_build_context {
|
|
9007
9105
|
|
9008
9106
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9009
9107
|
model.layers[il].wo, model.layers[il].bo,
|
9010
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9108
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9011
9109
|
}
|
9012
9110
|
|
9013
9111
|
if (il == n_layer - 1) {
|
@@ -9162,7 +9260,7 @@ struct llm_build_context {
|
|
9162
9260
|
|
9163
9261
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9164
9262
|
model.layers[il].wo, model.layers[il].bo,
|
9165
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9263
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9166
9264
|
}
|
9167
9265
|
|
9168
9266
|
if (il == n_layer - 1) {
|
@@ -9279,7 +9377,7 @@ struct llm_build_context {
|
|
9279
9377
|
|
9280
9378
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9281
9379
|
model.layers[il].wo, model.layers[il].bo,
|
9282
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9380
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9283
9381
|
}
|
9284
9382
|
|
9285
9383
|
if (il == n_layer - 1) {
|
@@ -9392,7 +9490,7 @@ struct llm_build_context {
|
|
9392
9490
|
|
9393
9491
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9394
9492
|
model.layers[il].wo, NULL,
|
9395
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9493
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9396
9494
|
}
|
9397
9495
|
struct ggml_tensor * sa_out = cur;
|
9398
9496
|
|
@@ -9495,7 +9593,7 @@ struct llm_build_context {
|
|
9495
9593
|
|
9496
9594
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9497
9595
|
model.layers[il].wo, model.layers[il].bo,
|
9498
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9596
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9499
9597
|
}
|
9500
9598
|
|
9501
9599
|
if (il == n_layer - 1) {
|
@@ -9602,7 +9700,7 @@ struct llm_build_context {
|
|
9602
9700
|
|
9603
9701
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9604
9702
|
model.layers[il].wo, model.layers[il].bo,
|
9605
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9703
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9606
9704
|
}
|
9607
9705
|
|
9608
9706
|
if (il == n_layer - 1) {
|
@@ -9718,7 +9816,7 @@ struct llm_build_context {
|
|
9718
9816
|
|
9719
9817
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9720
9818
|
model.layers[il].wo, NULL,
|
9721
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9819
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9722
9820
|
}
|
9723
9821
|
|
9724
9822
|
if (il == n_layer - 1) {
|
@@ -9835,7 +9933,7 @@ struct llm_build_context {
|
|
9835
9933
|
|
9836
9934
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9837
9935
|
model.layers[il].wo, model.layers[il].bo,
|
9838
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9936
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9839
9937
|
}
|
9840
9938
|
|
9841
9939
|
if (il == n_layer - 1) {
|
@@ -9965,7 +10063,7 @@ struct llm_build_context {
|
|
9965
10063
|
|
9966
10064
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9967
10065
|
model.layers[il].wo, model.layers[il].bo,
|
9968
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10066
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9969
10067
|
}
|
9970
10068
|
|
9971
10069
|
if (il == n_layer - 1) {
|
@@ -10086,7 +10184,7 @@ struct llm_build_context {
|
|
10086
10184
|
|
10087
10185
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10088
10186
|
model.layers[il].wo, NULL,
|
10089
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10187
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
10090
10188
|
}
|
10091
10189
|
|
10092
10190
|
if (il == n_layer - 1) {
|
@@ -10205,7 +10303,7 @@ struct llm_build_context {
|
|
10205
10303
|
|
10206
10304
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10207
10305
|
model.layers[il].wo, model.layers[il].bo,
|
10208
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10306
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10209
10307
|
}
|
10210
10308
|
|
10211
10309
|
if (il == n_layer - 1) {
|
@@ -10495,7 +10593,7 @@ struct llm_build_context {
|
|
10495
10593
|
|
10496
10594
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10497
10595
|
model.layers[il].wo, model.layers[il].bo,
|
10498
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10596
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10499
10597
|
}
|
10500
10598
|
|
10501
10599
|
if (il == n_layer - 1) {
|
@@ -10626,7 +10724,7 @@ struct llm_build_context {
|
|
10626
10724
|
|
10627
10725
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10628
10726
|
model.layers[il].wo, nullptr,
|
10629
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10727
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10630
10728
|
}
|
10631
10729
|
|
10632
10730
|
if (il == n_layer - 1) {
|
@@ -10807,6 +10905,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10807
10905
|
result = llm.build_refact();
|
10808
10906
|
} break;
|
10809
10907
|
case LLM_ARCH_BERT:
|
10908
|
+
case LLM_ARCH_JINA_BERT_V2:
|
10810
10909
|
case LLM_ARCH_NOMIC_BERT:
|
10811
10910
|
{
|
10812
10911
|
result = llm.build_bert();
|
@@ -11014,11 +11113,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11014
11113
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
11015
11114
|
f = -INFINITY;
|
11016
11115
|
} else {
|
11017
|
-
|
11116
|
+
if (hparams.use_alibi) {
|
11117
|
+
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
11118
|
+
} else {
|
11119
|
+
f = 0.0f;
|
11120
|
+
}
|
11018
11121
|
}
|
11019
11122
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
11020
11123
|
}
|
11021
11124
|
}
|
11125
|
+
|
11126
|
+
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
11127
|
+
for (int j = 0; j < n_kv; ++j) {
|
11128
|
+
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
11129
|
+
}
|
11130
|
+
}
|
11022
11131
|
}
|
11023
11132
|
} else {
|
11024
11133
|
// when using kv cache, the mask needs to match the kv cache size
|
@@ -11037,7 +11146,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11037
11146
|
float f = -INFINITY;
|
11038
11147
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
11039
11148
|
if (batch.seq_id[i][s] == seq_id) {
|
11040
|
-
|
11149
|
+
if (hparams.use_alibi) {
|
11150
|
+
f = -fabs(batch.pos[i] - batch.pos[j]);
|
11151
|
+
} else {
|
11152
|
+
f = 0.0f;
|
11153
|
+
}
|
11041
11154
|
break;
|
11042
11155
|
}
|
11043
11156
|
}
|
@@ -11053,21 +11166,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11053
11166
|
}
|
11054
11167
|
}
|
11055
11168
|
|
11056
|
-
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11057
|
-
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11058
|
-
if (hparams.use_alibi) {
|
11059
|
-
const int64_t n_kv = kv_self.n;
|
11060
|
-
|
11061
|
-
GGML_ASSERT(lctx.inp_KQ_pos);
|
11062
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
11063
|
-
|
11064
|
-
float * data = (float *) lctx.inp_KQ_pos->data;
|
11065
|
-
|
11066
|
-
for (int i = 0; i < n_kv; ++i) {
|
11067
|
-
data[i] = float(lctx.kv_self.cells[i].pos);
|
11068
|
-
}
|
11069
|
-
}
|
11070
|
-
|
11071
11169
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
11072
11170
|
const int64_t n_tokens = batch.n_tokens;
|
11073
11171
|
|
@@ -11437,7 +11535,8 @@ static int llama_decode_internal(
|
|
11437
11535
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11438
11536
|
// after enough generations, the benefit from this heuristic disappears
|
11439
11537
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11440
|
-
|
11538
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
11539
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
11441
11540
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11442
11541
|
}
|
11443
11542
|
}
|
@@ -11952,7 +12051,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
|
|
11952
12051
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
11953
12052
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
11954
12053
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
11955
|
-
const auto& token_data = vocab.id_to_token.at(id);
|
12054
|
+
const auto & token_data = vocab.id_to_token.at(id);
|
11956
12055
|
switch (llama_vocab_get_type(vocab)) {
|
11957
12056
|
case LLAMA_VOCAB_TYPE_SPM: {
|
11958
12057
|
auto buf = token_data.text.substr(3, 2);
|
@@ -12182,12 +12281,14 @@ struct llm_tokenizer_bpe {
|
|
12182
12281
|
|
12183
12282
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12184
12283
|
int final_prev_index = -1;
|
12284
|
+
bool ignore_merges = false;
|
12185
12285
|
|
12186
12286
|
std::vector<std::string> word_collection;
|
12187
12287
|
switch (vocab.type) {
|
12188
12288
|
case LLAMA_VOCAB_TYPE_BPE:
|
12189
12289
|
switch (vocab.type_pre) {
|
12190
12290
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12291
|
+
ignore_merges = true;
|
12191
12292
|
word_collection = unicode_regex_split(text, {
|
12192
12293
|
// original regex from tokenizer.json
|
12193
12294
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12196,6 +12297,12 @@ struct llm_tokenizer_bpe {
|
|
12196
12297
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12197
12298
|
});
|
12198
12299
|
break;
|
12300
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12301
|
+
word_collection = unicode_regex_split(text, {
|
12302
|
+
// same as llama3
|
12303
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12304
|
+
});
|
12305
|
+
break;
|
12199
12306
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12200
12307
|
word_collection = unicode_regex_split(text, {
|
12201
12308
|
"[\r\n]",
|
@@ -12212,14 +12319,13 @@ struct llm_tokenizer_bpe {
|
|
12212
12319
|
"\\s?\\p{L}+",
|
12213
12320
|
"\\s?\\p{P}+",
|
12214
12321
|
"[一-龥ࠀ-一가-]+",
|
12215
|
-
"\\p{N}
|
12322
|
+
"\\p{N}",
|
12216
12323
|
});
|
12217
12324
|
break;
|
12218
12325
|
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
12219
12326
|
word_collection = unicode_regex_split(text, {
|
12220
12327
|
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12221
12328
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12222
|
-
"\\p{N}+",
|
12223
12329
|
"[0-9][0-9][0-9]",
|
12224
12330
|
});
|
12225
12331
|
break;
|
@@ -12235,11 +12341,26 @@ struct llm_tokenizer_bpe {
|
|
12235
12341
|
});
|
12236
12342
|
break;
|
12237
12343
|
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
12344
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
12345
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
12346
|
+
word_collection = unicode_regex_split(text, {
|
12347
|
+
"\\p{N}",
|
12348
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12349
|
+
});
|
12350
|
+
break;
|
12238
12351
|
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
12352
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
12239
12353
|
word_collection = unicode_regex_split(text, {
|
12240
12354
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12241
12355
|
});
|
12242
12356
|
break;
|
12357
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12358
|
+
word_collection = unicode_regex_split(text, {
|
12359
|
+
// original regex from tokenizer.json
|
12360
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
12361
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12362
|
+
});
|
12363
|
+
break;
|
12243
12364
|
default:
|
12244
12365
|
// default regex for BPE tokenization pre-processing
|
12245
12366
|
word_collection = unicode_regex_split(text, {
|
@@ -12265,6 +12386,11 @@ struct llm_tokenizer_bpe {
|
|
12265
12386
|
int index = 0;
|
12266
12387
|
size_t offset = 0;
|
12267
12388
|
|
12389
|
+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
12390
|
+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
12391
|
+
offset = word.size();
|
12392
|
+
}
|
12393
|
+
|
12268
12394
|
while (offset < word.size()) {
|
12269
12395
|
llm_symbol sym;
|
12270
12396
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
@@ -12450,16 +12576,16 @@ struct llm_tokenizer_wpm {
|
|
12450
12576
|
// to lowercase, pad chinese characters, pad punctuation
|
12451
12577
|
std::string new_str = "";
|
12452
12578
|
for (uint32_t code : cpts_nfd) {
|
12453
|
-
|
12454
|
-
if (
|
12579
|
+
const codepoint_flags flags = unicode_cpt_flags(code);
|
12580
|
+
if (flags.is_accent_mark || flags.is_control) {
|
12455
12581
|
continue;
|
12456
12582
|
}
|
12457
12583
|
code = unicode_tolower(code);
|
12458
|
-
if (
|
12584
|
+
if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
|
12459
12585
|
code = ' ';
|
12460
12586
|
}
|
12461
12587
|
std::string s = unicode_cpt_to_utf8(code);
|
12462
|
-
if (
|
12588
|
+
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12463
12589
|
new_str += " ";
|
12464
12590
|
new_str += s;
|
12465
12591
|
new_str += " ";
|
@@ -12693,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12693
12819
|
}
|
12694
12820
|
}
|
12695
12821
|
|
12822
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
12823
|
+
LLAMA_LOG_WARN(
|
12824
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
12825
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
12826
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
12827
|
+
}
|
12828
|
+
|
12696
12829
|
if (add_special && vocab.special_add_eos == 1) {
|
12697
12830
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
12698
12831
|
output.push_back(vocab.special_eos_id);
|
@@ -12719,7 +12852,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12719
12852
|
}
|
12720
12853
|
}
|
12721
12854
|
|
12722
|
-
|
12855
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
12856
|
+
LLAMA_LOG_WARN(
|
12857
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
12858
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
12859
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
12860
|
+
}
|
12861
|
+
|
12862
|
+
if (add_special && vocab.special_add_eos == 1) {
|
12863
|
+
GGML_ASSERT(vocab.special_add_eos != -1);
|
12864
|
+
output.push_back(vocab.special_eos_id);
|
12865
|
+
}
|
12723
12866
|
} break;
|
12724
12867
|
case LLAMA_VOCAB_TYPE_WPM:
|
12725
12868
|
{
|
@@ -13073,6 +13216,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
13073
13216
|
return rejects;
|
13074
13217
|
}
|
13075
13218
|
|
13219
|
+
static bool llama_grammar_detect_left_recursion(
|
13220
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
13221
|
+
size_t rule_index,
|
13222
|
+
std::vector<bool> * rules_visited,
|
13223
|
+
std::vector<bool> * rules_in_progress,
|
13224
|
+
std::vector<bool> * rules_may_be_empty) {
|
13225
|
+
if ((*rules_in_progress)[rule_index]) {
|
13226
|
+
return true;
|
13227
|
+
}
|
13228
|
+
|
13229
|
+
(*rules_in_progress)[rule_index] = true;
|
13230
|
+
|
13231
|
+
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
13232
|
+
|
13233
|
+
// First check if the rule might produce the empty string. This could be done combined with the second
|
13234
|
+
// step but it's more readable as two steps.
|
13235
|
+
bool at_rule_start = true;
|
13236
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13237
|
+
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13238
|
+
if (at_rule_start) {
|
13239
|
+
(*rules_may_be_empty)[rule_index] = true;
|
13240
|
+
break;
|
13241
|
+
}
|
13242
|
+
at_rule_start = true;
|
13243
|
+
} else {
|
13244
|
+
at_rule_start = false;
|
13245
|
+
}
|
13246
|
+
}
|
13247
|
+
|
13248
|
+
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
13249
|
+
// be empty)
|
13250
|
+
bool recurse_into_nonterminal = true;
|
13251
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13252
|
+
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
13253
|
+
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
13254
|
+
return true;
|
13255
|
+
}
|
13256
|
+
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
13257
|
+
recurse_into_nonterminal = false;
|
13258
|
+
}
|
13259
|
+
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13260
|
+
recurse_into_nonterminal = true;
|
13261
|
+
} else {
|
13262
|
+
recurse_into_nonterminal = false;
|
13263
|
+
}
|
13264
|
+
}
|
13265
|
+
|
13266
|
+
(*rules_in_progress)[rule_index] = false;
|
13267
|
+
(*rules_visited)[rule_index] = true;
|
13268
|
+
return false;
|
13269
|
+
}
|
13270
|
+
|
13076
13271
|
//
|
13077
13272
|
// grammar - external
|
13078
13273
|
//
|
@@ -13092,6 +13287,19 @@ struct llama_grammar * llama_grammar_init(
|
|
13092
13287
|
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
13093
13288
|
}
|
13094
13289
|
|
13290
|
+
// Check for left recursion
|
13291
|
+
std::vector<bool> rules_visited(n_rules);
|
13292
|
+
std::vector<bool> rules_in_progress(n_rules);
|
13293
|
+
std::vector<bool> rules_may_be_empty(n_rules);
|
13294
|
+
for (size_t i = 0; i < n_rules; i++) {
|
13295
|
+
if (rules_visited[i]) {
|
13296
|
+
continue;
|
13297
|
+
}
|
13298
|
+
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
13299
|
+
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
13300
|
+
}
|
13301
|
+
}
|
13302
|
+
|
13095
13303
|
// loop over alternates of start rule to build initial stacks
|
13096
13304
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
13097
13305
|
pos = vec_rules[start_rule_index].data();
|
@@ -13114,6 +13322,9 @@ struct llama_grammar * llama_grammar_init(
|
|
13114
13322
|
}
|
13115
13323
|
} while (true);
|
13116
13324
|
|
13325
|
+
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
13326
|
+
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
13327
|
+
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
13117
13328
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
13118
13329
|
}
|
13119
13330
|
|
@@ -13708,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13708
13919
|
|
13709
13920
|
// Sample the next word X using top-k sampling
|
13710
13921
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
13711
|
-
|
13712
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13713
|
-
}
|
13922
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13714
13923
|
llama_token X = llama_sample_token(ctx, candidates);
|
13715
13924
|
t_start_sample_us = ggml_time_us();
|
13716
13925
|
|
@@ -13724,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13724
13933
|
// Update mu using the learning rate and error
|
13725
13934
|
*mu = *mu - eta * e;
|
13726
13935
|
|
13727
|
-
|
13728
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13729
|
-
}
|
13936
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13730
13937
|
return X;
|
13731
13938
|
}
|
13732
13939
|
|
@@ -14142,13 +14349,16 @@ static void llama_tensor_dequantize_internal(
|
|
14142
14349
|
if (qtype.to_float == NULL) {
|
14143
14350
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
14144
14351
|
}
|
14145
|
-
} else if (tensor->type != GGML_TYPE_F16
|
14352
|
+
} else if (tensor->type != GGML_TYPE_F16 &&
|
14353
|
+
tensor->type != GGML_TYPE_BF16) {
|
14146
14354
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
14147
14355
|
}
|
14148
14356
|
|
14149
14357
|
if (nthread < 2) {
|
14150
14358
|
if (tensor->type == GGML_TYPE_F16) {
|
14151
14359
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
14360
|
+
} else if (tensor->type == GGML_TYPE_BF16) {
|
14361
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
14152
14362
|
} else if (ggml_is_quantized(tensor->type)) {
|
14153
14363
|
qtype.to_float(tensor->data, f32_output, nelements);
|
14154
14364
|
} else {
|
@@ -14157,7 +14367,14 @@ static void llama_tensor_dequantize_internal(
|
|
14157
14367
|
return;
|
14158
14368
|
}
|
14159
14369
|
|
14160
|
-
size_t block_size
|
14370
|
+
size_t block_size;
|
14371
|
+
if (tensor->type == GGML_TYPE_F16 ||
|
14372
|
+
tensor->type == GGML_TYPE_BF16) {
|
14373
|
+
block_size = 1;
|
14374
|
+
} else {
|
14375
|
+
block_size = (size_t)ggml_blck_size(tensor->type);
|
14376
|
+
}
|
14377
|
+
|
14161
14378
|
size_t block_size_bytes = ggml_type_size(tensor->type);
|
14162
14379
|
|
14163
14380
|
GGML_ASSERT(nelements % block_size == 0);
|
@@ -14176,6 +14393,8 @@ static void llama_tensor_dequantize_internal(
|
|
14176
14393
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
14177
14394
|
if (typ == GGML_TYPE_F16) {
|
14178
14395
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
14396
|
+
} else if (typ == GGML_TYPE_BF16) {
|
14397
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
14179
14398
|
} else {
|
14180
14399
|
qtype.to_float(inbuf, outbuf, nels);
|
14181
14400
|
}
|
@@ -14536,6 +14755,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14536
14755
|
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
14537
14756
|
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
14538
14757
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
14758
|
+
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
14539
14759
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
14540
14760
|
|
14541
14761
|
// K-quants
|
@@ -15200,6 +15420,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15200
15420
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
15201
15421
|
/*.main_gpu =*/ 0,
|
15202
15422
|
/*.tensor_split =*/ nullptr,
|
15423
|
+
/*.rpc_servers =*/ nullptr,
|
15203
15424
|
/*.progress_callback =*/ nullptr,
|
15204
15425
|
/*.progress_callback_user_data =*/ nullptr,
|
15205
15426
|
/*.kv_overrides =*/ nullptr,
|
@@ -15270,7 +15491,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
15270
15491
|
}
|
15271
15492
|
|
15272
15493
|
size_t llama_max_devices(void) {
|
15273
|
-
#if defined(
|
15494
|
+
#if defined(GGML_USE_RPC)
|
15495
|
+
return GGML_RPC_MAX_SERVERS;
|
15496
|
+
#elif defined(GGML_USE_METAL)
|
15274
15497
|
return 1;
|
15275
15498
|
#elif defined(GGML_USE_CUDA)
|
15276
15499
|
return GGML_CUDA_MAX_DEVICES;
|
@@ -15293,7 +15516,7 @@ bool llama_supports_mlock(void) {
|
|
15293
15516
|
|
15294
15517
|
bool llama_supports_gpu_offload(void) {
|
15295
15518
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
15296
|
-
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
15519
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
15297
15520
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
15298
15521
|
return true;
|
15299
15522
|
#else
|
@@ -15356,7 +15579,17 @@ struct llama_model * llama_load_model_from_file(
|
|
15356
15579
|
return true;
|
15357
15580
|
};
|
15358
15581
|
}
|
15359
|
-
|
15582
|
+
if (params.rpc_servers != nullptr) {
|
15583
|
+
// split the servers set them into model->rpc_servers
|
15584
|
+
std::string servers(params.rpc_servers);
|
15585
|
+
size_t pos = 0;
|
15586
|
+
while ((pos = servers.find(",")) != std::string::npos) {
|
15587
|
+
std::string server = servers.substr(0, pos);
|
15588
|
+
model->rpc_servers.push_back(server);
|
15589
|
+
servers.erase(0, pos + 1);
|
15590
|
+
}
|
15591
|
+
model->rpc_servers.push_back(servers);
|
15592
|
+
}
|
15360
15593
|
int status = llama_model_load(path_model, *model, params);
|
15361
15594
|
GGML_ASSERT(status <= 0);
|
15362
15595
|
if (status < 0) {
|
@@ -15395,6 +15628,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15395
15628
|
return nullptr;
|
15396
15629
|
}
|
15397
15630
|
|
15631
|
+
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15632
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15633
|
+
params.flash_attn = false;
|
15634
|
+
}
|
15635
|
+
|
15398
15636
|
llama_context * ctx = new llama_context(*model);
|
15399
15637
|
|
15400
15638
|
const auto & hparams = model->hparams;
|
@@ -15418,7 +15656,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15418
15656
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15419
15657
|
|
15420
15658
|
// this is necessary due to kv_self.n being padded later during inference
|
15421
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx,
|
15659
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
|
15422
15660
|
|
15423
15661
|
// with causal attention, the batch size is limited by the context size
|
15424
15662
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
@@ -15463,23 +15701,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15463
15701
|
}
|
15464
15702
|
}
|
15465
15703
|
|
15466
|
-
if (cparams.flash_attn && hparams.use_alibi) {
|
15467
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15468
|
-
cparams.flash_attn = false;
|
15469
|
-
}
|
15470
|
-
|
15471
|
-
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15472
|
-
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15473
|
-
cparams.flash_attn = false;
|
15474
|
-
}
|
15475
|
-
|
15476
|
-
#ifdef GGML_USE_HIPBLAS
|
15477
|
-
if (cparams.flash_attn) {
|
15478
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
|
15479
|
-
cparams.flash_attn = false;
|
15480
|
-
}
|
15481
|
-
#endif
|
15482
|
-
|
15483
15704
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15484
15705
|
params.seed = time(NULL);
|
15485
15706
|
}
|
@@ -15515,7 +15736,17 @@ struct llama_context * llama_new_context_with_model(
|
|
15515
15736
|
|
15516
15737
|
if (!hparams.vocab_only) {
|
15517
15738
|
// initialize backends
|
15518
|
-
#
|
15739
|
+
#if defined(GGML_USE_RPC)
|
15740
|
+
for (auto & server : model->rpc_servers) {
|
15741
|
+
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
15742
|
+
if (backend == nullptr) {
|
15743
|
+
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
15744
|
+
llama_free(ctx);
|
15745
|
+
return nullptr;
|
15746
|
+
}
|
15747
|
+
ctx->backends.push_back(backend);
|
15748
|
+
}
|
15749
|
+
#elif defined(GGML_USE_METAL)
|
15519
15750
|
if (model->n_gpu_layers > 0) {
|
15520
15751
|
ctx->backend_metal = ggml_backend_metal_init();
|
15521
15752
|
if (ctx->backend_metal == nullptr) {
|
@@ -15671,7 +15902,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15671
15902
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
15672
15903
|
|
15673
15904
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
15674
|
-
bool pipeline_parallel =
|
15905
|
+
bool pipeline_parallel =
|
15906
|
+
llama_get_device_count(*model) > 1 &&
|
15907
|
+
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
15908
|
+
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
15909
|
+
params.offload_kqv;
|
15675
15910
|
#ifndef GGML_USE_CUDA
|
15676
15911
|
// pipeline parallelism requires support for async compute and events
|
15677
15912
|
// currently this is only implemented in the CUDA backend
|
@@ -15769,6 +16004,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15769
16004
|
case LLM_ARCH_REFACT:
|
15770
16005
|
case LLM_ARCH_BLOOM:
|
15771
16006
|
case LLM_ARCH_MAMBA:
|
16007
|
+
case LLM_ARCH_JINA_BERT_V2:
|
15772
16008
|
return LLAMA_ROPE_TYPE_NONE;
|
15773
16009
|
|
15774
16010
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -16790,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16790
17026
|
}
|
16791
17027
|
else {
|
16792
17028
|
if (cell_range_begin != kv_self.size) {
|
16793
|
-
cell_ranges.
|
17029
|
+
cell_ranges.emplace_back(cell_range_begin, i);
|
16794
17030
|
cell_range_begin = kv_self.size;
|
16795
17031
|
}
|
16796
17032
|
}
|
16797
17033
|
}
|
16798
17034
|
if (cell_range_begin != kv_self.size) {
|
16799
|
-
cell_ranges.
|
17035
|
+
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
16800
17036
|
}
|
16801
17037
|
|
16802
17038
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
@@ -17466,9 +17702,10 @@ int32_t llama_tokenize(
|
|
17466
17702
|
|
17467
17703
|
static std::string llama_decode_text(const std::string & text) {
|
17468
17704
|
std::string decoded_text;
|
17469
|
-
|
17470
|
-
|
17471
|
-
|
17705
|
+
|
17706
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
17707
|
+
for (const auto cpt : cpts) {
|
17708
|
+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
|
17472
17709
|
}
|
17473
17710
|
|
17474
17711
|
return decoded_text;
|
@@ -17832,7 +18069,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
17832
18069
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
17833
18070
|
|
17834
18071
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
17835
|
-
/*.n_p_eval =*/ std::max(
|
18072
|
+
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
17836
18073
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
17837
18074
|
};
|
17838
18075
|
|