llama_cpp 0.15.0 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -7
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +303 -23
- data/vendor/tmp/llama.cpp/ggml-impl.h +84 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +137 -133
- data/vendor/tmp/llama.cpp/ggml-metal.metal +87 -110
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +2220 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +35 -152
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +953 -268
- data/vendor/tmp/llama.cpp/ggml.c +1762 -681
- data/vendor/tmp/llama.cpp/ggml.h +43 -24
- data/vendor/tmp/llama.cpp/llama.cpp +533 -296
- data/vendor/tmp/llama.cpp/llama.h +10 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -1637
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +286 -176
- data/vendor/tmp/llama.cpp/unicode.h +44 -10
- metadata +4 -2
@@ -7,6 +7,10 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
+
#ifdef GGML_USE_RPC
|
11
|
+
# include "ggml-rpc.h"
|
12
|
+
#endif
|
13
|
+
|
10
14
|
#ifdef GGML_USE_CUDA
|
11
15
|
# include "ggml-cuda.h"
|
12
16
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -205,6 +209,7 @@ enum llm_arch {
|
|
205
209
|
LLM_ARCH_REFACT,
|
206
210
|
LLM_ARCH_BERT,
|
207
211
|
LLM_ARCH_NOMIC_BERT,
|
212
|
+
LLM_ARCH_JINA_BERT_V2,
|
208
213
|
LLM_ARCH_BLOOM,
|
209
214
|
LLM_ARCH_STABLELM,
|
210
215
|
LLM_ARCH_QWEN,
|
@@ -228,39 +233,40 @@ enum llm_arch {
|
|
228
233
|
};
|
229
234
|
|
230
235
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
231
|
-
{ LLM_ARCH_LLAMA, "llama"
|
232
|
-
{ LLM_ARCH_FALCON, "falcon"
|
233
|
-
{ LLM_ARCH_GROK, "grok"
|
234
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
235
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
236
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
237
|
-
{ LLM_ARCH_MPT, "mpt"
|
238
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
239
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
240
|
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
241
|
-
{ LLM_ARCH_REFACT, "refact"
|
242
|
-
{ LLM_ARCH_BERT, "bert"
|
243
|
-
{ LLM_ARCH_NOMIC_BERT, "nomic-bert"
|
244
|
-
{
|
245
|
-
{
|
246
|
-
{
|
247
|
-
{
|
248
|
-
{
|
249
|
-
{
|
250
|
-
{
|
251
|
-
{
|
252
|
-
{
|
253
|
-
{
|
254
|
-
{
|
255
|
-
{
|
256
|
-
{
|
257
|
-
{
|
258
|
-
{
|
259
|
-
{
|
260
|
-
{
|
261
|
-
{
|
262
|
-
{
|
263
|
-
{
|
236
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
237
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
238
|
+
{ LLM_ARCH_GROK, "grok" },
|
239
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
240
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
241
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
242
|
+
{ LLM_ARCH_MPT, "mpt" },
|
243
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
244
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
245
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
246
|
+
{ LLM_ARCH_REFACT, "refact" },
|
247
|
+
{ LLM_ARCH_BERT, "bert" },
|
248
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
249
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
250
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
251
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
252
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
253
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
254
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
255
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
256
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
257
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
258
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
259
|
+
{ LLM_ARCH_ORION, "orion" },
|
260
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
261
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
262
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
263
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
264
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
265
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
266
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
267
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
268
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
269
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
264
270
|
};
|
265
271
|
|
266
272
|
enum llm_kv {
|
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
691
697
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
692
698
|
},
|
693
699
|
},
|
700
|
+
{
|
701
|
+
LLM_ARCH_JINA_BERT_V2,
|
702
|
+
{
|
703
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
704
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
705
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
706
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
707
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
708
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
709
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
710
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
711
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
712
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
713
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
714
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
715
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
716
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
717
|
+
},
|
718
|
+
},
|
694
719
|
{
|
695
720
|
LLM_ARCH_BLOOM,
|
696
721
|
{
|
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1664
1689
|
GGML_UNUSED(host_buffer);
|
1665
1690
|
}
|
1666
1691
|
|
1667
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
1668
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1669
|
-
|
1670
|
-
#ifdef GGML_USE_METAL
|
1671
|
-
buft = ggml_backend_metal_buffer_type();
|
1672
|
-
#elif defined(GGML_USE_CUDA)
|
1673
|
-
buft = ggml_backend_cuda_buffer_type(gpu);
|
1674
|
-
#elif defined(GGML_USE_VULKAN)
|
1675
|
-
buft = ggml_backend_vk_buffer_type(gpu);
|
1676
|
-
#elif defined(GGML_USE_SYCL)
|
1677
|
-
buft = ggml_backend_sycl_buffer_type(gpu);
|
1678
|
-
#elif defined(GGML_USE_CLBLAST)
|
1679
|
-
buft = ggml_backend_opencl_buffer_type();
|
1680
|
-
#elif defined(GGML_USE_KOMPUTE)
|
1681
|
-
buft = ggml_backend_kompute_buffer_type(gpu);
|
1682
|
-
if (buft == nullptr) {
|
1683
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
1684
|
-
}
|
1685
|
-
#endif
|
1686
|
-
|
1687
|
-
if (buft == nullptr) {
|
1688
|
-
buft = llama_default_buffer_type_cpu(true);
|
1689
|
-
}
|
1690
|
-
return buft;
|
1691
|
-
|
1692
|
-
GGML_UNUSED(gpu);
|
1693
|
-
}
|
1694
|
-
|
1695
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1696
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1697
|
-
|
1698
|
-
#ifdef GGML_USE_CUDA
|
1699
|
-
if (ggml_backend_cuda_get_device_count() > 1) {
|
1700
|
-
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1701
|
-
}
|
1702
|
-
#endif
|
1703
|
-
|
1704
|
-
#ifdef GGML_USE_SYCL
|
1705
|
-
if (ggml_backend_sycl_get_device_count() > 1) {
|
1706
|
-
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1707
|
-
}
|
1708
|
-
#endif
|
1709
|
-
|
1710
|
-
if (buft == nullptr) {
|
1711
|
-
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1712
|
-
}
|
1713
|
-
return buft;
|
1714
|
-
|
1715
|
-
GGML_UNUSED(tensor_split);
|
1716
|
-
}
|
1717
|
-
|
1718
|
-
static size_t llama_get_device_count() {
|
1719
|
-
#if defined(GGML_USE_CUDA)
|
1720
|
-
return ggml_backend_cuda_get_device_count();
|
1721
|
-
#elif defined(GGML_USE_SYCL)
|
1722
|
-
return ggml_backend_sycl_get_device_count();
|
1723
|
-
#elif defined(GGML_USE_VULKAN)
|
1724
|
-
return ggml_backend_vk_get_device_count();
|
1725
|
-
#else
|
1726
|
-
return 1;
|
1727
|
-
#endif
|
1728
|
-
}
|
1729
|
-
|
1730
|
-
static size_t llama_get_device_memory(int device) {
|
1731
|
-
#if defined(GGML_USE_CUDA)
|
1732
|
-
size_t total;
|
1733
|
-
size_t free;
|
1734
|
-
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1735
|
-
return free;
|
1736
|
-
#elif defined(GGML_USE_SYCL)
|
1737
|
-
size_t total;
|
1738
|
-
size_t free;
|
1739
|
-
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1740
|
-
return free;
|
1741
|
-
#elif defined(GGML_USE_VULKAN)
|
1742
|
-
size_t total;
|
1743
|
-
size_t free;
|
1744
|
-
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1745
|
-
return free;
|
1746
|
-
#else
|
1747
|
-
return 1;
|
1748
|
-
GGML_UNUSED(device);
|
1749
|
-
#endif
|
1750
|
-
}
|
1751
|
-
|
1752
1692
|
//
|
1753
1693
|
// globals
|
1754
1694
|
//
|
@@ -1845,7 +1785,7 @@ struct llama_hparams {
|
|
1845
1785
|
float f_logit_scale = 0.0f;
|
1846
1786
|
|
1847
1787
|
bool causal_attn = true;
|
1848
|
-
bool use_alibi = false;
|
1788
|
+
bool use_alibi = false;
|
1849
1789
|
|
1850
1790
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1851
1791
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -2189,6 +2129,8 @@ struct llama_model {
|
|
2189
2129
|
int main_gpu;
|
2190
2130
|
int n_gpu_layers;
|
2191
2131
|
|
2132
|
+
std::vector<std::string> rpc_servers;
|
2133
|
+
|
2192
2134
|
// gguf metadata
|
2193
2135
|
std::unordered_map<std::string, std::string> gguf_kv;
|
2194
2136
|
|
@@ -2317,7 +2259,6 @@ struct llama_context {
|
|
2317
2259
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2318
2260
|
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2319
2261
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2320
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2321
2262
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2322
2263
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2323
2264
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
@@ -2333,6 +2274,104 @@ struct llama_context {
|
|
2333
2274
|
#endif
|
2334
2275
|
};
|
2335
2276
|
|
2277
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
2278
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2279
|
+
|
2280
|
+
#ifdef GGML_USE_RPC
|
2281
|
+
std::string endpoint = model.rpc_servers[gpu];
|
2282
|
+
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
2283
|
+
#elif defined(GGML_USE_METAL)
|
2284
|
+
buft = ggml_backend_metal_buffer_type();
|
2285
|
+
#elif defined(GGML_USE_CUDA)
|
2286
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
2287
|
+
#elif defined(GGML_USE_VULKAN)
|
2288
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
2289
|
+
#elif defined(GGML_USE_SYCL)
|
2290
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
2291
|
+
#elif defined(GGML_USE_CLBLAST)
|
2292
|
+
buft = ggml_backend_opencl_buffer_type();
|
2293
|
+
#elif defined(GGML_USE_KOMPUTE)
|
2294
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
2295
|
+
if (buft == nullptr) {
|
2296
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
2297
|
+
}
|
2298
|
+
#endif
|
2299
|
+
|
2300
|
+
if (buft == nullptr) {
|
2301
|
+
buft = llama_default_buffer_type_cpu(true);
|
2302
|
+
}
|
2303
|
+
return buft;
|
2304
|
+
GGML_UNUSED(model);
|
2305
|
+
GGML_UNUSED(gpu);
|
2306
|
+
}
|
2307
|
+
|
2308
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
2309
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2310
|
+
|
2311
|
+
#ifdef GGML_USE_CUDA
|
2312
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
2313
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
2314
|
+
}
|
2315
|
+
#endif
|
2316
|
+
|
2317
|
+
#ifdef GGML_USE_SYCL
|
2318
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
2319
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
2320
|
+
}
|
2321
|
+
#endif
|
2322
|
+
|
2323
|
+
if (buft == nullptr) {
|
2324
|
+
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
2325
|
+
}
|
2326
|
+
return buft;
|
2327
|
+
|
2328
|
+
GGML_UNUSED(tensor_split);
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
2332
|
+
#if defined(GGML_USE_RPC)
|
2333
|
+
return model.rpc_servers.size();
|
2334
|
+
#elif defined(GGML_USE_CUDA)
|
2335
|
+
return ggml_backend_cuda_get_device_count();
|
2336
|
+
#elif defined(GGML_USE_SYCL)
|
2337
|
+
return ggml_backend_sycl_get_device_count();
|
2338
|
+
#elif defined(GGML_USE_VULKAN)
|
2339
|
+
return ggml_backend_vk_get_device_count();
|
2340
|
+
#else
|
2341
|
+
return 1;
|
2342
|
+
#endif
|
2343
|
+
GGML_UNUSED(model);
|
2344
|
+
}
|
2345
|
+
|
2346
|
+
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
2347
|
+
#if defined(GGML_USE_RPC)
|
2348
|
+
size_t total;
|
2349
|
+
size_t free;
|
2350
|
+
std::string endpoint = model.rpc_servers[device];
|
2351
|
+
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
2352
|
+
return free;
|
2353
|
+
#elif defined(GGML_USE_CUDA)
|
2354
|
+
size_t total;
|
2355
|
+
size_t free;
|
2356
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
2357
|
+
return free;
|
2358
|
+
#elif defined(GGML_USE_SYCL)
|
2359
|
+
size_t total;
|
2360
|
+
size_t free;
|
2361
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
2362
|
+
return free;
|
2363
|
+
#elif defined(GGML_USE_VULKAN)
|
2364
|
+
size_t total;
|
2365
|
+
size_t free;
|
2366
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
2367
|
+
return free;
|
2368
|
+
#else
|
2369
|
+
return 1;
|
2370
|
+
#endif
|
2371
|
+
GGML_UNUSED(model);
|
2372
|
+
GGML_UNUSED(device);
|
2373
|
+
}
|
2374
|
+
|
2336
2375
|
//
|
2337
2376
|
// kv cache helpers
|
2338
2377
|
//
|
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
|
2785
2824
|
cache.do_defrag = true;
|
2786
2825
|
}
|
2787
2826
|
|
2827
|
+
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
2828
|
+
// the FA kernels require padding to avoid extra runtime boundary checks
|
2829
|
+
return cparams.flash_attn ? 256u : 32u;
|
2830
|
+
}
|
2831
|
+
|
2788
2832
|
//
|
2789
2833
|
// model loading and saving
|
2790
2834
|
//
|
@@ -3175,6 +3219,7 @@ struct llama_model_loader {
|
|
3175
3219
|
switch (type_max) {
|
3176
3220
|
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
3177
3221
|
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
3222
|
+
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
3178
3223
|
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
3179
3224
|
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
3180
3225
|
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
@@ -3666,6 +3711,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3666
3711
|
switch (ftype) {
|
3667
3712
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
3668
3713
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
3714
|
+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
3669
3715
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
3670
3716
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
3671
3717
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
@@ -3777,6 +3823,12 @@ static void llm_load_hparams(
|
|
3777
3823
|
|
3778
3824
|
// get hparams kv
|
3779
3825
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
3826
|
+
|
3827
|
+
// everything past this point is not vocab-related
|
3828
|
+
if (hparams.vocab_only) {
|
3829
|
+
return;
|
3830
|
+
}
|
3831
|
+
|
3780
3832
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
3781
3833
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
3782
3834
|
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
@@ -3858,7 +3910,7 @@ static void llm_load_hparams(
|
|
3858
3910
|
switch (hparams.n_layer) {
|
3859
3911
|
case 22: model.type = e_model::MODEL_1B; break;
|
3860
3912
|
case 26: model.type = e_model::MODEL_3B; break;
|
3861
|
-
case 32: model.type = hparams.
|
3913
|
+
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
3862
3914
|
case 40: model.type = e_model::MODEL_13B; break;
|
3863
3915
|
case 48: model.type = e_model::MODEL_34B; break;
|
3864
3916
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3960,6 +4012,19 @@ static void llm_load_hparams(
|
|
3960
4012
|
model.type = e_model::MODEL_335M; break; // bge-large
|
3961
4013
|
}
|
3962
4014
|
} break;
|
4015
|
+
case LLM_ARCH_JINA_BERT_V2:
|
4016
|
+
{
|
4017
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4018
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
4019
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
4020
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
4021
|
+
hparams.f_max_alibi_bias = 8.0f;
|
4022
|
+
|
4023
|
+
switch (hparams.n_layer) {
|
4024
|
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
4025
|
+
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
4026
|
+
}
|
4027
|
+
} break;
|
3963
4028
|
case LLM_ARCH_NOMIC_BERT:
|
3964
4029
|
{
|
3965
4030
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -4381,8 +4446,27 @@ static void llm_load_vocab(
|
|
4381
4446
|
tokenizer_pre == "starcoder") {
|
4382
4447
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4383
4448
|
} else if (
|
4384
|
-
tokenizer_pre == "gpt-2"
|
4449
|
+
tokenizer_pre == "gpt-2" ||
|
4450
|
+
tokenizer_pre == "jina-es" ||
|
4451
|
+
tokenizer_pre == "jina-de" ||
|
4452
|
+
tokenizer_pre == "jina-v2-es" ||
|
4453
|
+
tokenizer_pre == "jina-v2-de") {
|
4385
4454
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4455
|
+
} else if (
|
4456
|
+
tokenizer_pre == "refact") {
|
4457
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
4458
|
+
} else if (
|
4459
|
+
tokenizer_pre == "command-r") {
|
4460
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
4461
|
+
} else if (
|
4462
|
+
tokenizer_pre == "qwen2") {
|
4463
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4464
|
+
} else if (
|
4465
|
+
tokenizer_pre == "olmo") {
|
4466
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
4467
|
+
} else if (
|
4468
|
+
tokenizer_pre == "dbrx") {
|
4469
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4386
4470
|
} else {
|
4387
4471
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4388
4472
|
}
|
@@ -4726,13 +4810,13 @@ static bool llm_load_tensors(
|
|
4726
4810
|
|
4727
4811
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
4728
4812
|
// calculate the split points
|
4729
|
-
int device_count = llama_get_device_count();
|
4813
|
+
int device_count = llama_get_device_count(model);
|
4730
4814
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
4731
4815
|
std::vector<float> splits(device_count);
|
4732
4816
|
if (all_zero) {
|
4733
4817
|
// default split, by free memory
|
4734
4818
|
for (int i = 0; i < device_count; ++i) {
|
4735
|
-
splits[i] = llama_get_device_memory(i);
|
4819
|
+
splits[i] = llama_get_device_memory(model, i);
|
4736
4820
|
}
|
4737
4821
|
} else {
|
4738
4822
|
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
@@ -4752,35 +4836,35 @@ static bool llm_load_tensors(
|
|
4752
4836
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
4753
4837
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4754
4838
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
4755
|
-
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
4839
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
4756
4840
|
}
|
4757
4841
|
// assign the output layer
|
4758
4842
|
if (n_gpu_layers > n_layer) {
|
4759
4843
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
4760
|
-
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
4844
|
+
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
4761
4845
|
} else {
|
4762
4846
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
4763
4847
|
}
|
4764
4848
|
} else {
|
4765
4849
|
ggml_backend_buffer_type_t split_buft;
|
4766
4850
|
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
4767
|
-
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
4851
|
+
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
4768
4852
|
} else {
|
4769
4853
|
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
4770
|
-
split_buft = llama_default_buffer_type_offload(main_gpu);
|
4854
|
+
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
4771
4855
|
}
|
4772
4856
|
// assign the repeating layers
|
4773
4857
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4774
4858
|
model.buft_layer[i] = {
|
4775
4859
|
split_buft,
|
4776
|
-
llama_default_buffer_type_offload(main_gpu)
|
4860
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4777
4861
|
};
|
4778
4862
|
}
|
4779
4863
|
// assign the output layer
|
4780
4864
|
if (n_gpu_layers > n_layer) {
|
4781
4865
|
model.buft_output = {
|
4782
4866
|
split_buft,
|
4783
|
-
llama_default_buffer_type_offload(main_gpu)
|
4867
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4784
4868
|
};
|
4785
4869
|
} else {
|
4786
4870
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
@@ -5225,6 +5309,50 @@ static bool llm_load_tensors(
|
|
5225
5309
|
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5226
5310
|
}
|
5227
5311
|
} break;
|
5312
|
+
case LLM_ARCH_JINA_BERT_V2:
|
5313
|
+
{
|
5314
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
|
5315
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
|
5316
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
5317
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
5318
|
+
|
5319
|
+
for (int i = 0; i < n_layer; ++i) {
|
5320
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5321
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5322
|
+
|
5323
|
+
auto & layer = model.layers[i]; // JinaBertLayer
|
5324
|
+
|
5325
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5326
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5327
|
+
|
5328
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
5329
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
5330
|
+
|
5331
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5332
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5333
|
+
|
5334
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
5335
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
5336
|
+
|
5337
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5338
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5339
|
+
|
5340
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
|
5341
|
+
layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
|
5342
|
+
|
5343
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
5344
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
5345
|
+
|
5346
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5347
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5348
|
+
|
5349
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5350
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5351
|
+
|
5352
|
+
layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
5353
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5354
|
+
}
|
5355
|
+
} break;
|
5228
5356
|
case LLM_ARCH_BLOOM:
|
5229
5357
|
{
|
5230
5358
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -6120,6 +6248,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
6120
6248
|
|| !(
|
6121
6249
|
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
6122
6250
|
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
6251
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
6123
6252
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
6124
6253
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
6125
6254
|
)
|
@@ -6300,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|
6300
6429
|
llm_ffn_gate_type type_gate,
|
6301
6430
|
const llm_build_cb & cb,
|
6302
6431
|
int il) {
|
6303
|
-
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
6432
|
+
struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
|
6304
6433
|
cb(tmp, "ffn_up", il);
|
6305
6434
|
|
6306
6435
|
if (up_b) {
|
@@ -6482,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6482
6611
|
struct ggml_tensor * wo_b,
|
6483
6612
|
struct ggml_tensor * q_cur,
|
6484
6613
|
struct ggml_tensor * kq_mask,
|
6485
|
-
struct ggml_tensor * kq_pos,
|
6486
6614
|
int32_t n_tokens,
|
6487
6615
|
int32_t n_kv,
|
6488
6616
|
float kq_scale,
|
@@ -6494,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6494
6622
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
6495
6623
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6496
6624
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
6625
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6497
6626
|
|
6498
6627
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
6499
6628
|
cb(q, "q", il);
|
@@ -6512,26 +6641,22 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6512
6641
|
GGML_UNUSED(model);
|
6513
6642
|
GGML_UNUSED(n_ctx);
|
6514
6643
|
|
6515
|
-
// note: if this assert triggers, then some check has failed earlier
|
6516
|
-
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6517
|
-
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6518
|
-
|
6519
6644
|
// split cached v into n_head heads (not transposed)
|
6520
6645
|
struct ggml_tensor * v =
|
6521
6646
|
ggml_view_3d(ctx, kv.v_l[il],
|
6522
6647
|
n_embd_head_v, n_kv, n_head_kv,
|
6523
|
-
ggml_row_size(kv.v_l[il]->type,
|
6524
|
-
ggml_row_size(kv.v_l[il]->type,
|
6648
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
6649
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
6525
6650
|
0);
|
6526
6651
|
cb(v, "v", il);
|
6527
6652
|
|
6528
|
-
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6653
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6529
6654
|
|
6530
6655
|
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6531
6656
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6532
6657
|
}
|
6533
6658
|
|
6534
|
-
cur = ggml_reshape_2d(ctx, cur,
|
6659
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
6535
6660
|
} else {
|
6536
6661
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6537
6662
|
cb(kq, "kq", il);
|
@@ -6556,28 +6681,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6556
6681
|
kq = ggml_scale(ctx, kq, 30);
|
6557
6682
|
}
|
6558
6683
|
|
6559
|
-
|
6560
|
-
|
6561
|
-
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6562
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6563
|
-
if (hparams.use_alibi) {
|
6564
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
6565
|
-
cb(kq, "kq_scaled", il);
|
6566
|
-
|
6567
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6568
|
-
cb(kq, "kq_scaled_alibi", il);
|
6569
|
-
|
6570
|
-
kq = ggml_add(ctx, kq, kq_mask);
|
6571
|
-
cb(kq, "kq_masked", il);
|
6572
|
-
|
6573
|
-
kq = ggml_soft_max(ctx, kq);
|
6574
|
-
cb(kq, "kq_soft_max", il);
|
6575
|
-
} else
|
6576
|
-
#endif
|
6577
|
-
{
|
6578
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6579
|
-
cb(kq, "kq_soft_max_ext", il);
|
6580
|
-
}
|
6684
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6685
|
+
cb(kq, "kq_soft_max_ext", il);
|
6581
6686
|
|
6582
6687
|
GGML_ASSERT(kv.size == n_ctx);
|
6583
6688
|
|
@@ -6596,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6596
6701
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6597
6702
|
cb(kqv_merged, "kqv_merged", il);
|
6598
6703
|
|
6599
|
-
cur = ggml_cont_2d(ctx, kqv_merged,
|
6704
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
6600
6705
|
cb(cur, "kqv_merged_cont", il);
|
6601
6706
|
}
|
6602
6707
|
|
@@ -6627,7 +6732,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6627
6732
|
struct ggml_tensor * v_cur,
|
6628
6733
|
struct ggml_tensor * q_cur,
|
6629
6734
|
struct ggml_tensor * kq_mask,
|
6630
|
-
struct ggml_tensor * kq_pos,
|
6631
6735
|
int32_t n_tokens,
|
6632
6736
|
int32_t kv_head,
|
6633
6737
|
int32_t n_kv,
|
@@ -6646,7 +6750,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6646
6750
|
struct ggml_tensor * cur;
|
6647
6751
|
|
6648
6752
|
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6649
|
-
q_cur, kq_mask,
|
6753
|
+
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
6650
6754
|
cb(cur, "kqv_out", il);
|
6651
6755
|
|
6652
6756
|
return cur;
|
@@ -6753,18 +6857,17 @@ struct llm_build_context {
|
|
6753
6857
|
|
6754
6858
|
ctx0 = ggml_init(params);
|
6755
6859
|
|
6756
|
-
lctx.inp_tokens
|
6757
|
-
lctx.inp_embd
|
6758
|
-
lctx.inp_pos
|
6860
|
+
lctx.inp_tokens = nullptr;
|
6861
|
+
lctx.inp_embd = nullptr;
|
6862
|
+
lctx.inp_pos = nullptr;
|
6759
6863
|
lctx.inp_out_ids = nullptr;
|
6760
6864
|
lctx.inp_KQ_mask = nullptr;
|
6761
|
-
lctx.inp_KQ_pos = nullptr;
|
6762
6865
|
lctx.inp_K_shift = nullptr;
|
6763
|
-
lctx.inp_mean
|
6764
|
-
lctx.inp_cls
|
6765
|
-
lctx.inp_s_copy
|
6766
|
-
lctx.inp_s_mask
|
6767
|
-
lctx.inp_s_seq
|
6866
|
+
lctx.inp_mean = nullptr;
|
6867
|
+
lctx.inp_cls = nullptr;
|
6868
|
+
lctx.inp_s_copy = nullptr;
|
6869
|
+
lctx.inp_s_mask = nullptr;
|
6870
|
+
lctx.inp_s_seq = nullptr;
|
6768
6871
|
}
|
6769
6872
|
|
6770
6873
|
void free() {
|
@@ -6914,19 +7017,6 @@ struct llm_build_context {
|
|
6914
7017
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6915
7018
|
}
|
6916
7019
|
|
6917
|
-
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6918
|
-
if (causal) {
|
6919
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6920
|
-
} else {
|
6921
|
-
// TODO: this will be needed for ALiBi-based BERT models
|
6922
|
-
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6923
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6924
|
-
}
|
6925
|
-
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6926
|
-
ggml_set_input(lctx.inp_KQ_pos);
|
6927
|
-
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6928
|
-
}
|
6929
|
-
|
6930
7020
|
struct ggml_tensor * build_inp_mean() {
|
6931
7021
|
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6932
7022
|
cb(lctx.inp_mean, "inp_mean", -1);
|
@@ -7032,7 +7122,7 @@ struct llm_build_context {
|
|
7032
7122
|
|
7033
7123
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7034
7124
|
model.layers[il].wo, model.layers[il].bo,
|
7035
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7125
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7036
7126
|
}
|
7037
7127
|
|
7038
7128
|
if (il == n_layer - 1) {
|
@@ -7125,9 +7215,6 @@ struct llm_build_context {
|
|
7125
7215
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7126
7216
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7127
7217
|
|
7128
|
-
// positions of the tokens in the KV cache
|
7129
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7130
|
-
|
7131
7218
|
for (int il = 0; il < n_layer; ++il) {
|
7132
7219
|
struct ggml_tensor * inpSA = inpL;
|
7133
7220
|
|
@@ -7172,7 +7259,7 @@ struct llm_build_context {
|
|
7172
7259
|
|
7173
7260
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7174
7261
|
model.layers[il].wo, NULL,
|
7175
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7262
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7176
7263
|
}
|
7177
7264
|
|
7178
7265
|
if (il == n_layer - 1) {
|
@@ -7242,9 +7329,6 @@ struct llm_build_context {
|
|
7242
7329
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7243
7330
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7244
7331
|
|
7245
|
-
// positions of the tokens in the KV cache
|
7246
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7247
|
-
|
7248
7332
|
for (int il = 0; il < n_layer; ++il) {
|
7249
7333
|
struct ggml_tensor * inpSA = inpL;
|
7250
7334
|
|
@@ -7279,7 +7363,7 @@ struct llm_build_context {
|
|
7279
7363
|
cb(Kcur, "Kcur", il);
|
7280
7364
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7281
7365
|
model.layers[il].wo, NULL,
|
7282
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7366
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7283
7367
|
}
|
7284
7368
|
|
7285
7369
|
if (il == n_layer - 1) {
|
@@ -7399,7 +7483,7 @@ struct llm_build_context {
|
|
7399
7483
|
|
7400
7484
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7401
7485
|
model.layers[il].wo, NULL,
|
7402
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7486
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7403
7487
|
}
|
7404
7488
|
|
7405
7489
|
if (il == n_layer - 1) {
|
@@ -7524,7 +7608,7 @@ struct llm_build_context {
|
|
7524
7608
|
|
7525
7609
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7526
7610
|
model.layers[il].wo, model.layers[il].bo,
|
7527
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7611
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7528
7612
|
}
|
7529
7613
|
|
7530
7614
|
if (il == n_layer - 1) {
|
@@ -7676,7 +7760,7 @@ struct llm_build_context {
|
|
7676
7760
|
|
7677
7761
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7678
7762
|
model.layers[il].wo, NULL,
|
7679
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7763
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7680
7764
|
}
|
7681
7765
|
|
7682
7766
|
if (il == n_layer - 1) {
|
@@ -7788,7 +7872,7 @@ struct llm_build_context {
|
|
7788
7872
|
|
7789
7873
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7790
7874
|
model.layers[il].wo, model.layers[il].bo,
|
7791
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7875
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7792
7876
|
}
|
7793
7877
|
|
7794
7878
|
if (il == n_layer - 1) {
|
@@ -7992,7 +8076,7 @@ struct llm_build_context {
|
|
7992
8076
|
|
7993
8077
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7994
8078
|
model.layers[il].wo, model.layers[il].bo,
|
7995
|
-
Kcur, Vcur, Q, KQ_mask,
|
8079
|
+
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7996
8080
|
}
|
7997
8081
|
|
7998
8082
|
if (il == n_layer - 1) {
|
@@ -8058,9 +8142,6 @@ struct llm_build_context {
|
|
8058
8142
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8059
8143
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8060
8144
|
|
8061
|
-
// positions of the tokens in the KV cache
|
8062
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8063
|
-
|
8064
8145
|
for (int il = 0; il < n_layer; ++il) {
|
8065
8146
|
struct ggml_tensor * inpSA = inpL;
|
8066
8147
|
|
@@ -8088,7 +8169,7 @@ struct llm_build_context {
|
|
8088
8169
|
|
8089
8170
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8090
8171
|
model.layers[il].wo, NULL,
|
8091
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8172
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8092
8173
|
}
|
8093
8174
|
|
8094
8175
|
if (il == n_layer - 1) {
|
@@ -8150,8 +8231,11 @@ struct llm_build_context {
|
|
8150
8231
|
|
8151
8232
|
struct ggml_tensor * cur;
|
8152
8233
|
struct ggml_tensor * inpL;
|
8234
|
+
struct ggml_tensor * inp_pos = nullptr;
|
8153
8235
|
|
8154
|
-
|
8236
|
+
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
8237
|
+
inp_pos = build_inp_pos();
|
8238
|
+
}
|
8155
8239
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
8156
8240
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
8157
8241
|
|
@@ -8182,13 +8266,26 @@ struct llm_build_context {
|
|
8182
8266
|
struct ggml_tensor * Vcur;
|
8183
8267
|
|
8184
8268
|
// self-attention
|
8185
|
-
if (model.arch == LLM_ARCH_BERT) {
|
8269
|
+
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8186
8270
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
8187
8271
|
cb(Qcur, "Qcur", il);
|
8188
8272
|
|
8273
|
+
if (model.layers[il].attn_q_norm) {
|
8274
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8275
|
+
model.layers[il].attn_q_norm,
|
8276
|
+
model.layers[il].attn_q_norm_b,
|
8277
|
+
LLM_NORM, cb, il);
|
8278
|
+
}
|
8279
|
+
|
8189
8280
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
8190
8281
|
cb(Kcur, "Kcur", il);
|
8191
8282
|
|
8283
|
+
if (model.layers[il].attn_k_norm) {
|
8284
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8285
|
+
model.layers[il].attn_k_norm,
|
8286
|
+
model.layers[il].attn_k_norm_b,
|
8287
|
+
LLM_NORM, cb, il);
|
8288
|
+
}
|
8192
8289
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
8193
8290
|
cb(Vcur, "Vcur", il);
|
8194
8291
|
|
@@ -8228,7 +8325,7 @@ struct llm_build_context {
|
|
8228
8325
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
8229
8326
|
cb(kq, "kq", il);
|
8230
8327
|
|
8231
|
-
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask,
|
8328
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
8232
8329
|
cb(kq, "kq_soft_max_ext", il);
|
8233
8330
|
|
8234
8331
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
@@ -8279,6 +8376,13 @@ struct llm_build_context {
|
|
8279
8376
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8280
8377
|
NULL,
|
8281
8378
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8379
|
+
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8380
|
+
cur = llm_build_ffn(ctx0, cur,
|
8381
|
+
model.layers[il].ffn_up, NULL,
|
8382
|
+
model.layers[il].ffn_gate, NULL,
|
8383
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8384
|
+
NULL,
|
8385
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
8282
8386
|
} else {
|
8283
8387
|
cur = llm_build_ffn(ctx0, cur,
|
8284
8388
|
model.layers[il].ffn_up, NULL,
|
@@ -8345,9 +8449,6 @@ struct llm_build_context {
|
|
8345
8449
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8346
8450
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8347
8451
|
|
8348
|
-
// positions of the tokens in the KV cache
|
8349
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8350
|
-
|
8351
8452
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
8352
8453
|
model.tok_norm,
|
8353
8454
|
model.tok_norm_b,
|
@@ -8381,7 +8482,7 @@ struct llm_build_context {
|
|
8381
8482
|
|
8382
8483
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8383
8484
|
model.layers[il].wo, model.layers[il].bo,
|
8384
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8485
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8385
8486
|
}
|
8386
8487
|
|
8387
8488
|
if (il == n_layer - 1) {
|
@@ -8446,9 +8547,6 @@ struct llm_build_context {
|
|
8446
8547
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8447
8548
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8448
8549
|
|
8449
|
-
// positions of the tokens in the KV cache
|
8450
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8451
|
-
|
8452
8550
|
if (model.pos_embd) {
|
8453
8551
|
// inp_pos - contains the positions
|
8454
8552
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
@@ -8512,13 +8610,13 @@ struct llm_build_context {
|
|
8512
8610
|
|
8513
8611
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8514
8612
|
model.layers[il].wo, model.layers[il].bo,
|
8515
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8613
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8516
8614
|
} else {
|
8517
8615
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8518
8616
|
|
8519
8617
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8520
8618
|
model.layers[il].wo, model.layers[il].bo,
|
8521
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8619
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8522
8620
|
}
|
8523
8621
|
}
|
8524
8622
|
|
@@ -8662,7 +8760,7 @@ struct llm_build_context {
|
|
8662
8760
|
|
8663
8761
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8664
8762
|
model.layers[il].wo, NULL,
|
8665
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8763
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8666
8764
|
}
|
8667
8765
|
|
8668
8766
|
if (il == n_layer - 1) {
|
@@ -8780,7 +8878,7 @@ struct llm_build_context {
|
|
8780
8878
|
|
8781
8879
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8782
8880
|
model.layers[il].wo, NULL,
|
8783
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8881
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8784
8882
|
}
|
8785
8883
|
|
8786
8884
|
if (il == n_layer - 1) {
|
@@ -8893,7 +8991,7 @@ struct llm_build_context {
|
|
8893
8991
|
|
8894
8992
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8895
8993
|
model.layers[il].wo, model.layers[il].bo,
|
8896
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8994
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8897
8995
|
}
|
8898
8996
|
|
8899
8997
|
if (il == n_layer - 1) {
|
@@ -9007,7 +9105,7 @@ struct llm_build_context {
|
|
9007
9105
|
|
9008
9106
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9009
9107
|
model.layers[il].wo, model.layers[il].bo,
|
9010
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9108
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9011
9109
|
}
|
9012
9110
|
|
9013
9111
|
if (il == n_layer - 1) {
|
@@ -9162,7 +9260,7 @@ struct llm_build_context {
|
|
9162
9260
|
|
9163
9261
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9164
9262
|
model.layers[il].wo, model.layers[il].bo,
|
9165
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9263
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9166
9264
|
}
|
9167
9265
|
|
9168
9266
|
if (il == n_layer - 1) {
|
@@ -9279,7 +9377,7 @@ struct llm_build_context {
|
|
9279
9377
|
|
9280
9378
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9281
9379
|
model.layers[il].wo, model.layers[il].bo,
|
9282
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9380
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9283
9381
|
}
|
9284
9382
|
|
9285
9383
|
if (il == n_layer - 1) {
|
@@ -9392,7 +9490,7 @@ struct llm_build_context {
|
|
9392
9490
|
|
9393
9491
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9394
9492
|
model.layers[il].wo, NULL,
|
9395
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9493
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9396
9494
|
}
|
9397
9495
|
struct ggml_tensor * sa_out = cur;
|
9398
9496
|
|
@@ -9495,7 +9593,7 @@ struct llm_build_context {
|
|
9495
9593
|
|
9496
9594
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9497
9595
|
model.layers[il].wo, model.layers[il].bo,
|
9498
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9596
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9499
9597
|
}
|
9500
9598
|
|
9501
9599
|
if (il == n_layer - 1) {
|
@@ -9602,7 +9700,7 @@ struct llm_build_context {
|
|
9602
9700
|
|
9603
9701
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9604
9702
|
model.layers[il].wo, model.layers[il].bo,
|
9605
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9703
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9606
9704
|
}
|
9607
9705
|
|
9608
9706
|
if (il == n_layer - 1) {
|
@@ -9718,7 +9816,7 @@ struct llm_build_context {
|
|
9718
9816
|
|
9719
9817
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9720
9818
|
model.layers[il].wo, NULL,
|
9721
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9819
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9722
9820
|
}
|
9723
9821
|
|
9724
9822
|
if (il == n_layer - 1) {
|
@@ -9835,7 +9933,7 @@ struct llm_build_context {
|
|
9835
9933
|
|
9836
9934
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9837
9935
|
model.layers[il].wo, model.layers[il].bo,
|
9838
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9936
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9839
9937
|
}
|
9840
9938
|
|
9841
9939
|
if (il == n_layer - 1) {
|
@@ -9965,7 +10063,7 @@ struct llm_build_context {
|
|
9965
10063
|
|
9966
10064
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9967
10065
|
model.layers[il].wo, model.layers[il].bo,
|
9968
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10066
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9969
10067
|
}
|
9970
10068
|
|
9971
10069
|
if (il == n_layer - 1) {
|
@@ -10086,7 +10184,7 @@ struct llm_build_context {
|
|
10086
10184
|
|
10087
10185
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10088
10186
|
model.layers[il].wo, NULL,
|
10089
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10187
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
10090
10188
|
}
|
10091
10189
|
|
10092
10190
|
if (il == n_layer - 1) {
|
@@ -10205,7 +10303,7 @@ struct llm_build_context {
|
|
10205
10303
|
|
10206
10304
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10207
10305
|
model.layers[il].wo, model.layers[il].bo,
|
10208
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10306
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10209
10307
|
}
|
10210
10308
|
|
10211
10309
|
if (il == n_layer - 1) {
|
@@ -10495,7 +10593,7 @@ struct llm_build_context {
|
|
10495
10593
|
|
10496
10594
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10497
10595
|
model.layers[il].wo, model.layers[il].bo,
|
10498
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10596
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10499
10597
|
}
|
10500
10598
|
|
10501
10599
|
if (il == n_layer - 1) {
|
@@ -10626,7 +10724,7 @@ struct llm_build_context {
|
|
10626
10724
|
|
10627
10725
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10628
10726
|
model.layers[il].wo, nullptr,
|
10629
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10727
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10630
10728
|
}
|
10631
10729
|
|
10632
10730
|
if (il == n_layer - 1) {
|
@@ -10807,6 +10905,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10807
10905
|
result = llm.build_refact();
|
10808
10906
|
} break;
|
10809
10907
|
case LLM_ARCH_BERT:
|
10908
|
+
case LLM_ARCH_JINA_BERT_V2:
|
10810
10909
|
case LLM_ARCH_NOMIC_BERT:
|
10811
10910
|
{
|
10812
10911
|
result = llm.build_bert();
|
@@ -11014,11 +11113,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11014
11113
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
11015
11114
|
f = -INFINITY;
|
11016
11115
|
} else {
|
11017
|
-
|
11116
|
+
if (hparams.use_alibi) {
|
11117
|
+
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
11118
|
+
} else {
|
11119
|
+
f = 0.0f;
|
11120
|
+
}
|
11018
11121
|
}
|
11019
11122
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
11020
11123
|
}
|
11021
11124
|
}
|
11125
|
+
|
11126
|
+
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
11127
|
+
for (int j = 0; j < n_kv; ++j) {
|
11128
|
+
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
11129
|
+
}
|
11130
|
+
}
|
11022
11131
|
}
|
11023
11132
|
} else {
|
11024
11133
|
// when using kv cache, the mask needs to match the kv cache size
|
@@ -11037,7 +11146,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11037
11146
|
float f = -INFINITY;
|
11038
11147
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
11039
11148
|
if (batch.seq_id[i][s] == seq_id) {
|
11040
|
-
|
11149
|
+
if (hparams.use_alibi) {
|
11150
|
+
f = -fabs(batch.pos[i] - batch.pos[j]);
|
11151
|
+
} else {
|
11152
|
+
f = 0.0f;
|
11153
|
+
}
|
11041
11154
|
break;
|
11042
11155
|
}
|
11043
11156
|
}
|
@@ -11053,21 +11166,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11053
11166
|
}
|
11054
11167
|
}
|
11055
11168
|
|
11056
|
-
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11057
|
-
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11058
|
-
if (hparams.use_alibi) {
|
11059
|
-
const int64_t n_kv = kv_self.n;
|
11060
|
-
|
11061
|
-
GGML_ASSERT(lctx.inp_KQ_pos);
|
11062
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
11063
|
-
|
11064
|
-
float * data = (float *) lctx.inp_KQ_pos->data;
|
11065
|
-
|
11066
|
-
for (int i = 0; i < n_kv; ++i) {
|
11067
|
-
data[i] = float(lctx.kv_self.cells[i].pos);
|
11068
|
-
}
|
11069
|
-
}
|
11070
|
-
|
11071
11169
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
11072
11170
|
const int64_t n_tokens = batch.n_tokens;
|
11073
11171
|
|
@@ -11437,7 +11535,8 @@ static int llama_decode_internal(
|
|
11437
11535
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11438
11536
|
// after enough generations, the benefit from this heuristic disappears
|
11439
11537
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11440
|
-
|
11538
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
11539
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
11441
11540
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11442
11541
|
}
|
11443
11542
|
}
|
@@ -11952,7 +12051,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
|
|
11952
12051
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
11953
12052
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
11954
12053
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
11955
|
-
const auto& token_data = vocab.id_to_token.at(id);
|
12054
|
+
const auto & token_data = vocab.id_to_token.at(id);
|
11956
12055
|
switch (llama_vocab_get_type(vocab)) {
|
11957
12056
|
case LLAMA_VOCAB_TYPE_SPM: {
|
11958
12057
|
auto buf = token_data.text.substr(3, 2);
|
@@ -12182,12 +12281,14 @@ struct llm_tokenizer_bpe {
|
|
12182
12281
|
|
12183
12282
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12184
12283
|
int final_prev_index = -1;
|
12284
|
+
bool ignore_merges = false;
|
12185
12285
|
|
12186
12286
|
std::vector<std::string> word_collection;
|
12187
12287
|
switch (vocab.type) {
|
12188
12288
|
case LLAMA_VOCAB_TYPE_BPE:
|
12189
12289
|
switch (vocab.type_pre) {
|
12190
12290
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12291
|
+
ignore_merges = true;
|
12191
12292
|
word_collection = unicode_regex_split(text, {
|
12192
12293
|
// original regex from tokenizer.json
|
12193
12294
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12196,6 +12297,12 @@ struct llm_tokenizer_bpe {
|
|
12196
12297
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12197
12298
|
});
|
12198
12299
|
break;
|
12300
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12301
|
+
word_collection = unicode_regex_split(text, {
|
12302
|
+
// same as llama3
|
12303
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12304
|
+
});
|
12305
|
+
break;
|
12199
12306
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12200
12307
|
word_collection = unicode_regex_split(text, {
|
12201
12308
|
"[\r\n]",
|
@@ -12212,14 +12319,13 @@ struct llm_tokenizer_bpe {
|
|
12212
12319
|
"\\s?\\p{L}+",
|
12213
12320
|
"\\s?\\p{P}+",
|
12214
12321
|
"[一-龥ࠀ-一가-]+",
|
12215
|
-
"\\p{N}
|
12322
|
+
"\\p{N}",
|
12216
12323
|
});
|
12217
12324
|
break;
|
12218
12325
|
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
12219
12326
|
word_collection = unicode_regex_split(text, {
|
12220
12327
|
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12221
12328
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12222
|
-
"\\p{N}+",
|
12223
12329
|
"[0-9][0-9][0-9]",
|
12224
12330
|
});
|
12225
12331
|
break;
|
@@ -12235,11 +12341,26 @@ struct llm_tokenizer_bpe {
|
|
12235
12341
|
});
|
12236
12342
|
break;
|
12237
12343
|
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
12344
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
12345
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
12346
|
+
word_collection = unicode_regex_split(text, {
|
12347
|
+
"\\p{N}",
|
12348
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12349
|
+
});
|
12350
|
+
break;
|
12238
12351
|
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
12352
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
12239
12353
|
word_collection = unicode_regex_split(text, {
|
12240
12354
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12241
12355
|
});
|
12242
12356
|
break;
|
12357
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12358
|
+
word_collection = unicode_regex_split(text, {
|
12359
|
+
// original regex from tokenizer.json
|
12360
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
12361
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12362
|
+
});
|
12363
|
+
break;
|
12243
12364
|
default:
|
12244
12365
|
// default regex for BPE tokenization pre-processing
|
12245
12366
|
word_collection = unicode_regex_split(text, {
|
@@ -12265,6 +12386,11 @@ struct llm_tokenizer_bpe {
|
|
12265
12386
|
int index = 0;
|
12266
12387
|
size_t offset = 0;
|
12267
12388
|
|
12389
|
+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
12390
|
+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
12391
|
+
offset = word.size();
|
12392
|
+
}
|
12393
|
+
|
12268
12394
|
while (offset < word.size()) {
|
12269
12395
|
llm_symbol sym;
|
12270
12396
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
@@ -12450,16 +12576,16 @@ struct llm_tokenizer_wpm {
|
|
12450
12576
|
// to lowercase, pad chinese characters, pad punctuation
|
12451
12577
|
std::string new_str = "";
|
12452
12578
|
for (uint32_t code : cpts_nfd) {
|
12453
|
-
|
12454
|
-
if (
|
12579
|
+
const codepoint_flags flags = unicode_cpt_flags(code);
|
12580
|
+
if (flags.is_accent_mark || flags.is_control) {
|
12455
12581
|
continue;
|
12456
12582
|
}
|
12457
12583
|
code = unicode_tolower(code);
|
12458
|
-
if (
|
12584
|
+
if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
|
12459
12585
|
code = ' ';
|
12460
12586
|
}
|
12461
12587
|
std::string s = unicode_cpt_to_utf8(code);
|
12462
|
-
if (
|
12588
|
+
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12463
12589
|
new_str += " ";
|
12464
12590
|
new_str += s;
|
12465
12591
|
new_str += " ";
|
@@ -12693,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12693
12819
|
}
|
12694
12820
|
}
|
12695
12821
|
|
12822
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
12823
|
+
LLAMA_LOG_WARN(
|
12824
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
12825
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
12826
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
12827
|
+
}
|
12828
|
+
|
12696
12829
|
if (add_special && vocab.special_add_eos == 1) {
|
12697
12830
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
12698
12831
|
output.push_back(vocab.special_eos_id);
|
@@ -12719,7 +12852,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12719
12852
|
}
|
12720
12853
|
}
|
12721
12854
|
|
12722
|
-
|
12855
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
12856
|
+
LLAMA_LOG_WARN(
|
12857
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
12858
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
12859
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
12860
|
+
}
|
12861
|
+
|
12862
|
+
if (add_special && vocab.special_add_eos == 1) {
|
12863
|
+
GGML_ASSERT(vocab.special_add_eos != -1);
|
12864
|
+
output.push_back(vocab.special_eos_id);
|
12865
|
+
}
|
12723
12866
|
} break;
|
12724
12867
|
case LLAMA_VOCAB_TYPE_WPM:
|
12725
12868
|
{
|
@@ -13073,6 +13216,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
13073
13216
|
return rejects;
|
13074
13217
|
}
|
13075
13218
|
|
13219
|
+
static bool llama_grammar_detect_left_recursion(
|
13220
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
13221
|
+
size_t rule_index,
|
13222
|
+
std::vector<bool> * rules_visited,
|
13223
|
+
std::vector<bool> * rules_in_progress,
|
13224
|
+
std::vector<bool> * rules_may_be_empty) {
|
13225
|
+
if ((*rules_in_progress)[rule_index]) {
|
13226
|
+
return true;
|
13227
|
+
}
|
13228
|
+
|
13229
|
+
(*rules_in_progress)[rule_index] = true;
|
13230
|
+
|
13231
|
+
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
13232
|
+
|
13233
|
+
// First check if the rule might produce the empty string. This could be done combined with the second
|
13234
|
+
// step but it's more readable as two steps.
|
13235
|
+
bool at_rule_start = true;
|
13236
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13237
|
+
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13238
|
+
if (at_rule_start) {
|
13239
|
+
(*rules_may_be_empty)[rule_index] = true;
|
13240
|
+
break;
|
13241
|
+
}
|
13242
|
+
at_rule_start = true;
|
13243
|
+
} else {
|
13244
|
+
at_rule_start = false;
|
13245
|
+
}
|
13246
|
+
}
|
13247
|
+
|
13248
|
+
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
13249
|
+
// be empty)
|
13250
|
+
bool recurse_into_nonterminal = true;
|
13251
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13252
|
+
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
13253
|
+
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
13254
|
+
return true;
|
13255
|
+
}
|
13256
|
+
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
13257
|
+
recurse_into_nonterminal = false;
|
13258
|
+
}
|
13259
|
+
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13260
|
+
recurse_into_nonterminal = true;
|
13261
|
+
} else {
|
13262
|
+
recurse_into_nonterminal = false;
|
13263
|
+
}
|
13264
|
+
}
|
13265
|
+
|
13266
|
+
(*rules_in_progress)[rule_index] = false;
|
13267
|
+
(*rules_visited)[rule_index] = true;
|
13268
|
+
return false;
|
13269
|
+
}
|
13270
|
+
|
13076
13271
|
//
|
13077
13272
|
// grammar - external
|
13078
13273
|
//
|
@@ -13092,6 +13287,19 @@ struct llama_grammar * llama_grammar_init(
|
|
13092
13287
|
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
13093
13288
|
}
|
13094
13289
|
|
13290
|
+
// Check for left recursion
|
13291
|
+
std::vector<bool> rules_visited(n_rules);
|
13292
|
+
std::vector<bool> rules_in_progress(n_rules);
|
13293
|
+
std::vector<bool> rules_may_be_empty(n_rules);
|
13294
|
+
for (size_t i = 0; i < n_rules; i++) {
|
13295
|
+
if (rules_visited[i]) {
|
13296
|
+
continue;
|
13297
|
+
}
|
13298
|
+
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
13299
|
+
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
13300
|
+
}
|
13301
|
+
}
|
13302
|
+
|
13095
13303
|
// loop over alternates of start rule to build initial stacks
|
13096
13304
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
13097
13305
|
pos = vec_rules[start_rule_index].data();
|
@@ -13114,6 +13322,9 @@ struct llama_grammar * llama_grammar_init(
|
|
13114
13322
|
}
|
13115
13323
|
} while (true);
|
13116
13324
|
|
13325
|
+
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
13326
|
+
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
13327
|
+
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
13117
13328
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
13118
13329
|
}
|
13119
13330
|
|
@@ -13708,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13708
13919
|
|
13709
13920
|
// Sample the next word X using top-k sampling
|
13710
13921
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
13711
|
-
|
13712
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13713
|
-
}
|
13922
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13714
13923
|
llama_token X = llama_sample_token(ctx, candidates);
|
13715
13924
|
t_start_sample_us = ggml_time_us();
|
13716
13925
|
|
@@ -13724,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13724
13933
|
// Update mu using the learning rate and error
|
13725
13934
|
*mu = *mu - eta * e;
|
13726
13935
|
|
13727
|
-
|
13728
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13729
|
-
}
|
13936
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13730
13937
|
return X;
|
13731
13938
|
}
|
13732
13939
|
|
@@ -14142,13 +14349,16 @@ static void llama_tensor_dequantize_internal(
|
|
14142
14349
|
if (qtype.to_float == NULL) {
|
14143
14350
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
14144
14351
|
}
|
14145
|
-
} else if (tensor->type != GGML_TYPE_F16
|
14352
|
+
} else if (tensor->type != GGML_TYPE_F16 &&
|
14353
|
+
tensor->type != GGML_TYPE_BF16) {
|
14146
14354
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
14147
14355
|
}
|
14148
14356
|
|
14149
14357
|
if (nthread < 2) {
|
14150
14358
|
if (tensor->type == GGML_TYPE_F16) {
|
14151
14359
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
14360
|
+
} else if (tensor->type == GGML_TYPE_BF16) {
|
14361
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
14152
14362
|
} else if (ggml_is_quantized(tensor->type)) {
|
14153
14363
|
qtype.to_float(tensor->data, f32_output, nelements);
|
14154
14364
|
} else {
|
@@ -14157,7 +14367,14 @@ static void llama_tensor_dequantize_internal(
|
|
14157
14367
|
return;
|
14158
14368
|
}
|
14159
14369
|
|
14160
|
-
size_t block_size
|
14370
|
+
size_t block_size;
|
14371
|
+
if (tensor->type == GGML_TYPE_F16 ||
|
14372
|
+
tensor->type == GGML_TYPE_BF16) {
|
14373
|
+
block_size = 1;
|
14374
|
+
} else {
|
14375
|
+
block_size = (size_t)ggml_blck_size(tensor->type);
|
14376
|
+
}
|
14377
|
+
|
14161
14378
|
size_t block_size_bytes = ggml_type_size(tensor->type);
|
14162
14379
|
|
14163
14380
|
GGML_ASSERT(nelements % block_size == 0);
|
@@ -14176,6 +14393,8 @@ static void llama_tensor_dequantize_internal(
|
|
14176
14393
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
14177
14394
|
if (typ == GGML_TYPE_F16) {
|
14178
14395
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
14396
|
+
} else if (typ == GGML_TYPE_BF16) {
|
14397
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
14179
14398
|
} else {
|
14180
14399
|
qtype.to_float(inbuf, outbuf, nels);
|
14181
14400
|
}
|
@@ -14536,6 +14755,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14536
14755
|
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
14537
14756
|
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
14538
14757
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
14758
|
+
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
14539
14759
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
14540
14760
|
|
14541
14761
|
// K-quants
|
@@ -15200,6 +15420,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15200
15420
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
15201
15421
|
/*.main_gpu =*/ 0,
|
15202
15422
|
/*.tensor_split =*/ nullptr,
|
15423
|
+
/*.rpc_servers =*/ nullptr,
|
15203
15424
|
/*.progress_callback =*/ nullptr,
|
15204
15425
|
/*.progress_callback_user_data =*/ nullptr,
|
15205
15426
|
/*.kv_overrides =*/ nullptr,
|
@@ -15270,7 +15491,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
15270
15491
|
}
|
15271
15492
|
|
15272
15493
|
size_t llama_max_devices(void) {
|
15273
|
-
#if defined(
|
15494
|
+
#if defined(GGML_USE_RPC)
|
15495
|
+
return GGML_RPC_MAX_SERVERS;
|
15496
|
+
#elif defined(GGML_USE_METAL)
|
15274
15497
|
return 1;
|
15275
15498
|
#elif defined(GGML_USE_CUDA)
|
15276
15499
|
return GGML_CUDA_MAX_DEVICES;
|
@@ -15293,7 +15516,7 @@ bool llama_supports_mlock(void) {
|
|
15293
15516
|
|
15294
15517
|
bool llama_supports_gpu_offload(void) {
|
15295
15518
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
15296
|
-
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
15519
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
15297
15520
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
15298
15521
|
return true;
|
15299
15522
|
#else
|
@@ -15356,7 +15579,17 @@ struct llama_model * llama_load_model_from_file(
|
|
15356
15579
|
return true;
|
15357
15580
|
};
|
15358
15581
|
}
|
15359
|
-
|
15582
|
+
if (params.rpc_servers != nullptr) {
|
15583
|
+
// split the servers set them into model->rpc_servers
|
15584
|
+
std::string servers(params.rpc_servers);
|
15585
|
+
size_t pos = 0;
|
15586
|
+
while ((pos = servers.find(",")) != std::string::npos) {
|
15587
|
+
std::string server = servers.substr(0, pos);
|
15588
|
+
model->rpc_servers.push_back(server);
|
15589
|
+
servers.erase(0, pos + 1);
|
15590
|
+
}
|
15591
|
+
model->rpc_servers.push_back(servers);
|
15592
|
+
}
|
15360
15593
|
int status = llama_model_load(path_model, *model, params);
|
15361
15594
|
GGML_ASSERT(status <= 0);
|
15362
15595
|
if (status < 0) {
|
@@ -15395,6 +15628,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15395
15628
|
return nullptr;
|
15396
15629
|
}
|
15397
15630
|
|
15631
|
+
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15632
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15633
|
+
params.flash_attn = false;
|
15634
|
+
}
|
15635
|
+
|
15398
15636
|
llama_context * ctx = new llama_context(*model);
|
15399
15637
|
|
15400
15638
|
const auto & hparams = model->hparams;
|
@@ -15418,7 +15656,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15418
15656
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15419
15657
|
|
15420
15658
|
// this is necessary due to kv_self.n being padded later during inference
|
15421
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx,
|
15659
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
|
15422
15660
|
|
15423
15661
|
// with causal attention, the batch size is limited by the context size
|
15424
15662
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
@@ -15463,23 +15701,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15463
15701
|
}
|
15464
15702
|
}
|
15465
15703
|
|
15466
|
-
if (cparams.flash_attn && hparams.use_alibi) {
|
15467
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15468
|
-
cparams.flash_attn = false;
|
15469
|
-
}
|
15470
|
-
|
15471
|
-
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15472
|
-
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15473
|
-
cparams.flash_attn = false;
|
15474
|
-
}
|
15475
|
-
|
15476
|
-
#ifdef GGML_USE_HIPBLAS
|
15477
|
-
if (cparams.flash_attn) {
|
15478
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
|
15479
|
-
cparams.flash_attn = false;
|
15480
|
-
}
|
15481
|
-
#endif
|
15482
|
-
|
15483
15704
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15484
15705
|
params.seed = time(NULL);
|
15485
15706
|
}
|
@@ -15515,7 +15736,17 @@ struct llama_context * llama_new_context_with_model(
|
|
15515
15736
|
|
15516
15737
|
if (!hparams.vocab_only) {
|
15517
15738
|
// initialize backends
|
15518
|
-
#
|
15739
|
+
#if defined(GGML_USE_RPC)
|
15740
|
+
for (auto & server : model->rpc_servers) {
|
15741
|
+
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
15742
|
+
if (backend == nullptr) {
|
15743
|
+
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
15744
|
+
llama_free(ctx);
|
15745
|
+
return nullptr;
|
15746
|
+
}
|
15747
|
+
ctx->backends.push_back(backend);
|
15748
|
+
}
|
15749
|
+
#elif defined(GGML_USE_METAL)
|
15519
15750
|
if (model->n_gpu_layers > 0) {
|
15520
15751
|
ctx->backend_metal = ggml_backend_metal_init();
|
15521
15752
|
if (ctx->backend_metal == nullptr) {
|
@@ -15671,7 +15902,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15671
15902
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
15672
15903
|
|
15673
15904
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
15674
|
-
bool pipeline_parallel =
|
15905
|
+
bool pipeline_parallel =
|
15906
|
+
llama_get_device_count(*model) > 1 &&
|
15907
|
+
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
15908
|
+
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
15909
|
+
params.offload_kqv;
|
15675
15910
|
#ifndef GGML_USE_CUDA
|
15676
15911
|
// pipeline parallelism requires support for async compute and events
|
15677
15912
|
// currently this is only implemented in the CUDA backend
|
@@ -15769,6 +16004,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15769
16004
|
case LLM_ARCH_REFACT:
|
15770
16005
|
case LLM_ARCH_BLOOM:
|
15771
16006
|
case LLM_ARCH_MAMBA:
|
16007
|
+
case LLM_ARCH_JINA_BERT_V2:
|
15772
16008
|
return LLAMA_ROPE_TYPE_NONE;
|
15773
16009
|
|
15774
16010
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -16790,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16790
17026
|
}
|
16791
17027
|
else {
|
16792
17028
|
if (cell_range_begin != kv_self.size) {
|
16793
|
-
cell_ranges.
|
17029
|
+
cell_ranges.emplace_back(cell_range_begin, i);
|
16794
17030
|
cell_range_begin = kv_self.size;
|
16795
17031
|
}
|
16796
17032
|
}
|
16797
17033
|
}
|
16798
17034
|
if (cell_range_begin != kv_self.size) {
|
16799
|
-
cell_ranges.
|
17035
|
+
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
16800
17036
|
}
|
16801
17037
|
|
16802
17038
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
@@ -17466,9 +17702,10 @@ int32_t llama_tokenize(
|
|
17466
17702
|
|
17467
17703
|
static std::string llama_decode_text(const std::string & text) {
|
17468
17704
|
std::string decoded_text;
|
17469
|
-
|
17470
|
-
|
17471
|
-
|
17705
|
+
|
17706
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
17707
|
+
for (const auto cpt : cpts) {
|
17708
|
+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
|
17472
17709
|
}
|
17473
17710
|
|
17474
17711
|
return decoded_text;
|
@@ -17832,7 +18069,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
17832
18069
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
17833
18070
|
|
17834
18071
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
17835
|
-
/*.n_p_eval =*/ std::max(
|
18072
|
+
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
17836
18073
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
17837
18074
|
};
|
17838
18075
|
|