@fugood/llama.node 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +14 -12
- package/src/llama.cpp/common/common.cpp +19 -5
- package/src/llama.cpp/common/common.h +2 -0
- package/src/llama.cpp/common/grammar-parser.cpp +9 -0
- package/src/llama.cpp/common/sampling.cpp +3 -3
- package/src/llama.cpp/common/sampling.h +1 -1
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
- package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
- package/src/llama.cpp/examples/main/main.cpp +5 -1
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
- package/src/llama.cpp/examples/server/server.cpp +12 -16
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/ggml-backend.c +2 -2
- package/src/llama.cpp/ggml-kompute.cpp +9 -3
- package/src/llama.cpp/ggml-quants.c +6 -0
- package/src/llama.cpp/ggml-rpc.cpp +1023 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +20 -143
- package/src/llama.cpp/ggml-vulkan.cpp +4 -2
- package/src/llama.cpp/ggml.c +116 -271
- package/src/llama.cpp/ggml.h +12 -15
- package/src/llama.cpp/llama.cpp +451 -265
- package/src/llama.cpp/llama.h +3 -0
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
package/src/llama.cpp/llama.cpp
CHANGED
|
@@ -7,6 +7,10 @@
|
|
|
7
7
|
#include "ggml-alloc.h"
|
|
8
8
|
#include "ggml-backend.h"
|
|
9
9
|
|
|
10
|
+
#ifdef GGML_USE_RPC
|
|
11
|
+
# include "ggml-rpc.h"
|
|
12
|
+
#endif
|
|
13
|
+
|
|
10
14
|
#ifdef GGML_USE_CUDA
|
|
11
15
|
# include "ggml-cuda.h"
|
|
12
16
|
#elif defined(GGML_USE_CLBLAST)
|
|
@@ -205,6 +209,7 @@ enum llm_arch {
|
|
|
205
209
|
LLM_ARCH_REFACT,
|
|
206
210
|
LLM_ARCH_BERT,
|
|
207
211
|
LLM_ARCH_NOMIC_BERT,
|
|
212
|
+
LLM_ARCH_JINA_BERT_V2,
|
|
208
213
|
LLM_ARCH_BLOOM,
|
|
209
214
|
LLM_ARCH_STABLELM,
|
|
210
215
|
LLM_ARCH_QWEN,
|
|
@@ -228,39 +233,40 @@ enum llm_arch {
|
|
|
228
233
|
};
|
|
229
234
|
|
|
230
235
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
231
|
-
{ LLM_ARCH_LLAMA, "llama"
|
|
232
|
-
{ LLM_ARCH_FALCON, "falcon"
|
|
233
|
-
{ LLM_ARCH_GROK, "grok"
|
|
234
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
|
235
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
|
236
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
|
237
|
-
{ LLM_ARCH_MPT, "mpt"
|
|
238
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
|
239
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
|
240
|
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
|
241
|
-
{ LLM_ARCH_REFACT, "refact"
|
|
242
|
-
{ LLM_ARCH_BERT, "bert"
|
|
243
|
-
{ LLM_ARCH_NOMIC_BERT, "nomic-bert"
|
|
244
|
-
{
|
|
245
|
-
{
|
|
246
|
-
{
|
|
247
|
-
{
|
|
248
|
-
{
|
|
249
|
-
{
|
|
250
|
-
{
|
|
251
|
-
{
|
|
252
|
-
{
|
|
253
|
-
{
|
|
254
|
-
{
|
|
255
|
-
{
|
|
256
|
-
{
|
|
257
|
-
{
|
|
258
|
-
{
|
|
259
|
-
{
|
|
260
|
-
{
|
|
261
|
-
{
|
|
262
|
-
{
|
|
263
|
-
{
|
|
236
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
|
237
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
|
238
|
+
{ LLM_ARCH_GROK, "grok" },
|
|
239
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
|
240
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
|
241
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
|
242
|
+
{ LLM_ARCH_MPT, "mpt" },
|
|
243
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
|
244
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
|
245
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
|
246
|
+
{ LLM_ARCH_REFACT, "refact" },
|
|
247
|
+
{ LLM_ARCH_BERT, "bert" },
|
|
248
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
249
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
|
250
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
|
251
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
252
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
|
253
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
|
254
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
|
255
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
|
256
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
|
257
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
|
258
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
259
|
+
{ LLM_ARCH_ORION, "orion" },
|
|
260
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
|
261
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
|
262
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
|
263
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
264
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
|
265
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
|
266
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
|
267
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
|
268
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
|
269
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
264
270
|
};
|
|
265
271
|
|
|
266
272
|
enum llm_kv {
|
|
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
691
697
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
692
698
|
},
|
|
693
699
|
},
|
|
700
|
+
{
|
|
701
|
+
LLM_ARCH_JINA_BERT_V2,
|
|
702
|
+
{
|
|
703
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
704
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
705
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
706
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
707
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
708
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
709
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
710
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
711
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
712
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
713
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
714
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
715
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
716
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
717
|
+
},
|
|
718
|
+
},
|
|
694
719
|
{
|
|
695
720
|
LLM_ARCH_BLOOM,
|
|
696
721
|
{
|
|
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
|
1664
1689
|
GGML_UNUSED(host_buffer);
|
|
1665
1690
|
}
|
|
1666
1691
|
|
|
1667
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1668
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
|
1669
|
-
|
|
1670
|
-
#ifdef GGML_USE_METAL
|
|
1671
|
-
buft = ggml_backend_metal_buffer_type();
|
|
1672
|
-
#elif defined(GGML_USE_CUDA)
|
|
1673
|
-
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
1674
|
-
#elif defined(GGML_USE_VULKAN)
|
|
1675
|
-
buft = ggml_backend_vk_buffer_type(gpu);
|
|
1676
|
-
#elif defined(GGML_USE_SYCL)
|
|
1677
|
-
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
1678
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
1679
|
-
buft = ggml_backend_opencl_buffer_type();
|
|
1680
|
-
#elif defined(GGML_USE_KOMPUTE)
|
|
1681
|
-
buft = ggml_backend_kompute_buffer_type(gpu);
|
|
1682
|
-
if (buft == nullptr) {
|
|
1683
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
|
1684
|
-
}
|
|
1685
|
-
#endif
|
|
1686
|
-
|
|
1687
|
-
if (buft == nullptr) {
|
|
1688
|
-
buft = llama_default_buffer_type_cpu(true);
|
|
1689
|
-
}
|
|
1690
|
-
return buft;
|
|
1691
|
-
|
|
1692
|
-
GGML_UNUSED(gpu);
|
|
1693
|
-
}
|
|
1694
|
-
|
|
1695
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
|
1696
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
|
1697
|
-
|
|
1698
|
-
#ifdef GGML_USE_CUDA
|
|
1699
|
-
if (ggml_backend_cuda_get_device_count() > 1) {
|
|
1700
|
-
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
|
1701
|
-
}
|
|
1702
|
-
#endif
|
|
1703
|
-
|
|
1704
|
-
#ifdef GGML_USE_SYCL
|
|
1705
|
-
if (ggml_backend_sycl_get_device_count() > 1) {
|
|
1706
|
-
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
|
1707
|
-
}
|
|
1708
|
-
#endif
|
|
1709
|
-
|
|
1710
|
-
if (buft == nullptr) {
|
|
1711
|
-
buft = llama_default_buffer_type_offload(fallback_gpu);
|
|
1712
|
-
}
|
|
1713
|
-
return buft;
|
|
1714
|
-
|
|
1715
|
-
GGML_UNUSED(tensor_split);
|
|
1716
|
-
}
|
|
1717
|
-
|
|
1718
|
-
static size_t llama_get_device_count() {
|
|
1719
|
-
#if defined(GGML_USE_CUDA)
|
|
1720
|
-
return ggml_backend_cuda_get_device_count();
|
|
1721
|
-
#elif defined(GGML_USE_SYCL)
|
|
1722
|
-
return ggml_backend_sycl_get_device_count();
|
|
1723
|
-
#elif defined(GGML_USE_VULKAN)
|
|
1724
|
-
return ggml_backend_vk_get_device_count();
|
|
1725
|
-
#else
|
|
1726
|
-
return 1;
|
|
1727
|
-
#endif
|
|
1728
|
-
}
|
|
1729
|
-
|
|
1730
|
-
static size_t llama_get_device_memory(int device) {
|
|
1731
|
-
#if defined(GGML_USE_CUDA)
|
|
1732
|
-
size_t total;
|
|
1733
|
-
size_t free;
|
|
1734
|
-
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
|
1735
|
-
return free;
|
|
1736
|
-
#elif defined(GGML_USE_SYCL)
|
|
1737
|
-
size_t total;
|
|
1738
|
-
size_t free;
|
|
1739
|
-
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
|
1740
|
-
return free;
|
|
1741
|
-
#elif defined(GGML_USE_VULKAN)
|
|
1742
|
-
size_t total;
|
|
1743
|
-
size_t free;
|
|
1744
|
-
ggml_backend_vk_get_device_memory(device, &free, &total);
|
|
1745
|
-
return free;
|
|
1746
|
-
#else
|
|
1747
|
-
return 1;
|
|
1748
|
-
GGML_UNUSED(device);
|
|
1749
|
-
#endif
|
|
1750
|
-
}
|
|
1751
|
-
|
|
1752
1692
|
//
|
|
1753
1693
|
// globals
|
|
1754
1694
|
//
|
|
@@ -1845,7 +1785,7 @@ struct llama_hparams {
|
|
|
1845
1785
|
float f_logit_scale = 0.0f;
|
|
1846
1786
|
|
|
1847
1787
|
bool causal_attn = true;
|
|
1848
|
-
bool use_alibi = false;
|
|
1788
|
+
bool use_alibi = false;
|
|
1849
1789
|
|
|
1850
1790
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
1851
1791
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
@@ -2189,6 +2129,8 @@ struct llama_model {
|
|
|
2189
2129
|
int main_gpu;
|
|
2190
2130
|
int n_gpu_layers;
|
|
2191
2131
|
|
|
2132
|
+
std::vector<std::string> rpc_servers;
|
|
2133
|
+
|
|
2192
2134
|
// gguf metadata
|
|
2193
2135
|
std::unordered_map<std::string, std::string> gguf_kv;
|
|
2194
2136
|
|
|
@@ -2317,7 +2259,6 @@ struct llama_context {
|
|
|
2317
2259
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
|
2318
2260
|
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
|
2319
2261
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
|
2320
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
|
2321
2262
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
|
2322
2263
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
|
2323
2264
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
|
@@ -2333,6 +2274,104 @@ struct llama_context {
|
|
|
2333
2274
|
#endif
|
|
2334
2275
|
};
|
|
2335
2276
|
|
|
2277
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
|
2278
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
2279
|
+
|
|
2280
|
+
#ifdef GGML_USE_RPC
|
|
2281
|
+
std::string endpoint = model.rpc_servers[gpu];
|
|
2282
|
+
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
|
2283
|
+
#elif defined(GGML_USE_METAL)
|
|
2284
|
+
buft = ggml_backend_metal_buffer_type();
|
|
2285
|
+
#elif defined(GGML_USE_CUDA)
|
|
2286
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
2287
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2288
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
|
2289
|
+
#elif defined(GGML_USE_SYCL)
|
|
2290
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
2291
|
+
#elif defined(GGML_USE_CLBLAST)
|
|
2292
|
+
buft = ggml_backend_opencl_buffer_type();
|
|
2293
|
+
#elif defined(GGML_USE_KOMPUTE)
|
|
2294
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
|
2295
|
+
if (buft == nullptr) {
|
|
2296
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
|
2297
|
+
}
|
|
2298
|
+
#endif
|
|
2299
|
+
|
|
2300
|
+
if (buft == nullptr) {
|
|
2301
|
+
buft = llama_default_buffer_type_cpu(true);
|
|
2302
|
+
}
|
|
2303
|
+
return buft;
|
|
2304
|
+
GGML_UNUSED(model);
|
|
2305
|
+
GGML_UNUSED(gpu);
|
|
2306
|
+
}
|
|
2307
|
+
|
|
2308
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
|
2309
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
2310
|
+
|
|
2311
|
+
#ifdef GGML_USE_CUDA
|
|
2312
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
|
2313
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
|
2314
|
+
}
|
|
2315
|
+
#endif
|
|
2316
|
+
|
|
2317
|
+
#ifdef GGML_USE_SYCL
|
|
2318
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
|
2319
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
|
2320
|
+
}
|
|
2321
|
+
#endif
|
|
2322
|
+
|
|
2323
|
+
if (buft == nullptr) {
|
|
2324
|
+
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
|
2325
|
+
}
|
|
2326
|
+
return buft;
|
|
2327
|
+
|
|
2328
|
+
GGML_UNUSED(tensor_split);
|
|
2329
|
+
}
|
|
2330
|
+
|
|
2331
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
|
2332
|
+
#if defined(GGML_USE_RPC)
|
|
2333
|
+
return model.rpc_servers.size();
|
|
2334
|
+
#elif defined(GGML_USE_CUDA)
|
|
2335
|
+
return ggml_backend_cuda_get_device_count();
|
|
2336
|
+
#elif defined(GGML_USE_SYCL)
|
|
2337
|
+
return ggml_backend_sycl_get_device_count();
|
|
2338
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2339
|
+
return ggml_backend_vk_get_device_count();
|
|
2340
|
+
#else
|
|
2341
|
+
return 1;
|
|
2342
|
+
#endif
|
|
2343
|
+
GGML_UNUSED(model);
|
|
2344
|
+
}
|
|
2345
|
+
|
|
2346
|
+
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
2347
|
+
#if defined(GGML_USE_RPC)
|
|
2348
|
+
size_t total;
|
|
2349
|
+
size_t free;
|
|
2350
|
+
std::string endpoint = model.rpc_servers[device];
|
|
2351
|
+
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
|
2352
|
+
return free;
|
|
2353
|
+
#elif defined(GGML_USE_CUDA)
|
|
2354
|
+
size_t total;
|
|
2355
|
+
size_t free;
|
|
2356
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
|
2357
|
+
return free;
|
|
2358
|
+
#elif defined(GGML_USE_SYCL)
|
|
2359
|
+
size_t total;
|
|
2360
|
+
size_t free;
|
|
2361
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
|
2362
|
+
return free;
|
|
2363
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2364
|
+
size_t total;
|
|
2365
|
+
size_t free;
|
|
2366
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
|
2367
|
+
return free;
|
|
2368
|
+
#else
|
|
2369
|
+
return 1;
|
|
2370
|
+
#endif
|
|
2371
|
+
GGML_UNUSED(model);
|
|
2372
|
+
GGML_UNUSED(device);
|
|
2373
|
+
}
|
|
2374
|
+
|
|
2336
2375
|
//
|
|
2337
2376
|
// kv cache helpers
|
|
2338
2377
|
//
|
|
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
|
|
2785
2824
|
cache.do_defrag = true;
|
|
2786
2825
|
}
|
|
2787
2826
|
|
|
2827
|
+
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
|
2828
|
+
// the FA kernels require padding to avoid extra runtime boundary checks
|
|
2829
|
+
return cparams.flash_attn ? 256u : 32u;
|
|
2830
|
+
}
|
|
2831
|
+
|
|
2788
2832
|
//
|
|
2789
2833
|
// model loading and saving
|
|
2790
2834
|
//
|
|
@@ -3779,6 +3823,12 @@ static void llm_load_hparams(
|
|
|
3779
3823
|
|
|
3780
3824
|
// get hparams kv
|
|
3781
3825
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
|
3826
|
+
|
|
3827
|
+
// everything past this point is not vocab-related
|
|
3828
|
+
if (hparams.vocab_only) {
|
|
3829
|
+
return;
|
|
3830
|
+
}
|
|
3831
|
+
|
|
3782
3832
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
3783
3833
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
3784
3834
|
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
|
@@ -3860,7 +3910,7 @@ static void llm_load_hparams(
|
|
|
3860
3910
|
switch (hparams.n_layer) {
|
|
3861
3911
|
case 22: model.type = e_model::MODEL_1B; break;
|
|
3862
3912
|
case 26: model.type = e_model::MODEL_3B; break;
|
|
3863
|
-
case 32: model.type = hparams.
|
|
3913
|
+
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
|
3864
3914
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
3865
3915
|
case 48: model.type = e_model::MODEL_34B; break;
|
|
3866
3916
|
case 60: model.type = e_model::MODEL_30B; break;
|
|
@@ -3962,6 +4012,19 @@ static void llm_load_hparams(
|
|
|
3962
4012
|
model.type = e_model::MODEL_335M; break; // bge-large
|
|
3963
4013
|
}
|
|
3964
4014
|
} break;
|
|
4015
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
4016
|
+
{
|
|
4017
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
4018
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
4019
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
4020
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
4021
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
4022
|
+
|
|
4023
|
+
switch (hparams.n_layer) {
|
|
4024
|
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
|
4025
|
+
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
|
4026
|
+
}
|
|
4027
|
+
} break;
|
|
3965
4028
|
case LLM_ARCH_NOMIC_BERT:
|
|
3966
4029
|
{
|
|
3967
4030
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -4383,7 +4446,11 @@ static void llm_load_vocab(
|
|
|
4383
4446
|
tokenizer_pre == "starcoder") {
|
|
4384
4447
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
|
4385
4448
|
} else if (
|
|
4386
|
-
tokenizer_pre == "gpt-2"
|
|
4449
|
+
tokenizer_pre == "gpt-2" ||
|
|
4450
|
+
tokenizer_pre == "jina-es" ||
|
|
4451
|
+
tokenizer_pre == "jina-de" ||
|
|
4452
|
+
tokenizer_pre == "jina-v2-es" ||
|
|
4453
|
+
tokenizer_pre == "jina-v2-de") {
|
|
4387
4454
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
4388
4455
|
} else if (
|
|
4389
4456
|
tokenizer_pre == "refact") {
|
|
@@ -4743,13 +4810,13 @@ static bool llm_load_tensors(
|
|
|
4743
4810
|
|
|
4744
4811
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
|
4745
4812
|
// calculate the split points
|
|
4746
|
-
int device_count = llama_get_device_count();
|
|
4813
|
+
int device_count = llama_get_device_count(model);
|
|
4747
4814
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
4748
4815
|
std::vector<float> splits(device_count);
|
|
4749
4816
|
if (all_zero) {
|
|
4750
4817
|
// default split, by free memory
|
|
4751
4818
|
for (int i = 0; i < device_count; ++i) {
|
|
4752
|
-
splits[i] = llama_get_device_memory(i);
|
|
4819
|
+
splits[i] = llama_get_device_memory(model, i);
|
|
4753
4820
|
}
|
|
4754
4821
|
} else {
|
|
4755
4822
|
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
|
@@ -4769,35 +4836,35 @@ static bool llm_load_tensors(
|
|
|
4769
4836
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
|
4770
4837
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
4771
4838
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
|
4772
|
-
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
|
4839
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
|
4773
4840
|
}
|
|
4774
4841
|
// assign the output layer
|
|
4775
4842
|
if (n_gpu_layers > n_layer) {
|
|
4776
4843
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
|
4777
|
-
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
|
4844
|
+
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
|
4778
4845
|
} else {
|
|
4779
4846
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
4780
4847
|
}
|
|
4781
4848
|
} else {
|
|
4782
4849
|
ggml_backend_buffer_type_t split_buft;
|
|
4783
4850
|
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
4784
|
-
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
|
4851
|
+
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
|
4785
4852
|
} else {
|
|
4786
4853
|
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
|
4787
|
-
split_buft = llama_default_buffer_type_offload(main_gpu);
|
|
4854
|
+
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
|
4788
4855
|
}
|
|
4789
4856
|
// assign the repeating layers
|
|
4790
4857
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
4791
4858
|
model.buft_layer[i] = {
|
|
4792
4859
|
split_buft,
|
|
4793
|
-
llama_default_buffer_type_offload(main_gpu)
|
|
4860
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
|
4794
4861
|
};
|
|
4795
4862
|
}
|
|
4796
4863
|
// assign the output layer
|
|
4797
4864
|
if (n_gpu_layers > n_layer) {
|
|
4798
4865
|
model.buft_output = {
|
|
4799
4866
|
split_buft,
|
|
4800
|
-
llama_default_buffer_type_offload(main_gpu)
|
|
4867
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
|
4801
4868
|
};
|
|
4802
4869
|
} else {
|
|
4803
4870
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
@@ -5242,6 +5309,50 @@ static bool llm_load_tensors(
|
|
|
5242
5309
|
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
|
5243
5310
|
}
|
|
5244
5311
|
} break;
|
|
5312
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
5313
|
+
{
|
|
5314
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
|
|
5315
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
|
|
5316
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
|
5317
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
|
5318
|
+
|
|
5319
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5320
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
5321
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
5322
|
+
|
|
5323
|
+
auto & layer = model.layers[i]; // JinaBertLayer
|
|
5324
|
+
|
|
5325
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
5326
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
|
5327
|
+
|
|
5328
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
|
5329
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
|
5330
|
+
|
|
5331
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
5332
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
|
5333
|
+
|
|
5334
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
|
5335
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
|
5336
|
+
|
|
5337
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
5338
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
|
5339
|
+
|
|
5340
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
|
|
5341
|
+
layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
|
|
5342
|
+
|
|
5343
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
|
5344
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
|
5345
|
+
|
|
5346
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5347
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5348
|
+
|
|
5349
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
5350
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
5351
|
+
|
|
5352
|
+
layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
|
5353
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
|
5354
|
+
}
|
|
5355
|
+
} break;
|
|
5245
5356
|
case LLM_ARCH_BLOOM:
|
|
5246
5357
|
{
|
|
5247
5358
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
@@ -6318,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|
|
6318
6429
|
llm_ffn_gate_type type_gate,
|
|
6319
6430
|
const llm_build_cb & cb,
|
|
6320
6431
|
int il) {
|
|
6321
|
-
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
|
6432
|
+
struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
|
|
6322
6433
|
cb(tmp, "ffn_up", il);
|
|
6323
6434
|
|
|
6324
6435
|
if (up_b) {
|
|
@@ -6500,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6500
6611
|
struct ggml_tensor * wo_b,
|
|
6501
6612
|
struct ggml_tensor * q_cur,
|
|
6502
6613
|
struct ggml_tensor * kq_mask,
|
|
6503
|
-
struct ggml_tensor * kq_pos,
|
|
6504
6614
|
int32_t n_tokens,
|
|
6505
6615
|
int32_t n_kv,
|
|
6506
6616
|
float kq_scale,
|
|
@@ -6530,10 +6640,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6530
6640
|
GGML_UNUSED(model);
|
|
6531
6641
|
GGML_UNUSED(n_ctx);
|
|
6532
6642
|
|
|
6533
|
-
// note: if this assert triggers, then some check has failed earlier
|
|
6534
|
-
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
|
6535
|
-
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
|
6536
|
-
|
|
6537
6643
|
// split cached v into n_head heads (not transposed)
|
|
6538
6644
|
struct ggml_tensor * v =
|
|
6539
6645
|
ggml_view_3d(ctx, kv.v_l[il],
|
|
@@ -6543,7 +6649,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6543
6649
|
0);
|
|
6544
6650
|
cb(v, "v", il);
|
|
6545
6651
|
|
|
6546
|
-
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
|
6652
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
6547
6653
|
|
|
6548
6654
|
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
|
6549
6655
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
|
@@ -6574,28 +6680,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6574
6680
|
kq = ggml_scale(ctx, kq, 30);
|
|
6575
6681
|
}
|
|
6576
6682
|
|
|
6577
|
-
|
|
6578
|
-
|
|
6579
|
-
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
|
6580
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
|
6581
|
-
if (hparams.use_alibi) {
|
|
6582
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
|
6583
|
-
cb(kq, "kq_scaled", il);
|
|
6584
|
-
|
|
6585
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
|
6586
|
-
cb(kq, "kq_scaled_alibi", il);
|
|
6587
|
-
|
|
6588
|
-
kq = ggml_add(ctx, kq, kq_mask);
|
|
6589
|
-
cb(kq, "kq_masked", il);
|
|
6590
|
-
|
|
6591
|
-
kq = ggml_soft_max(ctx, kq);
|
|
6592
|
-
cb(kq, "kq_soft_max", il);
|
|
6593
|
-
} else
|
|
6594
|
-
#endif
|
|
6595
|
-
{
|
|
6596
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
|
6597
|
-
cb(kq, "kq_soft_max_ext", il);
|
|
6598
|
-
}
|
|
6683
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
6684
|
+
cb(kq, "kq_soft_max_ext", il);
|
|
6599
6685
|
|
|
6600
6686
|
GGML_ASSERT(kv.size == n_ctx);
|
|
6601
6687
|
|
|
@@ -6645,7 +6731,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
6645
6731
|
struct ggml_tensor * v_cur,
|
|
6646
6732
|
struct ggml_tensor * q_cur,
|
|
6647
6733
|
struct ggml_tensor * kq_mask,
|
|
6648
|
-
struct ggml_tensor * kq_pos,
|
|
6649
6734
|
int32_t n_tokens,
|
|
6650
6735
|
int32_t kv_head,
|
|
6651
6736
|
int32_t n_kv,
|
|
@@ -6664,7 +6749,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
6664
6749
|
struct ggml_tensor * cur;
|
|
6665
6750
|
|
|
6666
6751
|
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
|
6667
|
-
q_cur, kq_mask,
|
|
6752
|
+
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
|
6668
6753
|
cb(cur, "kqv_out", il);
|
|
6669
6754
|
|
|
6670
6755
|
return cur;
|
|
@@ -6771,18 +6856,17 @@ struct llm_build_context {
|
|
|
6771
6856
|
|
|
6772
6857
|
ctx0 = ggml_init(params);
|
|
6773
6858
|
|
|
6774
|
-
lctx.inp_tokens
|
|
6775
|
-
lctx.inp_embd
|
|
6776
|
-
lctx.inp_pos
|
|
6859
|
+
lctx.inp_tokens = nullptr;
|
|
6860
|
+
lctx.inp_embd = nullptr;
|
|
6861
|
+
lctx.inp_pos = nullptr;
|
|
6777
6862
|
lctx.inp_out_ids = nullptr;
|
|
6778
6863
|
lctx.inp_KQ_mask = nullptr;
|
|
6779
|
-
lctx.inp_KQ_pos = nullptr;
|
|
6780
6864
|
lctx.inp_K_shift = nullptr;
|
|
6781
|
-
lctx.inp_mean
|
|
6782
|
-
lctx.inp_cls
|
|
6783
|
-
lctx.inp_s_copy
|
|
6784
|
-
lctx.inp_s_mask
|
|
6785
|
-
lctx.inp_s_seq
|
|
6865
|
+
lctx.inp_mean = nullptr;
|
|
6866
|
+
lctx.inp_cls = nullptr;
|
|
6867
|
+
lctx.inp_s_copy = nullptr;
|
|
6868
|
+
lctx.inp_s_mask = nullptr;
|
|
6869
|
+
lctx.inp_s_seq = nullptr;
|
|
6786
6870
|
}
|
|
6787
6871
|
|
|
6788
6872
|
void free() {
|
|
@@ -6932,19 +7016,6 @@ struct llm_build_context {
|
|
|
6932
7016
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
|
6933
7017
|
}
|
|
6934
7018
|
|
|
6935
|
-
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
|
6936
|
-
if (causal) {
|
|
6937
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
|
6938
|
-
} else {
|
|
6939
|
-
// TODO: this will be needed for ALiBi-based BERT models
|
|
6940
|
-
// https://github.com/ggerganov/llama.cpp/pull/6826
|
|
6941
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
|
6942
|
-
}
|
|
6943
|
-
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
|
6944
|
-
ggml_set_input(lctx.inp_KQ_pos);
|
|
6945
|
-
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
|
6946
|
-
}
|
|
6947
|
-
|
|
6948
7019
|
struct ggml_tensor * build_inp_mean() {
|
|
6949
7020
|
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
|
6950
7021
|
cb(lctx.inp_mean, "inp_mean", -1);
|
|
@@ -7050,7 +7121,7 @@ struct llm_build_context {
|
|
|
7050
7121
|
|
|
7051
7122
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7052
7123
|
model.layers[il].wo, model.layers[il].bo,
|
|
7053
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7124
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7054
7125
|
}
|
|
7055
7126
|
|
|
7056
7127
|
if (il == n_layer - 1) {
|
|
@@ -7143,9 +7214,6 @@ struct llm_build_context {
|
|
|
7143
7214
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
7144
7215
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
7145
7216
|
|
|
7146
|
-
// positions of the tokens in the KV cache
|
|
7147
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
7148
|
-
|
|
7149
7217
|
for (int il = 0; il < n_layer; ++il) {
|
|
7150
7218
|
struct ggml_tensor * inpSA = inpL;
|
|
7151
7219
|
|
|
@@ -7190,7 +7258,7 @@ struct llm_build_context {
|
|
|
7190
7258
|
|
|
7191
7259
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7192
7260
|
model.layers[il].wo, NULL,
|
|
7193
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7261
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7194
7262
|
}
|
|
7195
7263
|
|
|
7196
7264
|
if (il == n_layer - 1) {
|
|
@@ -7260,9 +7328,6 @@ struct llm_build_context {
|
|
|
7260
7328
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
7261
7329
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
7262
7330
|
|
|
7263
|
-
// positions of the tokens in the KV cache
|
|
7264
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
7265
|
-
|
|
7266
7331
|
for (int il = 0; il < n_layer; ++il) {
|
|
7267
7332
|
struct ggml_tensor * inpSA = inpL;
|
|
7268
7333
|
|
|
@@ -7297,7 +7362,7 @@ struct llm_build_context {
|
|
|
7297
7362
|
cb(Kcur, "Kcur", il);
|
|
7298
7363
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7299
7364
|
model.layers[il].wo, NULL,
|
|
7300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7365
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7301
7366
|
}
|
|
7302
7367
|
|
|
7303
7368
|
if (il == n_layer - 1) {
|
|
@@ -7417,7 +7482,7 @@ struct llm_build_context {
|
|
|
7417
7482
|
|
|
7418
7483
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7419
7484
|
model.layers[il].wo, NULL,
|
|
7420
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7485
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7421
7486
|
}
|
|
7422
7487
|
|
|
7423
7488
|
if (il == n_layer - 1) {
|
|
@@ -7542,7 +7607,7 @@ struct llm_build_context {
|
|
|
7542
7607
|
|
|
7543
7608
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7544
7609
|
model.layers[il].wo, model.layers[il].bo,
|
|
7545
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7610
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
7546
7611
|
}
|
|
7547
7612
|
|
|
7548
7613
|
if (il == n_layer - 1) {
|
|
@@ -7694,7 +7759,7 @@ struct llm_build_context {
|
|
|
7694
7759
|
|
|
7695
7760
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7696
7761
|
model.layers[il].wo, NULL,
|
|
7697
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7762
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7698
7763
|
}
|
|
7699
7764
|
|
|
7700
7765
|
if (il == n_layer - 1) {
|
|
@@ -7806,7 +7871,7 @@ struct llm_build_context {
|
|
|
7806
7871
|
|
|
7807
7872
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7808
7873
|
model.layers[il].wo, model.layers[il].bo,
|
|
7809
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7874
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7810
7875
|
}
|
|
7811
7876
|
|
|
7812
7877
|
if (il == n_layer - 1) {
|
|
@@ -8010,7 +8075,7 @@ struct llm_build_context {
|
|
|
8010
8075
|
|
|
8011
8076
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8012
8077
|
model.layers[il].wo, model.layers[il].bo,
|
|
8013
|
-
Kcur, Vcur, Q, KQ_mask,
|
|
8078
|
+
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8014
8079
|
}
|
|
8015
8080
|
|
|
8016
8081
|
if (il == n_layer - 1) {
|
|
@@ -8076,9 +8141,6 @@ struct llm_build_context {
|
|
|
8076
8141
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
8077
8142
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
8078
8143
|
|
|
8079
|
-
// positions of the tokens in the KV cache
|
|
8080
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
8081
|
-
|
|
8082
8144
|
for (int il = 0; il < n_layer; ++il) {
|
|
8083
8145
|
struct ggml_tensor * inpSA = inpL;
|
|
8084
8146
|
|
|
@@ -8106,7 +8168,7 @@ struct llm_build_context {
|
|
|
8106
8168
|
|
|
8107
8169
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8108
8170
|
model.layers[il].wo, NULL,
|
|
8109
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8171
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8110
8172
|
}
|
|
8111
8173
|
|
|
8112
8174
|
if (il == n_layer - 1) {
|
|
@@ -8168,8 +8230,11 @@ struct llm_build_context {
|
|
|
8168
8230
|
|
|
8169
8231
|
struct ggml_tensor * cur;
|
|
8170
8232
|
struct ggml_tensor * inpL;
|
|
8233
|
+
struct ggml_tensor * inp_pos = nullptr;
|
|
8171
8234
|
|
|
8172
|
-
|
|
8235
|
+
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
|
8236
|
+
inp_pos = build_inp_pos();
|
|
8237
|
+
}
|
|
8173
8238
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
|
8174
8239
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
|
8175
8240
|
|
|
@@ -8200,13 +8265,26 @@ struct llm_build_context {
|
|
|
8200
8265
|
struct ggml_tensor * Vcur;
|
|
8201
8266
|
|
|
8202
8267
|
// self-attention
|
|
8203
|
-
if (model.arch == LLM_ARCH_BERT) {
|
|
8268
|
+
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
8204
8269
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
|
8205
8270
|
cb(Qcur, "Qcur", il);
|
|
8206
8271
|
|
|
8272
|
+
if (model.layers[il].attn_q_norm) {
|
|
8273
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
|
8274
|
+
model.layers[il].attn_q_norm,
|
|
8275
|
+
model.layers[il].attn_q_norm_b,
|
|
8276
|
+
LLM_NORM, cb, il);
|
|
8277
|
+
}
|
|
8278
|
+
|
|
8207
8279
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
|
8208
8280
|
cb(Kcur, "Kcur", il);
|
|
8209
8281
|
|
|
8282
|
+
if (model.layers[il].attn_k_norm) {
|
|
8283
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
|
8284
|
+
model.layers[il].attn_k_norm,
|
|
8285
|
+
model.layers[il].attn_k_norm_b,
|
|
8286
|
+
LLM_NORM, cb, il);
|
|
8287
|
+
}
|
|
8210
8288
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
|
8211
8289
|
cb(Vcur, "Vcur", il);
|
|
8212
8290
|
|
|
@@ -8246,7 +8324,7 @@ struct llm_build_context {
|
|
|
8246
8324
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
8247
8325
|
cb(kq, "kq", il);
|
|
8248
8326
|
|
|
8249
|
-
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask,
|
|
8327
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
|
8250
8328
|
cb(kq, "kq_soft_max_ext", il);
|
|
8251
8329
|
|
|
8252
8330
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
|
@@ -8297,6 +8375,13 @@ struct llm_build_context {
|
|
|
8297
8375
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
8298
8376
|
NULL,
|
|
8299
8377
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
8378
|
+
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
8379
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
8380
|
+
model.layers[il].ffn_up, NULL,
|
|
8381
|
+
model.layers[il].ffn_gate, NULL,
|
|
8382
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
8383
|
+
NULL,
|
|
8384
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
|
8300
8385
|
} else {
|
|
8301
8386
|
cur = llm_build_ffn(ctx0, cur,
|
|
8302
8387
|
model.layers[il].ffn_up, NULL,
|
|
@@ -8363,9 +8448,6 @@ struct llm_build_context {
|
|
|
8363
8448
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
8364
8449
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
8365
8450
|
|
|
8366
|
-
// positions of the tokens in the KV cache
|
|
8367
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
8368
|
-
|
|
8369
8451
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
|
8370
8452
|
model.tok_norm,
|
|
8371
8453
|
model.tok_norm_b,
|
|
@@ -8399,7 +8481,7 @@ struct llm_build_context {
|
|
|
8399
8481
|
|
|
8400
8482
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8401
8483
|
model.layers[il].wo, model.layers[il].bo,
|
|
8402
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8484
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8403
8485
|
}
|
|
8404
8486
|
|
|
8405
8487
|
if (il == n_layer - 1) {
|
|
@@ -8464,9 +8546,6 @@ struct llm_build_context {
|
|
|
8464
8546
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
8465
8547
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
8466
8548
|
|
|
8467
|
-
// positions of the tokens in the KV cache
|
|
8468
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
8469
|
-
|
|
8470
8549
|
if (model.pos_embd) {
|
|
8471
8550
|
// inp_pos - contains the positions
|
|
8472
8551
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
@@ -8530,13 +8609,13 @@ struct llm_build_context {
|
|
|
8530
8609
|
|
|
8531
8610
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8532
8611
|
model.layers[il].wo, model.layers[il].bo,
|
|
8533
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8612
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8534
8613
|
} else {
|
|
8535
8614
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8536
8615
|
|
|
8537
8616
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8538
8617
|
model.layers[il].wo, model.layers[il].bo,
|
|
8539
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8618
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8540
8619
|
}
|
|
8541
8620
|
}
|
|
8542
8621
|
|
|
@@ -8680,7 +8759,7 @@ struct llm_build_context {
|
|
|
8680
8759
|
|
|
8681
8760
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8682
8761
|
model.layers[il].wo, NULL,
|
|
8683
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8762
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8684
8763
|
}
|
|
8685
8764
|
|
|
8686
8765
|
if (il == n_layer - 1) {
|
|
@@ -8798,7 +8877,7 @@ struct llm_build_context {
|
|
|
8798
8877
|
|
|
8799
8878
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8800
8879
|
model.layers[il].wo, NULL,
|
|
8801
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8880
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8802
8881
|
}
|
|
8803
8882
|
|
|
8804
8883
|
if (il == n_layer - 1) {
|
|
@@ -8911,7 +8990,7 @@ struct llm_build_context {
|
|
|
8911
8990
|
|
|
8912
8991
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8913
8992
|
model.layers[il].wo, model.layers[il].bo,
|
|
8914
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8993
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8915
8994
|
}
|
|
8916
8995
|
|
|
8917
8996
|
if (il == n_layer - 1) {
|
|
@@ -9025,7 +9104,7 @@ struct llm_build_context {
|
|
|
9025
9104
|
|
|
9026
9105
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9027
9106
|
model.layers[il].wo, model.layers[il].bo,
|
|
9028
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9107
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9029
9108
|
}
|
|
9030
9109
|
|
|
9031
9110
|
if (il == n_layer - 1) {
|
|
@@ -9180,7 +9259,7 @@ struct llm_build_context {
|
|
|
9180
9259
|
|
|
9181
9260
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9182
9261
|
model.layers[il].wo, model.layers[il].bo,
|
|
9183
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9262
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
9184
9263
|
}
|
|
9185
9264
|
|
|
9186
9265
|
if (il == n_layer - 1) {
|
|
@@ -9297,7 +9376,7 @@ struct llm_build_context {
|
|
|
9297
9376
|
|
|
9298
9377
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9299
9378
|
model.layers[il].wo, model.layers[il].bo,
|
|
9300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9379
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
9301
9380
|
}
|
|
9302
9381
|
|
|
9303
9382
|
if (il == n_layer - 1) {
|
|
@@ -9410,7 +9489,7 @@ struct llm_build_context {
|
|
|
9410
9489
|
|
|
9411
9490
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9412
9491
|
model.layers[il].wo, NULL,
|
|
9413
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9492
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9414
9493
|
}
|
|
9415
9494
|
struct ggml_tensor * sa_out = cur;
|
|
9416
9495
|
|
|
@@ -9513,7 +9592,7 @@ struct llm_build_context {
|
|
|
9513
9592
|
|
|
9514
9593
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9515
9594
|
model.layers[il].wo, model.layers[il].bo,
|
|
9516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9595
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9517
9596
|
}
|
|
9518
9597
|
|
|
9519
9598
|
if (il == n_layer - 1) {
|
|
@@ -9620,7 +9699,7 @@ struct llm_build_context {
|
|
|
9620
9699
|
|
|
9621
9700
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9622
9701
|
model.layers[il].wo, model.layers[il].bo,
|
|
9623
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9702
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9624
9703
|
}
|
|
9625
9704
|
|
|
9626
9705
|
if (il == n_layer - 1) {
|
|
@@ -9736,7 +9815,7 @@ struct llm_build_context {
|
|
|
9736
9815
|
|
|
9737
9816
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9738
9817
|
model.layers[il].wo, NULL,
|
|
9739
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9818
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9740
9819
|
}
|
|
9741
9820
|
|
|
9742
9821
|
if (il == n_layer - 1) {
|
|
@@ -9853,7 +9932,7 @@ struct llm_build_context {
|
|
|
9853
9932
|
|
|
9854
9933
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9855
9934
|
model.layers[il].wo, model.layers[il].bo,
|
|
9856
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9935
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9857
9936
|
}
|
|
9858
9937
|
|
|
9859
9938
|
if (il == n_layer - 1) {
|
|
@@ -9983,7 +10062,7 @@ struct llm_build_context {
|
|
|
9983
10062
|
|
|
9984
10063
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9985
10064
|
model.layers[il].wo, model.layers[il].bo,
|
|
9986
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10065
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9987
10066
|
}
|
|
9988
10067
|
|
|
9989
10068
|
if (il == n_layer - 1) {
|
|
@@ -10104,7 +10183,7 @@ struct llm_build_context {
|
|
|
10104
10183
|
|
|
10105
10184
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10106
10185
|
model.layers[il].wo, NULL,
|
|
10107
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10186
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
10108
10187
|
}
|
|
10109
10188
|
|
|
10110
10189
|
if (il == n_layer - 1) {
|
|
@@ -10223,7 +10302,7 @@ struct llm_build_context {
|
|
|
10223
10302
|
|
|
10224
10303
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10225
10304
|
model.layers[il].wo, model.layers[il].bo,
|
|
10226
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10305
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10227
10306
|
}
|
|
10228
10307
|
|
|
10229
10308
|
if (il == n_layer - 1) {
|
|
@@ -10513,7 +10592,7 @@ struct llm_build_context {
|
|
|
10513
10592
|
|
|
10514
10593
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10515
10594
|
model.layers[il].wo, model.layers[il].bo,
|
|
10516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10595
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10517
10596
|
}
|
|
10518
10597
|
|
|
10519
10598
|
if (il == n_layer - 1) {
|
|
@@ -10644,7 +10723,7 @@ struct llm_build_context {
|
|
|
10644
10723
|
|
|
10645
10724
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10646
10725
|
model.layers[il].wo, nullptr,
|
|
10647
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10726
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10648
10727
|
}
|
|
10649
10728
|
|
|
10650
10729
|
if (il == n_layer - 1) {
|
|
@@ -10825,6 +10904,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
10825
10904
|
result = llm.build_refact();
|
|
10826
10905
|
} break;
|
|
10827
10906
|
case LLM_ARCH_BERT:
|
|
10907
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
10828
10908
|
case LLM_ARCH_NOMIC_BERT:
|
|
10829
10909
|
{
|
|
10830
10910
|
result = llm.build_bert();
|
|
@@ -11032,11 +11112,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
11032
11112
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
|
11033
11113
|
f = -INFINITY;
|
|
11034
11114
|
} else {
|
|
11035
|
-
|
|
11115
|
+
if (hparams.use_alibi) {
|
|
11116
|
+
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
|
11117
|
+
} else {
|
|
11118
|
+
f = 0.0f;
|
|
11119
|
+
}
|
|
11036
11120
|
}
|
|
11037
11121
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
|
11038
11122
|
}
|
|
11039
11123
|
}
|
|
11124
|
+
|
|
11125
|
+
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
|
11126
|
+
for (int j = 0; j < n_kv; ++j) {
|
|
11127
|
+
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
|
11128
|
+
}
|
|
11129
|
+
}
|
|
11040
11130
|
}
|
|
11041
11131
|
} else {
|
|
11042
11132
|
// when using kv cache, the mask needs to match the kv cache size
|
|
@@ -11055,7 +11145,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
11055
11145
|
float f = -INFINITY;
|
|
11056
11146
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
|
11057
11147
|
if (batch.seq_id[i][s] == seq_id) {
|
|
11058
|
-
|
|
11148
|
+
if (hparams.use_alibi) {
|
|
11149
|
+
f = -fabs(batch.pos[i] - batch.pos[j]);
|
|
11150
|
+
} else {
|
|
11151
|
+
f = 0.0f;
|
|
11152
|
+
}
|
|
11059
11153
|
break;
|
|
11060
11154
|
}
|
|
11061
11155
|
}
|
|
@@ -11071,21 +11165,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
11071
11165
|
}
|
|
11072
11166
|
}
|
|
11073
11167
|
|
|
11074
|
-
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
|
11075
|
-
// this allows to process multiple sequences in parallel with ALiBi-based models
|
|
11076
|
-
if (hparams.use_alibi) {
|
|
11077
|
-
const int64_t n_kv = kv_self.n;
|
|
11078
|
-
|
|
11079
|
-
GGML_ASSERT(lctx.inp_KQ_pos);
|
|
11080
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
|
11081
|
-
|
|
11082
|
-
float * data = (float *) lctx.inp_KQ_pos->data;
|
|
11083
|
-
|
|
11084
|
-
for (int i = 0; i < n_kv; ++i) {
|
|
11085
|
-
data[i] = float(lctx.kv_self.cells[i].pos);
|
|
11086
|
-
}
|
|
11087
|
-
}
|
|
11088
|
-
|
|
11089
11168
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
|
11090
11169
|
const int64_t n_tokens = batch.n_tokens;
|
|
11091
11170
|
|
|
@@ -11455,7 +11534,8 @@ static int llama_decode_internal(
|
|
|
11455
11534
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
|
11456
11535
|
// after enough generations, the benefit from this heuristic disappears
|
|
11457
11536
|
// if we start defragmenting the cache, the benefit from this will be more important
|
|
11458
|
-
|
|
11537
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
|
11538
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
|
11459
11539
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
|
11460
11540
|
}
|
|
11461
11541
|
}
|
|
@@ -12200,13 +12280,14 @@ struct llm_tokenizer_bpe {
|
|
|
12200
12280
|
|
|
12201
12281
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
12202
12282
|
int final_prev_index = -1;
|
|
12283
|
+
bool ignore_merges = false;
|
|
12203
12284
|
|
|
12204
12285
|
std::vector<std::string> word_collection;
|
|
12205
12286
|
switch (vocab.type) {
|
|
12206
12287
|
case LLAMA_VOCAB_TYPE_BPE:
|
|
12207
12288
|
switch (vocab.type_pre) {
|
|
12208
12289
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
|
12209
|
-
|
|
12290
|
+
ignore_merges = true;
|
|
12210
12291
|
word_collection = unicode_regex_split(text, {
|
|
12211
12292
|
// original regex from tokenizer.json
|
|
12212
12293
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
@@ -12215,6 +12296,12 @@ struct llm_tokenizer_bpe {
|
|
|
12215
12296
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
12216
12297
|
});
|
|
12217
12298
|
break;
|
|
12299
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
|
12300
|
+
word_collection = unicode_regex_split(text, {
|
|
12301
|
+
// same as llama3
|
|
12302
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
12303
|
+
});
|
|
12304
|
+
break;
|
|
12218
12305
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
|
12219
12306
|
word_collection = unicode_regex_split(text, {
|
|
12220
12307
|
"[\r\n]",
|
|
@@ -12298,6 +12385,11 @@ struct llm_tokenizer_bpe {
|
|
|
12298
12385
|
int index = 0;
|
|
12299
12386
|
size_t offset = 0;
|
|
12300
12387
|
|
|
12388
|
+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
|
12389
|
+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
|
12390
|
+
offset = word.size();
|
|
12391
|
+
}
|
|
12392
|
+
|
|
12301
12393
|
while (offset < word.size()) {
|
|
12302
12394
|
llm_symbol sym;
|
|
12303
12395
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
|
@@ -12752,7 +12844,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12752
12844
|
}
|
|
12753
12845
|
}
|
|
12754
12846
|
|
|
12755
|
-
|
|
12847
|
+
if (add_special && vocab.special_add_eos == 1) {
|
|
12848
|
+
GGML_ASSERT(vocab.special_add_eos != -1);
|
|
12849
|
+
output.push_back(vocab.special_eos_id);
|
|
12850
|
+
}
|
|
12756
12851
|
} break;
|
|
12757
12852
|
case LLAMA_VOCAB_TYPE_WPM:
|
|
12758
12853
|
{
|
|
@@ -13106,6 +13201,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
|
13106
13201
|
return rejects;
|
|
13107
13202
|
}
|
|
13108
13203
|
|
|
13204
|
+
static bool llama_grammar_detect_left_recursion(
|
|
13205
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
|
13206
|
+
size_t rule_index,
|
|
13207
|
+
std::vector<bool> * rules_visited,
|
|
13208
|
+
std::vector<bool> * rules_in_progress,
|
|
13209
|
+
std::vector<bool> * rules_may_be_empty) {
|
|
13210
|
+
if ((*rules_in_progress)[rule_index]) {
|
|
13211
|
+
return true;
|
|
13212
|
+
}
|
|
13213
|
+
|
|
13214
|
+
(*rules_in_progress)[rule_index] = true;
|
|
13215
|
+
|
|
13216
|
+
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
|
13217
|
+
|
|
13218
|
+
// First check if the rule might produce the empty string. This could be done combined with the second
|
|
13219
|
+
// step but it's more readable as two steps.
|
|
13220
|
+
bool at_rule_start = true;
|
|
13221
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
|
13222
|
+
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
|
13223
|
+
if (at_rule_start) {
|
|
13224
|
+
(*rules_may_be_empty)[rule_index] = true;
|
|
13225
|
+
break;
|
|
13226
|
+
}
|
|
13227
|
+
at_rule_start = true;
|
|
13228
|
+
} else {
|
|
13229
|
+
at_rule_start = false;
|
|
13230
|
+
}
|
|
13231
|
+
}
|
|
13232
|
+
|
|
13233
|
+
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
|
13234
|
+
// be empty)
|
|
13235
|
+
bool recurse_into_nonterminal = true;
|
|
13236
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
|
13237
|
+
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
|
13238
|
+
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
|
13239
|
+
return true;
|
|
13240
|
+
}
|
|
13241
|
+
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
|
13242
|
+
recurse_into_nonterminal = false;
|
|
13243
|
+
}
|
|
13244
|
+
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
|
13245
|
+
recurse_into_nonterminal = true;
|
|
13246
|
+
} else {
|
|
13247
|
+
recurse_into_nonterminal = false;
|
|
13248
|
+
}
|
|
13249
|
+
}
|
|
13250
|
+
|
|
13251
|
+
(*rules_in_progress)[rule_index] = false;
|
|
13252
|
+
(*rules_visited)[rule_index] = true;
|
|
13253
|
+
return false;
|
|
13254
|
+
}
|
|
13255
|
+
|
|
13109
13256
|
//
|
|
13110
13257
|
// grammar - external
|
|
13111
13258
|
//
|
|
@@ -13125,6 +13272,19 @@ struct llama_grammar * llama_grammar_init(
|
|
|
13125
13272
|
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
|
13126
13273
|
}
|
|
13127
13274
|
|
|
13275
|
+
// Check for left recursion
|
|
13276
|
+
std::vector<bool> rules_visited(n_rules);
|
|
13277
|
+
std::vector<bool> rules_in_progress(n_rules);
|
|
13278
|
+
std::vector<bool> rules_may_be_empty(n_rules);
|
|
13279
|
+
for (size_t i = 0; i < n_rules; i++) {
|
|
13280
|
+
if (rules_visited[i]) {
|
|
13281
|
+
continue;
|
|
13282
|
+
}
|
|
13283
|
+
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
|
13284
|
+
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
|
13285
|
+
}
|
|
13286
|
+
}
|
|
13287
|
+
|
|
13128
13288
|
// loop over alternates of start rule to build initial stacks
|
|
13129
13289
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
|
13130
13290
|
pos = vec_rules[start_rule_index].data();
|
|
@@ -13147,6 +13307,9 @@ struct llama_grammar * llama_grammar_init(
|
|
|
13147
13307
|
}
|
|
13148
13308
|
} while (true);
|
|
13149
13309
|
|
|
13310
|
+
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
|
13311
|
+
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
|
13312
|
+
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
|
13150
13313
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
|
13151
13314
|
}
|
|
13152
13315
|
|
|
@@ -15246,6 +15409,7 @@ struct llama_model_params llama_model_default_params() {
|
|
|
15246
15409
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
15247
15410
|
/*.main_gpu =*/ 0,
|
|
15248
15411
|
/*.tensor_split =*/ nullptr,
|
|
15412
|
+
/*.rpc_servers =*/ nullptr,
|
|
15249
15413
|
/*.progress_callback =*/ nullptr,
|
|
15250
15414
|
/*.progress_callback_user_data =*/ nullptr,
|
|
15251
15415
|
/*.kv_overrides =*/ nullptr,
|
|
@@ -15316,7 +15480,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
15316
15480
|
}
|
|
15317
15481
|
|
|
15318
15482
|
size_t llama_max_devices(void) {
|
|
15319
|
-
#if defined(
|
|
15483
|
+
#if defined(GGML_USE_RPC)
|
|
15484
|
+
return GGML_RPC_MAX_SERVERS;
|
|
15485
|
+
#elif defined(GGML_USE_METAL)
|
|
15320
15486
|
return 1;
|
|
15321
15487
|
#elif defined(GGML_USE_CUDA)
|
|
15322
15488
|
return GGML_CUDA_MAX_DEVICES;
|
|
@@ -15339,7 +15505,7 @@ bool llama_supports_mlock(void) {
|
|
|
15339
15505
|
|
|
15340
15506
|
bool llama_supports_gpu_offload(void) {
|
|
15341
15507
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
|
15342
|
-
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
|
15508
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
|
15343
15509
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
15344
15510
|
return true;
|
|
15345
15511
|
#else
|
|
@@ -15402,7 +15568,17 @@ struct llama_model * llama_load_model_from_file(
|
|
|
15402
15568
|
return true;
|
|
15403
15569
|
};
|
|
15404
15570
|
}
|
|
15405
|
-
|
|
15571
|
+
if (params.rpc_servers != nullptr) {
|
|
15572
|
+
// split the servers set them into model->rpc_servers
|
|
15573
|
+
std::string servers(params.rpc_servers);
|
|
15574
|
+
size_t pos = 0;
|
|
15575
|
+
while ((pos = servers.find(",")) != std::string::npos) {
|
|
15576
|
+
std::string server = servers.substr(0, pos);
|
|
15577
|
+
model->rpc_servers.push_back(server);
|
|
15578
|
+
servers.erase(0, pos + 1);
|
|
15579
|
+
}
|
|
15580
|
+
model->rpc_servers.push_back(servers);
|
|
15581
|
+
}
|
|
15406
15582
|
int status = llama_model_load(path_model, *model, params);
|
|
15407
15583
|
GGML_ASSERT(status <= 0);
|
|
15408
15584
|
if (status < 0) {
|
|
@@ -15441,6 +15617,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15441
15617
|
return nullptr;
|
|
15442
15618
|
}
|
|
15443
15619
|
|
|
15620
|
+
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
|
15621
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
|
15622
|
+
params.flash_attn = false;
|
|
15623
|
+
}
|
|
15624
|
+
|
|
15444
15625
|
llama_context * ctx = new llama_context(*model);
|
|
15445
15626
|
|
|
15446
15627
|
const auto & hparams = model->hparams;
|
|
@@ -15464,7 +15645,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15464
15645
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
|
15465
15646
|
|
|
15466
15647
|
// this is necessary due to kv_self.n being padded later during inference
|
|
15467
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx,
|
|
15648
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
|
|
15468
15649
|
|
|
15469
15650
|
// with causal attention, the batch size is limited by the context size
|
|
15470
15651
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
|
@@ -15509,16 +15690,6 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15509
15690
|
}
|
|
15510
15691
|
}
|
|
15511
15692
|
|
|
15512
|
-
if (cparams.flash_attn && hparams.use_alibi) {
|
|
15513
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
|
15514
|
-
cparams.flash_attn = false;
|
|
15515
|
-
}
|
|
15516
|
-
|
|
15517
|
-
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
|
15518
|
-
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
|
15519
|
-
cparams.flash_attn = false;
|
|
15520
|
-
}
|
|
15521
|
-
|
|
15522
15693
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
15523
15694
|
params.seed = time(NULL);
|
|
15524
15695
|
}
|
|
@@ -15554,7 +15725,17 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15554
15725
|
|
|
15555
15726
|
if (!hparams.vocab_only) {
|
|
15556
15727
|
// initialize backends
|
|
15557
|
-
#
|
|
15728
|
+
#if defined(GGML_USE_RPC)
|
|
15729
|
+
for (auto & server : model->rpc_servers) {
|
|
15730
|
+
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
|
15731
|
+
if (backend == nullptr) {
|
|
15732
|
+
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
|
15733
|
+
llama_free(ctx);
|
|
15734
|
+
return nullptr;
|
|
15735
|
+
}
|
|
15736
|
+
ctx->backends.push_back(backend);
|
|
15737
|
+
}
|
|
15738
|
+
#elif defined(GGML_USE_METAL)
|
|
15558
15739
|
if (model->n_gpu_layers > 0) {
|
|
15559
15740
|
ctx->backend_metal = ggml_backend_metal_init();
|
|
15560
15741
|
if (ctx->backend_metal == nullptr) {
|
|
@@ -15710,7 +15891,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15710
15891
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
|
15711
15892
|
|
|
15712
15893
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
|
15713
|
-
bool pipeline_parallel =
|
|
15894
|
+
bool pipeline_parallel =
|
|
15895
|
+
llama_get_device_count(*model) > 1 &&
|
|
15896
|
+
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
|
15897
|
+
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
|
15898
|
+
params.offload_kqv;
|
|
15714
15899
|
#ifndef GGML_USE_CUDA
|
|
15715
15900
|
// pipeline parallelism requires support for async compute and events
|
|
15716
15901
|
// currently this is only implemented in the CUDA backend
|
|
@@ -15808,6 +15993,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
15808
15993
|
case LLM_ARCH_REFACT:
|
|
15809
15994
|
case LLM_ARCH_BLOOM:
|
|
15810
15995
|
case LLM_ARCH_MAMBA:
|
|
15996
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
15811
15997
|
return LLAMA_ROPE_TYPE_NONE;
|
|
15812
15998
|
|
|
15813
15999
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|