llama_cpp 0.15.1 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +3 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +15 -7
- data/vendor/tmp/llama.cpp/ggml-impl.h +7 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +114 -125
- data/vendor/tmp/llama.cpp/ggml-metal.metal +86 -109
- data/vendor/tmp/llama.cpp/ggml-quants.c +2202 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +24 -143
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +4 -2
- data/vendor/tmp/llama.cpp/ggml.c +726 -646
- data/vendor/tmp/llama.cpp/ggml.h +28 -17
- data/vendor/tmp/llama.cpp/llama.cpp +478 -281
- data/vendor/tmp/llama.cpp/llama.h +3 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +4 -2
@@ -7,6 +7,10 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
+
#ifdef GGML_USE_RPC
|
11
|
+
# include "ggml-rpc.h"
|
12
|
+
#endif
|
13
|
+
|
10
14
|
#ifdef GGML_USE_CUDA
|
11
15
|
# include "ggml-cuda.h"
|
12
16
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -205,6 +209,7 @@ enum llm_arch {
|
|
205
209
|
LLM_ARCH_REFACT,
|
206
210
|
LLM_ARCH_BERT,
|
207
211
|
LLM_ARCH_NOMIC_BERT,
|
212
|
+
LLM_ARCH_JINA_BERT_V2,
|
208
213
|
LLM_ARCH_BLOOM,
|
209
214
|
LLM_ARCH_STABLELM,
|
210
215
|
LLM_ARCH_QWEN,
|
@@ -228,39 +233,40 @@ enum llm_arch {
|
|
228
233
|
};
|
229
234
|
|
230
235
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
231
|
-
{ LLM_ARCH_LLAMA, "llama"
|
232
|
-
{ LLM_ARCH_FALCON, "falcon"
|
233
|
-
{ LLM_ARCH_GROK, "grok"
|
234
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
235
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
236
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
237
|
-
{ LLM_ARCH_MPT, "mpt"
|
238
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
239
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
240
|
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
241
|
-
{ LLM_ARCH_REFACT, "refact"
|
242
|
-
{ LLM_ARCH_BERT, "bert"
|
243
|
-
{ LLM_ARCH_NOMIC_BERT, "nomic-bert"
|
244
|
-
{
|
245
|
-
{
|
246
|
-
{
|
247
|
-
{
|
248
|
-
{
|
249
|
-
{
|
250
|
-
{
|
251
|
-
{
|
252
|
-
{
|
253
|
-
{
|
254
|
-
{
|
255
|
-
{
|
256
|
-
{
|
257
|
-
{
|
258
|
-
{
|
259
|
-
{
|
260
|
-
{
|
261
|
-
{
|
262
|
-
{
|
263
|
-
{
|
236
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
237
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
238
|
+
{ LLM_ARCH_GROK, "grok" },
|
239
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
240
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
241
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
242
|
+
{ LLM_ARCH_MPT, "mpt" },
|
243
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
244
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
245
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
246
|
+
{ LLM_ARCH_REFACT, "refact" },
|
247
|
+
{ LLM_ARCH_BERT, "bert" },
|
248
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
249
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
250
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
251
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
252
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
253
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
254
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
255
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
256
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
257
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
258
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
259
|
+
{ LLM_ARCH_ORION, "orion" },
|
260
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
261
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
262
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
263
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
264
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
265
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
266
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
267
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
268
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
269
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
264
270
|
};
|
265
271
|
|
266
272
|
enum llm_kv {
|
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
691
697
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
692
698
|
},
|
693
699
|
},
|
700
|
+
{
|
701
|
+
LLM_ARCH_JINA_BERT_V2,
|
702
|
+
{
|
703
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
704
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
705
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
706
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
707
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
708
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
709
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
710
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
711
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
712
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
713
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
714
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
715
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
716
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
717
|
+
},
|
718
|
+
},
|
694
719
|
{
|
695
720
|
LLM_ARCH_BLOOM,
|
696
721
|
{
|
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1664
1689
|
GGML_UNUSED(host_buffer);
|
1665
1690
|
}
|
1666
1691
|
|
1667
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
1668
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1669
|
-
|
1670
|
-
#ifdef GGML_USE_METAL
|
1671
|
-
buft = ggml_backend_metal_buffer_type();
|
1672
|
-
#elif defined(GGML_USE_CUDA)
|
1673
|
-
buft = ggml_backend_cuda_buffer_type(gpu);
|
1674
|
-
#elif defined(GGML_USE_VULKAN)
|
1675
|
-
buft = ggml_backend_vk_buffer_type(gpu);
|
1676
|
-
#elif defined(GGML_USE_SYCL)
|
1677
|
-
buft = ggml_backend_sycl_buffer_type(gpu);
|
1678
|
-
#elif defined(GGML_USE_CLBLAST)
|
1679
|
-
buft = ggml_backend_opencl_buffer_type();
|
1680
|
-
#elif defined(GGML_USE_KOMPUTE)
|
1681
|
-
buft = ggml_backend_kompute_buffer_type(gpu);
|
1682
|
-
if (buft == nullptr) {
|
1683
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
1684
|
-
}
|
1685
|
-
#endif
|
1686
|
-
|
1687
|
-
if (buft == nullptr) {
|
1688
|
-
buft = llama_default_buffer_type_cpu(true);
|
1689
|
-
}
|
1690
|
-
return buft;
|
1691
|
-
|
1692
|
-
GGML_UNUSED(gpu);
|
1693
|
-
}
|
1694
|
-
|
1695
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1696
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1697
|
-
|
1698
|
-
#ifdef GGML_USE_CUDA
|
1699
|
-
if (ggml_backend_cuda_get_device_count() > 1) {
|
1700
|
-
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1701
|
-
}
|
1702
|
-
#endif
|
1703
|
-
|
1704
|
-
#ifdef GGML_USE_SYCL
|
1705
|
-
if (ggml_backend_sycl_get_device_count() > 1) {
|
1706
|
-
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1707
|
-
}
|
1708
|
-
#endif
|
1709
|
-
|
1710
|
-
if (buft == nullptr) {
|
1711
|
-
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1712
|
-
}
|
1713
|
-
return buft;
|
1714
|
-
|
1715
|
-
GGML_UNUSED(tensor_split);
|
1716
|
-
}
|
1717
|
-
|
1718
|
-
static size_t llama_get_device_count() {
|
1719
|
-
#if defined(GGML_USE_CUDA)
|
1720
|
-
return ggml_backend_cuda_get_device_count();
|
1721
|
-
#elif defined(GGML_USE_SYCL)
|
1722
|
-
return ggml_backend_sycl_get_device_count();
|
1723
|
-
#elif defined(GGML_USE_VULKAN)
|
1724
|
-
return ggml_backend_vk_get_device_count();
|
1725
|
-
#else
|
1726
|
-
return 1;
|
1727
|
-
#endif
|
1728
|
-
}
|
1729
|
-
|
1730
|
-
static size_t llama_get_device_memory(int device) {
|
1731
|
-
#if defined(GGML_USE_CUDA)
|
1732
|
-
size_t total;
|
1733
|
-
size_t free;
|
1734
|
-
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1735
|
-
return free;
|
1736
|
-
#elif defined(GGML_USE_SYCL)
|
1737
|
-
size_t total;
|
1738
|
-
size_t free;
|
1739
|
-
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1740
|
-
return free;
|
1741
|
-
#elif defined(GGML_USE_VULKAN)
|
1742
|
-
size_t total;
|
1743
|
-
size_t free;
|
1744
|
-
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1745
|
-
return free;
|
1746
|
-
#else
|
1747
|
-
return 1;
|
1748
|
-
GGML_UNUSED(device);
|
1749
|
-
#endif
|
1750
|
-
}
|
1751
|
-
|
1752
1692
|
//
|
1753
1693
|
// globals
|
1754
1694
|
//
|
@@ -1845,7 +1785,7 @@ struct llama_hparams {
|
|
1845
1785
|
float f_logit_scale = 0.0f;
|
1846
1786
|
|
1847
1787
|
bool causal_attn = true;
|
1848
|
-
bool use_alibi = false;
|
1788
|
+
bool use_alibi = false;
|
1849
1789
|
|
1850
1790
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1851
1791
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -2189,6 +2129,8 @@ struct llama_model {
|
|
2189
2129
|
int main_gpu;
|
2190
2130
|
int n_gpu_layers;
|
2191
2131
|
|
2132
|
+
std::vector<std::string> rpc_servers;
|
2133
|
+
|
2192
2134
|
// gguf metadata
|
2193
2135
|
std::unordered_map<std::string, std::string> gguf_kv;
|
2194
2136
|
|
@@ -2317,7 +2259,6 @@ struct llama_context {
|
|
2317
2259
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2318
2260
|
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2319
2261
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2320
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2321
2262
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2322
2263
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2323
2264
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
@@ -2333,6 +2274,104 @@ struct llama_context {
|
|
2333
2274
|
#endif
|
2334
2275
|
};
|
2335
2276
|
|
2277
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
2278
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2279
|
+
|
2280
|
+
#ifdef GGML_USE_RPC
|
2281
|
+
std::string endpoint = model.rpc_servers[gpu];
|
2282
|
+
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
2283
|
+
#elif defined(GGML_USE_METAL)
|
2284
|
+
buft = ggml_backend_metal_buffer_type();
|
2285
|
+
#elif defined(GGML_USE_CUDA)
|
2286
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
2287
|
+
#elif defined(GGML_USE_VULKAN)
|
2288
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
2289
|
+
#elif defined(GGML_USE_SYCL)
|
2290
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
2291
|
+
#elif defined(GGML_USE_CLBLAST)
|
2292
|
+
buft = ggml_backend_opencl_buffer_type();
|
2293
|
+
#elif defined(GGML_USE_KOMPUTE)
|
2294
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
2295
|
+
if (buft == nullptr) {
|
2296
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
2297
|
+
}
|
2298
|
+
#endif
|
2299
|
+
|
2300
|
+
if (buft == nullptr) {
|
2301
|
+
buft = llama_default_buffer_type_cpu(true);
|
2302
|
+
}
|
2303
|
+
return buft;
|
2304
|
+
GGML_UNUSED(model);
|
2305
|
+
GGML_UNUSED(gpu);
|
2306
|
+
}
|
2307
|
+
|
2308
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
2309
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2310
|
+
|
2311
|
+
#ifdef GGML_USE_CUDA
|
2312
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
2313
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
2314
|
+
}
|
2315
|
+
#endif
|
2316
|
+
|
2317
|
+
#ifdef GGML_USE_SYCL
|
2318
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
2319
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
2320
|
+
}
|
2321
|
+
#endif
|
2322
|
+
|
2323
|
+
if (buft == nullptr) {
|
2324
|
+
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
2325
|
+
}
|
2326
|
+
return buft;
|
2327
|
+
|
2328
|
+
GGML_UNUSED(tensor_split);
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
2332
|
+
#if defined(GGML_USE_RPC)
|
2333
|
+
return model.rpc_servers.size();
|
2334
|
+
#elif defined(GGML_USE_CUDA)
|
2335
|
+
return ggml_backend_cuda_get_device_count();
|
2336
|
+
#elif defined(GGML_USE_SYCL)
|
2337
|
+
return ggml_backend_sycl_get_device_count();
|
2338
|
+
#elif defined(GGML_USE_VULKAN)
|
2339
|
+
return ggml_backend_vk_get_device_count();
|
2340
|
+
#else
|
2341
|
+
return 1;
|
2342
|
+
#endif
|
2343
|
+
GGML_UNUSED(model);
|
2344
|
+
}
|
2345
|
+
|
2346
|
+
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
2347
|
+
#if defined(GGML_USE_RPC)
|
2348
|
+
size_t total;
|
2349
|
+
size_t free;
|
2350
|
+
std::string endpoint = model.rpc_servers[device];
|
2351
|
+
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
2352
|
+
return free;
|
2353
|
+
#elif defined(GGML_USE_CUDA)
|
2354
|
+
size_t total;
|
2355
|
+
size_t free;
|
2356
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
2357
|
+
return free;
|
2358
|
+
#elif defined(GGML_USE_SYCL)
|
2359
|
+
size_t total;
|
2360
|
+
size_t free;
|
2361
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
2362
|
+
return free;
|
2363
|
+
#elif defined(GGML_USE_VULKAN)
|
2364
|
+
size_t total;
|
2365
|
+
size_t free;
|
2366
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
2367
|
+
return free;
|
2368
|
+
#else
|
2369
|
+
return 1;
|
2370
|
+
#endif
|
2371
|
+
GGML_UNUSED(model);
|
2372
|
+
GGML_UNUSED(device);
|
2373
|
+
}
|
2374
|
+
|
2336
2375
|
//
|
2337
2376
|
// kv cache helpers
|
2338
2377
|
//
|
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
|
2785
2824
|
cache.do_defrag = true;
|
2786
2825
|
}
|
2787
2826
|
|
2827
|
+
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
2828
|
+
// the FA kernels require padding to avoid extra runtime boundary checks
|
2829
|
+
return cparams.flash_attn ? 256u : 32u;
|
2830
|
+
}
|
2831
|
+
|
2788
2832
|
//
|
2789
2833
|
// model loading and saving
|
2790
2834
|
//
|
@@ -3779,6 +3823,12 @@ static void llm_load_hparams(
|
|
3779
3823
|
|
3780
3824
|
// get hparams kv
|
3781
3825
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
3826
|
+
|
3827
|
+
// everything past this point is not vocab-related
|
3828
|
+
if (hparams.vocab_only) {
|
3829
|
+
return;
|
3830
|
+
}
|
3831
|
+
|
3782
3832
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
3783
3833
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
3784
3834
|
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
@@ -3860,7 +3910,7 @@ static void llm_load_hparams(
|
|
3860
3910
|
switch (hparams.n_layer) {
|
3861
3911
|
case 22: model.type = e_model::MODEL_1B; break;
|
3862
3912
|
case 26: model.type = e_model::MODEL_3B; break;
|
3863
|
-
case 32: model.type = hparams.
|
3913
|
+
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
3864
3914
|
case 40: model.type = e_model::MODEL_13B; break;
|
3865
3915
|
case 48: model.type = e_model::MODEL_34B; break;
|
3866
3916
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3962,6 +4012,19 @@ static void llm_load_hparams(
|
|
3962
4012
|
model.type = e_model::MODEL_335M; break; // bge-large
|
3963
4013
|
}
|
3964
4014
|
} break;
|
4015
|
+
case LLM_ARCH_JINA_BERT_V2:
|
4016
|
+
{
|
4017
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4018
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
4019
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
4020
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
4021
|
+
hparams.f_max_alibi_bias = 8.0f;
|
4022
|
+
|
4023
|
+
switch (hparams.n_layer) {
|
4024
|
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
4025
|
+
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
4026
|
+
}
|
4027
|
+
} break;
|
3965
4028
|
case LLM_ARCH_NOMIC_BERT:
|
3966
4029
|
{
|
3967
4030
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -4383,7 +4446,11 @@ static void llm_load_vocab(
|
|
4383
4446
|
tokenizer_pre == "starcoder") {
|
4384
4447
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4385
4448
|
} else if (
|
4386
|
-
tokenizer_pre == "gpt-2"
|
4449
|
+
tokenizer_pre == "gpt-2" ||
|
4450
|
+
tokenizer_pre == "jina-es" ||
|
4451
|
+
tokenizer_pre == "jina-de" ||
|
4452
|
+
tokenizer_pre == "jina-v2-es" ||
|
4453
|
+
tokenizer_pre == "jina-v2-de") {
|
4387
4454
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4388
4455
|
} else if (
|
4389
4456
|
tokenizer_pre == "refact") {
|
@@ -4743,13 +4810,13 @@ static bool llm_load_tensors(
|
|
4743
4810
|
|
4744
4811
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
4745
4812
|
// calculate the split points
|
4746
|
-
int device_count = llama_get_device_count();
|
4813
|
+
int device_count = llama_get_device_count(model);
|
4747
4814
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
4748
4815
|
std::vector<float> splits(device_count);
|
4749
4816
|
if (all_zero) {
|
4750
4817
|
// default split, by free memory
|
4751
4818
|
for (int i = 0; i < device_count; ++i) {
|
4752
|
-
splits[i] = llama_get_device_memory(i);
|
4819
|
+
splits[i] = llama_get_device_memory(model, i);
|
4753
4820
|
}
|
4754
4821
|
} else {
|
4755
4822
|
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
@@ -4769,35 +4836,35 @@ static bool llm_load_tensors(
|
|
4769
4836
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
4770
4837
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4771
4838
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
4772
|
-
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
4839
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
4773
4840
|
}
|
4774
4841
|
// assign the output layer
|
4775
4842
|
if (n_gpu_layers > n_layer) {
|
4776
4843
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
4777
|
-
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
4844
|
+
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
4778
4845
|
} else {
|
4779
4846
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
4780
4847
|
}
|
4781
4848
|
} else {
|
4782
4849
|
ggml_backend_buffer_type_t split_buft;
|
4783
4850
|
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
4784
|
-
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
4851
|
+
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
4785
4852
|
} else {
|
4786
4853
|
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
4787
|
-
split_buft = llama_default_buffer_type_offload(main_gpu);
|
4854
|
+
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
4788
4855
|
}
|
4789
4856
|
// assign the repeating layers
|
4790
4857
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4791
4858
|
model.buft_layer[i] = {
|
4792
4859
|
split_buft,
|
4793
|
-
llama_default_buffer_type_offload(main_gpu)
|
4860
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4794
4861
|
};
|
4795
4862
|
}
|
4796
4863
|
// assign the output layer
|
4797
4864
|
if (n_gpu_layers > n_layer) {
|
4798
4865
|
model.buft_output = {
|
4799
4866
|
split_buft,
|
4800
|
-
llama_default_buffer_type_offload(main_gpu)
|
4867
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4801
4868
|
};
|
4802
4869
|
} else {
|
4803
4870
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
@@ -5242,6 +5309,50 @@ static bool llm_load_tensors(
|
|
5242
5309
|
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5243
5310
|
}
|
5244
5311
|
} break;
|
5312
|
+
case LLM_ARCH_JINA_BERT_V2:
|
5313
|
+
{
|
5314
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
|
5315
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
|
5316
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
5317
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
5318
|
+
|
5319
|
+
for (int i = 0; i < n_layer; ++i) {
|
5320
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5321
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5322
|
+
|
5323
|
+
auto & layer = model.layers[i]; // JinaBertLayer
|
5324
|
+
|
5325
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5326
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5327
|
+
|
5328
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
5329
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
5330
|
+
|
5331
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5332
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5333
|
+
|
5334
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
5335
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
5336
|
+
|
5337
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5338
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5339
|
+
|
5340
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
|
5341
|
+
layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
|
5342
|
+
|
5343
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
5344
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
5345
|
+
|
5346
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5347
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5348
|
+
|
5349
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5350
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5351
|
+
|
5352
|
+
layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
5353
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5354
|
+
}
|
5355
|
+
} break;
|
5245
5356
|
case LLM_ARCH_BLOOM:
|
5246
5357
|
{
|
5247
5358
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -6318,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|
6318
6429
|
llm_ffn_gate_type type_gate,
|
6319
6430
|
const llm_build_cb & cb,
|
6320
6431
|
int il) {
|
6321
|
-
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
6432
|
+
struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
|
6322
6433
|
cb(tmp, "ffn_up", il);
|
6323
6434
|
|
6324
6435
|
if (up_b) {
|
@@ -6500,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6500
6611
|
struct ggml_tensor * wo_b,
|
6501
6612
|
struct ggml_tensor * q_cur,
|
6502
6613
|
struct ggml_tensor * kq_mask,
|
6503
|
-
struct ggml_tensor * kq_pos,
|
6504
6614
|
int32_t n_tokens,
|
6505
6615
|
int32_t n_kv,
|
6506
6616
|
float kq_scale,
|
@@ -6512,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6512
6622
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
6513
6623
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6514
6624
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
6625
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6515
6626
|
|
6516
6627
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
6517
6628
|
cb(q, "q", il);
|
@@ -6530,26 +6641,22 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6530
6641
|
GGML_UNUSED(model);
|
6531
6642
|
GGML_UNUSED(n_ctx);
|
6532
6643
|
|
6533
|
-
// note: if this assert triggers, then some check has failed earlier
|
6534
|
-
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6535
|
-
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6536
|
-
|
6537
6644
|
// split cached v into n_head heads (not transposed)
|
6538
6645
|
struct ggml_tensor * v =
|
6539
6646
|
ggml_view_3d(ctx, kv.v_l[il],
|
6540
6647
|
n_embd_head_v, n_kv, n_head_kv,
|
6541
|
-
ggml_row_size(kv.v_l[il]->type,
|
6542
|
-
ggml_row_size(kv.v_l[il]->type,
|
6648
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
6649
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
6543
6650
|
0);
|
6544
6651
|
cb(v, "v", il);
|
6545
6652
|
|
6546
|
-
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6653
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6547
6654
|
|
6548
6655
|
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6549
6656
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6550
6657
|
}
|
6551
6658
|
|
6552
|
-
cur = ggml_reshape_2d(ctx, cur,
|
6659
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
6553
6660
|
} else {
|
6554
6661
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6555
6662
|
cb(kq, "kq", il);
|
@@ -6574,28 +6681,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6574
6681
|
kq = ggml_scale(ctx, kq, 30);
|
6575
6682
|
}
|
6576
6683
|
|
6577
|
-
|
6578
|
-
|
6579
|
-
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6580
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6581
|
-
if (hparams.use_alibi) {
|
6582
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
6583
|
-
cb(kq, "kq_scaled", il);
|
6584
|
-
|
6585
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6586
|
-
cb(kq, "kq_scaled_alibi", il);
|
6587
|
-
|
6588
|
-
kq = ggml_add(ctx, kq, kq_mask);
|
6589
|
-
cb(kq, "kq_masked", il);
|
6590
|
-
|
6591
|
-
kq = ggml_soft_max(ctx, kq);
|
6592
|
-
cb(kq, "kq_soft_max", il);
|
6593
|
-
} else
|
6594
|
-
#endif
|
6595
|
-
{
|
6596
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6597
|
-
cb(kq, "kq_soft_max_ext", il);
|
6598
|
-
}
|
6684
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6685
|
+
cb(kq, "kq_soft_max_ext", il);
|
6599
6686
|
|
6600
6687
|
GGML_ASSERT(kv.size == n_ctx);
|
6601
6688
|
|
@@ -6614,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6614
6701
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6615
6702
|
cb(kqv_merged, "kqv_merged", il);
|
6616
6703
|
|
6617
|
-
cur = ggml_cont_2d(ctx, kqv_merged,
|
6704
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
6618
6705
|
cb(cur, "kqv_merged_cont", il);
|
6619
6706
|
}
|
6620
6707
|
|
@@ -6645,7 +6732,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6645
6732
|
struct ggml_tensor * v_cur,
|
6646
6733
|
struct ggml_tensor * q_cur,
|
6647
6734
|
struct ggml_tensor * kq_mask,
|
6648
|
-
struct ggml_tensor * kq_pos,
|
6649
6735
|
int32_t n_tokens,
|
6650
6736
|
int32_t kv_head,
|
6651
6737
|
int32_t n_kv,
|
@@ -6664,7 +6750,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6664
6750
|
struct ggml_tensor * cur;
|
6665
6751
|
|
6666
6752
|
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6667
|
-
q_cur, kq_mask,
|
6753
|
+
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
6668
6754
|
cb(cur, "kqv_out", il);
|
6669
6755
|
|
6670
6756
|
return cur;
|
@@ -6771,18 +6857,17 @@ struct llm_build_context {
|
|
6771
6857
|
|
6772
6858
|
ctx0 = ggml_init(params);
|
6773
6859
|
|
6774
|
-
lctx.inp_tokens
|
6775
|
-
lctx.inp_embd
|
6776
|
-
lctx.inp_pos
|
6860
|
+
lctx.inp_tokens = nullptr;
|
6861
|
+
lctx.inp_embd = nullptr;
|
6862
|
+
lctx.inp_pos = nullptr;
|
6777
6863
|
lctx.inp_out_ids = nullptr;
|
6778
6864
|
lctx.inp_KQ_mask = nullptr;
|
6779
|
-
lctx.inp_KQ_pos = nullptr;
|
6780
6865
|
lctx.inp_K_shift = nullptr;
|
6781
|
-
lctx.inp_mean
|
6782
|
-
lctx.inp_cls
|
6783
|
-
lctx.inp_s_copy
|
6784
|
-
lctx.inp_s_mask
|
6785
|
-
lctx.inp_s_seq
|
6866
|
+
lctx.inp_mean = nullptr;
|
6867
|
+
lctx.inp_cls = nullptr;
|
6868
|
+
lctx.inp_s_copy = nullptr;
|
6869
|
+
lctx.inp_s_mask = nullptr;
|
6870
|
+
lctx.inp_s_seq = nullptr;
|
6786
6871
|
}
|
6787
6872
|
|
6788
6873
|
void free() {
|
@@ -6932,19 +7017,6 @@ struct llm_build_context {
|
|
6932
7017
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6933
7018
|
}
|
6934
7019
|
|
6935
|
-
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6936
|
-
if (causal) {
|
6937
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6938
|
-
} else {
|
6939
|
-
// TODO: this will be needed for ALiBi-based BERT models
|
6940
|
-
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6941
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6942
|
-
}
|
6943
|
-
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6944
|
-
ggml_set_input(lctx.inp_KQ_pos);
|
6945
|
-
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6946
|
-
}
|
6947
|
-
|
6948
7020
|
struct ggml_tensor * build_inp_mean() {
|
6949
7021
|
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6950
7022
|
cb(lctx.inp_mean, "inp_mean", -1);
|
@@ -7050,7 +7122,7 @@ struct llm_build_context {
|
|
7050
7122
|
|
7051
7123
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7052
7124
|
model.layers[il].wo, model.layers[il].bo,
|
7053
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7125
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7054
7126
|
}
|
7055
7127
|
|
7056
7128
|
if (il == n_layer - 1) {
|
@@ -7143,9 +7215,6 @@ struct llm_build_context {
|
|
7143
7215
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7144
7216
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7145
7217
|
|
7146
|
-
// positions of the tokens in the KV cache
|
7147
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7148
|
-
|
7149
7218
|
for (int il = 0; il < n_layer; ++il) {
|
7150
7219
|
struct ggml_tensor * inpSA = inpL;
|
7151
7220
|
|
@@ -7190,7 +7259,7 @@ struct llm_build_context {
|
|
7190
7259
|
|
7191
7260
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7192
7261
|
model.layers[il].wo, NULL,
|
7193
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7262
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7194
7263
|
}
|
7195
7264
|
|
7196
7265
|
if (il == n_layer - 1) {
|
@@ -7260,9 +7329,6 @@ struct llm_build_context {
|
|
7260
7329
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7261
7330
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7262
7331
|
|
7263
|
-
// positions of the tokens in the KV cache
|
7264
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7265
|
-
|
7266
7332
|
for (int il = 0; il < n_layer; ++il) {
|
7267
7333
|
struct ggml_tensor * inpSA = inpL;
|
7268
7334
|
|
@@ -7297,7 +7363,7 @@ struct llm_build_context {
|
|
7297
7363
|
cb(Kcur, "Kcur", il);
|
7298
7364
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7299
7365
|
model.layers[il].wo, NULL,
|
7300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7366
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7301
7367
|
}
|
7302
7368
|
|
7303
7369
|
if (il == n_layer - 1) {
|
@@ -7417,7 +7483,7 @@ struct llm_build_context {
|
|
7417
7483
|
|
7418
7484
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7419
7485
|
model.layers[il].wo, NULL,
|
7420
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7486
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7421
7487
|
}
|
7422
7488
|
|
7423
7489
|
if (il == n_layer - 1) {
|
@@ -7542,7 +7608,7 @@ struct llm_build_context {
|
|
7542
7608
|
|
7543
7609
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7544
7610
|
model.layers[il].wo, model.layers[il].bo,
|
7545
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7611
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7546
7612
|
}
|
7547
7613
|
|
7548
7614
|
if (il == n_layer - 1) {
|
@@ -7694,7 +7760,7 @@ struct llm_build_context {
|
|
7694
7760
|
|
7695
7761
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7696
7762
|
model.layers[il].wo, NULL,
|
7697
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7763
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7698
7764
|
}
|
7699
7765
|
|
7700
7766
|
if (il == n_layer - 1) {
|
@@ -7806,7 +7872,7 @@ struct llm_build_context {
|
|
7806
7872
|
|
7807
7873
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7808
7874
|
model.layers[il].wo, model.layers[il].bo,
|
7809
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7875
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7810
7876
|
}
|
7811
7877
|
|
7812
7878
|
if (il == n_layer - 1) {
|
@@ -8010,7 +8076,7 @@ struct llm_build_context {
|
|
8010
8076
|
|
8011
8077
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8012
8078
|
model.layers[il].wo, model.layers[il].bo,
|
8013
|
-
Kcur, Vcur, Q, KQ_mask,
|
8079
|
+
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8014
8080
|
}
|
8015
8081
|
|
8016
8082
|
if (il == n_layer - 1) {
|
@@ -8076,9 +8142,6 @@ struct llm_build_context {
|
|
8076
8142
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8077
8143
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8078
8144
|
|
8079
|
-
// positions of the tokens in the KV cache
|
8080
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8081
|
-
|
8082
8145
|
for (int il = 0; il < n_layer; ++il) {
|
8083
8146
|
struct ggml_tensor * inpSA = inpL;
|
8084
8147
|
|
@@ -8106,7 +8169,7 @@ struct llm_build_context {
|
|
8106
8169
|
|
8107
8170
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8108
8171
|
model.layers[il].wo, NULL,
|
8109
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8172
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8110
8173
|
}
|
8111
8174
|
|
8112
8175
|
if (il == n_layer - 1) {
|
@@ -8168,8 +8231,11 @@ struct llm_build_context {
|
|
8168
8231
|
|
8169
8232
|
struct ggml_tensor * cur;
|
8170
8233
|
struct ggml_tensor * inpL;
|
8234
|
+
struct ggml_tensor * inp_pos = nullptr;
|
8171
8235
|
|
8172
|
-
|
8236
|
+
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
8237
|
+
inp_pos = build_inp_pos();
|
8238
|
+
}
|
8173
8239
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
8174
8240
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
8175
8241
|
|
@@ -8200,13 +8266,26 @@ struct llm_build_context {
|
|
8200
8266
|
struct ggml_tensor * Vcur;
|
8201
8267
|
|
8202
8268
|
// self-attention
|
8203
|
-
if (model.arch == LLM_ARCH_BERT) {
|
8269
|
+
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8204
8270
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
8205
8271
|
cb(Qcur, "Qcur", il);
|
8206
8272
|
|
8273
|
+
if (model.layers[il].attn_q_norm) {
|
8274
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8275
|
+
model.layers[il].attn_q_norm,
|
8276
|
+
model.layers[il].attn_q_norm_b,
|
8277
|
+
LLM_NORM, cb, il);
|
8278
|
+
}
|
8279
|
+
|
8207
8280
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
8208
8281
|
cb(Kcur, "Kcur", il);
|
8209
8282
|
|
8283
|
+
if (model.layers[il].attn_k_norm) {
|
8284
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8285
|
+
model.layers[il].attn_k_norm,
|
8286
|
+
model.layers[il].attn_k_norm_b,
|
8287
|
+
LLM_NORM, cb, il);
|
8288
|
+
}
|
8210
8289
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
8211
8290
|
cb(Vcur, "Vcur", il);
|
8212
8291
|
|
@@ -8246,7 +8325,7 @@ struct llm_build_context {
|
|
8246
8325
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
8247
8326
|
cb(kq, "kq", il);
|
8248
8327
|
|
8249
|
-
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask,
|
8328
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
8250
8329
|
cb(kq, "kq_soft_max_ext", il);
|
8251
8330
|
|
8252
8331
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
@@ -8297,6 +8376,13 @@ struct llm_build_context {
|
|
8297
8376
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8298
8377
|
NULL,
|
8299
8378
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8379
|
+
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8380
|
+
cur = llm_build_ffn(ctx0, cur,
|
8381
|
+
model.layers[il].ffn_up, NULL,
|
8382
|
+
model.layers[il].ffn_gate, NULL,
|
8383
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8384
|
+
NULL,
|
8385
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
8300
8386
|
} else {
|
8301
8387
|
cur = llm_build_ffn(ctx0, cur,
|
8302
8388
|
model.layers[il].ffn_up, NULL,
|
@@ -8363,9 +8449,6 @@ struct llm_build_context {
|
|
8363
8449
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8364
8450
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8365
8451
|
|
8366
|
-
// positions of the tokens in the KV cache
|
8367
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8368
|
-
|
8369
8452
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
8370
8453
|
model.tok_norm,
|
8371
8454
|
model.tok_norm_b,
|
@@ -8399,7 +8482,7 @@ struct llm_build_context {
|
|
8399
8482
|
|
8400
8483
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8401
8484
|
model.layers[il].wo, model.layers[il].bo,
|
8402
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8485
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8403
8486
|
}
|
8404
8487
|
|
8405
8488
|
if (il == n_layer - 1) {
|
@@ -8464,9 +8547,6 @@ struct llm_build_context {
|
|
8464
8547
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8465
8548
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8466
8549
|
|
8467
|
-
// positions of the tokens in the KV cache
|
8468
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8469
|
-
|
8470
8550
|
if (model.pos_embd) {
|
8471
8551
|
// inp_pos - contains the positions
|
8472
8552
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
@@ -8530,13 +8610,13 @@ struct llm_build_context {
|
|
8530
8610
|
|
8531
8611
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8532
8612
|
model.layers[il].wo, model.layers[il].bo,
|
8533
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8613
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8534
8614
|
} else {
|
8535
8615
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8536
8616
|
|
8537
8617
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8538
8618
|
model.layers[il].wo, model.layers[il].bo,
|
8539
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8619
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8540
8620
|
}
|
8541
8621
|
}
|
8542
8622
|
|
@@ -8680,7 +8760,7 @@ struct llm_build_context {
|
|
8680
8760
|
|
8681
8761
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8682
8762
|
model.layers[il].wo, NULL,
|
8683
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8763
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8684
8764
|
}
|
8685
8765
|
|
8686
8766
|
if (il == n_layer - 1) {
|
@@ -8798,7 +8878,7 @@ struct llm_build_context {
|
|
8798
8878
|
|
8799
8879
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8800
8880
|
model.layers[il].wo, NULL,
|
8801
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8881
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8802
8882
|
}
|
8803
8883
|
|
8804
8884
|
if (il == n_layer - 1) {
|
@@ -8911,7 +8991,7 @@ struct llm_build_context {
|
|
8911
8991
|
|
8912
8992
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8913
8993
|
model.layers[il].wo, model.layers[il].bo,
|
8914
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8994
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8915
8995
|
}
|
8916
8996
|
|
8917
8997
|
if (il == n_layer - 1) {
|
@@ -9025,7 +9105,7 @@ struct llm_build_context {
|
|
9025
9105
|
|
9026
9106
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9027
9107
|
model.layers[il].wo, model.layers[il].bo,
|
9028
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9108
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9029
9109
|
}
|
9030
9110
|
|
9031
9111
|
if (il == n_layer - 1) {
|
@@ -9180,7 +9260,7 @@ struct llm_build_context {
|
|
9180
9260
|
|
9181
9261
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9182
9262
|
model.layers[il].wo, model.layers[il].bo,
|
9183
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9263
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9184
9264
|
}
|
9185
9265
|
|
9186
9266
|
if (il == n_layer - 1) {
|
@@ -9297,7 +9377,7 @@ struct llm_build_context {
|
|
9297
9377
|
|
9298
9378
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9299
9379
|
model.layers[il].wo, model.layers[il].bo,
|
9300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9380
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9301
9381
|
}
|
9302
9382
|
|
9303
9383
|
if (il == n_layer - 1) {
|
@@ -9410,7 +9490,7 @@ struct llm_build_context {
|
|
9410
9490
|
|
9411
9491
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9412
9492
|
model.layers[il].wo, NULL,
|
9413
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9493
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9414
9494
|
}
|
9415
9495
|
struct ggml_tensor * sa_out = cur;
|
9416
9496
|
|
@@ -9513,7 +9593,7 @@ struct llm_build_context {
|
|
9513
9593
|
|
9514
9594
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9515
9595
|
model.layers[il].wo, model.layers[il].bo,
|
9516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9596
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9517
9597
|
}
|
9518
9598
|
|
9519
9599
|
if (il == n_layer - 1) {
|
@@ -9620,7 +9700,7 @@ struct llm_build_context {
|
|
9620
9700
|
|
9621
9701
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9622
9702
|
model.layers[il].wo, model.layers[il].bo,
|
9623
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9703
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9624
9704
|
}
|
9625
9705
|
|
9626
9706
|
if (il == n_layer - 1) {
|
@@ -9736,7 +9816,7 @@ struct llm_build_context {
|
|
9736
9816
|
|
9737
9817
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9738
9818
|
model.layers[il].wo, NULL,
|
9739
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9819
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9740
9820
|
}
|
9741
9821
|
|
9742
9822
|
if (il == n_layer - 1) {
|
@@ -9853,7 +9933,7 @@ struct llm_build_context {
|
|
9853
9933
|
|
9854
9934
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9855
9935
|
model.layers[il].wo, model.layers[il].bo,
|
9856
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9936
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9857
9937
|
}
|
9858
9938
|
|
9859
9939
|
if (il == n_layer - 1) {
|
@@ -9983,7 +10063,7 @@ struct llm_build_context {
|
|
9983
10063
|
|
9984
10064
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9985
10065
|
model.layers[il].wo, model.layers[il].bo,
|
9986
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10066
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9987
10067
|
}
|
9988
10068
|
|
9989
10069
|
if (il == n_layer - 1) {
|
@@ -10104,7 +10184,7 @@ struct llm_build_context {
|
|
10104
10184
|
|
10105
10185
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10106
10186
|
model.layers[il].wo, NULL,
|
10107
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10187
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
10108
10188
|
}
|
10109
10189
|
|
10110
10190
|
if (il == n_layer - 1) {
|
@@ -10223,7 +10303,7 @@ struct llm_build_context {
|
|
10223
10303
|
|
10224
10304
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10225
10305
|
model.layers[il].wo, model.layers[il].bo,
|
10226
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10306
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10227
10307
|
}
|
10228
10308
|
|
10229
10309
|
if (il == n_layer - 1) {
|
@@ -10513,7 +10593,7 @@ struct llm_build_context {
|
|
10513
10593
|
|
10514
10594
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10515
10595
|
model.layers[il].wo, model.layers[il].bo,
|
10516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10596
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10517
10597
|
}
|
10518
10598
|
|
10519
10599
|
if (il == n_layer - 1) {
|
@@ -10644,7 +10724,7 @@ struct llm_build_context {
|
|
10644
10724
|
|
10645
10725
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10646
10726
|
model.layers[il].wo, nullptr,
|
10647
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10727
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10648
10728
|
}
|
10649
10729
|
|
10650
10730
|
if (il == n_layer - 1) {
|
@@ -10825,6 +10905,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10825
10905
|
result = llm.build_refact();
|
10826
10906
|
} break;
|
10827
10907
|
case LLM_ARCH_BERT:
|
10908
|
+
case LLM_ARCH_JINA_BERT_V2:
|
10828
10909
|
case LLM_ARCH_NOMIC_BERT:
|
10829
10910
|
{
|
10830
10911
|
result = llm.build_bert();
|
@@ -11032,11 +11113,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11032
11113
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
11033
11114
|
f = -INFINITY;
|
11034
11115
|
} else {
|
11035
|
-
|
11116
|
+
if (hparams.use_alibi) {
|
11117
|
+
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
11118
|
+
} else {
|
11119
|
+
f = 0.0f;
|
11120
|
+
}
|
11036
11121
|
}
|
11037
11122
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
11038
11123
|
}
|
11039
11124
|
}
|
11125
|
+
|
11126
|
+
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
11127
|
+
for (int j = 0; j < n_kv; ++j) {
|
11128
|
+
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
11129
|
+
}
|
11130
|
+
}
|
11040
11131
|
}
|
11041
11132
|
} else {
|
11042
11133
|
// when using kv cache, the mask needs to match the kv cache size
|
@@ -11055,7 +11146,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11055
11146
|
float f = -INFINITY;
|
11056
11147
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
11057
11148
|
if (batch.seq_id[i][s] == seq_id) {
|
11058
|
-
|
11149
|
+
if (hparams.use_alibi) {
|
11150
|
+
f = -fabs(batch.pos[i] - batch.pos[j]);
|
11151
|
+
} else {
|
11152
|
+
f = 0.0f;
|
11153
|
+
}
|
11059
11154
|
break;
|
11060
11155
|
}
|
11061
11156
|
}
|
@@ -11071,21 +11166,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11071
11166
|
}
|
11072
11167
|
}
|
11073
11168
|
|
11074
|
-
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11075
|
-
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11076
|
-
if (hparams.use_alibi) {
|
11077
|
-
const int64_t n_kv = kv_self.n;
|
11078
|
-
|
11079
|
-
GGML_ASSERT(lctx.inp_KQ_pos);
|
11080
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
11081
|
-
|
11082
|
-
float * data = (float *) lctx.inp_KQ_pos->data;
|
11083
|
-
|
11084
|
-
for (int i = 0; i < n_kv; ++i) {
|
11085
|
-
data[i] = float(lctx.kv_self.cells[i].pos);
|
11086
|
-
}
|
11087
|
-
}
|
11088
|
-
|
11089
11169
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
11090
11170
|
const int64_t n_tokens = batch.n_tokens;
|
11091
11171
|
|
@@ -11455,7 +11535,8 @@ static int llama_decode_internal(
|
|
11455
11535
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11456
11536
|
// after enough generations, the benefit from this heuristic disappears
|
11457
11537
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11458
|
-
|
11538
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
11539
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
11459
11540
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11460
11541
|
}
|
11461
11542
|
}
|
@@ -12200,13 +12281,14 @@ struct llm_tokenizer_bpe {
|
|
12200
12281
|
|
12201
12282
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12202
12283
|
int final_prev_index = -1;
|
12284
|
+
bool ignore_merges = false;
|
12203
12285
|
|
12204
12286
|
std::vector<std::string> word_collection;
|
12205
12287
|
switch (vocab.type) {
|
12206
12288
|
case LLAMA_VOCAB_TYPE_BPE:
|
12207
12289
|
switch (vocab.type_pre) {
|
12208
12290
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12209
|
-
|
12291
|
+
ignore_merges = true;
|
12210
12292
|
word_collection = unicode_regex_split(text, {
|
12211
12293
|
// original regex from tokenizer.json
|
12212
12294
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12215,6 +12297,12 @@ struct llm_tokenizer_bpe {
|
|
12215
12297
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12216
12298
|
});
|
12217
12299
|
break;
|
12300
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12301
|
+
word_collection = unicode_regex_split(text, {
|
12302
|
+
// same as llama3
|
12303
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12304
|
+
});
|
12305
|
+
break;
|
12218
12306
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12219
12307
|
word_collection = unicode_regex_split(text, {
|
12220
12308
|
"[\r\n]",
|
@@ -12298,6 +12386,11 @@ struct llm_tokenizer_bpe {
|
|
12298
12386
|
int index = 0;
|
12299
12387
|
size_t offset = 0;
|
12300
12388
|
|
12389
|
+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
12390
|
+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
12391
|
+
offset = word.size();
|
12392
|
+
}
|
12393
|
+
|
12301
12394
|
while (offset < word.size()) {
|
12302
12395
|
llm_symbol sym;
|
12303
12396
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
@@ -12483,16 +12576,16 @@ struct llm_tokenizer_wpm {
|
|
12483
12576
|
// to lowercase, pad chinese characters, pad punctuation
|
12484
12577
|
std::string new_str = "";
|
12485
12578
|
for (uint32_t code : cpts_nfd) {
|
12486
|
-
|
12487
|
-
if (
|
12579
|
+
const codepoint_flags flags = unicode_cpt_flags(code);
|
12580
|
+
if (flags.is_accent_mark || flags.is_control) {
|
12488
12581
|
continue;
|
12489
12582
|
}
|
12490
12583
|
code = unicode_tolower(code);
|
12491
|
-
if (
|
12584
|
+
if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
|
12492
12585
|
code = ' ';
|
12493
12586
|
}
|
12494
12587
|
std::string s = unicode_cpt_to_utf8(code);
|
12495
|
-
if (
|
12588
|
+
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12496
12589
|
new_str += " ";
|
12497
12590
|
new_str += s;
|
12498
12591
|
new_str += " ";
|
@@ -12726,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12726
12819
|
}
|
12727
12820
|
}
|
12728
12821
|
|
12822
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
12823
|
+
LLAMA_LOG_WARN(
|
12824
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
12825
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
12826
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
12827
|
+
}
|
12828
|
+
|
12729
12829
|
if (add_special && vocab.special_add_eos == 1) {
|
12730
12830
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
12731
12831
|
output.push_back(vocab.special_eos_id);
|
@@ -12752,7 +12852,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12752
12852
|
}
|
12753
12853
|
}
|
12754
12854
|
|
12755
|
-
|
12855
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
12856
|
+
LLAMA_LOG_WARN(
|
12857
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
12858
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
12859
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
12860
|
+
}
|
12861
|
+
|
12862
|
+
if (add_special && vocab.special_add_eos == 1) {
|
12863
|
+
GGML_ASSERT(vocab.special_add_eos != -1);
|
12864
|
+
output.push_back(vocab.special_eos_id);
|
12865
|
+
}
|
12756
12866
|
} break;
|
12757
12867
|
case LLAMA_VOCAB_TYPE_WPM:
|
12758
12868
|
{
|
@@ -13106,6 +13216,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
13106
13216
|
return rejects;
|
13107
13217
|
}
|
13108
13218
|
|
13219
|
+
static bool llama_grammar_detect_left_recursion(
|
13220
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
13221
|
+
size_t rule_index,
|
13222
|
+
std::vector<bool> * rules_visited,
|
13223
|
+
std::vector<bool> * rules_in_progress,
|
13224
|
+
std::vector<bool> * rules_may_be_empty) {
|
13225
|
+
if ((*rules_in_progress)[rule_index]) {
|
13226
|
+
return true;
|
13227
|
+
}
|
13228
|
+
|
13229
|
+
(*rules_in_progress)[rule_index] = true;
|
13230
|
+
|
13231
|
+
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
13232
|
+
|
13233
|
+
// First check if the rule might produce the empty string. This could be done combined with the second
|
13234
|
+
// step but it's more readable as two steps.
|
13235
|
+
bool at_rule_start = true;
|
13236
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13237
|
+
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13238
|
+
if (at_rule_start) {
|
13239
|
+
(*rules_may_be_empty)[rule_index] = true;
|
13240
|
+
break;
|
13241
|
+
}
|
13242
|
+
at_rule_start = true;
|
13243
|
+
} else {
|
13244
|
+
at_rule_start = false;
|
13245
|
+
}
|
13246
|
+
}
|
13247
|
+
|
13248
|
+
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
13249
|
+
// be empty)
|
13250
|
+
bool recurse_into_nonterminal = true;
|
13251
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13252
|
+
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
13253
|
+
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
13254
|
+
return true;
|
13255
|
+
}
|
13256
|
+
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
13257
|
+
recurse_into_nonterminal = false;
|
13258
|
+
}
|
13259
|
+
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13260
|
+
recurse_into_nonterminal = true;
|
13261
|
+
} else {
|
13262
|
+
recurse_into_nonterminal = false;
|
13263
|
+
}
|
13264
|
+
}
|
13265
|
+
|
13266
|
+
(*rules_in_progress)[rule_index] = false;
|
13267
|
+
(*rules_visited)[rule_index] = true;
|
13268
|
+
return false;
|
13269
|
+
}
|
13270
|
+
|
13109
13271
|
//
|
13110
13272
|
// grammar - external
|
13111
13273
|
//
|
@@ -13125,6 +13287,19 @@ struct llama_grammar * llama_grammar_init(
|
|
13125
13287
|
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
13126
13288
|
}
|
13127
13289
|
|
13290
|
+
// Check for left recursion
|
13291
|
+
std::vector<bool> rules_visited(n_rules);
|
13292
|
+
std::vector<bool> rules_in_progress(n_rules);
|
13293
|
+
std::vector<bool> rules_may_be_empty(n_rules);
|
13294
|
+
for (size_t i = 0; i < n_rules; i++) {
|
13295
|
+
if (rules_visited[i]) {
|
13296
|
+
continue;
|
13297
|
+
}
|
13298
|
+
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
13299
|
+
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
13300
|
+
}
|
13301
|
+
}
|
13302
|
+
|
13128
13303
|
// loop over alternates of start rule to build initial stacks
|
13129
13304
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
13130
13305
|
pos = vec_rules[start_rule_index].data();
|
@@ -13147,6 +13322,9 @@ struct llama_grammar * llama_grammar_init(
|
|
13147
13322
|
}
|
13148
13323
|
} while (true);
|
13149
13324
|
|
13325
|
+
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
13326
|
+
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
13327
|
+
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
13150
13328
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
13151
13329
|
}
|
13152
13330
|
|
@@ -13741,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13741
13919
|
|
13742
13920
|
// Sample the next word X using top-k sampling
|
13743
13921
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
13744
|
-
|
13745
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13746
|
-
}
|
13922
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13747
13923
|
llama_token X = llama_sample_token(ctx, candidates);
|
13748
13924
|
t_start_sample_us = ggml_time_us();
|
13749
13925
|
|
@@ -13757,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13757
13933
|
// Update mu using the learning rate and error
|
13758
13934
|
*mu = *mu - eta * e;
|
13759
13935
|
|
13760
|
-
|
13761
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13762
|
-
}
|
13936
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13763
13937
|
return X;
|
13764
13938
|
}
|
13765
13939
|
|
@@ -15246,6 +15420,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15246
15420
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
15247
15421
|
/*.main_gpu =*/ 0,
|
15248
15422
|
/*.tensor_split =*/ nullptr,
|
15423
|
+
/*.rpc_servers =*/ nullptr,
|
15249
15424
|
/*.progress_callback =*/ nullptr,
|
15250
15425
|
/*.progress_callback_user_data =*/ nullptr,
|
15251
15426
|
/*.kv_overrides =*/ nullptr,
|
@@ -15316,7 +15491,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
15316
15491
|
}
|
15317
15492
|
|
15318
15493
|
size_t llama_max_devices(void) {
|
15319
|
-
#if defined(
|
15494
|
+
#if defined(GGML_USE_RPC)
|
15495
|
+
return GGML_RPC_MAX_SERVERS;
|
15496
|
+
#elif defined(GGML_USE_METAL)
|
15320
15497
|
return 1;
|
15321
15498
|
#elif defined(GGML_USE_CUDA)
|
15322
15499
|
return GGML_CUDA_MAX_DEVICES;
|
@@ -15339,7 +15516,7 @@ bool llama_supports_mlock(void) {
|
|
15339
15516
|
|
15340
15517
|
bool llama_supports_gpu_offload(void) {
|
15341
15518
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
15342
|
-
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
15519
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
15343
15520
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
15344
15521
|
return true;
|
15345
15522
|
#else
|
@@ -15402,7 +15579,17 @@ struct llama_model * llama_load_model_from_file(
|
|
15402
15579
|
return true;
|
15403
15580
|
};
|
15404
15581
|
}
|
15405
|
-
|
15582
|
+
if (params.rpc_servers != nullptr) {
|
15583
|
+
// split the servers set them into model->rpc_servers
|
15584
|
+
std::string servers(params.rpc_servers);
|
15585
|
+
size_t pos = 0;
|
15586
|
+
while ((pos = servers.find(",")) != std::string::npos) {
|
15587
|
+
std::string server = servers.substr(0, pos);
|
15588
|
+
model->rpc_servers.push_back(server);
|
15589
|
+
servers.erase(0, pos + 1);
|
15590
|
+
}
|
15591
|
+
model->rpc_servers.push_back(servers);
|
15592
|
+
}
|
15406
15593
|
int status = llama_model_load(path_model, *model, params);
|
15407
15594
|
GGML_ASSERT(status <= 0);
|
15408
15595
|
if (status < 0) {
|
@@ -15441,6 +15628,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15441
15628
|
return nullptr;
|
15442
15629
|
}
|
15443
15630
|
|
15631
|
+
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15632
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15633
|
+
params.flash_attn = false;
|
15634
|
+
}
|
15635
|
+
|
15444
15636
|
llama_context * ctx = new llama_context(*model);
|
15445
15637
|
|
15446
15638
|
const auto & hparams = model->hparams;
|
@@ -15464,7 +15656,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15464
15656
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15465
15657
|
|
15466
15658
|
// this is necessary due to kv_self.n being padded later during inference
|
15467
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx,
|
15659
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
|
15468
15660
|
|
15469
15661
|
// with causal attention, the batch size is limited by the context size
|
15470
15662
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
@@ -15509,16 +15701,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15509
15701
|
}
|
15510
15702
|
}
|
15511
15703
|
|
15512
|
-
if (cparams.flash_attn && hparams.use_alibi) {
|
15513
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15514
|
-
cparams.flash_attn = false;
|
15515
|
-
}
|
15516
|
-
|
15517
|
-
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15518
|
-
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15519
|
-
cparams.flash_attn = false;
|
15520
|
-
}
|
15521
|
-
|
15522
15704
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15523
15705
|
params.seed = time(NULL);
|
15524
15706
|
}
|
@@ -15554,7 +15736,17 @@ struct llama_context * llama_new_context_with_model(
|
|
15554
15736
|
|
15555
15737
|
if (!hparams.vocab_only) {
|
15556
15738
|
// initialize backends
|
15557
|
-
#
|
15739
|
+
#if defined(GGML_USE_RPC)
|
15740
|
+
for (auto & server : model->rpc_servers) {
|
15741
|
+
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
15742
|
+
if (backend == nullptr) {
|
15743
|
+
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
15744
|
+
llama_free(ctx);
|
15745
|
+
return nullptr;
|
15746
|
+
}
|
15747
|
+
ctx->backends.push_back(backend);
|
15748
|
+
}
|
15749
|
+
#elif defined(GGML_USE_METAL)
|
15558
15750
|
if (model->n_gpu_layers > 0) {
|
15559
15751
|
ctx->backend_metal = ggml_backend_metal_init();
|
15560
15752
|
if (ctx->backend_metal == nullptr) {
|
@@ -15710,7 +15902,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15710
15902
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
15711
15903
|
|
15712
15904
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
15713
|
-
bool pipeline_parallel =
|
15905
|
+
bool pipeline_parallel =
|
15906
|
+
llama_get_device_count(*model) > 1 &&
|
15907
|
+
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
15908
|
+
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
15909
|
+
params.offload_kqv;
|
15714
15910
|
#ifndef GGML_USE_CUDA
|
15715
15911
|
// pipeline parallelism requires support for async compute and events
|
15716
15912
|
// currently this is only implemented in the CUDA backend
|
@@ -15808,6 +16004,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15808
16004
|
case LLM_ARCH_REFACT:
|
15809
16005
|
case LLM_ARCH_BLOOM:
|
15810
16006
|
case LLM_ARCH_MAMBA:
|
16007
|
+
case LLM_ARCH_JINA_BERT_V2:
|
15811
16008
|
return LLAMA_ROPE_TYPE_NONE;
|
15812
16009
|
|
15813
16010
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -16829,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16829
17026
|
}
|
16830
17027
|
else {
|
16831
17028
|
if (cell_range_begin != kv_self.size) {
|
16832
|
-
cell_ranges.
|
17029
|
+
cell_ranges.emplace_back(cell_range_begin, i);
|
16833
17030
|
cell_range_begin = kv_self.size;
|
16834
17031
|
}
|
16835
17032
|
}
|
16836
17033
|
}
|
16837
17034
|
if (cell_range_begin != kv_self.size) {
|
16838
|
-
cell_ranges.
|
17035
|
+
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
16839
17036
|
}
|
16840
17037
|
|
16841
17038
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|