llama_cpp 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,8 +1,3 @@
|
|
1
|
-
// Defines fileno on msys:
|
2
|
-
#ifndef _GNU_SOURCE
|
3
|
-
#define _GNU_SOURCE
|
4
|
-
#endif
|
5
|
-
|
6
1
|
#include "llama.h"
|
7
2
|
|
8
3
|
#include "ggml.h"
|
@@ -126,6 +121,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
|
126
121
|
}
|
127
122
|
s = std::move(result);
|
128
123
|
}
|
124
|
+
#ifdef GGML_USE_CPU_HBM
|
125
|
+
#include <hbwmalloc.h>
|
126
|
+
#endif
|
129
127
|
|
130
128
|
static void zeros(std::ofstream & file, size_t n) {
|
131
129
|
char zero = 0;
|
@@ -157,6 +155,7 @@ static std::string format(const char * fmt, ...) {
|
|
157
155
|
enum llm_arch {
|
158
156
|
LLM_ARCH_LLAMA,
|
159
157
|
LLM_ARCH_FALCON,
|
158
|
+
LLM_ARCH_BAICHUAN,
|
160
159
|
LLM_ARCH_GPT2,
|
161
160
|
LLM_ARCH_GPTJ,
|
162
161
|
LLM_ARCH_GPTNEOX,
|
@@ -171,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
171
170
|
{ LLM_ARCH_GPTJ, "gptj" },
|
172
171
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
173
172
|
{ LLM_ARCH_MPT, "mpt" },
|
173
|
+
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
174
174
|
};
|
175
175
|
|
176
176
|
enum llm_kv {
|
@@ -311,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
311
311
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
312
312
|
},
|
313
313
|
},
|
314
|
+
{
|
315
|
+
LLM_ARCH_BAICHUAN,
|
316
|
+
{
|
317
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
318
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
319
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
320
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
321
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
322
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
323
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
324
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
325
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
326
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
327
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
328
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
329
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
330
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
331
|
+
},
|
332
|
+
},
|
314
333
|
{
|
315
334
|
LLM_ARCH_FALCON,
|
316
335
|
{
|
@@ -325,6 +344,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
325
344
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
326
345
|
},
|
327
346
|
},
|
347
|
+
{
|
348
|
+
LLM_ARCH_GPT2,
|
349
|
+
{
|
350
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
351
|
+
},
|
352
|
+
},
|
353
|
+
{
|
354
|
+
LLM_ARCH_GPTJ,
|
355
|
+
{
|
356
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
357
|
+
},
|
358
|
+
},
|
359
|
+
{
|
360
|
+
LLM_ARCH_GPTNEOX,
|
361
|
+
{
|
362
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
363
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
364
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
365
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
366
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
367
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
368
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
369
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
370
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
371
|
+
},
|
372
|
+
},
|
373
|
+
{
|
374
|
+
LLM_ARCH_MPT,
|
375
|
+
{
|
376
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
377
|
+
},
|
378
|
+
},
|
379
|
+
{
|
380
|
+
LLM_ARCH_UNKNOWN,
|
381
|
+
{
|
382
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
383
|
+
},
|
384
|
+
},
|
328
385
|
};
|
329
386
|
|
330
387
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
@@ -412,6 +469,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
412
469
|
#elif GGML_USE_METAL
|
413
470
|
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
|
414
471
|
# define llama_host_free(data) ggml_metal_host_free(data)
|
472
|
+
#elif GGML_USE_CPU_HBM
|
473
|
+
# define llama_host_malloc(n) hbw_malloc(n)
|
474
|
+
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
415
475
|
#else
|
416
476
|
# define llama_host_malloc(n) malloc(n)
|
417
477
|
# define llama_host_free(data) free(data)
|
@@ -568,16 +628,16 @@ struct llama_mmap {
|
|
568
628
|
|
569
629
|
if (prefetch > 0) {
|
570
630
|
// Advise the kernel to preload the mapped memory
|
571
|
-
if (
|
572
|
-
fprintf(stderr, "warning:
|
631
|
+
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
632
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
573
633
|
strerror(errno));
|
574
634
|
}
|
575
635
|
}
|
576
636
|
if (numa) {
|
577
637
|
// advise the kernel not to use readahead
|
578
638
|
// (because the next page might not belong on the same node)
|
579
|
-
if (
|
580
|
-
fprintf(stderr, "warning:
|
639
|
+
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
640
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
581
641
|
strerror(errno));
|
582
642
|
}
|
583
643
|
}
|
@@ -620,7 +680,6 @@ struct llama_mmap {
|
|
620
680
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
621
681
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
622
682
|
llama_format_win_err(GetLastError()).c_str());
|
623
|
-
}
|
624
683
|
}
|
625
684
|
#else
|
626
685
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -1446,7 +1505,11 @@ struct llama_model_loader {
|
|
1446
1505
|
// allocate temp buffer if not using mmap
|
1447
1506
|
if (!use_mmap && cur->data == NULL) {
|
1448
1507
|
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
1449
|
-
|
1508
|
+
#ifdef GGML_USE_CPU_HBM
|
1509
|
+
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
1510
|
+
#else
|
1511
|
+
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
1512
|
+
#endif
|
1450
1513
|
}
|
1451
1514
|
|
1452
1515
|
load_data_for(cur);
|
@@ -1600,9 +1663,13 @@ static void llm_load_hparams(
|
|
1600
1663
|
|
1601
1664
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
1602
1665
|
|
1603
|
-
if (
|
1604
|
-
|
1666
|
+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
1667
|
+
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
1668
|
+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
1669
|
+
}
|
1605
1670
|
}
|
1671
|
+
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
1672
|
+
// gpt-j n_rot = rotary_dim
|
1606
1673
|
}
|
1607
1674
|
|
1608
1675
|
// arch-specific KVs
|
@@ -1631,6 +1698,15 @@ static void llm_load_hparams(
|
|
1631
1698
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1632
1699
|
}
|
1633
1700
|
} break;
|
1701
|
+
case LLM_ARCH_BAICHUAN:
|
1702
|
+
{
|
1703
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1704
|
+
switch (hparams.n_layer) {
|
1705
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
1706
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
1707
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1708
|
+
}
|
1709
|
+
} break;
|
1634
1710
|
default: (void)0;
|
1635
1711
|
};
|
1636
1712
|
|
@@ -1871,7 +1947,6 @@ static void llm_load_tensors(
|
|
1871
1947
|
const int64_t n_vocab = hparams.n_vocab;
|
1872
1948
|
|
1873
1949
|
const auto tn = LLM_TN(model.arch);
|
1874
|
-
|
1875
1950
|
switch (model.arch) {
|
1876
1951
|
case LLM_ARCH_LLAMA:
|
1877
1952
|
{
|
@@ -1914,6 +1989,72 @@ static void llm_load_tensors(
|
|
1914
1989
|
|
1915
1990
|
model.layers.resize(n_layer);
|
1916
1991
|
|
1992
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
1993
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1994
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1995
|
+
|
1996
|
+
auto & layer = model.layers[i];
|
1997
|
+
|
1998
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
1999
|
+
|
2000
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
2001
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2002
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2003
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2004
|
+
|
2005
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2006
|
+
|
2007
|
+
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
2008
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2009
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2010
|
+
|
2011
|
+
if (backend == GGML_BACKEND_GPU) {
|
2012
|
+
vram_weights +=
|
2013
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
2014
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
2015
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
2016
|
+
}
|
2017
|
+
}
|
2018
|
+
} break;
|
2019
|
+
case LLM_ARCH_BAICHUAN:
|
2020
|
+
{
|
2021
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2022
|
+
{
|
2023
|
+
ggml_backend backend_norm;
|
2024
|
+
ggml_backend backend_output;
|
2025
|
+
|
2026
|
+
if (n_gpu_layers > int(n_layer)) {
|
2027
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2028
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2029
|
+
#ifndef _WIN32
|
2030
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2031
|
+
#else
|
2032
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2033
|
+
#endif // _WIN32
|
2034
|
+
|
2035
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2036
|
+
} else {
|
2037
|
+
backend_norm = GGML_BACKEND_CPU;
|
2038
|
+
backend_output = GGML_BACKEND_CPU;
|
2039
|
+
}
|
2040
|
+
|
2041
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2042
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2043
|
+
|
2044
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2045
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2046
|
+
}
|
2047
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2048
|
+
vram_weights += ggml_nbytes(model.output);
|
2049
|
+
}
|
2050
|
+
}
|
2051
|
+
|
2052
|
+
const uint32_t n_ff = hparams.n_ff;
|
2053
|
+
|
2054
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2055
|
+
|
2056
|
+
model.layers.resize(n_layer);
|
2057
|
+
|
1917
2058
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1918
2059
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1919
2060
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
@@ -2071,95 +2212,427 @@ static void llm_load_tensors(
|
|
2071
2212
|
vram_kv_cache += hparams.kv_size() / 2;
|
2072
2213
|
}
|
2073
2214
|
}
|
2074
|
-
#elif defined(GGML_USE_CLBLAST)
|
2075
|
-
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2076
|
-
const int max_offloadable_layers = hparams.n_layer + 1;
|
2077
|
-
#endif // GGML_USE_CUBLAS
|
2215
|
+
#elif defined(GGML_USE_CLBLAST)
|
2216
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2217
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
2218
|
+
#endif // GGML_USE_CUBLAS
|
2219
|
+
|
2220
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
2221
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2222
|
+
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
2223
|
+
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2224
|
+
#else
|
2225
|
+
(void) n_gpu_layers;
|
2226
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2227
|
+
}
|
2228
|
+
|
2229
|
+
// populate `tensors_by_name`
|
2230
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
2231
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
2232
|
+
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
(void) tensor_split;
|
2236
|
+
#if defined(GGML_USE_CUBLAS)
|
2237
|
+
{
|
2238
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
2239
|
+
}
|
2240
|
+
#endif
|
2241
|
+
|
2242
|
+
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
2243
|
+
|
2244
|
+
if (progress_callback) {
|
2245
|
+
progress_callback(1.0f, progress_callback_user_data);
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
model.mapping = std::move(ml.mapping);
|
2249
|
+
|
2250
|
+
// loading time will be recalculate after the first eval, so
|
2251
|
+
// we take page faults deferred by mmap() into consideration
|
2252
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
2253
|
+
}
|
2254
|
+
|
2255
|
+
static bool llama_model_load(
|
2256
|
+
const std::string & fname,
|
2257
|
+
llama_model & model,
|
2258
|
+
int n_ctx,
|
2259
|
+
int n_batch,
|
2260
|
+
int n_gpu_layers,
|
2261
|
+
int main_gpu,
|
2262
|
+
const float * tensor_split,
|
2263
|
+
const bool mul_mat_q,
|
2264
|
+
float rope_freq_base,
|
2265
|
+
float rope_freq_scale,
|
2266
|
+
bool low_vram,
|
2267
|
+
ggml_type memory_type,
|
2268
|
+
bool use_mmap,
|
2269
|
+
bool use_mlock,
|
2270
|
+
bool vocab_only,
|
2271
|
+
llama_progress_callback progress_callback,
|
2272
|
+
void *progress_callback_user_data) {
|
2273
|
+
try {
|
2274
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
2275
|
+
|
2276
|
+
llm_load_arch (*ml, model);
|
2277
|
+
llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
|
2278
|
+
llm_load_vocab (*ml, model);
|
2279
|
+
|
2280
|
+
llm_load_print_meta(*ml, model);
|
2281
|
+
|
2282
|
+
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2283
|
+
throw std::runtime_error("vocab size mismatch");
|
2284
|
+
}
|
2285
|
+
|
2286
|
+
if (vocab_only) {
|
2287
|
+
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
2288
|
+
return true;
|
2289
|
+
}
|
2290
|
+
|
2291
|
+
llm_load_tensors(
|
2292
|
+
*ml, model, n_batch, n_gpu_layers,
|
2293
|
+
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
|
2294
|
+
use_mlock, progress_callback, progress_callback_user_data);
|
2295
|
+
} catch (const std::exception & err) {
|
2296
|
+
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
2297
|
+
return false;
|
2298
|
+
}
|
2299
|
+
|
2300
|
+
return true;
|
2301
|
+
}
|
2302
|
+
|
2303
|
+
static struct ggml_cgraph * llm_build_llama(
|
2304
|
+
llama_context & lctx,
|
2305
|
+
const llama_token * tokens,
|
2306
|
+
const float * embd,
|
2307
|
+
int n_tokens,
|
2308
|
+
int n_past) {
|
2309
|
+
|
2310
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2311
|
+
|
2312
|
+
const int N = n_tokens;
|
2313
|
+
|
2314
|
+
const auto & model = lctx.model;
|
2315
|
+
const auto & hparams = model.hparams;
|
2316
|
+
|
2317
|
+
const auto & kv_self = lctx.kv_self;
|
2318
|
+
|
2319
|
+
GGML_ASSERT(!!kv_self.ctx);
|
2320
|
+
|
2321
|
+
const int64_t n_embd = hparams.n_embd;
|
2322
|
+
const int64_t n_layer = hparams.n_layer;
|
2323
|
+
const int64_t n_ctx = hparams.n_ctx;
|
2324
|
+
const int64_t n_head = hparams.n_head;
|
2325
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
2326
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
2327
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
2328
|
+
|
2329
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2330
|
+
|
2331
|
+
const float freq_base = hparams.rope_freq_base;
|
2332
|
+
const float freq_scale = hparams.rope_freq_scale;
|
2333
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2334
|
+
|
2335
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
2336
|
+
|
2337
|
+
auto & buf_compute = lctx.buf_compute;
|
2338
|
+
|
2339
|
+
struct ggml_init_params params = {
|
2340
|
+
/*.mem_size =*/ buf_compute.size,
|
2341
|
+
/*.mem_buffer =*/ buf_compute.data,
|
2342
|
+
/*.no_alloc =*/ false,
|
2343
|
+
};
|
2344
|
+
|
2345
|
+
params.no_alloc = true;
|
2346
|
+
|
2347
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
2348
|
+
|
2349
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
2350
|
+
|
2351
|
+
struct ggml_tensor * cur;
|
2352
|
+
struct ggml_tensor * inpL;
|
2353
|
+
|
2354
|
+
if (tokens) {
|
2355
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
2356
|
+
|
2357
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2358
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2359
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
2360
|
+
}
|
2361
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
2362
|
+
|
2363
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
2364
|
+
} else {
|
2365
|
+
#ifdef GGML_USE_MPI
|
2366
|
+
GGML_ASSERT(false && "not implemented");
|
2367
|
+
#endif
|
2368
|
+
|
2369
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
2370
|
+
|
2371
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
2372
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2373
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
2374
|
+
}
|
2375
|
+
}
|
2376
|
+
|
2377
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2378
|
+
(void) i_gpu_start;
|
2379
|
+
|
2380
|
+
// offload functions set the tensor output backend to GPU
|
2381
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2382
|
+
//
|
2383
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2384
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
2385
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2386
|
+
offload_func_t offload_func_kq = llama_nop;
|
2387
|
+
offload_func_t offload_func_v = llama_nop;
|
2388
|
+
|
2389
|
+
#ifdef GGML_USE_CUBLAS
|
2390
|
+
if (n_gpu_layers > n_layer) {
|
2391
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
2392
|
+
}
|
2393
|
+
if (n_gpu_layers > n_layer + 1) {
|
2394
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
2395
|
+
}
|
2396
|
+
if (n_gpu_layers > n_layer + 2) {
|
2397
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
2398
|
+
}
|
2399
|
+
#endif // GGML_USE_CUBLAS
|
2400
|
+
|
2401
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2402
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2403
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2404
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2405
|
+
}
|
2406
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2407
|
+
|
2408
|
+
for (int il = 0; il < n_layer; ++il) {
|
2409
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
2410
|
+
|
2411
|
+
offload_func_t offload_func = llama_nop;
|
2412
|
+
|
2413
|
+
#ifdef GGML_USE_CUBLAS
|
2414
|
+
if (il >= i_gpu_start) {
|
2415
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
2416
|
+
}
|
2417
|
+
#endif // GGML_USE_CUBLAS
|
2418
|
+
|
2419
|
+
struct ggml_tensor * inpSA = inpL;
|
2420
|
+
|
2421
|
+
// norm
|
2422
|
+
{
|
2423
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
2424
|
+
offload_func(cur);
|
2425
|
+
ggml_set_name(cur, "rms_norm_0");
|
2426
|
+
|
2427
|
+
// cur = cur*attn_norm(broadcasted)
|
2428
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
2429
|
+
offload_func(cur);
|
2430
|
+
ggml_set_name(cur, "attention_norm_0");
|
2431
|
+
}
|
2432
|
+
|
2433
|
+
// self-attention
|
2434
|
+
{
|
2435
|
+
// compute Q and K and RoPE them
|
2436
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
2437
|
+
offload_func_kq(tmpk);
|
2438
|
+
ggml_set_name(tmpk, "tmpk");
|
2439
|
+
|
2440
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
2441
|
+
offload_func_kq(tmpq);
|
2442
|
+
ggml_set_name(tmpq, "tmpq");
|
2443
|
+
|
2444
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2445
|
+
offload_func_kq(Kcur);
|
2446
|
+
ggml_set_name(Kcur, "Kcur");
|
2447
|
+
|
2448
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2449
|
+
offload_func_kq(Qcur);
|
2450
|
+
ggml_set_name(Qcur, "Qcur");
|
2451
|
+
|
2452
|
+
// store key and value to memory
|
2453
|
+
{
|
2454
|
+
// compute the transposed [N, n_embd] V matrix
|
2455
|
+
|
2456
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2457
|
+
offload_func_v(tmpv);
|
2458
|
+
ggml_set_name(tmpv, "tmpv");
|
2459
|
+
|
2460
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
2461
|
+
offload_func_v(Vcur);
|
2462
|
+
ggml_set_name(Vcur, "Vcur");
|
2463
|
+
|
2464
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
2465
|
+
offload_func_kq(k);
|
2466
|
+
ggml_set_name(k, "k");
|
2467
|
+
|
2468
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
2469
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
2470
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
2471
|
+
offload_func_v(v);
|
2472
|
+
ggml_set_name(v, "v");
|
2473
|
+
|
2474
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
2475
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
2476
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
2477
|
+
}
|
2478
|
+
|
2479
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
2480
|
+
offload_func_kq(Q);
|
2481
|
+
ggml_set_name(Q, "Q");
|
2482
|
+
|
2483
|
+
struct ggml_tensor * K =
|
2484
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2485
|
+
n_embd_head, n_past + N, n_head_kv,
|
2486
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2487
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2488
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
2489
|
+
offload_func_kq(K);
|
2490
|
+
ggml_set_name(K, "K");
|
2491
|
+
|
2492
|
+
// K * Q
|
2493
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
2494
|
+
offload_func_kq(KQ);
|
2495
|
+
ggml_set_name(KQ, "KQ");
|
2496
|
+
|
2497
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2498
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
2499
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
2500
|
+
offload_func_kq(KQ_scaled);
|
2501
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2502
|
+
|
2503
|
+
// KQ_masked = mask_past(KQ_scaled)
|
2504
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2505
|
+
offload_func_kq(KQ_masked);
|
2506
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
2507
|
+
|
2508
|
+
// KQ = soft_max(KQ_masked)
|
2509
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
2510
|
+
offload_func_v(KQ_soft_max);
|
2511
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2512
|
+
|
2513
|
+
// split cached V into n_head heads
|
2514
|
+
struct ggml_tensor * V =
|
2515
|
+
ggml_view_3d(ctx0, kv_self.v,
|
2516
|
+
n_past + N, n_embd_head, n_head_kv,
|
2517
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
2518
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2519
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
2520
|
+
offload_func_v(V);
|
2521
|
+
ggml_set_name(V, "V");
|
2522
|
+
|
2523
|
+
#if 1
|
2524
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
2525
|
+
offload_func_v(KQV);
|
2526
|
+
ggml_set_name(KQV, "KQV");
|
2527
|
+
#else
|
2528
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2529
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2530
|
+
// is there a better way?
|
2531
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
2532
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2533
|
+
#endif
|
2534
|
+
|
2535
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
2536
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
2537
|
+
offload_func_v(KQV_merged);
|
2538
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
2539
|
+
|
2540
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
2541
|
+
cur = ggml_cpy(ctx0,
|
2542
|
+
KQV_merged,
|
2543
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2544
|
+
offload_func_v(cur);
|
2545
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
2546
|
+
|
2547
|
+
// projection (no bias)
|
2548
|
+
cur = ggml_mul_mat(ctx0,
|
2549
|
+
model.layers[il].wo,
|
2550
|
+
cur);
|
2551
|
+
offload_func(cur);
|
2552
|
+
ggml_set_name(cur, "result_wo");
|
2553
|
+
}
|
2078
2554
|
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2083
|
-
#else
|
2084
|
-
(void) n_gpu_layers;
|
2085
|
-
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2086
|
-
}
|
2555
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
2556
|
+
offload_func(inpFF);
|
2557
|
+
ggml_set_name(inpFF, "inpFF");
|
2087
2558
|
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2559
|
+
// feed-forward network
|
2560
|
+
{
|
2561
|
+
// norm
|
2562
|
+
{
|
2563
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
2564
|
+
offload_func(cur);
|
2565
|
+
ggml_set_name(cur, "rms_norm_1");
|
2093
2566
|
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
#endif
|
2567
|
+
// cur = cur*ffn_norm(broadcasted)
|
2568
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
2569
|
+
offload_func(cur);
|
2570
|
+
ggml_set_name(cur, "ffn_norm");
|
2571
|
+
}
|
2100
2572
|
|
2101
|
-
|
2573
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
2574
|
+
model.layers[il].w3,
|
2575
|
+
cur);
|
2576
|
+
offload_func(tmp);
|
2577
|
+
ggml_set_name(tmp, "result_w3");
|
2102
2578
|
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2579
|
+
cur = ggml_mul_mat(ctx0,
|
2580
|
+
model.layers[il].w1,
|
2581
|
+
cur);
|
2582
|
+
offload_func(cur);
|
2583
|
+
ggml_set_name(cur, "result_w1");
|
2106
2584
|
|
2107
|
-
|
2585
|
+
// SILU activation
|
2586
|
+
cur = ggml_silu(ctx0, cur);
|
2587
|
+
offload_func(cur);
|
2588
|
+
ggml_set_name(cur, "silu");
|
2108
2589
|
|
2109
|
-
|
2110
|
-
|
2111
|
-
|
2112
|
-
}
|
2590
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
2591
|
+
offload_func(cur);
|
2592
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
2113
2593
|
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
int main_gpu,
|
2121
|
-
const float * tensor_split,
|
2122
|
-
const bool mul_mat_q,
|
2123
|
-
float rope_freq_base,
|
2124
|
-
float rope_freq_scale,
|
2125
|
-
bool low_vram,
|
2126
|
-
ggml_type memory_type,
|
2127
|
-
bool use_mmap,
|
2128
|
-
bool use_mlock,
|
2129
|
-
bool vocab_only,
|
2130
|
-
llama_progress_callback progress_callback,
|
2131
|
-
void *progress_callback_user_data) {
|
2132
|
-
try {
|
2133
|
-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
2594
|
+
cur = ggml_mul_mat(ctx0,
|
2595
|
+
model.layers[il].w2,
|
2596
|
+
cur);
|
2597
|
+
offload_func(cur);
|
2598
|
+
ggml_set_name(cur, "result_w2");
|
2599
|
+
}
|
2134
2600
|
|
2135
|
-
|
2136
|
-
|
2137
|
-
|
2601
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
2602
|
+
offload_func(cur);
|
2603
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
2138
2604
|
|
2139
|
-
|
2605
|
+
// input for next layer
|
2606
|
+
inpL = cur;
|
2607
|
+
}
|
2140
2608
|
|
2141
|
-
|
2142
|
-
throw std::runtime_error("vocab size mismatch");
|
2143
|
-
}
|
2609
|
+
cur = inpL;
|
2144
2610
|
|
2145
|
-
|
2146
|
-
|
2147
|
-
|
2148
|
-
|
2611
|
+
// norm
|
2612
|
+
{
|
2613
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
2614
|
+
offload_func_nr(cur);
|
2615
|
+
ggml_set_name(cur, "rms_norm_2");
|
2149
2616
|
|
2150
|
-
|
2151
|
-
|
2152
|
-
|
2153
|
-
|
2154
|
-
} catch (const std::exception & err) {
|
2155
|
-
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
2156
|
-
return false;
|
2617
|
+
// cur = cur*norm(broadcasted)
|
2618
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
2619
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
2620
|
+
ggml_set_name(cur, "result_norm");
|
2157
2621
|
}
|
2158
2622
|
|
2159
|
-
|
2623
|
+
// lm_head
|
2624
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
2625
|
+
ggml_set_name(cur, "result_output");
|
2626
|
+
|
2627
|
+
ggml_build_forward_expand(gf, cur);
|
2628
|
+
|
2629
|
+
ggml_free(ctx0);
|
2630
|
+
|
2631
|
+
return gf;
|
2160
2632
|
}
|
2161
2633
|
|
2162
|
-
|
2634
|
+
|
2635
|
+
static struct ggml_cgraph * llm_build_baichaun(
|
2163
2636
|
llama_context & lctx,
|
2164
2637
|
const llama_token * tokens,
|
2165
2638
|
const float * embd,
|
@@ -2300,11 +2773,24 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2300
2773
|
offload_func_kq(tmpq);
|
2301
2774
|
ggml_set_name(tmpq, "tmpq");
|
2302
2775
|
|
2303
|
-
struct ggml_tensor * Kcur
|
2776
|
+
struct ggml_tensor * Kcur;
|
2777
|
+
struct ggml_tensor * Qcur;
|
2778
|
+
switch (model.type) {
|
2779
|
+
case MODEL_7B:
|
2780
|
+
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2781
|
+
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2782
|
+
break;
|
2783
|
+
case MODEL_13B:
|
2784
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
|
2785
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
|
2786
|
+
break;
|
2787
|
+
default:
|
2788
|
+
GGML_ASSERT(false);
|
2789
|
+
}
|
2790
|
+
|
2304
2791
|
offload_func_kq(Kcur);
|
2305
2792
|
ggml_set_name(Kcur, "Kcur");
|
2306
2793
|
|
2307
|
-
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2308
2794
|
offload_func_kq(Qcur);
|
2309
2795
|
ggml_set_name(Qcur, "Qcur");
|
2310
2796
|
|
@@ -2359,10 +2845,26 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2359
2845
|
offload_func_kq(KQ_scaled);
|
2360
2846
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2361
2847
|
|
2848
|
+
struct ggml_tensor * KQ_masked;
|
2849
|
+
struct ggml_tensor * KQ_scaled_alibi;
|
2850
|
+
|
2851
|
+
switch (model.type) {
|
2852
|
+
case MODEL_7B:
|
2853
|
+
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2854
|
+
break;
|
2855
|
+
case MODEL_13B:
|
2856
|
+
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
|
2857
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2858
|
+
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2859
|
+
break;
|
2860
|
+
default:
|
2861
|
+
GGML_ASSERT(false);
|
2862
|
+
}
|
2362
2863
|
// KQ_masked = mask_past(KQ_scaled)
|
2363
|
-
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2364
|
-
|
2365
|
-
|
2864
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2865
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2866
|
+
// offload_func_kq(KQ_masked);
|
2867
|
+
// ggml_set_name(KQ_masked, "KQ_masked");
|
2366
2868
|
|
2367
2869
|
// KQ = soft_max(KQ_masked)
|
2368
2870
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
@@ -2812,6 +3314,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
2812
3314
|
{
|
2813
3315
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
2814
3316
|
} break;
|
3317
|
+
case LLM_ARCH_BAICHUAN:
|
3318
|
+
{
|
3319
|
+
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
|
3320
|
+
} break;
|
2815
3321
|
case LLM_ARCH_FALCON:
|
2816
3322
|
{
|
2817
3323
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
@@ -2895,7 +3401,12 @@ static bool llama_eval_internal(
|
|
2895
3401
|
|
2896
3402
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
2897
3403
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
2898
|
-
|
3404
|
+
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
3405
|
+
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
3406
|
+
// with the BLAS calls. need a better solution
|
3407
|
+
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
3408
|
+
n_threads = std::min(4, n_threads);
|
3409
|
+
}
|
2899
3410
|
|
2900
3411
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
2901
3412
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
@@ -3000,33 +3511,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
3000
3511
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
3001
3512
|
}
|
3002
3513
|
|
3003
|
-
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
3004
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
3005
|
-
}
|
3006
|
-
|
3007
|
-
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
3008
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
|
3009
|
-
}
|
3010
|
-
|
3011
3514
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
3012
3515
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
3013
3516
|
}
|
3014
3517
|
|
3015
|
-
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
3016
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3017
|
-
return id == vocab.special_bos_id;
|
3018
|
-
}
|
3019
|
-
|
3020
|
-
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
|
3021
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3022
|
-
return id == vocab.special_eos_id;
|
3023
|
-
}
|
3024
|
-
|
3025
|
-
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
|
3026
|
-
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
|
3027
|
-
return id == vocab.special_pad_id;
|
3028
|
-
}
|
3029
|
-
|
3030
3518
|
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
3031
3519
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
3032
3520
|
const auto& token_data = vocab.id_to_token.at(id);
|
@@ -3087,10 +3575,9 @@ struct llm_tokenizer_spm {
|
|
3087
3575
|
while (offs < text.size()) {
|
3088
3576
|
llm_symbol sym;
|
3089
3577
|
size_t len = utf8_len(text[offs]);
|
3090
|
-
GGML_ASSERT(offs + len <= text.size());
|
3091
3578
|
sym.text = text.c_str() + offs;
|
3092
|
-
sym.n = len;
|
3093
|
-
offs +=
|
3579
|
+
sym.n = std::min(len, text.size() - offs);
|
3580
|
+
offs += sym.n;
|
3094
3581
|
sym.prev = index - 1;
|
3095
3582
|
sym.next = offs == text.size() ? -1 : index + 1;
|
3096
3583
|
index++;
|
@@ -3319,9 +3806,15 @@ struct llm_tokenizer_bpe {
|
|
3319
3806
|
std::string byte_str(1, *j);
|
3320
3807
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
3321
3808
|
if (token_multibyte == vocab.token_to_id.end()) {
|
3322
|
-
|
3809
|
+
try {
|
3810
|
+
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
3811
|
+
output.push_back(token_byte);
|
3812
|
+
} catch (const std::out_of_range & err) {
|
3813
|
+
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
3814
|
+
}
|
3815
|
+
} else {
|
3816
|
+
output.push_back((*token_multibyte).second);
|
3323
3817
|
}
|
3324
|
-
output.push_back((*token_multibyte).second);
|
3325
3818
|
}
|
3326
3819
|
} else {
|
3327
3820
|
output.push_back((*token).second);
|
@@ -3595,7 +4088,7 @@ static void llama_grammar_advance_stack(
|
|
3595
4088
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
3596
4089
|
|
3597
4090
|
if (stack.empty()) {
|
3598
|
-
new_stacks.
|
4091
|
+
new_stacks.emplace_back(stack);
|
3599
4092
|
return;
|
3600
4093
|
}
|
3601
4094
|
|
@@ -3632,7 +4125,7 @@ static void llama_grammar_advance_stack(
|
|
3632
4125
|
}
|
3633
4126
|
case LLAMA_GRETYPE_CHAR:
|
3634
4127
|
case LLAMA_GRETYPE_CHAR_NOT:
|
3635
|
-
new_stacks.
|
4128
|
+
new_stacks.emplace_back(stack);
|
3636
4129
|
break;
|
3637
4130
|
default:
|
3638
4131
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -3797,6 +4290,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
|
|
3797
4290
|
delete grammar;
|
3798
4291
|
}
|
3799
4292
|
|
4293
|
+
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
4294
|
+
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
4295
|
+
|
4296
|
+
// redirect elements in stacks to point to new rules
|
4297
|
+
for (size_t is = 0; is < result->stacks.size(); is++) {
|
4298
|
+
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
4299
|
+
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
4300
|
+
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
4301
|
+
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
4302
|
+
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
4303
|
+
}
|
4304
|
+
}
|
4305
|
+
}
|
4306
|
+
}
|
4307
|
+
}
|
4308
|
+
|
4309
|
+
return result;
|
4310
|
+
}
|
4311
|
+
|
3800
4312
|
//
|
3801
4313
|
// sampling
|
3802
4314
|
//
|
@@ -4388,7 +4900,7 @@ struct llama_logit_info {
|
|
4388
4900
|
}
|
4389
4901
|
return min_heap;
|
4390
4902
|
}
|
4391
|
-
float probability_from_logit(float logit) {
|
4903
|
+
float probability_from_logit(float logit) const {
|
4392
4904
|
return normalizer * std::exp(logit - max_l);
|
4393
4905
|
}
|
4394
4906
|
};
|
@@ -4581,7 +5093,16 @@ void llama_beam_search(llama_context * ctx,
|
|
4581
5093
|
// quantization
|
4582
5094
|
//
|
4583
5095
|
|
4584
|
-
|
5096
|
+
template <typename T>
|
5097
|
+
struct no_init {
|
5098
|
+
T value;
|
5099
|
+
no_init() { /* do nothing */ }
|
5100
|
+
};
|
5101
|
+
|
5102
|
+
static void llama_convert_tensor_internal(
|
5103
|
+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
5104
|
+
const size_t nelements, const int nthread
|
5105
|
+
) {
|
4585
5106
|
if (output.size() < nelements) {
|
4586
5107
|
output.resize(nelements);
|
4587
5108
|
}
|
@@ -4616,7 +5137,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4616
5137
|
auto blocks_per_thread = nblocks / nthread;
|
4617
5138
|
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
4618
5139
|
|
4619
|
-
std::vector<std::thread> workers;
|
4620
5140
|
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
4621
5141
|
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
4622
5142
|
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
@@ -4629,14 +5149,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4629
5149
|
qtype.to_float(inbuf, outbuf, nels);
|
4630
5150
|
}
|
4631
5151
|
};
|
4632
|
-
workers.
|
5152
|
+
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
4633
5153
|
in_buff_offs += thr_block_bytes;
|
4634
5154
|
out_buff_offs += thr_elems;
|
4635
5155
|
}
|
4636
|
-
for (auto &
|
4637
|
-
|
5156
|
+
for (auto & w : workers) { w.join(); }
|
5157
|
+
workers.clear();
|
5158
|
+
}
|
5159
|
+
|
5160
|
+
#ifdef GGML_USE_K_QUANTS
|
5161
|
+
static ggml_type get_k_quant_type(
|
5162
|
+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
5163
|
+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
5164
|
+
) {
|
5165
|
+
const std::string name = ggml_get_name(tensor);
|
5166
|
+
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
5167
|
+
const auto tn = LLM_TN(model.arch);
|
5168
|
+
|
5169
|
+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
5170
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
5171
|
+
};
|
5172
|
+
|
5173
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5174
|
+
int nx = tensor->ne[0];
|
5175
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
5176
|
+
new_type = GGML_TYPE_Q8_0;
|
5177
|
+
}
|
5178
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
5179
|
+
new_type = GGML_TYPE_Q6_K;
|
5180
|
+
}
|
5181
|
+
} else if (name.find("attn_v.weight") != std::string::npos) {
|
5182
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5183
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5184
|
+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5185
|
+
}
|
5186
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5187
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
5188
|
+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
5189
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
5190
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
5191
|
+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
5192
|
+
if (model.type == MODEL_70B) {
|
5193
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
5194
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
5195
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
5196
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
5197
|
+
}
|
5198
|
+
++*i_attention_wv;
|
5199
|
+
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
5200
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5201
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5202
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
5203
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
5204
|
+
: GGML_TYPE_Q3_K;
|
5205
|
+
}
|
5206
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
5207
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
5208
|
+
}
|
5209
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
5210
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
5211
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
5212
|
+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5213
|
+
} else {
|
5214
|
+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5215
|
+
}
|
5216
|
+
}
|
5217
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5218
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
5219
|
+
new_type = GGML_TYPE_Q5_K;
|
5220
|
+
}
|
5221
|
+
++*i_feed_forward_w2;
|
5222
|
+
} else if (name.find("attn_output.weight") != std::string::npos) {
|
5223
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
5224
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
5225
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
5226
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5227
|
+
} else {
|
5228
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5229
|
+
}
|
5230
|
+
}
|
5231
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
5232
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5233
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
5234
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
5235
|
+
}
|
5236
|
+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
5237
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5238
|
+
}
|
5239
|
+
// This can be used to reduce the size of the Q5_K_S model.
|
5240
|
+
// The associated PPL increase is fully in line with the size reduction
|
5241
|
+
//else {
|
5242
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
5243
|
+
//}
|
5244
|
+
bool convert_incompatible_tensor = false;
|
5245
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
5246
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
5247
|
+
int nx = tensor->ne[0];
|
5248
|
+
int ny = tensor->ne[1];
|
5249
|
+
if (nx % QK_K != 0) {
|
5250
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
5251
|
+
convert_incompatible_tensor = true;
|
5252
|
+
}
|
5253
|
+
}
|
5254
|
+
if (convert_incompatible_tensor) {
|
5255
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5256
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
5257
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
5258
|
+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
5259
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
5260
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
5261
|
+
} else {
|
5262
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
5263
|
+
}
|
4638
5264
|
}
|
5265
|
+
|
5266
|
+
return new_type;
|
4639
5267
|
}
|
5268
|
+
#endif
|
4640
5269
|
|
4641
5270
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
4642
5271
|
ggml_type quantized_type;
|
@@ -4678,6 +5307,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4678
5307
|
llm_load_arch(*ml, model);
|
4679
5308
|
llm_load_hparams(*ml, model, 0, 0, 0);
|
4680
5309
|
|
5310
|
+
if (params->only_copy) {
|
5311
|
+
ftype = model.ftype;
|
5312
|
+
}
|
5313
|
+
|
4681
5314
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
4682
5315
|
struct gguf_context * ctx_out = gguf_init_empty();
|
4683
5316
|
|
@@ -4717,16 +5350,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4717
5350
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
4718
5351
|
|
4719
5352
|
std::vector<std::thread> workers;
|
5353
|
+
workers.reserve(nthread);
|
4720
5354
|
std::mutex mutex;
|
4721
5355
|
|
4722
|
-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4723
|
-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4724
|
-
};
|
4725
|
-
|
4726
5356
|
int idx = 0;
|
4727
5357
|
|
4728
|
-
std::vector<uint8_t
|
4729
|
-
std::vector<uint8_t
|
5358
|
+
std::vector<no_init<uint8_t>> read_data;
|
5359
|
+
std::vector<no_init<uint8_t>> work;
|
5360
|
+
std::vector<no_init<float>> f32_conv_buf;
|
4730
5361
|
|
4731
5362
|
// populate the original tensors so we get an initial meta data
|
4732
5363
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
@@ -4748,7 +5379,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4748
5379
|
|
4749
5380
|
const std::string name = ggml_get_name(tensor);
|
4750
5381
|
|
4751
|
-
read_data.
|
5382
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
5383
|
+
read_data.resize(ggml_nbytes(tensor));
|
5384
|
+
}
|
4752
5385
|
tensor->data = read_data.data();
|
4753
5386
|
ml->load_data_for(tensor);
|
4754
5387
|
|
@@ -4764,137 +5397,50 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4764
5397
|
// quantize only 2D tensors
|
4765
5398
|
quantize &= (tensor->n_dims == 2);
|
4766
5399
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
4767
|
-
quantize &=
|
5400
|
+
quantize &= !params->only_copy;
|
4768
5401
|
|
4769
5402
|
enum ggml_type new_type;
|
4770
5403
|
void * new_data;
|
4771
5404
|
size_t new_size;
|
4772
5405
|
|
5406
|
+
if (quantize) {
|
5407
|
+
new_type = quantized_type;
|
5408
|
+
#ifdef GGML_USE_K_QUANTS
|
5409
|
+
new_type = get_k_quant_type(
|
5410
|
+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
5411
|
+
);
|
5412
|
+
#endif
|
5413
|
+
// If we've decided to quantize to the same type the tensor is already
|
5414
|
+
// in then there's nothing to do.
|
5415
|
+
quantize = tensor->type != new_type;
|
5416
|
+
}
|
4773
5417
|
if (!quantize) {
|
4774
5418
|
new_type = tensor->type;
|
4775
5419
|
new_data = tensor->data;
|
4776
5420
|
new_size = ggml_nbytes(tensor);
|
4777
5421
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4778
5422
|
} else {
|
4779
|
-
new_type = quantized_type;
|
4780
|
-
#ifdef GGML_USE_K_QUANTS
|
4781
|
-
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
4782
|
-
const auto tn = LLM_TN(ml->get_arch());
|
4783
|
-
|
4784
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4785
|
-
int nx = tensor->ne[0];
|
4786
|
-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4787
|
-
new_type = GGML_TYPE_Q8_0;
|
4788
|
-
}
|
4789
|
-
else if (new_type != GGML_TYPE_Q8_0) {
|
4790
|
-
new_type = GGML_TYPE_Q6_K;
|
4791
|
-
}
|
4792
|
-
} else if (name.find("attn_v.weight") != std::string::npos) {
|
4793
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4794
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4795
|
-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4796
|
-
}
|
4797
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4798
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4799
|
-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
4800
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4801
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4802
|
-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4803
|
-
if (model.type == MODEL_70B) {
|
4804
|
-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4805
|
-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4806
|
-
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4807
|
-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4808
|
-
}
|
4809
|
-
++i_attention_wv;
|
4810
|
-
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4811
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4812
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4813
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4814
|
-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4815
|
-
: GGML_TYPE_Q3_K;
|
4816
|
-
}
|
4817
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4818
|
-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4819
|
-
}
|
4820
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4821
|
-
if (model.arch == LLM_ARCH_FALCON) {
|
4822
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4823
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4824
|
-
} else {
|
4825
|
-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4826
|
-
}
|
4827
|
-
}
|
4828
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4829
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4830
|
-
new_type = GGML_TYPE_Q5_K;
|
4831
|
-
}
|
4832
|
-
++i_feed_forward_w2;
|
4833
|
-
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4834
|
-
if (model.arch != LLM_ARCH_FALCON) {
|
4835
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4836
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4837
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4838
|
-
} else {
|
4839
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4840
|
-
}
|
4841
|
-
}
|
4842
|
-
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4843
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4844
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4845
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4846
|
-
}
|
4847
|
-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4848
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4849
|
-
}
|
4850
|
-
// This can be used to reduce the size of the Q5_K_S model.
|
4851
|
-
// The associated PPL increase is fully in line with the size reduction
|
4852
|
-
//else {
|
4853
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
4854
|
-
//}
|
4855
|
-
bool convert_incompatible_tensor = false;
|
4856
|
-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
4857
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4858
|
-
int nx = tensor->ne[0];
|
4859
|
-
int ny = tensor->ne[1];
|
4860
|
-
if (nx % QK_K != 0) {
|
4861
|
-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4862
|
-
convert_incompatible_tensor = true;
|
4863
|
-
}
|
4864
|
-
}
|
4865
|
-
if (convert_incompatible_tensor) {
|
4866
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4867
|
-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
4868
|
-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
4869
|
-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
4870
|
-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
4871
|
-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
4872
|
-
} else {
|
4873
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
4874
|
-
}
|
4875
|
-
}
|
4876
|
-
#endif
|
4877
|
-
|
4878
5423
|
const size_t nelements = ggml_nelements(tensor);
|
4879
5424
|
|
4880
5425
|
float * f32_data;
|
4881
|
-
std::vector<float> f32_conv_buf;
|
4882
5426
|
|
4883
5427
|
if (tensor->type == GGML_TYPE_F32) {
|
4884
5428
|
f32_data = (float *) tensor->data;
|
4885
5429
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
4886
5430
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
4887
5431
|
} else {
|
4888
|
-
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
5432
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
4889
5433
|
f32_data = (float *) f32_conv_buf.data();
|
4890
5434
|
}
|
4891
5435
|
|
4892
5436
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
4893
5437
|
fflush(stdout);
|
4894
5438
|
|
4895
|
-
work.
|
5439
|
+
if (work.size() < nelements * 4) {
|
5440
|
+
work.resize(nelements * 4); // upper bound on size
|
5441
|
+
}
|
4896
5442
|
new_data = work.data();
|
4897
|
-
std::
|
5443
|
+
std::array<int64_t, 1 << 4> hist_cur = {};
|
4898
5444
|
|
4899
5445
|
static const int chunk_size = 32 * 512;
|
4900
5446
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
@@ -4905,13 +5451,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4905
5451
|
size_t counter = 0;
|
4906
5452
|
new_size = 0;
|
4907
5453
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
4908
|
-
std::
|
5454
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
4909
5455
|
size_t local_size = 0;
|
4910
5456
|
while (true) {
|
4911
5457
|
std::unique_lock<std::mutex> lock(mutex);
|
4912
5458
|
size_t first = counter; counter += chunk_size;
|
4913
5459
|
if (first >= nelements) {
|
4914
|
-
if (
|
5460
|
+
if (local_size > 0) {
|
4915
5461
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
4916
5462
|
hist_cur[j] += local_hist[j];
|
4917
5463
|
}
|
@@ -4921,22 +5467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4921
5467
|
}
|
4922
5468
|
lock.unlock();
|
4923
5469
|
size_t last = std::min(nelements, first + chunk_size);
|
4924
|
-
if (local_hist.empty()) {
|
4925
|
-
local_hist.resize(hist_cur.size(), 0);
|
4926
|
-
}
|
4927
5470
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
4928
5471
|
}
|
4929
5472
|
};
|
4930
|
-
if ((int) workers.size() < nthread_use - 1) {
|
4931
|
-
workers.resize(nthread_use - 1);
|
4932
|
-
}
|
4933
5473
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
4934
|
-
workers
|
5474
|
+
workers.emplace_back(compute);
|
4935
5475
|
}
|
4936
5476
|
compute();
|
4937
|
-
for (
|
4938
|
-
|
4939
|
-
}
|
5477
|
+
for (auto & w : workers) { w.join(); }
|
5478
|
+
workers.clear();
|
4940
5479
|
}
|
4941
5480
|
|
4942
5481
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -5279,7 +5818,7 @@ struct llama_context_params llama_context_default_params() {
|
|
5279
5818
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
5280
5819
|
/*.n_ctx =*/ 512,
|
5281
5820
|
/*.n_batch =*/ 512,
|
5282
|
-
/*.
|
5821
|
+
/*.n_gpu_layers =*/ 0,
|
5283
5822
|
/*.main_gpu =*/ 0,
|
5284
5823
|
/*.tensor_split =*/ nullptr,
|
5285
5824
|
/*.rope_freq_base =*/ 10000.0f,
|
@@ -5296,6 +5835,10 @@ struct llama_context_params llama_context_default_params() {
|
|
5296
5835
|
/*.embedding =*/ false,
|
5297
5836
|
};
|
5298
5837
|
|
5838
|
+
#ifdef GGML_USE_METAL
|
5839
|
+
result.n_gpu_layers = 1;
|
5840
|
+
#endif
|
5841
|
+
|
5299
5842
|
return result;
|
5300
5843
|
}
|
5301
5844
|
|
@@ -5305,6 +5848,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
5305
5848
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
5306
5849
|
/*.allow_requantize =*/ false,
|
5307
5850
|
/*.quantize_output_tensor =*/ true,
|
5851
|
+
/*.only_copy =*/ false,
|
5308
5852
|
};
|
5309
5853
|
|
5310
5854
|
return result;
|
@@ -5487,43 +6031,43 @@ struct llama_context * llama_new_context_with_model(
|
|
5487
6031
|
}
|
5488
6032
|
#endif
|
5489
6033
|
}
|
5490
|
-
}
|
5491
6034
|
|
5492
6035
|
#ifdef GGML_USE_METAL
|
5493
|
-
|
5494
|
-
|
6036
|
+
if (params.n_gpu_layers > 0) {
|
6037
|
+
// this allocates all Metal resources and memory buffers
|
5495
6038
|
|
5496
|
-
|
5497
|
-
|
6039
|
+
void * data_ptr = NULL;
|
6040
|
+
size_t data_size = 0;
|
5498
6041
|
|
5499
|
-
|
5500
|
-
|
5501
|
-
|
5502
|
-
|
5503
|
-
|
5504
|
-
|
5505
|
-
|
6042
|
+
if (params.use_mmap) {
|
6043
|
+
data_ptr = ctx->model.mapping->addr;
|
6044
|
+
data_size = ctx->model.mapping->size;
|
6045
|
+
} else {
|
6046
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
6047
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
6048
|
+
}
|
5506
6049
|
|
5507
|
-
|
6050
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
5508
6051
|
|
5509
|
-
|
6052
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
5510
6053
|
|
5511
6054
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
5512
|
-
|
5513
|
-
|
5514
|
-
|
5515
|
-
|
5516
|
-
|
6055
|
+
if (!(result)) { \
|
6056
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
6057
|
+
llama_free(ctx); \
|
6058
|
+
return NULL; \
|
6059
|
+
}
|
5517
6060
|
|
5518
|
-
|
6061
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
5519
6062
|
|
5520
|
-
|
5521
|
-
|
6063
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
6064
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
5522
6065
|
|
5523
|
-
|
6066
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
5524
6067
|
#undef LLAMA_METAL_CHECK_BUF
|
5525
|
-
|
6068
|
+
}
|
5526
6069
|
#endif
|
6070
|
+
}
|
5527
6071
|
|
5528
6072
|
#ifdef GGML_USE_MPI
|
5529
6073
|
ctx->ctx_mpi = ggml_mpi_init();
|
@@ -5559,15 +6103,19 @@ void llama_free(struct llama_context * ctx) {
|
|
5559
6103
|
}
|
5560
6104
|
|
5561
6105
|
int llama_n_vocab(const struct llama_context * ctx) {
|
5562
|
-
return ctx->model
|
6106
|
+
return llama_model_n_vocab(&ctx->model);
|
5563
6107
|
}
|
5564
6108
|
|
5565
6109
|
int llama_n_ctx(const struct llama_context * ctx) {
|
5566
|
-
return ctx->model
|
6110
|
+
return llama_model_n_ctx(&ctx->model);
|
6111
|
+
}
|
6112
|
+
|
6113
|
+
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6114
|
+
return llama_model_n_ctx_train(&ctx->model);
|
5567
6115
|
}
|
5568
6116
|
|
5569
6117
|
int llama_n_embd(const struct llama_context * ctx) {
|
5570
|
-
return ctx->model
|
6118
|
+
return llama_model_n_embd(&ctx->model);
|
5571
6119
|
}
|
5572
6120
|
|
5573
6121
|
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
@@ -5582,6 +6130,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
|
|
5582
6130
|
return model->hparams.n_ctx;
|
5583
6131
|
}
|
5584
6132
|
|
6133
|
+
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6134
|
+
return model->hparams.n_ctx_train;
|
6135
|
+
}
|
6136
|
+
|
5585
6137
|
int llama_model_n_embd(const struct llama_model * model) {
|
5586
6138
|
return model->hparams.n_embd;
|
5587
6139
|
}
|
@@ -5857,7 +6409,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
5857
6409
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
5858
6410
|
rng_ss >> ctx->rng;
|
5859
6411
|
|
5860
|
-
GGML_ASSERT(rng_ss.fail()
|
6412
|
+
GGML_ASSERT(!rng_ss.fail());
|
5861
6413
|
}
|
5862
6414
|
|
5863
6415
|
// set logits
|
@@ -6136,7 +6688,7 @@ int llama_tokenize_with_model(
|
|
6136
6688
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
6137
6689
|
|
6138
6690
|
if (n_max_tokens < (int) res.size()) {
|
6139
|
-
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6691
|
+
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6140
6692
|
return -((int) res.size());
|
6141
6693
|
}
|
6142
6694
|
|