llama_cpp 0.5.0 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,8 +1,3 @@
|
|
1
|
-
// Defines fileno on msys:
|
2
|
-
#ifndef _GNU_SOURCE
|
3
|
-
#define _GNU_SOURCE
|
4
|
-
#endif
|
5
|
-
|
6
1
|
#include "llama.h"
|
7
2
|
|
8
3
|
#include "ggml.h"
|
@@ -126,6 +121,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
|
126
121
|
}
|
127
122
|
s = std::move(result);
|
128
123
|
}
|
124
|
+
#ifdef GGML_USE_CPU_HBM
|
125
|
+
#include <hbwmalloc.h>
|
126
|
+
#endif
|
129
127
|
|
130
128
|
static void zeros(std::ofstream & file, size_t n) {
|
131
129
|
char zero = 0;
|
@@ -157,6 +155,7 @@ static std::string format(const char * fmt, ...) {
|
|
157
155
|
enum llm_arch {
|
158
156
|
LLM_ARCH_LLAMA,
|
159
157
|
LLM_ARCH_FALCON,
|
158
|
+
LLM_ARCH_BAICHUAN,
|
160
159
|
LLM_ARCH_GPT2,
|
161
160
|
LLM_ARCH_GPTJ,
|
162
161
|
LLM_ARCH_GPTNEOX,
|
@@ -171,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
171
170
|
{ LLM_ARCH_GPTJ, "gptj" },
|
172
171
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
173
172
|
{ LLM_ARCH_MPT, "mpt" },
|
173
|
+
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
174
174
|
};
|
175
175
|
|
176
176
|
enum llm_kv {
|
@@ -311,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
311
311
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
312
312
|
},
|
313
313
|
},
|
314
|
+
{
|
315
|
+
LLM_ARCH_BAICHUAN,
|
316
|
+
{
|
317
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
318
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
319
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
320
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
321
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
322
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
323
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
324
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
325
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
326
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
327
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
328
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
329
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
330
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
331
|
+
},
|
332
|
+
},
|
314
333
|
{
|
315
334
|
LLM_ARCH_FALCON,
|
316
335
|
{
|
@@ -325,6 +344,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
325
344
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
326
345
|
},
|
327
346
|
},
|
347
|
+
{
|
348
|
+
LLM_ARCH_GPT2,
|
349
|
+
{
|
350
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
351
|
+
},
|
352
|
+
},
|
353
|
+
{
|
354
|
+
LLM_ARCH_GPTJ,
|
355
|
+
{
|
356
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
357
|
+
},
|
358
|
+
},
|
359
|
+
{
|
360
|
+
LLM_ARCH_GPTNEOX,
|
361
|
+
{
|
362
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
363
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
364
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
365
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
366
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
367
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
368
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
369
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
370
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
371
|
+
},
|
372
|
+
},
|
373
|
+
{
|
374
|
+
LLM_ARCH_MPT,
|
375
|
+
{
|
376
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
377
|
+
},
|
378
|
+
},
|
379
|
+
{
|
380
|
+
LLM_ARCH_UNKNOWN,
|
381
|
+
{
|
382
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
383
|
+
},
|
384
|
+
},
|
328
385
|
};
|
329
386
|
|
330
387
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
@@ -412,6 +469,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
412
469
|
#elif GGML_USE_METAL
|
413
470
|
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
|
414
471
|
# define llama_host_free(data) ggml_metal_host_free(data)
|
472
|
+
#elif GGML_USE_CPU_HBM
|
473
|
+
# define llama_host_malloc(n) hbw_malloc(n)
|
474
|
+
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
415
475
|
#else
|
416
476
|
# define llama_host_malloc(n) malloc(n)
|
417
477
|
# define llama_host_free(data) free(data)
|
@@ -568,16 +628,16 @@ struct llama_mmap {
|
|
568
628
|
|
569
629
|
if (prefetch > 0) {
|
570
630
|
// Advise the kernel to preload the mapped memory
|
571
|
-
if (
|
572
|
-
fprintf(stderr, "warning:
|
631
|
+
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
632
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
573
633
|
strerror(errno));
|
574
634
|
}
|
575
635
|
}
|
576
636
|
if (numa) {
|
577
637
|
// advise the kernel not to use readahead
|
578
638
|
// (because the next page might not belong on the same node)
|
579
|
-
if (
|
580
|
-
fprintf(stderr, "warning:
|
639
|
+
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
640
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
581
641
|
strerror(errno));
|
582
642
|
}
|
583
643
|
}
|
@@ -620,7 +680,6 @@ struct llama_mmap {
|
|
620
680
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
621
681
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
622
682
|
llama_format_win_err(GetLastError()).c_str());
|
623
|
-
}
|
624
683
|
}
|
625
684
|
#else
|
626
685
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -1446,7 +1505,11 @@ struct llama_model_loader {
|
|
1446
1505
|
// allocate temp buffer if not using mmap
|
1447
1506
|
if (!use_mmap && cur->data == NULL) {
|
1448
1507
|
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
1449
|
-
|
1508
|
+
#ifdef GGML_USE_CPU_HBM
|
1509
|
+
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
1510
|
+
#else
|
1511
|
+
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
1512
|
+
#endif
|
1450
1513
|
}
|
1451
1514
|
|
1452
1515
|
load_data_for(cur);
|
@@ -1600,9 +1663,13 @@ static void llm_load_hparams(
|
|
1600
1663
|
|
1601
1664
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
1602
1665
|
|
1603
|
-
if (
|
1604
|
-
|
1666
|
+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
1667
|
+
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
1668
|
+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
1669
|
+
}
|
1605
1670
|
}
|
1671
|
+
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
1672
|
+
// gpt-j n_rot = rotary_dim
|
1606
1673
|
}
|
1607
1674
|
|
1608
1675
|
// arch-specific KVs
|
@@ -1631,6 +1698,15 @@ static void llm_load_hparams(
|
|
1631
1698
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1632
1699
|
}
|
1633
1700
|
} break;
|
1701
|
+
case LLM_ARCH_BAICHUAN:
|
1702
|
+
{
|
1703
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1704
|
+
switch (hparams.n_layer) {
|
1705
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
1706
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
1707
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1708
|
+
}
|
1709
|
+
} break;
|
1634
1710
|
default: (void)0;
|
1635
1711
|
};
|
1636
1712
|
|
@@ -1871,7 +1947,6 @@ static void llm_load_tensors(
|
|
1871
1947
|
const int64_t n_vocab = hparams.n_vocab;
|
1872
1948
|
|
1873
1949
|
const auto tn = LLM_TN(model.arch);
|
1874
|
-
|
1875
1950
|
switch (model.arch) {
|
1876
1951
|
case LLM_ARCH_LLAMA:
|
1877
1952
|
{
|
@@ -1914,6 +1989,72 @@ static void llm_load_tensors(
|
|
1914
1989
|
|
1915
1990
|
model.layers.resize(n_layer);
|
1916
1991
|
|
1992
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
1993
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1994
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1995
|
+
|
1996
|
+
auto & layer = model.layers[i];
|
1997
|
+
|
1998
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
1999
|
+
|
2000
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
2001
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2002
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2003
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2004
|
+
|
2005
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2006
|
+
|
2007
|
+
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
2008
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2009
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2010
|
+
|
2011
|
+
if (backend == GGML_BACKEND_GPU) {
|
2012
|
+
vram_weights +=
|
2013
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
2014
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
2015
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
2016
|
+
}
|
2017
|
+
}
|
2018
|
+
} break;
|
2019
|
+
case LLM_ARCH_BAICHUAN:
|
2020
|
+
{
|
2021
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2022
|
+
{
|
2023
|
+
ggml_backend backend_norm;
|
2024
|
+
ggml_backend backend_output;
|
2025
|
+
|
2026
|
+
if (n_gpu_layers > int(n_layer)) {
|
2027
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2028
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2029
|
+
#ifndef _WIN32
|
2030
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2031
|
+
#else
|
2032
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2033
|
+
#endif // _WIN32
|
2034
|
+
|
2035
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2036
|
+
} else {
|
2037
|
+
backend_norm = GGML_BACKEND_CPU;
|
2038
|
+
backend_output = GGML_BACKEND_CPU;
|
2039
|
+
}
|
2040
|
+
|
2041
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2042
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2043
|
+
|
2044
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2045
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2046
|
+
}
|
2047
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2048
|
+
vram_weights += ggml_nbytes(model.output);
|
2049
|
+
}
|
2050
|
+
}
|
2051
|
+
|
2052
|
+
const uint32_t n_ff = hparams.n_ff;
|
2053
|
+
|
2054
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2055
|
+
|
2056
|
+
model.layers.resize(n_layer);
|
2057
|
+
|
1917
2058
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1918
2059
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1919
2060
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
@@ -2071,95 +2212,427 @@ static void llm_load_tensors(
|
|
2071
2212
|
vram_kv_cache += hparams.kv_size() / 2;
|
2072
2213
|
}
|
2073
2214
|
}
|
2074
|
-
#elif defined(GGML_USE_CLBLAST)
|
2075
|
-
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2076
|
-
const int max_offloadable_layers = hparams.n_layer + 1;
|
2077
|
-
#endif // GGML_USE_CUBLAS
|
2215
|
+
#elif defined(GGML_USE_CLBLAST)
|
2216
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2217
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
2218
|
+
#endif // GGML_USE_CUBLAS
|
2219
|
+
|
2220
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
2221
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2222
|
+
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
2223
|
+
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2224
|
+
#else
|
2225
|
+
(void) n_gpu_layers;
|
2226
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2227
|
+
}
|
2228
|
+
|
2229
|
+
// populate `tensors_by_name`
|
2230
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
2231
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
2232
|
+
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
(void) tensor_split;
|
2236
|
+
#if defined(GGML_USE_CUBLAS)
|
2237
|
+
{
|
2238
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
2239
|
+
}
|
2240
|
+
#endif
|
2241
|
+
|
2242
|
+
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
2243
|
+
|
2244
|
+
if (progress_callback) {
|
2245
|
+
progress_callback(1.0f, progress_callback_user_data);
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
model.mapping = std::move(ml.mapping);
|
2249
|
+
|
2250
|
+
// loading time will be recalculate after the first eval, so
|
2251
|
+
// we take page faults deferred by mmap() into consideration
|
2252
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
2253
|
+
}
|
2254
|
+
|
2255
|
+
static bool llama_model_load(
|
2256
|
+
const std::string & fname,
|
2257
|
+
llama_model & model,
|
2258
|
+
int n_ctx,
|
2259
|
+
int n_batch,
|
2260
|
+
int n_gpu_layers,
|
2261
|
+
int main_gpu,
|
2262
|
+
const float * tensor_split,
|
2263
|
+
const bool mul_mat_q,
|
2264
|
+
float rope_freq_base,
|
2265
|
+
float rope_freq_scale,
|
2266
|
+
bool low_vram,
|
2267
|
+
ggml_type memory_type,
|
2268
|
+
bool use_mmap,
|
2269
|
+
bool use_mlock,
|
2270
|
+
bool vocab_only,
|
2271
|
+
llama_progress_callback progress_callback,
|
2272
|
+
void *progress_callback_user_data) {
|
2273
|
+
try {
|
2274
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
2275
|
+
|
2276
|
+
llm_load_arch (*ml, model);
|
2277
|
+
llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
|
2278
|
+
llm_load_vocab (*ml, model);
|
2279
|
+
|
2280
|
+
llm_load_print_meta(*ml, model);
|
2281
|
+
|
2282
|
+
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2283
|
+
throw std::runtime_error("vocab size mismatch");
|
2284
|
+
}
|
2285
|
+
|
2286
|
+
if (vocab_only) {
|
2287
|
+
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
2288
|
+
return true;
|
2289
|
+
}
|
2290
|
+
|
2291
|
+
llm_load_tensors(
|
2292
|
+
*ml, model, n_batch, n_gpu_layers,
|
2293
|
+
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
|
2294
|
+
use_mlock, progress_callback, progress_callback_user_data);
|
2295
|
+
} catch (const std::exception & err) {
|
2296
|
+
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
2297
|
+
return false;
|
2298
|
+
}
|
2299
|
+
|
2300
|
+
return true;
|
2301
|
+
}
|
2302
|
+
|
2303
|
+
static struct ggml_cgraph * llm_build_llama(
|
2304
|
+
llama_context & lctx,
|
2305
|
+
const llama_token * tokens,
|
2306
|
+
const float * embd,
|
2307
|
+
int n_tokens,
|
2308
|
+
int n_past) {
|
2309
|
+
|
2310
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2311
|
+
|
2312
|
+
const int N = n_tokens;
|
2313
|
+
|
2314
|
+
const auto & model = lctx.model;
|
2315
|
+
const auto & hparams = model.hparams;
|
2316
|
+
|
2317
|
+
const auto & kv_self = lctx.kv_self;
|
2318
|
+
|
2319
|
+
GGML_ASSERT(!!kv_self.ctx);
|
2320
|
+
|
2321
|
+
const int64_t n_embd = hparams.n_embd;
|
2322
|
+
const int64_t n_layer = hparams.n_layer;
|
2323
|
+
const int64_t n_ctx = hparams.n_ctx;
|
2324
|
+
const int64_t n_head = hparams.n_head;
|
2325
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
2326
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
2327
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
2328
|
+
|
2329
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2330
|
+
|
2331
|
+
const float freq_base = hparams.rope_freq_base;
|
2332
|
+
const float freq_scale = hparams.rope_freq_scale;
|
2333
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2334
|
+
|
2335
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
2336
|
+
|
2337
|
+
auto & buf_compute = lctx.buf_compute;
|
2338
|
+
|
2339
|
+
struct ggml_init_params params = {
|
2340
|
+
/*.mem_size =*/ buf_compute.size,
|
2341
|
+
/*.mem_buffer =*/ buf_compute.data,
|
2342
|
+
/*.no_alloc =*/ false,
|
2343
|
+
};
|
2344
|
+
|
2345
|
+
params.no_alloc = true;
|
2346
|
+
|
2347
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
2348
|
+
|
2349
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
2350
|
+
|
2351
|
+
struct ggml_tensor * cur;
|
2352
|
+
struct ggml_tensor * inpL;
|
2353
|
+
|
2354
|
+
if (tokens) {
|
2355
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
2356
|
+
|
2357
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2358
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2359
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
2360
|
+
}
|
2361
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
2362
|
+
|
2363
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
2364
|
+
} else {
|
2365
|
+
#ifdef GGML_USE_MPI
|
2366
|
+
GGML_ASSERT(false && "not implemented");
|
2367
|
+
#endif
|
2368
|
+
|
2369
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
2370
|
+
|
2371
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
2372
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2373
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
2374
|
+
}
|
2375
|
+
}
|
2376
|
+
|
2377
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2378
|
+
(void) i_gpu_start;
|
2379
|
+
|
2380
|
+
// offload functions set the tensor output backend to GPU
|
2381
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2382
|
+
//
|
2383
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2384
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
2385
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2386
|
+
offload_func_t offload_func_kq = llama_nop;
|
2387
|
+
offload_func_t offload_func_v = llama_nop;
|
2388
|
+
|
2389
|
+
#ifdef GGML_USE_CUBLAS
|
2390
|
+
if (n_gpu_layers > n_layer) {
|
2391
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
2392
|
+
}
|
2393
|
+
if (n_gpu_layers > n_layer + 1) {
|
2394
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
2395
|
+
}
|
2396
|
+
if (n_gpu_layers > n_layer + 2) {
|
2397
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
2398
|
+
}
|
2399
|
+
#endif // GGML_USE_CUBLAS
|
2400
|
+
|
2401
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2402
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2403
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2404
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2405
|
+
}
|
2406
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2407
|
+
|
2408
|
+
for (int il = 0; il < n_layer; ++il) {
|
2409
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
2410
|
+
|
2411
|
+
offload_func_t offload_func = llama_nop;
|
2412
|
+
|
2413
|
+
#ifdef GGML_USE_CUBLAS
|
2414
|
+
if (il >= i_gpu_start) {
|
2415
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
2416
|
+
}
|
2417
|
+
#endif // GGML_USE_CUBLAS
|
2418
|
+
|
2419
|
+
struct ggml_tensor * inpSA = inpL;
|
2420
|
+
|
2421
|
+
// norm
|
2422
|
+
{
|
2423
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
2424
|
+
offload_func(cur);
|
2425
|
+
ggml_set_name(cur, "rms_norm_0");
|
2426
|
+
|
2427
|
+
// cur = cur*attn_norm(broadcasted)
|
2428
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
2429
|
+
offload_func(cur);
|
2430
|
+
ggml_set_name(cur, "attention_norm_0");
|
2431
|
+
}
|
2432
|
+
|
2433
|
+
// self-attention
|
2434
|
+
{
|
2435
|
+
// compute Q and K and RoPE them
|
2436
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
2437
|
+
offload_func_kq(tmpk);
|
2438
|
+
ggml_set_name(tmpk, "tmpk");
|
2439
|
+
|
2440
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
2441
|
+
offload_func_kq(tmpq);
|
2442
|
+
ggml_set_name(tmpq, "tmpq");
|
2443
|
+
|
2444
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2445
|
+
offload_func_kq(Kcur);
|
2446
|
+
ggml_set_name(Kcur, "Kcur");
|
2447
|
+
|
2448
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2449
|
+
offload_func_kq(Qcur);
|
2450
|
+
ggml_set_name(Qcur, "Qcur");
|
2451
|
+
|
2452
|
+
// store key and value to memory
|
2453
|
+
{
|
2454
|
+
// compute the transposed [N, n_embd] V matrix
|
2455
|
+
|
2456
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2457
|
+
offload_func_v(tmpv);
|
2458
|
+
ggml_set_name(tmpv, "tmpv");
|
2459
|
+
|
2460
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
2461
|
+
offload_func_v(Vcur);
|
2462
|
+
ggml_set_name(Vcur, "Vcur");
|
2463
|
+
|
2464
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
2465
|
+
offload_func_kq(k);
|
2466
|
+
ggml_set_name(k, "k");
|
2467
|
+
|
2468
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
2469
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
2470
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
2471
|
+
offload_func_v(v);
|
2472
|
+
ggml_set_name(v, "v");
|
2473
|
+
|
2474
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
2475
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
2476
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
2477
|
+
}
|
2478
|
+
|
2479
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
2480
|
+
offload_func_kq(Q);
|
2481
|
+
ggml_set_name(Q, "Q");
|
2482
|
+
|
2483
|
+
struct ggml_tensor * K =
|
2484
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2485
|
+
n_embd_head, n_past + N, n_head_kv,
|
2486
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2487
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2488
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
2489
|
+
offload_func_kq(K);
|
2490
|
+
ggml_set_name(K, "K");
|
2491
|
+
|
2492
|
+
// K * Q
|
2493
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
2494
|
+
offload_func_kq(KQ);
|
2495
|
+
ggml_set_name(KQ, "KQ");
|
2496
|
+
|
2497
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2498
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
2499
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
2500
|
+
offload_func_kq(KQ_scaled);
|
2501
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2502
|
+
|
2503
|
+
// KQ_masked = mask_past(KQ_scaled)
|
2504
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2505
|
+
offload_func_kq(KQ_masked);
|
2506
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
2507
|
+
|
2508
|
+
// KQ = soft_max(KQ_masked)
|
2509
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
2510
|
+
offload_func_v(KQ_soft_max);
|
2511
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2512
|
+
|
2513
|
+
// split cached V into n_head heads
|
2514
|
+
struct ggml_tensor * V =
|
2515
|
+
ggml_view_3d(ctx0, kv_self.v,
|
2516
|
+
n_past + N, n_embd_head, n_head_kv,
|
2517
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
2518
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2519
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
2520
|
+
offload_func_v(V);
|
2521
|
+
ggml_set_name(V, "V");
|
2522
|
+
|
2523
|
+
#if 1
|
2524
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
2525
|
+
offload_func_v(KQV);
|
2526
|
+
ggml_set_name(KQV, "KQV");
|
2527
|
+
#else
|
2528
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2529
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2530
|
+
// is there a better way?
|
2531
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
2532
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2533
|
+
#endif
|
2534
|
+
|
2535
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
2536
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
2537
|
+
offload_func_v(KQV_merged);
|
2538
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
2539
|
+
|
2540
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
2541
|
+
cur = ggml_cpy(ctx0,
|
2542
|
+
KQV_merged,
|
2543
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2544
|
+
offload_func_v(cur);
|
2545
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
2546
|
+
|
2547
|
+
// projection (no bias)
|
2548
|
+
cur = ggml_mul_mat(ctx0,
|
2549
|
+
model.layers[il].wo,
|
2550
|
+
cur);
|
2551
|
+
offload_func(cur);
|
2552
|
+
ggml_set_name(cur, "result_wo");
|
2553
|
+
}
|
2078
2554
|
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2083
|
-
#else
|
2084
|
-
(void) n_gpu_layers;
|
2085
|
-
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2086
|
-
}
|
2555
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
2556
|
+
offload_func(inpFF);
|
2557
|
+
ggml_set_name(inpFF, "inpFF");
|
2087
2558
|
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2559
|
+
// feed-forward network
|
2560
|
+
{
|
2561
|
+
// norm
|
2562
|
+
{
|
2563
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
2564
|
+
offload_func(cur);
|
2565
|
+
ggml_set_name(cur, "rms_norm_1");
|
2093
2566
|
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
#endif
|
2567
|
+
// cur = cur*ffn_norm(broadcasted)
|
2568
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
2569
|
+
offload_func(cur);
|
2570
|
+
ggml_set_name(cur, "ffn_norm");
|
2571
|
+
}
|
2100
2572
|
|
2101
|
-
|
2573
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
2574
|
+
model.layers[il].w3,
|
2575
|
+
cur);
|
2576
|
+
offload_func(tmp);
|
2577
|
+
ggml_set_name(tmp, "result_w3");
|
2102
2578
|
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2579
|
+
cur = ggml_mul_mat(ctx0,
|
2580
|
+
model.layers[il].w1,
|
2581
|
+
cur);
|
2582
|
+
offload_func(cur);
|
2583
|
+
ggml_set_name(cur, "result_w1");
|
2106
2584
|
|
2107
|
-
|
2585
|
+
// SILU activation
|
2586
|
+
cur = ggml_silu(ctx0, cur);
|
2587
|
+
offload_func(cur);
|
2588
|
+
ggml_set_name(cur, "silu");
|
2108
2589
|
|
2109
|
-
|
2110
|
-
|
2111
|
-
|
2112
|
-
}
|
2590
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
2591
|
+
offload_func(cur);
|
2592
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
2113
2593
|
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
int main_gpu,
|
2121
|
-
const float * tensor_split,
|
2122
|
-
const bool mul_mat_q,
|
2123
|
-
float rope_freq_base,
|
2124
|
-
float rope_freq_scale,
|
2125
|
-
bool low_vram,
|
2126
|
-
ggml_type memory_type,
|
2127
|
-
bool use_mmap,
|
2128
|
-
bool use_mlock,
|
2129
|
-
bool vocab_only,
|
2130
|
-
llama_progress_callback progress_callback,
|
2131
|
-
void *progress_callback_user_data) {
|
2132
|
-
try {
|
2133
|
-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
2594
|
+
cur = ggml_mul_mat(ctx0,
|
2595
|
+
model.layers[il].w2,
|
2596
|
+
cur);
|
2597
|
+
offload_func(cur);
|
2598
|
+
ggml_set_name(cur, "result_w2");
|
2599
|
+
}
|
2134
2600
|
|
2135
|
-
|
2136
|
-
|
2137
|
-
|
2601
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
2602
|
+
offload_func(cur);
|
2603
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
2138
2604
|
|
2139
|
-
|
2605
|
+
// input for next layer
|
2606
|
+
inpL = cur;
|
2607
|
+
}
|
2140
2608
|
|
2141
|
-
|
2142
|
-
throw std::runtime_error("vocab size mismatch");
|
2143
|
-
}
|
2609
|
+
cur = inpL;
|
2144
2610
|
|
2145
|
-
|
2146
|
-
|
2147
|
-
|
2148
|
-
|
2611
|
+
// norm
|
2612
|
+
{
|
2613
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
2614
|
+
offload_func_nr(cur);
|
2615
|
+
ggml_set_name(cur, "rms_norm_2");
|
2149
2616
|
|
2150
|
-
|
2151
|
-
|
2152
|
-
|
2153
|
-
|
2154
|
-
} catch (const std::exception & err) {
|
2155
|
-
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
2156
|
-
return false;
|
2617
|
+
// cur = cur*norm(broadcasted)
|
2618
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
2619
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
2620
|
+
ggml_set_name(cur, "result_norm");
|
2157
2621
|
}
|
2158
2622
|
|
2159
|
-
|
2623
|
+
// lm_head
|
2624
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
2625
|
+
ggml_set_name(cur, "result_output");
|
2626
|
+
|
2627
|
+
ggml_build_forward_expand(gf, cur);
|
2628
|
+
|
2629
|
+
ggml_free(ctx0);
|
2630
|
+
|
2631
|
+
return gf;
|
2160
2632
|
}
|
2161
2633
|
|
2162
|
-
|
2634
|
+
|
2635
|
+
static struct ggml_cgraph * llm_build_baichaun(
|
2163
2636
|
llama_context & lctx,
|
2164
2637
|
const llama_token * tokens,
|
2165
2638
|
const float * embd,
|
@@ -2300,11 +2773,24 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2300
2773
|
offload_func_kq(tmpq);
|
2301
2774
|
ggml_set_name(tmpq, "tmpq");
|
2302
2775
|
|
2303
|
-
struct ggml_tensor * Kcur
|
2776
|
+
struct ggml_tensor * Kcur;
|
2777
|
+
struct ggml_tensor * Qcur;
|
2778
|
+
switch (model.type) {
|
2779
|
+
case MODEL_7B:
|
2780
|
+
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2781
|
+
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2782
|
+
break;
|
2783
|
+
case MODEL_13B:
|
2784
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
|
2785
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
|
2786
|
+
break;
|
2787
|
+
default:
|
2788
|
+
GGML_ASSERT(false);
|
2789
|
+
}
|
2790
|
+
|
2304
2791
|
offload_func_kq(Kcur);
|
2305
2792
|
ggml_set_name(Kcur, "Kcur");
|
2306
2793
|
|
2307
|
-
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2308
2794
|
offload_func_kq(Qcur);
|
2309
2795
|
ggml_set_name(Qcur, "Qcur");
|
2310
2796
|
|
@@ -2359,10 +2845,26 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2359
2845
|
offload_func_kq(KQ_scaled);
|
2360
2846
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2361
2847
|
|
2848
|
+
struct ggml_tensor * KQ_masked;
|
2849
|
+
struct ggml_tensor * KQ_scaled_alibi;
|
2850
|
+
|
2851
|
+
switch (model.type) {
|
2852
|
+
case MODEL_7B:
|
2853
|
+
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2854
|
+
break;
|
2855
|
+
case MODEL_13B:
|
2856
|
+
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
|
2857
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2858
|
+
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2859
|
+
break;
|
2860
|
+
default:
|
2861
|
+
GGML_ASSERT(false);
|
2862
|
+
}
|
2362
2863
|
// KQ_masked = mask_past(KQ_scaled)
|
2363
|
-
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2364
|
-
|
2365
|
-
|
2864
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2865
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2866
|
+
// offload_func_kq(KQ_masked);
|
2867
|
+
// ggml_set_name(KQ_masked, "KQ_masked");
|
2366
2868
|
|
2367
2869
|
// KQ = soft_max(KQ_masked)
|
2368
2870
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
@@ -2812,6 +3314,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
2812
3314
|
{
|
2813
3315
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
2814
3316
|
} break;
|
3317
|
+
case LLM_ARCH_BAICHUAN:
|
3318
|
+
{
|
3319
|
+
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
|
3320
|
+
} break;
|
2815
3321
|
case LLM_ARCH_FALCON:
|
2816
3322
|
{
|
2817
3323
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
@@ -2895,7 +3401,12 @@ static bool llama_eval_internal(
|
|
2895
3401
|
|
2896
3402
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
2897
3403
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
2898
|
-
|
3404
|
+
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
3405
|
+
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
3406
|
+
// with the BLAS calls. need a better solution
|
3407
|
+
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
3408
|
+
n_threads = std::min(4, n_threads);
|
3409
|
+
}
|
2899
3410
|
|
2900
3411
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
2901
3412
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
@@ -3000,33 +3511,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
3000
3511
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
3001
3512
|
}
|
3002
3513
|
|
3003
|
-
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
3004
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
3005
|
-
}
|
3006
|
-
|
3007
|
-
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
3008
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
|
3009
|
-
}
|
3010
|
-
|
3011
3514
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
3012
3515
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
3013
3516
|
}
|
3014
3517
|
|
3015
|
-
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
3016
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3017
|
-
return id == vocab.special_bos_id;
|
3018
|
-
}
|
3019
|
-
|
3020
|
-
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
|
3021
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3022
|
-
return id == vocab.special_eos_id;
|
3023
|
-
}
|
3024
|
-
|
3025
|
-
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
|
3026
|
-
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
|
3027
|
-
return id == vocab.special_pad_id;
|
3028
|
-
}
|
3029
|
-
|
3030
3518
|
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
3031
3519
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
3032
3520
|
const auto& token_data = vocab.id_to_token.at(id);
|
@@ -3087,10 +3575,9 @@ struct llm_tokenizer_spm {
|
|
3087
3575
|
while (offs < text.size()) {
|
3088
3576
|
llm_symbol sym;
|
3089
3577
|
size_t len = utf8_len(text[offs]);
|
3090
|
-
GGML_ASSERT(offs + len <= text.size());
|
3091
3578
|
sym.text = text.c_str() + offs;
|
3092
|
-
sym.n = len;
|
3093
|
-
offs +=
|
3579
|
+
sym.n = std::min(len, text.size() - offs);
|
3580
|
+
offs += sym.n;
|
3094
3581
|
sym.prev = index - 1;
|
3095
3582
|
sym.next = offs == text.size() ? -1 : index + 1;
|
3096
3583
|
index++;
|
@@ -3319,9 +3806,15 @@ struct llm_tokenizer_bpe {
|
|
3319
3806
|
std::string byte_str(1, *j);
|
3320
3807
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
3321
3808
|
if (token_multibyte == vocab.token_to_id.end()) {
|
3322
|
-
|
3809
|
+
try {
|
3810
|
+
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
3811
|
+
output.push_back(token_byte);
|
3812
|
+
} catch (const std::out_of_range & err) {
|
3813
|
+
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
3814
|
+
}
|
3815
|
+
} else {
|
3816
|
+
output.push_back((*token_multibyte).second);
|
3323
3817
|
}
|
3324
|
-
output.push_back((*token_multibyte).second);
|
3325
3818
|
}
|
3326
3819
|
} else {
|
3327
3820
|
output.push_back((*token).second);
|
@@ -3595,7 +4088,7 @@ static void llama_grammar_advance_stack(
|
|
3595
4088
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
3596
4089
|
|
3597
4090
|
if (stack.empty()) {
|
3598
|
-
new_stacks.
|
4091
|
+
new_stacks.emplace_back(stack);
|
3599
4092
|
return;
|
3600
4093
|
}
|
3601
4094
|
|
@@ -3632,7 +4125,7 @@ static void llama_grammar_advance_stack(
|
|
3632
4125
|
}
|
3633
4126
|
case LLAMA_GRETYPE_CHAR:
|
3634
4127
|
case LLAMA_GRETYPE_CHAR_NOT:
|
3635
|
-
new_stacks.
|
4128
|
+
new_stacks.emplace_back(stack);
|
3636
4129
|
break;
|
3637
4130
|
default:
|
3638
4131
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -3797,6 +4290,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
|
|
3797
4290
|
delete grammar;
|
3798
4291
|
}
|
3799
4292
|
|
4293
|
+
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
4294
|
+
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
4295
|
+
|
4296
|
+
// redirect elements in stacks to point to new rules
|
4297
|
+
for (size_t is = 0; is < result->stacks.size(); is++) {
|
4298
|
+
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
4299
|
+
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
4300
|
+
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
4301
|
+
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
4302
|
+
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
4303
|
+
}
|
4304
|
+
}
|
4305
|
+
}
|
4306
|
+
}
|
4307
|
+
}
|
4308
|
+
|
4309
|
+
return result;
|
4310
|
+
}
|
4311
|
+
|
3800
4312
|
//
|
3801
4313
|
// sampling
|
3802
4314
|
//
|
@@ -4388,7 +4900,7 @@ struct llama_logit_info {
|
|
4388
4900
|
}
|
4389
4901
|
return min_heap;
|
4390
4902
|
}
|
4391
|
-
float probability_from_logit(float logit) {
|
4903
|
+
float probability_from_logit(float logit) const {
|
4392
4904
|
return normalizer * std::exp(logit - max_l);
|
4393
4905
|
}
|
4394
4906
|
};
|
@@ -4581,7 +5093,16 @@ void llama_beam_search(llama_context * ctx,
|
|
4581
5093
|
// quantization
|
4582
5094
|
//
|
4583
5095
|
|
4584
|
-
|
5096
|
+
template <typename T>
|
5097
|
+
struct no_init {
|
5098
|
+
T value;
|
5099
|
+
no_init() { /* do nothing */ }
|
5100
|
+
};
|
5101
|
+
|
5102
|
+
static void llama_convert_tensor_internal(
|
5103
|
+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
5104
|
+
const size_t nelements, const int nthread
|
5105
|
+
) {
|
4585
5106
|
if (output.size() < nelements) {
|
4586
5107
|
output.resize(nelements);
|
4587
5108
|
}
|
@@ -4616,7 +5137,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4616
5137
|
auto blocks_per_thread = nblocks / nthread;
|
4617
5138
|
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
4618
5139
|
|
4619
|
-
std::vector<std::thread> workers;
|
4620
5140
|
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
4621
5141
|
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
4622
5142
|
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
@@ -4629,14 +5149,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4629
5149
|
qtype.to_float(inbuf, outbuf, nels);
|
4630
5150
|
}
|
4631
5151
|
};
|
4632
|
-
workers.
|
5152
|
+
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
4633
5153
|
in_buff_offs += thr_block_bytes;
|
4634
5154
|
out_buff_offs += thr_elems;
|
4635
5155
|
}
|
4636
|
-
for (auto &
|
4637
|
-
|
5156
|
+
for (auto & w : workers) { w.join(); }
|
5157
|
+
workers.clear();
|
5158
|
+
}
|
5159
|
+
|
5160
|
+
#ifdef GGML_USE_K_QUANTS
|
5161
|
+
static ggml_type get_k_quant_type(
|
5162
|
+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
5163
|
+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
5164
|
+
) {
|
5165
|
+
const std::string name = ggml_get_name(tensor);
|
5166
|
+
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
5167
|
+
const auto tn = LLM_TN(model.arch);
|
5168
|
+
|
5169
|
+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
5170
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
5171
|
+
};
|
5172
|
+
|
5173
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5174
|
+
int nx = tensor->ne[0];
|
5175
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
5176
|
+
new_type = GGML_TYPE_Q8_0;
|
5177
|
+
}
|
5178
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
5179
|
+
new_type = GGML_TYPE_Q6_K;
|
5180
|
+
}
|
5181
|
+
} else if (name.find("attn_v.weight") != std::string::npos) {
|
5182
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5183
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5184
|
+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5185
|
+
}
|
5186
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5187
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
5188
|
+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
5189
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
5190
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
5191
|
+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
5192
|
+
if (model.type == MODEL_70B) {
|
5193
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
5194
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
5195
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
5196
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
5197
|
+
}
|
5198
|
+
++*i_attention_wv;
|
5199
|
+
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
5200
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5201
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5202
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
5203
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
5204
|
+
: GGML_TYPE_Q3_K;
|
5205
|
+
}
|
5206
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
5207
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
5208
|
+
}
|
5209
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
5210
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
5211
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
5212
|
+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5213
|
+
} else {
|
5214
|
+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5215
|
+
}
|
5216
|
+
}
|
5217
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5218
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
5219
|
+
new_type = GGML_TYPE_Q5_K;
|
5220
|
+
}
|
5221
|
+
++*i_feed_forward_w2;
|
5222
|
+
} else if (name.find("attn_output.weight") != std::string::npos) {
|
5223
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
5224
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
5225
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
5226
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5227
|
+
} else {
|
5228
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5229
|
+
}
|
5230
|
+
}
|
5231
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
5232
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5233
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
5234
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
5235
|
+
}
|
5236
|
+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
5237
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5238
|
+
}
|
5239
|
+
// This can be used to reduce the size of the Q5_K_S model.
|
5240
|
+
// The associated PPL increase is fully in line with the size reduction
|
5241
|
+
//else {
|
5242
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
5243
|
+
//}
|
5244
|
+
bool convert_incompatible_tensor = false;
|
5245
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
5246
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
5247
|
+
int nx = tensor->ne[0];
|
5248
|
+
int ny = tensor->ne[1];
|
5249
|
+
if (nx % QK_K != 0) {
|
5250
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
5251
|
+
convert_incompatible_tensor = true;
|
5252
|
+
}
|
5253
|
+
}
|
5254
|
+
if (convert_incompatible_tensor) {
|
5255
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5256
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
5257
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
5258
|
+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
5259
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
5260
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
5261
|
+
} else {
|
5262
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
5263
|
+
}
|
4638
5264
|
}
|
5265
|
+
|
5266
|
+
return new_type;
|
4639
5267
|
}
|
5268
|
+
#endif
|
4640
5269
|
|
4641
5270
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
4642
5271
|
ggml_type quantized_type;
|
@@ -4678,6 +5307,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4678
5307
|
llm_load_arch(*ml, model);
|
4679
5308
|
llm_load_hparams(*ml, model, 0, 0, 0);
|
4680
5309
|
|
5310
|
+
if (params->only_copy) {
|
5311
|
+
ftype = model.ftype;
|
5312
|
+
}
|
5313
|
+
|
4681
5314
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
4682
5315
|
struct gguf_context * ctx_out = gguf_init_empty();
|
4683
5316
|
|
@@ -4717,16 +5350,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4717
5350
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
4718
5351
|
|
4719
5352
|
std::vector<std::thread> workers;
|
5353
|
+
workers.reserve(nthread);
|
4720
5354
|
std::mutex mutex;
|
4721
5355
|
|
4722
|
-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4723
|
-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4724
|
-
};
|
4725
|
-
|
4726
5356
|
int idx = 0;
|
4727
5357
|
|
4728
|
-
std::vector<uint8_t
|
4729
|
-
std::vector<uint8_t
|
5358
|
+
std::vector<no_init<uint8_t>> read_data;
|
5359
|
+
std::vector<no_init<uint8_t>> work;
|
5360
|
+
std::vector<no_init<float>> f32_conv_buf;
|
4730
5361
|
|
4731
5362
|
// populate the original tensors so we get an initial meta data
|
4732
5363
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
@@ -4748,7 +5379,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4748
5379
|
|
4749
5380
|
const std::string name = ggml_get_name(tensor);
|
4750
5381
|
|
4751
|
-
read_data.
|
5382
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
5383
|
+
read_data.resize(ggml_nbytes(tensor));
|
5384
|
+
}
|
4752
5385
|
tensor->data = read_data.data();
|
4753
5386
|
ml->load_data_for(tensor);
|
4754
5387
|
|
@@ -4764,137 +5397,50 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4764
5397
|
// quantize only 2D tensors
|
4765
5398
|
quantize &= (tensor->n_dims == 2);
|
4766
5399
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
4767
|
-
quantize &=
|
5400
|
+
quantize &= !params->only_copy;
|
4768
5401
|
|
4769
5402
|
enum ggml_type new_type;
|
4770
5403
|
void * new_data;
|
4771
5404
|
size_t new_size;
|
4772
5405
|
|
5406
|
+
if (quantize) {
|
5407
|
+
new_type = quantized_type;
|
5408
|
+
#ifdef GGML_USE_K_QUANTS
|
5409
|
+
new_type = get_k_quant_type(
|
5410
|
+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
5411
|
+
);
|
5412
|
+
#endif
|
5413
|
+
// If we've decided to quantize to the same type the tensor is already
|
5414
|
+
// in then there's nothing to do.
|
5415
|
+
quantize = tensor->type != new_type;
|
5416
|
+
}
|
4773
5417
|
if (!quantize) {
|
4774
5418
|
new_type = tensor->type;
|
4775
5419
|
new_data = tensor->data;
|
4776
5420
|
new_size = ggml_nbytes(tensor);
|
4777
5421
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4778
5422
|
} else {
|
4779
|
-
new_type = quantized_type;
|
4780
|
-
#ifdef GGML_USE_K_QUANTS
|
4781
|
-
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
4782
|
-
const auto tn = LLM_TN(ml->get_arch());
|
4783
|
-
|
4784
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4785
|
-
int nx = tensor->ne[0];
|
4786
|
-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4787
|
-
new_type = GGML_TYPE_Q8_0;
|
4788
|
-
}
|
4789
|
-
else if (new_type != GGML_TYPE_Q8_0) {
|
4790
|
-
new_type = GGML_TYPE_Q6_K;
|
4791
|
-
}
|
4792
|
-
} else if (name.find("attn_v.weight") != std::string::npos) {
|
4793
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4794
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4795
|
-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4796
|
-
}
|
4797
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4798
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4799
|
-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
4800
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4801
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4802
|
-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4803
|
-
if (model.type == MODEL_70B) {
|
4804
|
-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4805
|
-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4806
|
-
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4807
|
-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4808
|
-
}
|
4809
|
-
++i_attention_wv;
|
4810
|
-
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4811
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4812
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4813
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4814
|
-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4815
|
-
: GGML_TYPE_Q3_K;
|
4816
|
-
}
|
4817
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4818
|
-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4819
|
-
}
|
4820
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4821
|
-
if (model.arch == LLM_ARCH_FALCON) {
|
4822
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4823
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4824
|
-
} else {
|
4825
|
-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4826
|
-
}
|
4827
|
-
}
|
4828
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4829
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4830
|
-
new_type = GGML_TYPE_Q5_K;
|
4831
|
-
}
|
4832
|
-
++i_feed_forward_w2;
|
4833
|
-
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4834
|
-
if (model.arch != LLM_ARCH_FALCON) {
|
4835
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4836
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4837
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4838
|
-
} else {
|
4839
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4840
|
-
}
|
4841
|
-
}
|
4842
|
-
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4843
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4844
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4845
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4846
|
-
}
|
4847
|
-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4848
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4849
|
-
}
|
4850
|
-
// This can be used to reduce the size of the Q5_K_S model.
|
4851
|
-
// The associated PPL increase is fully in line with the size reduction
|
4852
|
-
//else {
|
4853
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
4854
|
-
//}
|
4855
|
-
bool convert_incompatible_tensor = false;
|
4856
|
-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
4857
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4858
|
-
int nx = tensor->ne[0];
|
4859
|
-
int ny = tensor->ne[1];
|
4860
|
-
if (nx % QK_K != 0) {
|
4861
|
-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4862
|
-
convert_incompatible_tensor = true;
|
4863
|
-
}
|
4864
|
-
}
|
4865
|
-
if (convert_incompatible_tensor) {
|
4866
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4867
|
-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
4868
|
-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
4869
|
-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
4870
|
-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
4871
|
-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
4872
|
-
} else {
|
4873
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
4874
|
-
}
|
4875
|
-
}
|
4876
|
-
#endif
|
4877
|
-
|
4878
5423
|
const size_t nelements = ggml_nelements(tensor);
|
4879
5424
|
|
4880
5425
|
float * f32_data;
|
4881
|
-
std::vector<float> f32_conv_buf;
|
4882
5426
|
|
4883
5427
|
if (tensor->type == GGML_TYPE_F32) {
|
4884
5428
|
f32_data = (float *) tensor->data;
|
4885
5429
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
4886
5430
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
4887
5431
|
} else {
|
4888
|
-
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
5432
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
4889
5433
|
f32_data = (float *) f32_conv_buf.data();
|
4890
5434
|
}
|
4891
5435
|
|
4892
5436
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
4893
5437
|
fflush(stdout);
|
4894
5438
|
|
4895
|
-
work.
|
5439
|
+
if (work.size() < nelements * 4) {
|
5440
|
+
work.resize(nelements * 4); // upper bound on size
|
5441
|
+
}
|
4896
5442
|
new_data = work.data();
|
4897
|
-
std::
|
5443
|
+
std::array<int64_t, 1 << 4> hist_cur = {};
|
4898
5444
|
|
4899
5445
|
static const int chunk_size = 32 * 512;
|
4900
5446
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
@@ -4905,13 +5451,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4905
5451
|
size_t counter = 0;
|
4906
5452
|
new_size = 0;
|
4907
5453
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
4908
|
-
std::
|
5454
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
4909
5455
|
size_t local_size = 0;
|
4910
5456
|
while (true) {
|
4911
5457
|
std::unique_lock<std::mutex> lock(mutex);
|
4912
5458
|
size_t first = counter; counter += chunk_size;
|
4913
5459
|
if (first >= nelements) {
|
4914
|
-
if (
|
5460
|
+
if (local_size > 0) {
|
4915
5461
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
4916
5462
|
hist_cur[j] += local_hist[j];
|
4917
5463
|
}
|
@@ -4921,22 +5467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4921
5467
|
}
|
4922
5468
|
lock.unlock();
|
4923
5469
|
size_t last = std::min(nelements, first + chunk_size);
|
4924
|
-
if (local_hist.empty()) {
|
4925
|
-
local_hist.resize(hist_cur.size(), 0);
|
4926
|
-
}
|
4927
5470
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
4928
5471
|
}
|
4929
5472
|
};
|
4930
|
-
if ((int) workers.size() < nthread_use - 1) {
|
4931
|
-
workers.resize(nthread_use - 1);
|
4932
|
-
}
|
4933
5473
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
4934
|
-
workers
|
5474
|
+
workers.emplace_back(compute);
|
4935
5475
|
}
|
4936
5476
|
compute();
|
4937
|
-
for (
|
4938
|
-
|
4939
|
-
}
|
5477
|
+
for (auto & w : workers) { w.join(); }
|
5478
|
+
workers.clear();
|
4940
5479
|
}
|
4941
5480
|
|
4942
5481
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -5279,7 +5818,7 @@ struct llama_context_params llama_context_default_params() {
|
|
5279
5818
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
5280
5819
|
/*.n_ctx =*/ 512,
|
5281
5820
|
/*.n_batch =*/ 512,
|
5282
|
-
/*.
|
5821
|
+
/*.n_gpu_layers =*/ 0,
|
5283
5822
|
/*.main_gpu =*/ 0,
|
5284
5823
|
/*.tensor_split =*/ nullptr,
|
5285
5824
|
/*.rope_freq_base =*/ 10000.0f,
|
@@ -5296,6 +5835,10 @@ struct llama_context_params llama_context_default_params() {
|
|
5296
5835
|
/*.embedding =*/ false,
|
5297
5836
|
};
|
5298
5837
|
|
5838
|
+
#ifdef GGML_USE_METAL
|
5839
|
+
result.n_gpu_layers = 1;
|
5840
|
+
#endif
|
5841
|
+
|
5299
5842
|
return result;
|
5300
5843
|
}
|
5301
5844
|
|
@@ -5305,6 +5848,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
5305
5848
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
5306
5849
|
/*.allow_requantize =*/ false,
|
5307
5850
|
/*.quantize_output_tensor =*/ true,
|
5851
|
+
/*.only_copy =*/ false,
|
5308
5852
|
};
|
5309
5853
|
|
5310
5854
|
return result;
|
@@ -5487,43 +6031,43 @@ struct llama_context * llama_new_context_with_model(
|
|
5487
6031
|
}
|
5488
6032
|
#endif
|
5489
6033
|
}
|
5490
|
-
}
|
5491
6034
|
|
5492
6035
|
#ifdef GGML_USE_METAL
|
5493
|
-
|
5494
|
-
|
6036
|
+
if (params.n_gpu_layers > 0) {
|
6037
|
+
// this allocates all Metal resources and memory buffers
|
5495
6038
|
|
5496
|
-
|
5497
|
-
|
6039
|
+
void * data_ptr = NULL;
|
6040
|
+
size_t data_size = 0;
|
5498
6041
|
|
5499
|
-
|
5500
|
-
|
5501
|
-
|
5502
|
-
|
5503
|
-
|
5504
|
-
|
5505
|
-
|
6042
|
+
if (params.use_mmap) {
|
6043
|
+
data_ptr = ctx->model.mapping->addr;
|
6044
|
+
data_size = ctx->model.mapping->size;
|
6045
|
+
} else {
|
6046
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
6047
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
6048
|
+
}
|
5506
6049
|
|
5507
|
-
|
6050
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
5508
6051
|
|
5509
|
-
|
6052
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
5510
6053
|
|
5511
6054
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
5512
|
-
|
5513
|
-
|
5514
|
-
|
5515
|
-
|
5516
|
-
|
6055
|
+
if (!(result)) { \
|
6056
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
6057
|
+
llama_free(ctx); \
|
6058
|
+
return NULL; \
|
6059
|
+
}
|
5517
6060
|
|
5518
|
-
|
6061
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
5519
6062
|
|
5520
|
-
|
5521
|
-
|
6063
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
6064
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
5522
6065
|
|
5523
|
-
|
6066
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
5524
6067
|
#undef LLAMA_METAL_CHECK_BUF
|
5525
|
-
|
6068
|
+
}
|
5526
6069
|
#endif
|
6070
|
+
}
|
5527
6071
|
|
5528
6072
|
#ifdef GGML_USE_MPI
|
5529
6073
|
ctx->ctx_mpi = ggml_mpi_init();
|
@@ -5559,15 +6103,19 @@ void llama_free(struct llama_context * ctx) {
|
|
5559
6103
|
}
|
5560
6104
|
|
5561
6105
|
int llama_n_vocab(const struct llama_context * ctx) {
|
5562
|
-
return ctx->model
|
6106
|
+
return llama_model_n_vocab(&ctx->model);
|
5563
6107
|
}
|
5564
6108
|
|
5565
6109
|
int llama_n_ctx(const struct llama_context * ctx) {
|
5566
|
-
return ctx->model
|
6110
|
+
return llama_model_n_ctx(&ctx->model);
|
6111
|
+
}
|
6112
|
+
|
6113
|
+
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6114
|
+
return llama_model_n_ctx_train(&ctx->model);
|
5567
6115
|
}
|
5568
6116
|
|
5569
6117
|
int llama_n_embd(const struct llama_context * ctx) {
|
5570
|
-
return ctx->model
|
6118
|
+
return llama_model_n_embd(&ctx->model);
|
5571
6119
|
}
|
5572
6120
|
|
5573
6121
|
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
@@ -5582,6 +6130,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
|
|
5582
6130
|
return model->hparams.n_ctx;
|
5583
6131
|
}
|
5584
6132
|
|
6133
|
+
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6134
|
+
return model->hparams.n_ctx_train;
|
6135
|
+
}
|
6136
|
+
|
5585
6137
|
int llama_model_n_embd(const struct llama_model * model) {
|
5586
6138
|
return model->hparams.n_embd;
|
5587
6139
|
}
|
@@ -5857,7 +6409,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
5857
6409
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
5858
6410
|
rng_ss >> ctx->rng;
|
5859
6411
|
|
5860
|
-
GGML_ASSERT(rng_ss.fail()
|
6412
|
+
GGML_ASSERT(!rng_ss.fail());
|
5861
6413
|
}
|
5862
6414
|
|
5863
6415
|
// set logits
|
@@ -6136,7 +6688,7 @@ int llama_tokenize_with_model(
|
|
6136
6688
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
6137
6689
|
|
6138
6690
|
if (n_max_tokens < (int) res.size()) {
|
6139
|
-
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6691
|
+
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6140
6692
|
return -((int) res.size());
|
6141
6693
|
}
|
6142
6694
|
|