llama_cpp 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +0 -5
- data/ext/llama_cpp/src/ggml-cuda.cu +1011 -655
- data/ext/llama_cpp/src/ggml-metal.m +57 -15
- data/ext/llama_cpp/src/ggml-metal.metal +271 -137
- data/ext/llama_cpp/src/ggml.c +7 -3
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +617 -141
- data/ext/llama_cpp/src/llama.h +8 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,8 +1,3 @@
|
|
1
|
-
// Defines fileno on msys:
|
2
|
-
#ifndef _GNU_SOURCE
|
3
|
-
#define _GNU_SOURCE
|
4
|
-
#endif
|
5
|
-
|
6
1
|
#include "llama.h"
|
7
2
|
|
8
3
|
#include "ggml.h"
|
@@ -160,6 +155,7 @@ static std::string format(const char * fmt, ...) {
|
|
160
155
|
enum llm_arch {
|
161
156
|
LLM_ARCH_LLAMA,
|
162
157
|
LLM_ARCH_FALCON,
|
158
|
+
LLM_ARCH_BAICHUAN,
|
163
159
|
LLM_ARCH_GPT2,
|
164
160
|
LLM_ARCH_GPTJ,
|
165
161
|
LLM_ARCH_GPTNEOX,
|
@@ -174,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
174
170
|
{ LLM_ARCH_GPTJ, "gptj" },
|
175
171
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
176
172
|
{ LLM_ARCH_MPT, "mpt" },
|
173
|
+
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
177
174
|
};
|
178
175
|
|
179
176
|
enum llm_kv {
|
@@ -314,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
314
311
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
315
312
|
},
|
316
313
|
},
|
314
|
+
{
|
315
|
+
LLM_ARCH_BAICHUAN,
|
316
|
+
{
|
317
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
318
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
319
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
320
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
321
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
322
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
323
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
324
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
325
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
326
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
327
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
328
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
329
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
330
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
331
|
+
},
|
332
|
+
},
|
317
333
|
{
|
318
334
|
LLM_ARCH_FALCON,
|
319
335
|
{
|
@@ -658,15 +674,12 @@ struct llama_mmap {
|
|
658
674
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
659
675
|
if (prefetch) {
|
660
676
|
// Advise the kernel to preload the mapped memory
|
661
|
-
|
662
677
|
WIN32_MEMORY_RANGE_ENTRY range;
|
663
|
-
|
664
678
|
range.VirtualAddress = addr;
|
665
679
|
range.NumberOfBytes = (SIZE_T)size;
|
666
680
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
667
681
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
668
682
|
llama_format_win_err(GetLastError()).c_str());
|
669
|
-
}
|
670
683
|
}
|
671
684
|
#else
|
672
685
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -1685,6 +1698,15 @@ static void llm_load_hparams(
|
|
1685
1698
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1686
1699
|
}
|
1687
1700
|
} break;
|
1701
|
+
case LLM_ARCH_BAICHUAN:
|
1702
|
+
{
|
1703
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1704
|
+
switch (hparams.n_layer) {
|
1705
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
1706
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
1707
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1708
|
+
}
|
1709
|
+
} break;
|
1688
1710
|
default: (void)0;
|
1689
1711
|
};
|
1690
1712
|
|
@@ -1925,7 +1947,6 @@ static void llm_load_tensors(
|
|
1925
1947
|
const int64_t n_vocab = hparams.n_vocab;
|
1926
1948
|
|
1927
1949
|
const auto tn = LLM_TN(model.arch);
|
1928
|
-
|
1929
1950
|
switch (model.arch) {
|
1930
1951
|
case LLM_ARCH_LLAMA:
|
1931
1952
|
{
|
@@ -1968,6 +1989,72 @@ static void llm_load_tensors(
|
|
1968
1989
|
|
1969
1990
|
model.layers.resize(n_layer);
|
1970
1991
|
|
1992
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
1993
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1994
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1995
|
+
|
1996
|
+
auto & layer = model.layers[i];
|
1997
|
+
|
1998
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
1999
|
+
|
2000
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
2001
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2002
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2003
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2004
|
+
|
2005
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2006
|
+
|
2007
|
+
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
2008
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2009
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2010
|
+
|
2011
|
+
if (backend == GGML_BACKEND_GPU) {
|
2012
|
+
vram_weights +=
|
2013
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
2014
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
2015
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
2016
|
+
}
|
2017
|
+
}
|
2018
|
+
} break;
|
2019
|
+
case LLM_ARCH_BAICHUAN:
|
2020
|
+
{
|
2021
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2022
|
+
{
|
2023
|
+
ggml_backend backend_norm;
|
2024
|
+
ggml_backend backend_output;
|
2025
|
+
|
2026
|
+
if (n_gpu_layers > int(n_layer)) {
|
2027
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2028
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2029
|
+
#ifndef _WIN32
|
2030
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2031
|
+
#else
|
2032
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2033
|
+
#endif // _WIN32
|
2034
|
+
|
2035
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2036
|
+
} else {
|
2037
|
+
backend_norm = GGML_BACKEND_CPU;
|
2038
|
+
backend_output = GGML_BACKEND_CPU;
|
2039
|
+
}
|
2040
|
+
|
2041
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2042
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2043
|
+
|
2044
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2045
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2046
|
+
}
|
2047
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2048
|
+
vram_weights += ggml_nbytes(model.output);
|
2049
|
+
}
|
2050
|
+
}
|
2051
|
+
|
2052
|
+
const uint32_t n_ff = hparams.n_ff;
|
2053
|
+
|
2054
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2055
|
+
|
2056
|
+
model.layers.resize(n_layer);
|
2057
|
+
|
1971
2058
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1972
2059
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1973
2060
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
@@ -2544,6 +2631,367 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2544
2631
|
return gf;
|
2545
2632
|
}
|
2546
2633
|
|
2634
|
+
|
2635
|
+
static struct ggml_cgraph * llm_build_baichaun(
|
2636
|
+
llama_context & lctx,
|
2637
|
+
const llama_token * tokens,
|
2638
|
+
const float * embd,
|
2639
|
+
int n_tokens,
|
2640
|
+
int n_past) {
|
2641
|
+
|
2642
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2643
|
+
|
2644
|
+
const int N = n_tokens;
|
2645
|
+
|
2646
|
+
const auto & model = lctx.model;
|
2647
|
+
const auto & hparams = model.hparams;
|
2648
|
+
|
2649
|
+
const auto & kv_self = lctx.kv_self;
|
2650
|
+
|
2651
|
+
GGML_ASSERT(!!kv_self.ctx);
|
2652
|
+
|
2653
|
+
const int64_t n_embd = hparams.n_embd;
|
2654
|
+
const int64_t n_layer = hparams.n_layer;
|
2655
|
+
const int64_t n_ctx = hparams.n_ctx;
|
2656
|
+
const int64_t n_head = hparams.n_head;
|
2657
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
2658
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
2659
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
2660
|
+
|
2661
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2662
|
+
|
2663
|
+
const float freq_base = hparams.rope_freq_base;
|
2664
|
+
const float freq_scale = hparams.rope_freq_scale;
|
2665
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2666
|
+
|
2667
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
2668
|
+
|
2669
|
+
auto & buf_compute = lctx.buf_compute;
|
2670
|
+
|
2671
|
+
struct ggml_init_params params = {
|
2672
|
+
/*.mem_size =*/ buf_compute.size,
|
2673
|
+
/*.mem_buffer =*/ buf_compute.data,
|
2674
|
+
/*.no_alloc =*/ false,
|
2675
|
+
};
|
2676
|
+
|
2677
|
+
params.no_alloc = true;
|
2678
|
+
|
2679
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
2680
|
+
|
2681
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
2682
|
+
|
2683
|
+
struct ggml_tensor * cur;
|
2684
|
+
struct ggml_tensor * inpL;
|
2685
|
+
|
2686
|
+
if (tokens) {
|
2687
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
2688
|
+
|
2689
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2690
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2691
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
2692
|
+
}
|
2693
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
2694
|
+
|
2695
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
2696
|
+
} else {
|
2697
|
+
#ifdef GGML_USE_MPI
|
2698
|
+
GGML_ASSERT(false && "not implemented");
|
2699
|
+
#endif
|
2700
|
+
|
2701
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
2702
|
+
|
2703
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
2704
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2705
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
2706
|
+
}
|
2707
|
+
}
|
2708
|
+
|
2709
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2710
|
+
(void) i_gpu_start;
|
2711
|
+
|
2712
|
+
// offload functions set the tensor output backend to GPU
|
2713
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2714
|
+
//
|
2715
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2716
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
2717
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2718
|
+
offload_func_t offload_func_kq = llama_nop;
|
2719
|
+
offload_func_t offload_func_v = llama_nop;
|
2720
|
+
|
2721
|
+
#ifdef GGML_USE_CUBLAS
|
2722
|
+
if (n_gpu_layers > n_layer) {
|
2723
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
2724
|
+
}
|
2725
|
+
if (n_gpu_layers > n_layer + 1) {
|
2726
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
2727
|
+
}
|
2728
|
+
if (n_gpu_layers > n_layer + 2) {
|
2729
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
2730
|
+
}
|
2731
|
+
#endif // GGML_USE_CUBLAS
|
2732
|
+
|
2733
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2734
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2735
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2736
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2737
|
+
}
|
2738
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2739
|
+
|
2740
|
+
for (int il = 0; il < n_layer; ++il) {
|
2741
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
2742
|
+
|
2743
|
+
offload_func_t offload_func = llama_nop;
|
2744
|
+
|
2745
|
+
#ifdef GGML_USE_CUBLAS
|
2746
|
+
if (il >= i_gpu_start) {
|
2747
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
2748
|
+
}
|
2749
|
+
#endif // GGML_USE_CUBLAS
|
2750
|
+
|
2751
|
+
struct ggml_tensor * inpSA = inpL;
|
2752
|
+
|
2753
|
+
// norm
|
2754
|
+
{
|
2755
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
2756
|
+
offload_func(cur);
|
2757
|
+
ggml_set_name(cur, "rms_norm_0");
|
2758
|
+
|
2759
|
+
// cur = cur*attn_norm(broadcasted)
|
2760
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
2761
|
+
offload_func(cur);
|
2762
|
+
ggml_set_name(cur, "attention_norm_0");
|
2763
|
+
}
|
2764
|
+
|
2765
|
+
// self-attention
|
2766
|
+
{
|
2767
|
+
// compute Q and K and RoPE them
|
2768
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
2769
|
+
offload_func_kq(tmpk);
|
2770
|
+
ggml_set_name(tmpk, "tmpk");
|
2771
|
+
|
2772
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
2773
|
+
offload_func_kq(tmpq);
|
2774
|
+
ggml_set_name(tmpq, "tmpq");
|
2775
|
+
|
2776
|
+
struct ggml_tensor * Kcur;
|
2777
|
+
struct ggml_tensor * Qcur;
|
2778
|
+
switch (model.type) {
|
2779
|
+
case MODEL_7B:
|
2780
|
+
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2781
|
+
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2782
|
+
break;
|
2783
|
+
case MODEL_13B:
|
2784
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
|
2785
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
|
2786
|
+
break;
|
2787
|
+
default:
|
2788
|
+
GGML_ASSERT(false);
|
2789
|
+
}
|
2790
|
+
|
2791
|
+
offload_func_kq(Kcur);
|
2792
|
+
ggml_set_name(Kcur, "Kcur");
|
2793
|
+
|
2794
|
+
offload_func_kq(Qcur);
|
2795
|
+
ggml_set_name(Qcur, "Qcur");
|
2796
|
+
|
2797
|
+
// store key and value to memory
|
2798
|
+
{
|
2799
|
+
// compute the transposed [N, n_embd] V matrix
|
2800
|
+
|
2801
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2802
|
+
offload_func_v(tmpv);
|
2803
|
+
ggml_set_name(tmpv, "tmpv");
|
2804
|
+
|
2805
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
2806
|
+
offload_func_v(Vcur);
|
2807
|
+
ggml_set_name(Vcur, "Vcur");
|
2808
|
+
|
2809
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
2810
|
+
offload_func_kq(k);
|
2811
|
+
ggml_set_name(k, "k");
|
2812
|
+
|
2813
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
2814
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
2815
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
2816
|
+
offload_func_v(v);
|
2817
|
+
ggml_set_name(v, "v");
|
2818
|
+
|
2819
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
2820
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
2821
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
2825
|
+
offload_func_kq(Q);
|
2826
|
+
ggml_set_name(Q, "Q");
|
2827
|
+
|
2828
|
+
struct ggml_tensor * K =
|
2829
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2830
|
+
n_embd_head, n_past + N, n_head_kv,
|
2831
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2832
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2833
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
2834
|
+
offload_func_kq(K);
|
2835
|
+
ggml_set_name(K, "K");
|
2836
|
+
|
2837
|
+
// K * Q
|
2838
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
2839
|
+
offload_func_kq(KQ);
|
2840
|
+
ggml_set_name(KQ, "KQ");
|
2841
|
+
|
2842
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2843
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
2844
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
2845
|
+
offload_func_kq(KQ_scaled);
|
2846
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2847
|
+
|
2848
|
+
struct ggml_tensor * KQ_masked;
|
2849
|
+
struct ggml_tensor * KQ_scaled_alibi;
|
2850
|
+
|
2851
|
+
switch (model.type) {
|
2852
|
+
case MODEL_7B:
|
2853
|
+
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2854
|
+
break;
|
2855
|
+
case MODEL_13B:
|
2856
|
+
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
|
2857
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2858
|
+
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2859
|
+
break;
|
2860
|
+
default:
|
2861
|
+
GGML_ASSERT(false);
|
2862
|
+
}
|
2863
|
+
// KQ_masked = mask_past(KQ_scaled)
|
2864
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2865
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2866
|
+
// offload_func_kq(KQ_masked);
|
2867
|
+
// ggml_set_name(KQ_masked, "KQ_masked");
|
2868
|
+
|
2869
|
+
// KQ = soft_max(KQ_masked)
|
2870
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
2871
|
+
offload_func_v(KQ_soft_max);
|
2872
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2873
|
+
|
2874
|
+
// split cached V into n_head heads
|
2875
|
+
struct ggml_tensor * V =
|
2876
|
+
ggml_view_3d(ctx0, kv_self.v,
|
2877
|
+
n_past + N, n_embd_head, n_head_kv,
|
2878
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
2879
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2880
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
2881
|
+
offload_func_v(V);
|
2882
|
+
ggml_set_name(V, "V");
|
2883
|
+
|
2884
|
+
#if 1
|
2885
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
2886
|
+
offload_func_v(KQV);
|
2887
|
+
ggml_set_name(KQV, "KQV");
|
2888
|
+
#else
|
2889
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2890
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2891
|
+
// is there a better way?
|
2892
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
2893
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2894
|
+
#endif
|
2895
|
+
|
2896
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
2897
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
2898
|
+
offload_func_v(KQV_merged);
|
2899
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
2900
|
+
|
2901
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
2902
|
+
cur = ggml_cpy(ctx0,
|
2903
|
+
KQV_merged,
|
2904
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2905
|
+
offload_func_v(cur);
|
2906
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
2907
|
+
|
2908
|
+
// projection (no bias)
|
2909
|
+
cur = ggml_mul_mat(ctx0,
|
2910
|
+
model.layers[il].wo,
|
2911
|
+
cur);
|
2912
|
+
offload_func(cur);
|
2913
|
+
ggml_set_name(cur, "result_wo");
|
2914
|
+
}
|
2915
|
+
|
2916
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
2917
|
+
offload_func(inpFF);
|
2918
|
+
ggml_set_name(inpFF, "inpFF");
|
2919
|
+
|
2920
|
+
// feed-forward network
|
2921
|
+
{
|
2922
|
+
// norm
|
2923
|
+
{
|
2924
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
2925
|
+
offload_func(cur);
|
2926
|
+
ggml_set_name(cur, "rms_norm_1");
|
2927
|
+
|
2928
|
+
// cur = cur*ffn_norm(broadcasted)
|
2929
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
2930
|
+
offload_func(cur);
|
2931
|
+
ggml_set_name(cur, "ffn_norm");
|
2932
|
+
}
|
2933
|
+
|
2934
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
2935
|
+
model.layers[il].w3,
|
2936
|
+
cur);
|
2937
|
+
offload_func(tmp);
|
2938
|
+
ggml_set_name(tmp, "result_w3");
|
2939
|
+
|
2940
|
+
cur = ggml_mul_mat(ctx0,
|
2941
|
+
model.layers[il].w1,
|
2942
|
+
cur);
|
2943
|
+
offload_func(cur);
|
2944
|
+
ggml_set_name(cur, "result_w1");
|
2945
|
+
|
2946
|
+
// SILU activation
|
2947
|
+
cur = ggml_silu(ctx0, cur);
|
2948
|
+
offload_func(cur);
|
2949
|
+
ggml_set_name(cur, "silu");
|
2950
|
+
|
2951
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
2952
|
+
offload_func(cur);
|
2953
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
2954
|
+
|
2955
|
+
cur = ggml_mul_mat(ctx0,
|
2956
|
+
model.layers[il].w2,
|
2957
|
+
cur);
|
2958
|
+
offload_func(cur);
|
2959
|
+
ggml_set_name(cur, "result_w2");
|
2960
|
+
}
|
2961
|
+
|
2962
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
2963
|
+
offload_func(cur);
|
2964
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
2965
|
+
|
2966
|
+
// input for next layer
|
2967
|
+
inpL = cur;
|
2968
|
+
}
|
2969
|
+
|
2970
|
+
cur = inpL;
|
2971
|
+
|
2972
|
+
// norm
|
2973
|
+
{
|
2974
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
2975
|
+
offload_func_nr(cur);
|
2976
|
+
ggml_set_name(cur, "rms_norm_2");
|
2977
|
+
|
2978
|
+
// cur = cur*norm(broadcasted)
|
2979
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
2980
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
2981
|
+
ggml_set_name(cur, "result_norm");
|
2982
|
+
}
|
2983
|
+
|
2984
|
+
// lm_head
|
2985
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
2986
|
+
ggml_set_name(cur, "result_output");
|
2987
|
+
|
2988
|
+
ggml_build_forward_expand(gf, cur);
|
2989
|
+
|
2990
|
+
ggml_free(ctx0);
|
2991
|
+
|
2992
|
+
return gf;
|
2993
|
+
}
|
2994
|
+
|
2547
2995
|
static struct ggml_cgraph * llm_build_falcon(
|
2548
2996
|
llama_context & lctx,
|
2549
2997
|
const llama_token * tokens,
|
@@ -2866,6 +3314,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
2866
3314
|
{
|
2867
3315
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
2868
3316
|
} break;
|
3317
|
+
case LLM_ARCH_BAICHUAN:
|
3318
|
+
{
|
3319
|
+
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
|
3320
|
+
} break;
|
2869
3321
|
case LLM_ARCH_FALCON:
|
2870
3322
|
{
|
2871
3323
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
@@ -3123,10 +3575,9 @@ struct llm_tokenizer_spm {
|
|
3123
3575
|
while (offs < text.size()) {
|
3124
3576
|
llm_symbol sym;
|
3125
3577
|
size_t len = utf8_len(text[offs]);
|
3126
|
-
GGML_ASSERT(offs + len <= text.size());
|
3127
3578
|
sym.text = text.c_str() + offs;
|
3128
|
-
sym.n = len;
|
3129
|
-
offs +=
|
3579
|
+
sym.n = std::min(len, text.size() - offs);
|
3580
|
+
offs += sym.n;
|
3130
3581
|
sym.prev = index - 1;
|
3131
3582
|
sym.next = offs == text.size() ? -1 : index + 1;
|
3132
3583
|
index++;
|
@@ -4642,7 +5093,16 @@ void llama_beam_search(llama_context * ctx,
|
|
4642
5093
|
// quantization
|
4643
5094
|
//
|
4644
5095
|
|
4645
|
-
|
5096
|
+
template <typename T>
|
5097
|
+
struct no_init {
|
5098
|
+
T value;
|
5099
|
+
no_init() { /* do nothing */ }
|
5100
|
+
};
|
5101
|
+
|
5102
|
+
static void llama_convert_tensor_internal(
|
5103
|
+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
5104
|
+
const size_t nelements, const int nthread
|
5105
|
+
) {
|
4646
5106
|
if (output.size() < nelements) {
|
4647
5107
|
output.resize(nelements);
|
4648
5108
|
}
|
@@ -4677,7 +5137,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4677
5137
|
auto blocks_per_thread = nblocks / nthread;
|
4678
5138
|
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
4679
5139
|
|
4680
|
-
std::vector<std::thread> workers;
|
4681
5140
|
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
4682
5141
|
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
4683
5142
|
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
@@ -4690,14 +5149,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4690
5149
|
qtype.to_float(inbuf, outbuf, nels);
|
4691
5150
|
}
|
4692
5151
|
};
|
4693
|
-
workers.
|
5152
|
+
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
4694
5153
|
in_buff_offs += thr_block_bytes;
|
4695
5154
|
out_buff_offs += thr_elems;
|
4696
5155
|
}
|
4697
|
-
for (auto &
|
4698
|
-
|
5156
|
+
for (auto & w : workers) { w.join(); }
|
5157
|
+
workers.clear();
|
5158
|
+
}
|
5159
|
+
|
5160
|
+
#ifdef GGML_USE_K_QUANTS
|
5161
|
+
static ggml_type get_k_quant_type(
|
5162
|
+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
5163
|
+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
5164
|
+
) {
|
5165
|
+
const std::string name = ggml_get_name(tensor);
|
5166
|
+
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
5167
|
+
const auto tn = LLM_TN(model.arch);
|
5168
|
+
|
5169
|
+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
5170
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
5171
|
+
};
|
5172
|
+
|
5173
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5174
|
+
int nx = tensor->ne[0];
|
5175
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
5176
|
+
new_type = GGML_TYPE_Q8_0;
|
5177
|
+
}
|
5178
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
5179
|
+
new_type = GGML_TYPE_Q6_K;
|
5180
|
+
}
|
5181
|
+
} else if (name.find("attn_v.weight") != std::string::npos) {
|
5182
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5183
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5184
|
+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5185
|
+
}
|
5186
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5187
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
5188
|
+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
5189
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
5190
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
5191
|
+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
5192
|
+
if (model.type == MODEL_70B) {
|
5193
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
5194
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
5195
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
5196
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
5197
|
+
}
|
5198
|
+
++*i_attention_wv;
|
5199
|
+
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
5200
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5201
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5202
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
5203
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
5204
|
+
: GGML_TYPE_Q3_K;
|
5205
|
+
}
|
5206
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
5207
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
5208
|
+
}
|
5209
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
5210
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
5211
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
5212
|
+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5213
|
+
} else {
|
5214
|
+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5215
|
+
}
|
5216
|
+
}
|
5217
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5218
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
5219
|
+
new_type = GGML_TYPE_Q5_K;
|
5220
|
+
}
|
5221
|
+
++*i_feed_forward_w2;
|
5222
|
+
} else if (name.find("attn_output.weight") != std::string::npos) {
|
5223
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
5224
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
5225
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
5226
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5227
|
+
} else {
|
5228
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5229
|
+
}
|
5230
|
+
}
|
5231
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
5232
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5233
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
5234
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
5235
|
+
}
|
5236
|
+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
5237
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5238
|
+
}
|
5239
|
+
// This can be used to reduce the size of the Q5_K_S model.
|
5240
|
+
// The associated PPL increase is fully in line with the size reduction
|
5241
|
+
//else {
|
5242
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
5243
|
+
//}
|
5244
|
+
bool convert_incompatible_tensor = false;
|
5245
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
5246
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
5247
|
+
int nx = tensor->ne[0];
|
5248
|
+
int ny = tensor->ne[1];
|
5249
|
+
if (nx % QK_K != 0) {
|
5250
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
5251
|
+
convert_incompatible_tensor = true;
|
5252
|
+
}
|
5253
|
+
}
|
5254
|
+
if (convert_incompatible_tensor) {
|
5255
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5256
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
5257
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
5258
|
+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
5259
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
5260
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
5261
|
+
} else {
|
5262
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
5263
|
+
}
|
4699
5264
|
}
|
5265
|
+
|
5266
|
+
return new_type;
|
4700
5267
|
}
|
5268
|
+
#endif
|
4701
5269
|
|
4702
5270
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
4703
5271
|
ggml_type quantized_type;
|
@@ -4782,18 +5350,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4782
5350
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
4783
5351
|
|
4784
5352
|
std::vector<std::thread> workers;
|
5353
|
+
workers.reserve(nthread);
|
4785
5354
|
std::mutex mutex;
|
4786
5355
|
|
4787
|
-
#ifdef GGML_USE_K_QUANTS
|
4788
|
-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4789
|
-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4790
|
-
};
|
4791
|
-
#endif
|
4792
|
-
|
4793
5356
|
int idx = 0;
|
4794
5357
|
|
4795
|
-
std::vector<uint8_t
|
4796
|
-
std::vector<uint8_t
|
5358
|
+
std::vector<no_init<uint8_t>> read_data;
|
5359
|
+
std::vector<no_init<uint8_t>> work;
|
5360
|
+
std::vector<no_init<float>> f32_conv_buf;
|
4797
5361
|
|
4798
5362
|
// populate the original tensors so we get an initial meta data
|
4799
5363
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
@@ -4815,7 +5379,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4815
5379
|
|
4816
5380
|
const std::string name = ggml_get_name(tensor);
|
4817
5381
|
|
4818
|
-
read_data.
|
5382
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
5383
|
+
read_data.resize(ggml_nbytes(tensor));
|
5384
|
+
}
|
4819
5385
|
tensor->data = read_data.data();
|
4820
5386
|
ml->load_data_for(tensor);
|
4821
5387
|
|
@@ -4840,101 +5406,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4840
5406
|
if (quantize) {
|
4841
5407
|
new_type = quantized_type;
|
4842
5408
|
#ifdef GGML_USE_K_QUANTS
|
4843
|
-
|
4844
|
-
|
4845
|
-
|
4846
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4847
|
-
int nx = tensor->ne[0];
|
4848
|
-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4849
|
-
new_type = GGML_TYPE_Q8_0;
|
4850
|
-
}
|
4851
|
-
else if (new_type != GGML_TYPE_Q8_0) {
|
4852
|
-
new_type = GGML_TYPE_Q6_K;
|
4853
|
-
}
|
4854
|
-
} else if (name.find("attn_v.weight") != std::string::npos) {
|
4855
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4856
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4857
|
-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4858
|
-
}
|
4859
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4860
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4861
|
-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
4862
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4863
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4864
|
-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4865
|
-
if (model.type == MODEL_70B) {
|
4866
|
-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4867
|
-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4868
|
-
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4869
|
-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4870
|
-
}
|
4871
|
-
++i_attention_wv;
|
4872
|
-
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4873
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4874
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4875
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4876
|
-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4877
|
-
: GGML_TYPE_Q3_K;
|
4878
|
-
}
|
4879
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4880
|
-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4881
|
-
}
|
4882
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4883
|
-
if (model.arch == LLM_ARCH_FALCON) {
|
4884
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4885
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4886
|
-
} else {
|
4887
|
-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4888
|
-
}
|
4889
|
-
}
|
4890
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4891
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4892
|
-
new_type = GGML_TYPE_Q5_K;
|
4893
|
-
}
|
4894
|
-
++i_feed_forward_w2;
|
4895
|
-
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4896
|
-
if (model.arch != LLM_ARCH_FALCON) {
|
4897
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4898
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4899
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4900
|
-
} else {
|
4901
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4902
|
-
}
|
4903
|
-
}
|
4904
|
-
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4905
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4906
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4907
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4908
|
-
}
|
4909
|
-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4910
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4911
|
-
}
|
4912
|
-
// This can be used to reduce the size of the Q5_K_S model.
|
4913
|
-
// The associated PPL increase is fully in line with the size reduction
|
4914
|
-
//else {
|
4915
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
4916
|
-
//}
|
4917
|
-
bool convert_incompatible_tensor = false;
|
4918
|
-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
4919
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4920
|
-
int nx = tensor->ne[0];
|
4921
|
-
int ny = tensor->ne[1];
|
4922
|
-
if (nx % QK_K != 0) {
|
4923
|
-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4924
|
-
convert_incompatible_tensor = true;
|
4925
|
-
}
|
4926
|
-
}
|
4927
|
-
if (convert_incompatible_tensor) {
|
4928
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4929
|
-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
4930
|
-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
4931
|
-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
4932
|
-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
4933
|
-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
4934
|
-
} else {
|
4935
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
4936
|
-
}
|
4937
|
-
}
|
5409
|
+
new_type = get_k_quant_type(
|
5410
|
+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
5411
|
+
);
|
4938
5412
|
#endif
|
4939
5413
|
// If we've decided to quantize to the same type the tensor is already
|
4940
5414
|
// in then there's nothing to do.
|
@@ -4949,23 +5423,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4949
5423
|
const size_t nelements = ggml_nelements(tensor);
|
4950
5424
|
|
4951
5425
|
float * f32_data;
|
4952
|
-
std::vector<float> f32_conv_buf;
|
4953
5426
|
|
4954
5427
|
if (tensor->type == GGML_TYPE_F32) {
|
4955
5428
|
f32_data = (float *) tensor->data;
|
4956
5429
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
4957
5430
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
4958
5431
|
} else {
|
4959
|
-
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
5432
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
4960
5433
|
f32_data = (float *) f32_conv_buf.data();
|
4961
5434
|
}
|
4962
5435
|
|
4963
5436
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
4964
5437
|
fflush(stdout);
|
4965
5438
|
|
4966
|
-
work.
|
5439
|
+
if (work.size() < nelements * 4) {
|
5440
|
+
work.resize(nelements * 4); // upper bound on size
|
5441
|
+
}
|
4967
5442
|
new_data = work.data();
|
4968
|
-
std::
|
5443
|
+
std::array<int64_t, 1 << 4> hist_cur = {};
|
4969
5444
|
|
4970
5445
|
static const int chunk_size = 32 * 512;
|
4971
5446
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
@@ -4976,13 +5451,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4976
5451
|
size_t counter = 0;
|
4977
5452
|
new_size = 0;
|
4978
5453
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
4979
|
-
std::
|
5454
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
4980
5455
|
size_t local_size = 0;
|
4981
5456
|
while (true) {
|
4982
5457
|
std::unique_lock<std::mutex> lock(mutex);
|
4983
5458
|
size_t first = counter; counter += chunk_size;
|
4984
5459
|
if (first >= nelements) {
|
4985
|
-
if (
|
5460
|
+
if (local_size > 0) {
|
4986
5461
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
4987
5462
|
hist_cur[j] += local_hist[j];
|
4988
5463
|
}
|
@@ -4992,22 +5467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4992
5467
|
}
|
4993
5468
|
lock.unlock();
|
4994
5469
|
size_t last = std::min(nelements, first + chunk_size);
|
4995
|
-
if (local_hist.empty()) {
|
4996
|
-
local_hist.resize(hist_cur.size(), 0);
|
4997
|
-
}
|
4998
5470
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
4999
5471
|
}
|
5000
5472
|
};
|
5001
|
-
if ((int) workers.size() < nthread_use - 1) {
|
5002
|
-
workers.resize(nthread_use - 1);
|
5003
|
-
}
|
5004
5473
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
5005
|
-
workers
|
5474
|
+
workers.emplace_back(compute);
|
5006
5475
|
}
|
5007
5476
|
compute();
|
5008
|
-
for (
|
5009
|
-
|
5010
|
-
}
|
5477
|
+
for (auto & w : workers) { w.join(); }
|
5478
|
+
workers.clear();
|
5011
5479
|
}
|
5012
5480
|
|
5013
5481
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -5635,15 +6103,19 @@ void llama_free(struct llama_context * ctx) {
|
|
5635
6103
|
}
|
5636
6104
|
|
5637
6105
|
int llama_n_vocab(const struct llama_context * ctx) {
|
5638
|
-
return ctx->model
|
6106
|
+
return llama_model_n_vocab(&ctx->model);
|
5639
6107
|
}
|
5640
6108
|
|
5641
6109
|
int llama_n_ctx(const struct llama_context * ctx) {
|
5642
|
-
return ctx->model
|
6110
|
+
return llama_model_n_ctx(&ctx->model);
|
6111
|
+
}
|
6112
|
+
|
6113
|
+
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6114
|
+
return llama_model_n_ctx_train(&ctx->model);
|
5643
6115
|
}
|
5644
6116
|
|
5645
6117
|
int llama_n_embd(const struct llama_context * ctx) {
|
5646
|
-
return ctx->model
|
6118
|
+
return llama_model_n_embd(&ctx->model);
|
5647
6119
|
}
|
5648
6120
|
|
5649
6121
|
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
@@ -5658,6 +6130,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
|
|
5658
6130
|
return model->hparams.n_ctx;
|
5659
6131
|
}
|
5660
6132
|
|
6133
|
+
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6134
|
+
return model->hparams.n_ctx_train;
|
6135
|
+
}
|
6136
|
+
|
5661
6137
|
int llama_model_n_embd(const struct llama_model * model) {
|
5662
6138
|
return model->hparams.n_embd;
|
5663
6139
|
}
|
@@ -6212,7 +6688,7 @@ int llama_tokenize_with_model(
|
|
6212
6688
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
6213
6689
|
|
6214
6690
|
if (n_max_tokens < (int) res.size()) {
|
6215
|
-
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6691
|
+
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6216
6692
|
return -((int) res.size());
|
6217
6693
|
}
|
6218
6694
|
|