llama_cpp 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +0 -5
- data/ext/llama_cpp/src/ggml-cuda.cu +1011 -655
- data/ext/llama_cpp/src/ggml-metal.m +57 -15
- data/ext/llama_cpp/src/ggml-metal.metal +271 -137
- data/ext/llama_cpp/src/ggml.c +7 -3
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +617 -141
- data/ext/llama_cpp/src/llama.h +8 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,8 +1,3 @@
|
|
1
|
-
// Defines fileno on msys:
|
2
|
-
#ifndef _GNU_SOURCE
|
3
|
-
#define _GNU_SOURCE
|
4
|
-
#endif
|
5
|
-
|
6
1
|
#include "llama.h"
|
7
2
|
|
8
3
|
#include "ggml.h"
|
@@ -160,6 +155,7 @@ static std::string format(const char * fmt, ...) {
|
|
160
155
|
enum llm_arch {
|
161
156
|
LLM_ARCH_LLAMA,
|
162
157
|
LLM_ARCH_FALCON,
|
158
|
+
LLM_ARCH_BAICHUAN,
|
163
159
|
LLM_ARCH_GPT2,
|
164
160
|
LLM_ARCH_GPTJ,
|
165
161
|
LLM_ARCH_GPTNEOX,
|
@@ -174,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
174
170
|
{ LLM_ARCH_GPTJ, "gptj" },
|
175
171
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
176
172
|
{ LLM_ARCH_MPT, "mpt" },
|
173
|
+
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
177
174
|
};
|
178
175
|
|
179
176
|
enum llm_kv {
|
@@ -314,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
314
311
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
315
312
|
},
|
316
313
|
},
|
314
|
+
{
|
315
|
+
LLM_ARCH_BAICHUAN,
|
316
|
+
{
|
317
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
318
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
319
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
320
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
321
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
322
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
323
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
324
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
325
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
326
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
327
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
328
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
329
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
330
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
331
|
+
},
|
332
|
+
},
|
317
333
|
{
|
318
334
|
LLM_ARCH_FALCON,
|
319
335
|
{
|
@@ -658,15 +674,12 @@ struct llama_mmap {
|
|
658
674
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
659
675
|
if (prefetch) {
|
660
676
|
// Advise the kernel to preload the mapped memory
|
661
|
-
|
662
677
|
WIN32_MEMORY_RANGE_ENTRY range;
|
663
|
-
|
664
678
|
range.VirtualAddress = addr;
|
665
679
|
range.NumberOfBytes = (SIZE_T)size;
|
666
680
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
667
681
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
668
682
|
llama_format_win_err(GetLastError()).c_str());
|
669
|
-
}
|
670
683
|
}
|
671
684
|
#else
|
672
685
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -1685,6 +1698,15 @@ static void llm_load_hparams(
|
|
1685
1698
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1686
1699
|
}
|
1687
1700
|
} break;
|
1701
|
+
case LLM_ARCH_BAICHUAN:
|
1702
|
+
{
|
1703
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1704
|
+
switch (hparams.n_layer) {
|
1705
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
1706
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
1707
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1708
|
+
}
|
1709
|
+
} break;
|
1688
1710
|
default: (void)0;
|
1689
1711
|
};
|
1690
1712
|
|
@@ -1925,7 +1947,6 @@ static void llm_load_tensors(
|
|
1925
1947
|
const int64_t n_vocab = hparams.n_vocab;
|
1926
1948
|
|
1927
1949
|
const auto tn = LLM_TN(model.arch);
|
1928
|
-
|
1929
1950
|
switch (model.arch) {
|
1930
1951
|
case LLM_ARCH_LLAMA:
|
1931
1952
|
{
|
@@ -1968,6 +1989,72 @@ static void llm_load_tensors(
|
|
1968
1989
|
|
1969
1990
|
model.layers.resize(n_layer);
|
1970
1991
|
|
1992
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
1993
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1994
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1995
|
+
|
1996
|
+
auto & layer = model.layers[i];
|
1997
|
+
|
1998
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
1999
|
+
|
2000
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
2001
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2002
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2003
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2004
|
+
|
2005
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2006
|
+
|
2007
|
+
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
2008
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2009
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2010
|
+
|
2011
|
+
if (backend == GGML_BACKEND_GPU) {
|
2012
|
+
vram_weights +=
|
2013
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
2014
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
2015
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
2016
|
+
}
|
2017
|
+
}
|
2018
|
+
} break;
|
2019
|
+
case LLM_ARCH_BAICHUAN:
|
2020
|
+
{
|
2021
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2022
|
+
{
|
2023
|
+
ggml_backend backend_norm;
|
2024
|
+
ggml_backend backend_output;
|
2025
|
+
|
2026
|
+
if (n_gpu_layers > int(n_layer)) {
|
2027
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2028
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2029
|
+
#ifndef _WIN32
|
2030
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2031
|
+
#else
|
2032
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2033
|
+
#endif // _WIN32
|
2034
|
+
|
2035
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2036
|
+
} else {
|
2037
|
+
backend_norm = GGML_BACKEND_CPU;
|
2038
|
+
backend_output = GGML_BACKEND_CPU;
|
2039
|
+
}
|
2040
|
+
|
2041
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2042
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2043
|
+
|
2044
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2045
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2046
|
+
}
|
2047
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2048
|
+
vram_weights += ggml_nbytes(model.output);
|
2049
|
+
}
|
2050
|
+
}
|
2051
|
+
|
2052
|
+
const uint32_t n_ff = hparams.n_ff;
|
2053
|
+
|
2054
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2055
|
+
|
2056
|
+
model.layers.resize(n_layer);
|
2057
|
+
|
1971
2058
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1972
2059
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1973
2060
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
@@ -2544,6 +2631,367 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2544
2631
|
return gf;
|
2545
2632
|
}
|
2546
2633
|
|
2634
|
+
|
2635
|
+
static struct ggml_cgraph * llm_build_baichaun(
|
2636
|
+
llama_context & lctx,
|
2637
|
+
const llama_token * tokens,
|
2638
|
+
const float * embd,
|
2639
|
+
int n_tokens,
|
2640
|
+
int n_past) {
|
2641
|
+
|
2642
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2643
|
+
|
2644
|
+
const int N = n_tokens;
|
2645
|
+
|
2646
|
+
const auto & model = lctx.model;
|
2647
|
+
const auto & hparams = model.hparams;
|
2648
|
+
|
2649
|
+
const auto & kv_self = lctx.kv_self;
|
2650
|
+
|
2651
|
+
GGML_ASSERT(!!kv_self.ctx);
|
2652
|
+
|
2653
|
+
const int64_t n_embd = hparams.n_embd;
|
2654
|
+
const int64_t n_layer = hparams.n_layer;
|
2655
|
+
const int64_t n_ctx = hparams.n_ctx;
|
2656
|
+
const int64_t n_head = hparams.n_head;
|
2657
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
2658
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
2659
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
2660
|
+
|
2661
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2662
|
+
|
2663
|
+
const float freq_base = hparams.rope_freq_base;
|
2664
|
+
const float freq_scale = hparams.rope_freq_scale;
|
2665
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2666
|
+
|
2667
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
2668
|
+
|
2669
|
+
auto & buf_compute = lctx.buf_compute;
|
2670
|
+
|
2671
|
+
struct ggml_init_params params = {
|
2672
|
+
/*.mem_size =*/ buf_compute.size,
|
2673
|
+
/*.mem_buffer =*/ buf_compute.data,
|
2674
|
+
/*.no_alloc =*/ false,
|
2675
|
+
};
|
2676
|
+
|
2677
|
+
params.no_alloc = true;
|
2678
|
+
|
2679
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
2680
|
+
|
2681
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
2682
|
+
|
2683
|
+
struct ggml_tensor * cur;
|
2684
|
+
struct ggml_tensor * inpL;
|
2685
|
+
|
2686
|
+
if (tokens) {
|
2687
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
2688
|
+
|
2689
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2690
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2691
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
2692
|
+
}
|
2693
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
2694
|
+
|
2695
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
2696
|
+
} else {
|
2697
|
+
#ifdef GGML_USE_MPI
|
2698
|
+
GGML_ASSERT(false && "not implemented");
|
2699
|
+
#endif
|
2700
|
+
|
2701
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
2702
|
+
|
2703
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
2704
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2705
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
2706
|
+
}
|
2707
|
+
}
|
2708
|
+
|
2709
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2710
|
+
(void) i_gpu_start;
|
2711
|
+
|
2712
|
+
// offload functions set the tensor output backend to GPU
|
2713
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2714
|
+
//
|
2715
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2716
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
2717
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2718
|
+
offload_func_t offload_func_kq = llama_nop;
|
2719
|
+
offload_func_t offload_func_v = llama_nop;
|
2720
|
+
|
2721
|
+
#ifdef GGML_USE_CUBLAS
|
2722
|
+
if (n_gpu_layers > n_layer) {
|
2723
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
2724
|
+
}
|
2725
|
+
if (n_gpu_layers > n_layer + 1) {
|
2726
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
2727
|
+
}
|
2728
|
+
if (n_gpu_layers > n_layer + 2) {
|
2729
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
2730
|
+
}
|
2731
|
+
#endif // GGML_USE_CUBLAS
|
2732
|
+
|
2733
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2734
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2735
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2736
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2737
|
+
}
|
2738
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2739
|
+
|
2740
|
+
for (int il = 0; il < n_layer; ++il) {
|
2741
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
2742
|
+
|
2743
|
+
offload_func_t offload_func = llama_nop;
|
2744
|
+
|
2745
|
+
#ifdef GGML_USE_CUBLAS
|
2746
|
+
if (il >= i_gpu_start) {
|
2747
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
2748
|
+
}
|
2749
|
+
#endif // GGML_USE_CUBLAS
|
2750
|
+
|
2751
|
+
struct ggml_tensor * inpSA = inpL;
|
2752
|
+
|
2753
|
+
// norm
|
2754
|
+
{
|
2755
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
2756
|
+
offload_func(cur);
|
2757
|
+
ggml_set_name(cur, "rms_norm_0");
|
2758
|
+
|
2759
|
+
// cur = cur*attn_norm(broadcasted)
|
2760
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
2761
|
+
offload_func(cur);
|
2762
|
+
ggml_set_name(cur, "attention_norm_0");
|
2763
|
+
}
|
2764
|
+
|
2765
|
+
// self-attention
|
2766
|
+
{
|
2767
|
+
// compute Q and K and RoPE them
|
2768
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
2769
|
+
offload_func_kq(tmpk);
|
2770
|
+
ggml_set_name(tmpk, "tmpk");
|
2771
|
+
|
2772
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
2773
|
+
offload_func_kq(tmpq);
|
2774
|
+
ggml_set_name(tmpq, "tmpq");
|
2775
|
+
|
2776
|
+
struct ggml_tensor * Kcur;
|
2777
|
+
struct ggml_tensor * Qcur;
|
2778
|
+
switch (model.type) {
|
2779
|
+
case MODEL_7B:
|
2780
|
+
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2781
|
+
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2782
|
+
break;
|
2783
|
+
case MODEL_13B:
|
2784
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
|
2785
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
|
2786
|
+
break;
|
2787
|
+
default:
|
2788
|
+
GGML_ASSERT(false);
|
2789
|
+
}
|
2790
|
+
|
2791
|
+
offload_func_kq(Kcur);
|
2792
|
+
ggml_set_name(Kcur, "Kcur");
|
2793
|
+
|
2794
|
+
offload_func_kq(Qcur);
|
2795
|
+
ggml_set_name(Qcur, "Qcur");
|
2796
|
+
|
2797
|
+
// store key and value to memory
|
2798
|
+
{
|
2799
|
+
// compute the transposed [N, n_embd] V matrix
|
2800
|
+
|
2801
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2802
|
+
offload_func_v(tmpv);
|
2803
|
+
ggml_set_name(tmpv, "tmpv");
|
2804
|
+
|
2805
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
2806
|
+
offload_func_v(Vcur);
|
2807
|
+
ggml_set_name(Vcur, "Vcur");
|
2808
|
+
|
2809
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
2810
|
+
offload_func_kq(k);
|
2811
|
+
ggml_set_name(k, "k");
|
2812
|
+
|
2813
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
2814
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
2815
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
2816
|
+
offload_func_v(v);
|
2817
|
+
ggml_set_name(v, "v");
|
2818
|
+
|
2819
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
2820
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
2821
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
2825
|
+
offload_func_kq(Q);
|
2826
|
+
ggml_set_name(Q, "Q");
|
2827
|
+
|
2828
|
+
struct ggml_tensor * K =
|
2829
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2830
|
+
n_embd_head, n_past + N, n_head_kv,
|
2831
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2832
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2833
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
2834
|
+
offload_func_kq(K);
|
2835
|
+
ggml_set_name(K, "K");
|
2836
|
+
|
2837
|
+
// K * Q
|
2838
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
2839
|
+
offload_func_kq(KQ);
|
2840
|
+
ggml_set_name(KQ, "KQ");
|
2841
|
+
|
2842
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2843
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
2844
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
2845
|
+
offload_func_kq(KQ_scaled);
|
2846
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2847
|
+
|
2848
|
+
struct ggml_tensor * KQ_masked;
|
2849
|
+
struct ggml_tensor * KQ_scaled_alibi;
|
2850
|
+
|
2851
|
+
switch (model.type) {
|
2852
|
+
case MODEL_7B:
|
2853
|
+
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2854
|
+
break;
|
2855
|
+
case MODEL_13B:
|
2856
|
+
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
|
2857
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2858
|
+
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2859
|
+
break;
|
2860
|
+
default:
|
2861
|
+
GGML_ASSERT(false);
|
2862
|
+
}
|
2863
|
+
// KQ_masked = mask_past(KQ_scaled)
|
2864
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2865
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2866
|
+
// offload_func_kq(KQ_masked);
|
2867
|
+
// ggml_set_name(KQ_masked, "KQ_masked");
|
2868
|
+
|
2869
|
+
// KQ = soft_max(KQ_masked)
|
2870
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
2871
|
+
offload_func_v(KQ_soft_max);
|
2872
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2873
|
+
|
2874
|
+
// split cached V into n_head heads
|
2875
|
+
struct ggml_tensor * V =
|
2876
|
+
ggml_view_3d(ctx0, kv_self.v,
|
2877
|
+
n_past + N, n_embd_head, n_head_kv,
|
2878
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
2879
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2880
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
2881
|
+
offload_func_v(V);
|
2882
|
+
ggml_set_name(V, "V");
|
2883
|
+
|
2884
|
+
#if 1
|
2885
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
2886
|
+
offload_func_v(KQV);
|
2887
|
+
ggml_set_name(KQV, "KQV");
|
2888
|
+
#else
|
2889
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2890
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2891
|
+
// is there a better way?
|
2892
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
2893
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2894
|
+
#endif
|
2895
|
+
|
2896
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
2897
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
2898
|
+
offload_func_v(KQV_merged);
|
2899
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
2900
|
+
|
2901
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
2902
|
+
cur = ggml_cpy(ctx0,
|
2903
|
+
KQV_merged,
|
2904
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2905
|
+
offload_func_v(cur);
|
2906
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
2907
|
+
|
2908
|
+
// projection (no bias)
|
2909
|
+
cur = ggml_mul_mat(ctx0,
|
2910
|
+
model.layers[il].wo,
|
2911
|
+
cur);
|
2912
|
+
offload_func(cur);
|
2913
|
+
ggml_set_name(cur, "result_wo");
|
2914
|
+
}
|
2915
|
+
|
2916
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
2917
|
+
offload_func(inpFF);
|
2918
|
+
ggml_set_name(inpFF, "inpFF");
|
2919
|
+
|
2920
|
+
// feed-forward network
|
2921
|
+
{
|
2922
|
+
// norm
|
2923
|
+
{
|
2924
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
2925
|
+
offload_func(cur);
|
2926
|
+
ggml_set_name(cur, "rms_norm_1");
|
2927
|
+
|
2928
|
+
// cur = cur*ffn_norm(broadcasted)
|
2929
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
2930
|
+
offload_func(cur);
|
2931
|
+
ggml_set_name(cur, "ffn_norm");
|
2932
|
+
}
|
2933
|
+
|
2934
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
2935
|
+
model.layers[il].w3,
|
2936
|
+
cur);
|
2937
|
+
offload_func(tmp);
|
2938
|
+
ggml_set_name(tmp, "result_w3");
|
2939
|
+
|
2940
|
+
cur = ggml_mul_mat(ctx0,
|
2941
|
+
model.layers[il].w1,
|
2942
|
+
cur);
|
2943
|
+
offload_func(cur);
|
2944
|
+
ggml_set_name(cur, "result_w1");
|
2945
|
+
|
2946
|
+
// SILU activation
|
2947
|
+
cur = ggml_silu(ctx0, cur);
|
2948
|
+
offload_func(cur);
|
2949
|
+
ggml_set_name(cur, "silu");
|
2950
|
+
|
2951
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
2952
|
+
offload_func(cur);
|
2953
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
2954
|
+
|
2955
|
+
cur = ggml_mul_mat(ctx0,
|
2956
|
+
model.layers[il].w2,
|
2957
|
+
cur);
|
2958
|
+
offload_func(cur);
|
2959
|
+
ggml_set_name(cur, "result_w2");
|
2960
|
+
}
|
2961
|
+
|
2962
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
2963
|
+
offload_func(cur);
|
2964
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
2965
|
+
|
2966
|
+
// input for next layer
|
2967
|
+
inpL = cur;
|
2968
|
+
}
|
2969
|
+
|
2970
|
+
cur = inpL;
|
2971
|
+
|
2972
|
+
// norm
|
2973
|
+
{
|
2974
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
2975
|
+
offload_func_nr(cur);
|
2976
|
+
ggml_set_name(cur, "rms_norm_2");
|
2977
|
+
|
2978
|
+
// cur = cur*norm(broadcasted)
|
2979
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
2980
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
2981
|
+
ggml_set_name(cur, "result_norm");
|
2982
|
+
}
|
2983
|
+
|
2984
|
+
// lm_head
|
2985
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
2986
|
+
ggml_set_name(cur, "result_output");
|
2987
|
+
|
2988
|
+
ggml_build_forward_expand(gf, cur);
|
2989
|
+
|
2990
|
+
ggml_free(ctx0);
|
2991
|
+
|
2992
|
+
return gf;
|
2993
|
+
}
|
2994
|
+
|
2547
2995
|
static struct ggml_cgraph * llm_build_falcon(
|
2548
2996
|
llama_context & lctx,
|
2549
2997
|
const llama_token * tokens,
|
@@ -2866,6 +3314,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
2866
3314
|
{
|
2867
3315
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
2868
3316
|
} break;
|
3317
|
+
case LLM_ARCH_BAICHUAN:
|
3318
|
+
{
|
3319
|
+
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
|
3320
|
+
} break;
|
2869
3321
|
case LLM_ARCH_FALCON:
|
2870
3322
|
{
|
2871
3323
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
@@ -3123,10 +3575,9 @@ struct llm_tokenizer_spm {
|
|
3123
3575
|
while (offs < text.size()) {
|
3124
3576
|
llm_symbol sym;
|
3125
3577
|
size_t len = utf8_len(text[offs]);
|
3126
|
-
GGML_ASSERT(offs + len <= text.size());
|
3127
3578
|
sym.text = text.c_str() + offs;
|
3128
|
-
sym.n = len;
|
3129
|
-
offs +=
|
3579
|
+
sym.n = std::min(len, text.size() - offs);
|
3580
|
+
offs += sym.n;
|
3130
3581
|
sym.prev = index - 1;
|
3131
3582
|
sym.next = offs == text.size() ? -1 : index + 1;
|
3132
3583
|
index++;
|
@@ -4642,7 +5093,16 @@ void llama_beam_search(llama_context * ctx,
|
|
4642
5093
|
// quantization
|
4643
5094
|
//
|
4644
5095
|
|
4645
|
-
|
5096
|
+
template <typename T>
|
5097
|
+
struct no_init {
|
5098
|
+
T value;
|
5099
|
+
no_init() { /* do nothing */ }
|
5100
|
+
};
|
5101
|
+
|
5102
|
+
static void llama_convert_tensor_internal(
|
5103
|
+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
5104
|
+
const size_t nelements, const int nthread
|
5105
|
+
) {
|
4646
5106
|
if (output.size() < nelements) {
|
4647
5107
|
output.resize(nelements);
|
4648
5108
|
}
|
@@ -4677,7 +5137,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4677
5137
|
auto blocks_per_thread = nblocks / nthread;
|
4678
5138
|
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
4679
5139
|
|
4680
|
-
std::vector<std::thread> workers;
|
4681
5140
|
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
4682
5141
|
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
4683
5142
|
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
@@ -4690,14 +5149,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4690
5149
|
qtype.to_float(inbuf, outbuf, nels);
|
4691
5150
|
}
|
4692
5151
|
};
|
4693
|
-
workers.
|
5152
|
+
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
4694
5153
|
in_buff_offs += thr_block_bytes;
|
4695
5154
|
out_buff_offs += thr_elems;
|
4696
5155
|
}
|
4697
|
-
for (auto &
|
4698
|
-
|
5156
|
+
for (auto & w : workers) { w.join(); }
|
5157
|
+
workers.clear();
|
5158
|
+
}
|
5159
|
+
|
5160
|
+
#ifdef GGML_USE_K_QUANTS
|
5161
|
+
static ggml_type get_k_quant_type(
|
5162
|
+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
5163
|
+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
5164
|
+
) {
|
5165
|
+
const std::string name = ggml_get_name(tensor);
|
5166
|
+
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
5167
|
+
const auto tn = LLM_TN(model.arch);
|
5168
|
+
|
5169
|
+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
5170
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
5171
|
+
};
|
5172
|
+
|
5173
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5174
|
+
int nx = tensor->ne[0];
|
5175
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
5176
|
+
new_type = GGML_TYPE_Q8_0;
|
5177
|
+
}
|
5178
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
5179
|
+
new_type = GGML_TYPE_Q6_K;
|
5180
|
+
}
|
5181
|
+
} else if (name.find("attn_v.weight") != std::string::npos) {
|
5182
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5183
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5184
|
+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5185
|
+
}
|
5186
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5187
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
5188
|
+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
5189
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
5190
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
5191
|
+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
5192
|
+
if (model.type == MODEL_70B) {
|
5193
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
5194
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
5195
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
5196
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
5197
|
+
}
|
5198
|
+
++*i_attention_wv;
|
5199
|
+
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
5200
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5201
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5202
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
5203
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
5204
|
+
: GGML_TYPE_Q3_K;
|
5205
|
+
}
|
5206
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
5207
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
5208
|
+
}
|
5209
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
5210
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
5211
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
5212
|
+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5213
|
+
} else {
|
5214
|
+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5215
|
+
}
|
5216
|
+
}
|
5217
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5218
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
5219
|
+
new_type = GGML_TYPE_Q5_K;
|
5220
|
+
}
|
5221
|
+
++*i_feed_forward_w2;
|
5222
|
+
} else if (name.find("attn_output.weight") != std::string::npos) {
|
5223
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
5224
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
5225
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
5226
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5227
|
+
} else {
|
5228
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5229
|
+
}
|
5230
|
+
}
|
5231
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
5232
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5233
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
5234
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
5235
|
+
}
|
5236
|
+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
5237
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5238
|
+
}
|
5239
|
+
// This can be used to reduce the size of the Q5_K_S model.
|
5240
|
+
// The associated PPL increase is fully in line with the size reduction
|
5241
|
+
//else {
|
5242
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
5243
|
+
//}
|
5244
|
+
bool convert_incompatible_tensor = false;
|
5245
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
5246
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
5247
|
+
int nx = tensor->ne[0];
|
5248
|
+
int ny = tensor->ne[1];
|
5249
|
+
if (nx % QK_K != 0) {
|
5250
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
5251
|
+
convert_incompatible_tensor = true;
|
5252
|
+
}
|
5253
|
+
}
|
5254
|
+
if (convert_incompatible_tensor) {
|
5255
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5256
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
5257
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
5258
|
+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
5259
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
5260
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
5261
|
+
} else {
|
5262
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
5263
|
+
}
|
4699
5264
|
}
|
5265
|
+
|
5266
|
+
return new_type;
|
4700
5267
|
}
|
5268
|
+
#endif
|
4701
5269
|
|
4702
5270
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
4703
5271
|
ggml_type quantized_type;
|
@@ -4782,18 +5350,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4782
5350
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
4783
5351
|
|
4784
5352
|
std::vector<std::thread> workers;
|
5353
|
+
workers.reserve(nthread);
|
4785
5354
|
std::mutex mutex;
|
4786
5355
|
|
4787
|
-
#ifdef GGML_USE_K_QUANTS
|
4788
|
-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4789
|
-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4790
|
-
};
|
4791
|
-
#endif
|
4792
|
-
|
4793
5356
|
int idx = 0;
|
4794
5357
|
|
4795
|
-
std::vector<uint8_t
|
4796
|
-
std::vector<uint8_t
|
5358
|
+
std::vector<no_init<uint8_t>> read_data;
|
5359
|
+
std::vector<no_init<uint8_t>> work;
|
5360
|
+
std::vector<no_init<float>> f32_conv_buf;
|
4797
5361
|
|
4798
5362
|
// populate the original tensors so we get an initial meta data
|
4799
5363
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
@@ -4815,7 +5379,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4815
5379
|
|
4816
5380
|
const std::string name = ggml_get_name(tensor);
|
4817
5381
|
|
4818
|
-
read_data.
|
5382
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
5383
|
+
read_data.resize(ggml_nbytes(tensor));
|
5384
|
+
}
|
4819
5385
|
tensor->data = read_data.data();
|
4820
5386
|
ml->load_data_for(tensor);
|
4821
5387
|
|
@@ -4840,101 +5406,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4840
5406
|
if (quantize) {
|
4841
5407
|
new_type = quantized_type;
|
4842
5408
|
#ifdef GGML_USE_K_QUANTS
|
4843
|
-
|
4844
|
-
|
4845
|
-
|
4846
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4847
|
-
int nx = tensor->ne[0];
|
4848
|
-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4849
|
-
new_type = GGML_TYPE_Q8_0;
|
4850
|
-
}
|
4851
|
-
else if (new_type != GGML_TYPE_Q8_0) {
|
4852
|
-
new_type = GGML_TYPE_Q6_K;
|
4853
|
-
}
|
4854
|
-
} else if (name.find("attn_v.weight") != std::string::npos) {
|
4855
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4856
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4857
|
-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4858
|
-
}
|
4859
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4860
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4861
|
-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
4862
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4863
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4864
|
-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4865
|
-
if (model.type == MODEL_70B) {
|
4866
|
-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4867
|
-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4868
|
-
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4869
|
-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4870
|
-
}
|
4871
|
-
++i_attention_wv;
|
4872
|
-
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4873
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4874
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4875
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4876
|
-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4877
|
-
: GGML_TYPE_Q3_K;
|
4878
|
-
}
|
4879
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4880
|
-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4881
|
-
}
|
4882
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4883
|
-
if (model.arch == LLM_ARCH_FALCON) {
|
4884
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4885
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4886
|
-
} else {
|
4887
|
-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4888
|
-
}
|
4889
|
-
}
|
4890
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4891
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4892
|
-
new_type = GGML_TYPE_Q5_K;
|
4893
|
-
}
|
4894
|
-
++i_feed_forward_w2;
|
4895
|
-
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4896
|
-
if (model.arch != LLM_ARCH_FALCON) {
|
4897
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4898
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4899
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4900
|
-
} else {
|
4901
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4902
|
-
}
|
4903
|
-
}
|
4904
|
-
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4905
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4906
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4907
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4908
|
-
}
|
4909
|
-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4910
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4911
|
-
}
|
4912
|
-
// This can be used to reduce the size of the Q5_K_S model.
|
4913
|
-
// The associated PPL increase is fully in line with the size reduction
|
4914
|
-
//else {
|
4915
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
4916
|
-
//}
|
4917
|
-
bool convert_incompatible_tensor = false;
|
4918
|
-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
4919
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4920
|
-
int nx = tensor->ne[0];
|
4921
|
-
int ny = tensor->ne[1];
|
4922
|
-
if (nx % QK_K != 0) {
|
4923
|
-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4924
|
-
convert_incompatible_tensor = true;
|
4925
|
-
}
|
4926
|
-
}
|
4927
|
-
if (convert_incompatible_tensor) {
|
4928
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4929
|
-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
4930
|
-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
4931
|
-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
4932
|
-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
4933
|
-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
4934
|
-
} else {
|
4935
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
4936
|
-
}
|
4937
|
-
}
|
5409
|
+
new_type = get_k_quant_type(
|
5410
|
+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
5411
|
+
);
|
4938
5412
|
#endif
|
4939
5413
|
// If we've decided to quantize to the same type the tensor is already
|
4940
5414
|
// in then there's nothing to do.
|
@@ -4949,23 +5423,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4949
5423
|
const size_t nelements = ggml_nelements(tensor);
|
4950
5424
|
|
4951
5425
|
float * f32_data;
|
4952
|
-
std::vector<float> f32_conv_buf;
|
4953
5426
|
|
4954
5427
|
if (tensor->type == GGML_TYPE_F32) {
|
4955
5428
|
f32_data = (float *) tensor->data;
|
4956
5429
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
4957
5430
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
4958
5431
|
} else {
|
4959
|
-
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
5432
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
4960
5433
|
f32_data = (float *) f32_conv_buf.data();
|
4961
5434
|
}
|
4962
5435
|
|
4963
5436
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
4964
5437
|
fflush(stdout);
|
4965
5438
|
|
4966
|
-
work.
|
5439
|
+
if (work.size() < nelements * 4) {
|
5440
|
+
work.resize(nelements * 4); // upper bound on size
|
5441
|
+
}
|
4967
5442
|
new_data = work.data();
|
4968
|
-
std::
|
5443
|
+
std::array<int64_t, 1 << 4> hist_cur = {};
|
4969
5444
|
|
4970
5445
|
static const int chunk_size = 32 * 512;
|
4971
5446
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
@@ -4976,13 +5451,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4976
5451
|
size_t counter = 0;
|
4977
5452
|
new_size = 0;
|
4978
5453
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
4979
|
-
std::
|
5454
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
4980
5455
|
size_t local_size = 0;
|
4981
5456
|
while (true) {
|
4982
5457
|
std::unique_lock<std::mutex> lock(mutex);
|
4983
5458
|
size_t first = counter; counter += chunk_size;
|
4984
5459
|
if (first >= nelements) {
|
4985
|
-
if (
|
5460
|
+
if (local_size > 0) {
|
4986
5461
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
4987
5462
|
hist_cur[j] += local_hist[j];
|
4988
5463
|
}
|
@@ -4992,22 +5467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4992
5467
|
}
|
4993
5468
|
lock.unlock();
|
4994
5469
|
size_t last = std::min(nelements, first + chunk_size);
|
4995
|
-
if (local_hist.empty()) {
|
4996
|
-
local_hist.resize(hist_cur.size(), 0);
|
4997
|
-
}
|
4998
5470
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
4999
5471
|
}
|
5000
5472
|
};
|
5001
|
-
if ((int) workers.size() < nthread_use - 1) {
|
5002
|
-
workers.resize(nthread_use - 1);
|
5003
|
-
}
|
5004
5473
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
5005
|
-
workers
|
5474
|
+
workers.emplace_back(compute);
|
5006
5475
|
}
|
5007
5476
|
compute();
|
5008
|
-
for (
|
5009
|
-
|
5010
|
-
}
|
5477
|
+
for (auto & w : workers) { w.join(); }
|
5478
|
+
workers.clear();
|
5011
5479
|
}
|
5012
5480
|
|
5013
5481
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -5635,15 +6103,19 @@ void llama_free(struct llama_context * ctx) {
|
|
5635
6103
|
}
|
5636
6104
|
|
5637
6105
|
int llama_n_vocab(const struct llama_context * ctx) {
|
5638
|
-
return ctx->model
|
6106
|
+
return llama_model_n_vocab(&ctx->model);
|
5639
6107
|
}
|
5640
6108
|
|
5641
6109
|
int llama_n_ctx(const struct llama_context * ctx) {
|
5642
|
-
return ctx->model
|
6110
|
+
return llama_model_n_ctx(&ctx->model);
|
6111
|
+
}
|
6112
|
+
|
6113
|
+
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6114
|
+
return llama_model_n_ctx_train(&ctx->model);
|
5643
6115
|
}
|
5644
6116
|
|
5645
6117
|
int llama_n_embd(const struct llama_context * ctx) {
|
5646
|
-
return ctx->model
|
6118
|
+
return llama_model_n_embd(&ctx->model);
|
5647
6119
|
}
|
5648
6120
|
|
5649
6121
|
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
@@ -5658,6 +6130,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
|
|
5658
6130
|
return model->hparams.n_ctx;
|
5659
6131
|
}
|
5660
6132
|
|
6133
|
+
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6134
|
+
return model->hparams.n_ctx_train;
|
6135
|
+
}
|
6136
|
+
|
5661
6137
|
int llama_model_n_embd(const struct llama_model * model) {
|
5662
6138
|
return model->hparams.n_embd;
|
5663
6139
|
}
|
@@ -6212,7 +6688,7 @@ int llama_tokenize_with_model(
|
|
6212
6688
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
6213
6689
|
|
6214
6690
|
if (n_max_tokens < (int) res.size()) {
|
6215
|
-
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6691
|
+
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6216
6692
|
return -((int) res.size());
|
6217
6693
|
}
|
6218
6694
|
|