llama_cpp 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,3 @@
1
- // Defines fileno on msys:
2
- #ifndef _GNU_SOURCE
3
- #define _GNU_SOURCE
4
- #endif
5
-
6
1
  #include "llama.h"
7
2
 
8
3
  #include "ggml.h"
@@ -160,6 +155,7 @@ static std::string format(const char * fmt, ...) {
160
155
  enum llm_arch {
161
156
  LLM_ARCH_LLAMA,
162
157
  LLM_ARCH_FALCON,
158
+ LLM_ARCH_BAICHUAN,
163
159
  LLM_ARCH_GPT2,
164
160
  LLM_ARCH_GPTJ,
165
161
  LLM_ARCH_GPTNEOX,
@@ -174,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
174
170
  { LLM_ARCH_GPTJ, "gptj" },
175
171
  { LLM_ARCH_GPTNEOX, "gptneox" },
176
172
  { LLM_ARCH_MPT, "mpt" },
173
+ { LLM_ARCH_BAICHUAN,"baichuan" },
177
174
  };
178
175
 
179
176
  enum llm_kv {
@@ -314,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
314
311
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
315
312
  },
316
313
  },
314
+ {
315
+ LLM_ARCH_BAICHUAN,
316
+ {
317
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
318
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
319
+ { LLM_TENSOR_OUTPUT, "output" },
320
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
321
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
322
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
323
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
324
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
325
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
326
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
327
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
328
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
329
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
330
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
331
+ },
332
+ },
317
333
  {
318
334
  LLM_ARCH_FALCON,
319
335
  {
@@ -658,15 +674,12 @@ struct llama_mmap {
658
674
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
659
675
  if (prefetch) {
660
676
  // Advise the kernel to preload the mapped memory
661
-
662
677
  WIN32_MEMORY_RANGE_ENTRY range;
663
-
664
678
  range.VirtualAddress = addr;
665
679
  range.NumberOfBytes = (SIZE_T)size;
666
680
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
667
681
  fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
668
682
  llama_format_win_err(GetLastError()).c_str());
669
- }
670
683
  }
671
684
  #else
672
685
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -1685,6 +1698,15 @@ static void llm_load_hparams(
1685
1698
  default: model.type = e_model::MODEL_UNKNOWN;
1686
1699
  }
1687
1700
  } break;
1701
+ case LLM_ARCH_BAICHUAN:
1702
+ {
1703
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1704
+ switch (hparams.n_layer) {
1705
+ case 32: model.type = e_model::MODEL_7B; break;
1706
+ case 40: model.type = e_model::MODEL_13B; break;
1707
+ default: model.type = e_model::MODEL_UNKNOWN;
1708
+ }
1709
+ } break;
1688
1710
  default: (void)0;
1689
1711
  };
1690
1712
 
@@ -1925,7 +1947,6 @@ static void llm_load_tensors(
1925
1947
  const int64_t n_vocab = hparams.n_vocab;
1926
1948
 
1927
1949
  const auto tn = LLM_TN(model.arch);
1928
-
1929
1950
  switch (model.arch) {
1930
1951
  case LLM_ARCH_LLAMA:
1931
1952
  {
@@ -1968,6 +1989,72 @@ static void llm_load_tensors(
1968
1989
 
1969
1990
  model.layers.resize(n_layer);
1970
1991
 
1992
+ for (uint32_t i = 0; i < n_layer; ++i) {
1993
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1994
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1995
+
1996
+ auto & layer = model.layers[i];
1997
+
1998
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
1999
+
2000
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
2001
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2002
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2003
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2004
+
2005
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2006
+
2007
+ layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
2008
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2009
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2010
+
2011
+ if (backend == GGML_BACKEND_GPU) {
2012
+ vram_weights +=
2013
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2014
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2015
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
2016
+ }
2017
+ }
2018
+ } break;
2019
+ case LLM_ARCH_BAICHUAN:
2020
+ {
2021
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2022
+ {
2023
+ ggml_backend backend_norm;
2024
+ ggml_backend backend_output;
2025
+
2026
+ if (n_gpu_layers > int(n_layer)) {
2027
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2028
+ // on Windows however this is detrimental unless everything is on the GPU
2029
+ #ifndef _WIN32
2030
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2031
+ #else
2032
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2033
+ #endif // _WIN32
2034
+
2035
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2036
+ } else {
2037
+ backend_norm = GGML_BACKEND_CPU;
2038
+ backend_output = GGML_BACKEND_CPU;
2039
+ }
2040
+
2041
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2042
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2043
+
2044
+ if (backend_norm == GGML_BACKEND_GPU) {
2045
+ vram_weights += ggml_nbytes(model.output_norm);
2046
+ }
2047
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2048
+ vram_weights += ggml_nbytes(model.output);
2049
+ }
2050
+ }
2051
+
2052
+ const uint32_t n_ff = hparams.n_ff;
2053
+
2054
+ const int i_gpu_start = n_layer - n_gpu_layers;
2055
+
2056
+ model.layers.resize(n_layer);
2057
+
1971
2058
  for (uint32_t i = 0; i < n_layer; ++i) {
1972
2059
  const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1973
2060
  const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
@@ -2544,6 +2631,367 @@ static struct ggml_cgraph * llm_build_llama(
2544
2631
  return gf;
2545
2632
  }
2546
2633
 
2634
+
2635
+ static struct ggml_cgraph * llm_build_baichaun(
2636
+ llama_context & lctx,
2637
+ const llama_token * tokens,
2638
+ const float * embd,
2639
+ int n_tokens,
2640
+ int n_past) {
2641
+
2642
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2643
+
2644
+ const int N = n_tokens;
2645
+
2646
+ const auto & model = lctx.model;
2647
+ const auto & hparams = model.hparams;
2648
+
2649
+ const auto & kv_self = lctx.kv_self;
2650
+
2651
+ GGML_ASSERT(!!kv_self.ctx);
2652
+
2653
+ const int64_t n_embd = hparams.n_embd;
2654
+ const int64_t n_layer = hparams.n_layer;
2655
+ const int64_t n_ctx = hparams.n_ctx;
2656
+ const int64_t n_head = hparams.n_head;
2657
+ const int64_t n_head_kv = hparams.n_head_kv;
2658
+ const int64_t n_embd_head = hparams.n_embd_head();
2659
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
2660
+
2661
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
2662
+
2663
+ const float freq_base = hparams.rope_freq_base;
2664
+ const float freq_scale = hparams.rope_freq_scale;
2665
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
2666
+
2667
+ const int n_gpu_layers = model.n_gpu_layers;
2668
+
2669
+ auto & buf_compute = lctx.buf_compute;
2670
+
2671
+ struct ggml_init_params params = {
2672
+ /*.mem_size =*/ buf_compute.size,
2673
+ /*.mem_buffer =*/ buf_compute.data,
2674
+ /*.no_alloc =*/ false,
2675
+ };
2676
+
2677
+ params.no_alloc = true;
2678
+
2679
+ struct ggml_context * ctx0 = ggml_init(params);
2680
+
2681
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
2682
+
2683
+ struct ggml_tensor * cur;
2684
+ struct ggml_tensor * inpL;
2685
+
2686
+ if (tokens) {
2687
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2688
+
2689
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
2690
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2691
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2692
+ }
2693
+ ggml_set_name(inp_tokens, "inp_tokens");
2694
+
2695
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
2696
+ } else {
2697
+ #ifdef GGML_USE_MPI
2698
+ GGML_ASSERT(false && "not implemented");
2699
+ #endif
2700
+
2701
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2702
+
2703
+ ggml_allocr_alloc(lctx.alloc, inpL);
2704
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2705
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2706
+ }
2707
+ }
2708
+
2709
+ const int i_gpu_start = n_layer - n_gpu_layers;
2710
+ (void) i_gpu_start;
2711
+
2712
+ // offload functions set the tensor output backend to GPU
2713
+ // tensors are GPU-accelerated if any input or the output has been offloaded
2714
+ //
2715
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2716
+ // in that case ggml_cuda_assign_buffers has no effect
2717
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2718
+ offload_func_t offload_func_kq = llama_nop;
2719
+ offload_func_t offload_func_v = llama_nop;
2720
+
2721
+ #ifdef GGML_USE_CUBLAS
2722
+ if (n_gpu_layers > n_layer) {
2723
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
2724
+ }
2725
+ if (n_gpu_layers > n_layer + 1) {
2726
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
2727
+ }
2728
+ if (n_gpu_layers > n_layer + 2) {
2729
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
2730
+ }
2731
+ #endif // GGML_USE_CUBLAS
2732
+
2733
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2734
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
2735
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2736
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2737
+ }
2738
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2739
+
2740
+ for (int il = 0; il < n_layer; ++il) {
2741
+ ggml_format_name(inpL, "layer_inp_%d", il);
2742
+
2743
+ offload_func_t offload_func = llama_nop;
2744
+
2745
+ #ifdef GGML_USE_CUBLAS
2746
+ if (il >= i_gpu_start) {
2747
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
2748
+ }
2749
+ #endif // GGML_USE_CUBLAS
2750
+
2751
+ struct ggml_tensor * inpSA = inpL;
2752
+
2753
+ // norm
2754
+ {
2755
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
2756
+ offload_func(cur);
2757
+ ggml_set_name(cur, "rms_norm_0");
2758
+
2759
+ // cur = cur*attn_norm(broadcasted)
2760
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
2761
+ offload_func(cur);
2762
+ ggml_set_name(cur, "attention_norm_0");
2763
+ }
2764
+
2765
+ // self-attention
2766
+ {
2767
+ // compute Q and K and RoPE them
2768
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
2769
+ offload_func_kq(tmpk);
2770
+ ggml_set_name(tmpk, "tmpk");
2771
+
2772
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
2773
+ offload_func_kq(tmpq);
2774
+ ggml_set_name(tmpq, "tmpq");
2775
+
2776
+ struct ggml_tensor * Kcur;
2777
+ struct ggml_tensor * Qcur;
2778
+ switch (model.type) {
2779
+ case MODEL_7B:
2780
+ Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2781
+ Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2782
+ break;
2783
+ case MODEL_13B:
2784
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2785
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
2786
+ break;
2787
+ default:
2788
+ GGML_ASSERT(false);
2789
+ }
2790
+
2791
+ offload_func_kq(Kcur);
2792
+ ggml_set_name(Kcur, "Kcur");
2793
+
2794
+ offload_func_kq(Qcur);
2795
+ ggml_set_name(Qcur, "Qcur");
2796
+
2797
+ // store key and value to memory
2798
+ {
2799
+ // compute the transposed [N, n_embd] V matrix
2800
+
2801
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2802
+ offload_func_v(tmpv);
2803
+ ggml_set_name(tmpv, "tmpv");
2804
+
2805
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2806
+ offload_func_v(Vcur);
2807
+ ggml_set_name(Vcur, "Vcur");
2808
+
2809
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2810
+ offload_func_kq(k);
2811
+ ggml_set_name(k, "k");
2812
+
2813
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2814
+ ( n_ctx)*ggml_element_size(kv_self.v),
2815
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2816
+ offload_func_v(v);
2817
+ ggml_set_name(v, "v");
2818
+
2819
+ // important: storing RoPE-ed version of K in the KV cache!
2820
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
2821
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
2822
+ }
2823
+
2824
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
2825
+ offload_func_kq(Q);
2826
+ ggml_set_name(Q, "Q");
2827
+
2828
+ struct ggml_tensor * K =
2829
+ ggml_view_3d(ctx0, kv_self.k,
2830
+ n_embd_head, n_past + N, n_head_kv,
2831
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2832
+ ggml_element_size(kv_self.k)*n_embd_head,
2833
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
2834
+ offload_func_kq(K);
2835
+ ggml_set_name(K, "K");
2836
+
2837
+ // K * Q
2838
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2839
+ offload_func_kq(KQ);
2840
+ ggml_set_name(KQ, "KQ");
2841
+
2842
+ // KQ_scaled = KQ / sqrt(n_embd_head)
2843
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
2844
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2845
+ offload_func_kq(KQ_scaled);
2846
+ ggml_set_name(KQ_scaled, "KQ_scaled");
2847
+
2848
+ struct ggml_tensor * KQ_masked;
2849
+ struct ggml_tensor * KQ_scaled_alibi;
2850
+
2851
+ switch (model.type) {
2852
+ case MODEL_7B:
2853
+ KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2854
+ break;
2855
+ case MODEL_13B:
2856
+ KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
2857
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2858
+ KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2859
+ break;
2860
+ default:
2861
+ GGML_ASSERT(false);
2862
+ }
2863
+ // KQ_masked = mask_past(KQ_scaled)
2864
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2865
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2866
+ // offload_func_kq(KQ_masked);
2867
+ // ggml_set_name(KQ_masked, "KQ_masked");
2868
+
2869
+ // KQ = soft_max(KQ_masked)
2870
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2871
+ offload_func_v(KQ_soft_max);
2872
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
2873
+
2874
+ // split cached V into n_head heads
2875
+ struct ggml_tensor * V =
2876
+ ggml_view_3d(ctx0, kv_self.v,
2877
+ n_past + N, n_embd_head, n_head_kv,
2878
+ ggml_element_size(kv_self.v)*n_ctx,
2879
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2880
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
2881
+ offload_func_v(V);
2882
+ ggml_set_name(V, "V");
2883
+
2884
+ #if 1
2885
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2886
+ offload_func_v(KQV);
2887
+ ggml_set_name(KQV, "KQV");
2888
+ #else
2889
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2890
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2891
+ // is there a better way?
2892
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2893
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2894
+ #endif
2895
+
2896
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
2897
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2898
+ offload_func_v(KQV_merged);
2899
+ ggml_set_name(KQV_merged, "KQV_merged");
2900
+
2901
+ // cur = KQV_merged.contiguous().view(n_embd, N)
2902
+ cur = ggml_cpy(ctx0,
2903
+ KQV_merged,
2904
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2905
+ offload_func_v(cur);
2906
+ ggml_set_name(cur, "KQV_merged_contiguous");
2907
+
2908
+ // projection (no bias)
2909
+ cur = ggml_mul_mat(ctx0,
2910
+ model.layers[il].wo,
2911
+ cur);
2912
+ offload_func(cur);
2913
+ ggml_set_name(cur, "result_wo");
2914
+ }
2915
+
2916
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
2917
+ offload_func(inpFF);
2918
+ ggml_set_name(inpFF, "inpFF");
2919
+
2920
+ // feed-forward network
2921
+ {
2922
+ // norm
2923
+ {
2924
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
2925
+ offload_func(cur);
2926
+ ggml_set_name(cur, "rms_norm_1");
2927
+
2928
+ // cur = cur*ffn_norm(broadcasted)
2929
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
2930
+ offload_func(cur);
2931
+ ggml_set_name(cur, "ffn_norm");
2932
+ }
2933
+
2934
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
2935
+ model.layers[il].w3,
2936
+ cur);
2937
+ offload_func(tmp);
2938
+ ggml_set_name(tmp, "result_w3");
2939
+
2940
+ cur = ggml_mul_mat(ctx0,
2941
+ model.layers[il].w1,
2942
+ cur);
2943
+ offload_func(cur);
2944
+ ggml_set_name(cur, "result_w1");
2945
+
2946
+ // SILU activation
2947
+ cur = ggml_silu(ctx0, cur);
2948
+ offload_func(cur);
2949
+ ggml_set_name(cur, "silu");
2950
+
2951
+ cur = ggml_mul(ctx0, cur, tmp);
2952
+ offload_func(cur);
2953
+ ggml_set_name(cur, "silu_x_result_w3");
2954
+
2955
+ cur = ggml_mul_mat(ctx0,
2956
+ model.layers[il].w2,
2957
+ cur);
2958
+ offload_func(cur);
2959
+ ggml_set_name(cur, "result_w2");
2960
+ }
2961
+
2962
+ cur = ggml_add(ctx0, cur, inpFF);
2963
+ offload_func(cur);
2964
+ ggml_set_name(cur, "inpFF_+_result_w2");
2965
+
2966
+ // input for next layer
2967
+ inpL = cur;
2968
+ }
2969
+
2970
+ cur = inpL;
2971
+
2972
+ // norm
2973
+ {
2974
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
2975
+ offload_func_nr(cur);
2976
+ ggml_set_name(cur, "rms_norm_2");
2977
+
2978
+ // cur = cur*norm(broadcasted)
2979
+ cur = ggml_mul(ctx0, cur, model.output_norm);
2980
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
2981
+ ggml_set_name(cur, "result_norm");
2982
+ }
2983
+
2984
+ // lm_head
2985
+ cur = ggml_mul_mat(ctx0, model.output, cur);
2986
+ ggml_set_name(cur, "result_output");
2987
+
2988
+ ggml_build_forward_expand(gf, cur);
2989
+
2990
+ ggml_free(ctx0);
2991
+
2992
+ return gf;
2993
+ }
2994
+
2547
2995
  static struct ggml_cgraph * llm_build_falcon(
2548
2996
  llama_context & lctx,
2549
2997
  const llama_token * tokens,
@@ -2866,6 +3314,10 @@ static struct ggml_cgraph * llama_build_graph(
2866
3314
  {
2867
3315
  result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
2868
3316
  } break;
3317
+ case LLM_ARCH_BAICHUAN:
3318
+ {
3319
+ result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3320
+ } break;
2869
3321
  case LLM_ARCH_FALCON:
2870
3322
  {
2871
3323
  result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
@@ -3123,10 +3575,9 @@ struct llm_tokenizer_spm {
3123
3575
  while (offs < text.size()) {
3124
3576
  llm_symbol sym;
3125
3577
  size_t len = utf8_len(text[offs]);
3126
- GGML_ASSERT(offs + len <= text.size());
3127
3578
  sym.text = text.c_str() + offs;
3128
- sym.n = len;
3129
- offs += len;
3579
+ sym.n = std::min(len, text.size() - offs);
3580
+ offs += sym.n;
3130
3581
  sym.prev = index - 1;
3131
3582
  sym.next = offs == text.size() ? -1 : index + 1;
3132
3583
  index++;
@@ -4642,7 +5093,16 @@ void llama_beam_search(llama_context * ctx,
4642
5093
  // quantization
4643
5094
  //
4644
5095
 
4645
- static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
5096
+ template <typename T>
5097
+ struct no_init {
5098
+ T value;
5099
+ no_init() { /* do nothing */ }
5100
+ };
5101
+
5102
+ static void llama_convert_tensor_internal(
5103
+ struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
5104
+ const size_t nelements, const int nthread
5105
+ ) {
4646
5106
  if (output.size() < nelements) {
4647
5107
  output.resize(nelements);
4648
5108
  }
@@ -4677,7 +5137,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4677
5137
  auto blocks_per_thread = nblocks / nthread;
4678
5138
  auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
4679
5139
 
4680
- std::vector<std::thread> workers;
4681
5140
  for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
4682
5141
  auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
4683
5142
  auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4690,14 +5149,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4690
5149
  qtype.to_float(inbuf, outbuf, nels);
4691
5150
  }
4692
5151
  };
4693
- workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
5152
+ workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
4694
5153
  in_buff_offs += thr_block_bytes;
4695
5154
  out_buff_offs += thr_elems;
4696
5155
  }
4697
- for (auto & worker : workers) {
4698
- worker.join();
5156
+ for (auto & w : workers) { w.join(); }
5157
+ workers.clear();
5158
+ }
5159
+
5160
+ #ifdef GGML_USE_K_QUANTS
5161
+ static ggml_type get_k_quant_type(
5162
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
5163
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
5164
+ ) {
5165
+ const std::string name = ggml_get_name(tensor);
5166
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
5167
+ const auto tn = LLM_TN(model.arch);
5168
+
5169
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
5170
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
5171
+ };
5172
+
5173
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5174
+ int nx = tensor->ne[0];
5175
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
5176
+ new_type = GGML_TYPE_Q8_0;
5177
+ }
5178
+ else if (new_type != GGML_TYPE_Q8_0) {
5179
+ new_type = GGML_TYPE_Q6_K;
5180
+ }
5181
+ } else if (name.find("attn_v.weight") != std::string::npos) {
5182
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5183
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5184
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5185
+ }
5186
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5187
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5188
+ use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5189
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
5190
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5191
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
5192
+ if (model.type == MODEL_70B) {
5193
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5194
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5195
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
5196
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5197
+ }
5198
+ ++*i_attention_wv;
5199
+ } else if (name.find("ffn_down.weight") != std::string::npos) {
5200
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5201
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5202
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5203
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5204
+ : GGML_TYPE_Q3_K;
5205
+ }
5206
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5207
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5208
+ }
5209
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5210
+ if (model.arch == LLM_ARCH_FALCON) {
5211
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5212
+ use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5213
+ } else {
5214
+ if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5215
+ }
5216
+ }
5217
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5218
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
5219
+ new_type = GGML_TYPE_Q5_K;
5220
+ }
5221
+ ++*i_feed_forward_w2;
5222
+ } else if (name.find("attn_output.weight") != std::string::npos) {
5223
+ if (model.arch != LLM_ARCH_FALCON) {
5224
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5225
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5226
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5227
+ } else {
5228
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5229
+ }
5230
+ }
5231
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
5232
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5233
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5234
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5235
+ }
5236
+ else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
5237
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5238
+ }
5239
+ // This can be used to reduce the size of the Q5_K_S model.
5240
+ // The associated PPL increase is fully in line with the size reduction
5241
+ //else {
5242
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5243
+ //}
5244
+ bool convert_incompatible_tensor = false;
5245
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5246
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5247
+ int nx = tensor->ne[0];
5248
+ int ny = tensor->ne[1];
5249
+ if (nx % QK_K != 0) {
5250
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
5251
+ convert_incompatible_tensor = true;
5252
+ }
5253
+ }
5254
+ if (convert_incompatible_tensor) {
5255
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5256
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
5257
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
5258
+ } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
5259
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
5260
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
5261
+ } else {
5262
+ throw std::runtime_error("Unsupported tensor size encountered\n");
5263
+ }
4699
5264
  }
5265
+
5266
+ return new_type;
4700
5267
  }
5268
+ #endif
4701
5269
 
4702
5270
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
4703
5271
  ggml_type quantized_type;
@@ -4782,18 +5350,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4782
5350
  std::vector<int64_t> hist_all(1 << 4, 0);
4783
5351
 
4784
5352
  std::vector<std::thread> workers;
5353
+ workers.reserve(nthread);
4785
5354
  std::mutex mutex;
4786
5355
 
4787
- #ifdef GGML_USE_K_QUANTS
4788
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4789
- return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4790
- };
4791
- #endif
4792
-
4793
5356
  int idx = 0;
4794
5357
 
4795
- std::vector<uint8_t> read_data;
4796
- std::vector<uint8_t> work;
5358
+ std::vector<no_init<uint8_t>> read_data;
5359
+ std::vector<no_init<uint8_t>> work;
5360
+ std::vector<no_init<float>> f32_conv_buf;
4797
5361
 
4798
5362
  // populate the original tensors so we get an initial meta data
4799
5363
  for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4815,7 +5379,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4815
5379
 
4816
5380
  const std::string name = ggml_get_name(tensor);
4817
5381
 
4818
- read_data.resize(ggml_nbytes(tensor));
5382
+ if (read_data.size() < ggml_nbytes(tensor)) {
5383
+ read_data.resize(ggml_nbytes(tensor));
5384
+ }
4819
5385
  tensor->data = read_data.data();
4820
5386
  ml->load_data_for(tensor);
4821
5387
 
@@ -4840,101 +5406,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4840
5406
  if (quantize) {
4841
5407
  new_type = quantized_type;
4842
5408
  #ifdef GGML_USE_K_QUANTS
4843
- // TODO: avoid hardcoded tensor names - use the TN_* constants
4844
- const auto tn = LLM_TN(ml->get_arch());
4845
-
4846
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4847
- int nx = tensor->ne[0];
4848
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4849
- new_type = GGML_TYPE_Q8_0;
4850
- }
4851
- else if (new_type != GGML_TYPE_Q8_0) {
4852
- new_type = GGML_TYPE_Q6_K;
4853
- }
4854
- } else if (name.find("attn_v.weight") != std::string::npos) {
4855
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4856
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4857
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4858
- }
4859
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4860
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4861
- use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4862
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4863
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4864
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4865
- if (model.type == MODEL_70B) {
4866
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4867
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4868
- // nearly negligible increase in model size by quantizing this tensor with more bits:
4869
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4870
- }
4871
- ++i_attention_wv;
4872
- } else if (name.find("ffn_down.weight") != std::string::npos) {
4873
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4874
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4875
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4876
- : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4877
- : GGML_TYPE_Q3_K;
4878
- }
4879
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4880
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4881
- }
4882
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4883
- if (model.arch == LLM_ARCH_FALCON) {
4884
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4885
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4886
- } else {
4887
- if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4888
- }
4889
- }
4890
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4891
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4892
- new_type = GGML_TYPE_Q5_K;
4893
- }
4894
- ++i_feed_forward_w2;
4895
- } else if (name.find("attn_output.weight") != std::string::npos) {
4896
- if (model.arch != LLM_ARCH_FALCON) {
4897
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4898
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4899
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4900
- } else {
4901
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4902
- }
4903
- }
4904
- else if (name.find("attn_qkv.weight") != std::string::npos) {
4905
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4906
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4907
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4908
- }
4909
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4910
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4911
- }
4912
- // This can be used to reduce the size of the Q5_K_S model.
4913
- // The associated PPL increase is fully in line with the size reduction
4914
- //else {
4915
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4916
- //}
4917
- bool convert_incompatible_tensor = false;
4918
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4919
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4920
- int nx = tensor->ne[0];
4921
- int ny = tensor->ne[1];
4922
- if (nx % QK_K != 0) {
4923
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4924
- convert_incompatible_tensor = true;
4925
- }
4926
- }
4927
- if (convert_incompatible_tensor) {
4928
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4929
- new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4930
- LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4931
- } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4932
- new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4933
- LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4934
- } else {
4935
- throw std::runtime_error("Unsupported tensor size encountered\n");
4936
- }
4937
- }
5409
+ new_type = get_k_quant_type(
5410
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
5411
+ );
4938
5412
  #endif
4939
5413
  // If we've decided to quantize to the same type the tensor is already
4940
5414
  // in then there's nothing to do.
@@ -4949,23 +5423,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4949
5423
  const size_t nelements = ggml_nelements(tensor);
4950
5424
 
4951
5425
  float * f32_data;
4952
- std::vector<float> f32_conv_buf;
4953
5426
 
4954
5427
  if (tensor->type == GGML_TYPE_F32) {
4955
5428
  f32_data = (float *) tensor->data;
4956
5429
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
4957
5430
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
4958
5431
  } else {
4959
- llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
5432
+ llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
4960
5433
  f32_data = (float *) f32_conv_buf.data();
4961
5434
  }
4962
5435
 
4963
5436
  LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
4964
5437
  fflush(stdout);
4965
5438
 
4966
- work.resize(nelements * 4); // upper bound on size
5439
+ if (work.size() < nelements * 4) {
5440
+ work.resize(nelements * 4); // upper bound on size
5441
+ }
4967
5442
  new_data = work.data();
4968
- std::vector<int64_t> hist_cur(1 << 4, 0);
5443
+ std::array<int64_t, 1 << 4> hist_cur = {};
4969
5444
 
4970
5445
  static const int chunk_size = 32 * 512;
4971
5446
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4976,13 +5451,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4976
5451
  size_t counter = 0;
4977
5452
  new_size = 0;
4978
5453
  auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
4979
- std::vector<int64_t> local_hist;
5454
+ std::array<int64_t, 1 << 4> local_hist = {};
4980
5455
  size_t local_size = 0;
4981
5456
  while (true) {
4982
5457
  std::unique_lock<std::mutex> lock(mutex);
4983
5458
  size_t first = counter; counter += chunk_size;
4984
5459
  if (first >= nelements) {
4985
- if (!local_hist.empty()) {
5460
+ if (local_size > 0) {
4986
5461
  for (int j=0; j<int(local_hist.size()); ++j) {
4987
5462
  hist_cur[j] += local_hist[j];
4988
5463
  }
@@ -4992,22 +5467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4992
5467
  }
4993
5468
  lock.unlock();
4994
5469
  size_t last = std::min(nelements, first + chunk_size);
4995
- if (local_hist.empty()) {
4996
- local_hist.resize(hist_cur.size(), 0);
4997
- }
4998
5470
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
4999
5471
  }
5000
5472
  };
5001
- if ((int) workers.size() < nthread_use - 1) {
5002
- workers.resize(nthread_use - 1);
5003
- }
5004
5473
  for (int it = 0; it < nthread_use - 1; ++it) {
5005
- workers[it] = std::thread(compute);
5474
+ workers.emplace_back(compute);
5006
5475
  }
5007
5476
  compute();
5008
- for (int it = 0; it < nthread_use - 1; ++it) {
5009
- workers[it].join();
5010
- }
5477
+ for (auto & w : workers) { w.join(); }
5478
+ workers.clear();
5011
5479
  }
5012
5480
 
5013
5481
  LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -5635,15 +6103,19 @@ void llama_free(struct llama_context * ctx) {
5635
6103
  }
5636
6104
 
5637
6105
  int llama_n_vocab(const struct llama_context * ctx) {
5638
- return ctx->model.vocab.id_to_token.size();
6106
+ return llama_model_n_vocab(&ctx->model);
5639
6107
  }
5640
6108
 
5641
6109
  int llama_n_ctx(const struct llama_context * ctx) {
5642
- return ctx->model.hparams.n_ctx;
6110
+ return llama_model_n_ctx(&ctx->model);
6111
+ }
6112
+
6113
+ int llama_n_ctx_train(const struct llama_context * ctx) {
6114
+ return llama_model_n_ctx_train(&ctx->model);
5643
6115
  }
5644
6116
 
5645
6117
  int llama_n_embd(const struct llama_context * ctx) {
5646
- return ctx->model.hparams.n_embd;
6118
+ return llama_model_n_embd(&ctx->model);
5647
6119
  }
5648
6120
 
5649
6121
  enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
@@ -5658,6 +6130,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
5658
6130
  return model->hparams.n_ctx;
5659
6131
  }
5660
6132
 
6133
+ int llama_model_n_ctx_train(const struct llama_model * model) {
6134
+ return model->hparams.n_ctx_train;
6135
+ }
6136
+
5661
6137
  int llama_model_n_embd(const struct llama_model * model) {
5662
6138
  return model->hparams.n_embd;
5663
6139
  }
@@ -6212,7 +6688,7 @@ int llama_tokenize_with_model(
6212
6688
  auto res = llama_tokenize_internal(model->vocab, text, add_bos);
6213
6689
 
6214
6690
  if (n_max_tokens < (int) res.size()) {
6215
- LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6691
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6216
6692
  return -((int) res.size());
6217
6693
  }
6218
6694