llama_cpp 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,3 @@
1
- // Defines fileno on msys:
2
- #ifndef _GNU_SOURCE
3
- #define _GNU_SOURCE
4
- #endif
5
-
6
1
  #include "llama.h"
7
2
 
8
3
  #include "ggml.h"
@@ -160,6 +155,7 @@ static std::string format(const char * fmt, ...) {
160
155
  enum llm_arch {
161
156
  LLM_ARCH_LLAMA,
162
157
  LLM_ARCH_FALCON,
158
+ LLM_ARCH_BAICHUAN,
163
159
  LLM_ARCH_GPT2,
164
160
  LLM_ARCH_GPTJ,
165
161
  LLM_ARCH_GPTNEOX,
@@ -174,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
174
170
  { LLM_ARCH_GPTJ, "gptj" },
175
171
  { LLM_ARCH_GPTNEOX, "gptneox" },
176
172
  { LLM_ARCH_MPT, "mpt" },
173
+ { LLM_ARCH_BAICHUAN,"baichuan" },
177
174
  };
178
175
 
179
176
  enum llm_kv {
@@ -314,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
314
311
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
315
312
  },
316
313
  },
314
+ {
315
+ LLM_ARCH_BAICHUAN,
316
+ {
317
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
318
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
319
+ { LLM_TENSOR_OUTPUT, "output" },
320
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
321
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
322
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
323
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
324
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
325
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
326
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
327
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
328
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
329
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
330
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
331
+ },
332
+ },
317
333
  {
318
334
  LLM_ARCH_FALCON,
319
335
  {
@@ -658,15 +674,12 @@ struct llama_mmap {
658
674
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
659
675
  if (prefetch) {
660
676
  // Advise the kernel to preload the mapped memory
661
-
662
677
  WIN32_MEMORY_RANGE_ENTRY range;
663
-
664
678
  range.VirtualAddress = addr;
665
679
  range.NumberOfBytes = (SIZE_T)size;
666
680
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
667
681
  fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
668
682
  llama_format_win_err(GetLastError()).c_str());
669
- }
670
683
  }
671
684
  #else
672
685
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -1685,6 +1698,15 @@ static void llm_load_hparams(
1685
1698
  default: model.type = e_model::MODEL_UNKNOWN;
1686
1699
  }
1687
1700
  } break;
1701
+ case LLM_ARCH_BAICHUAN:
1702
+ {
1703
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1704
+ switch (hparams.n_layer) {
1705
+ case 32: model.type = e_model::MODEL_7B; break;
1706
+ case 40: model.type = e_model::MODEL_13B; break;
1707
+ default: model.type = e_model::MODEL_UNKNOWN;
1708
+ }
1709
+ } break;
1688
1710
  default: (void)0;
1689
1711
  };
1690
1712
 
@@ -1925,7 +1947,6 @@ static void llm_load_tensors(
1925
1947
  const int64_t n_vocab = hparams.n_vocab;
1926
1948
 
1927
1949
  const auto tn = LLM_TN(model.arch);
1928
-
1929
1950
  switch (model.arch) {
1930
1951
  case LLM_ARCH_LLAMA:
1931
1952
  {
@@ -1968,6 +1989,72 @@ static void llm_load_tensors(
1968
1989
 
1969
1990
  model.layers.resize(n_layer);
1970
1991
 
1992
+ for (uint32_t i = 0; i < n_layer; ++i) {
1993
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1994
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1995
+
1996
+ auto & layer = model.layers[i];
1997
+
1998
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
1999
+
2000
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
2001
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2002
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2003
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2004
+
2005
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2006
+
2007
+ layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
2008
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2009
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2010
+
2011
+ if (backend == GGML_BACKEND_GPU) {
2012
+ vram_weights +=
2013
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2014
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2015
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
2016
+ }
2017
+ }
2018
+ } break;
2019
+ case LLM_ARCH_BAICHUAN:
2020
+ {
2021
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2022
+ {
2023
+ ggml_backend backend_norm;
2024
+ ggml_backend backend_output;
2025
+
2026
+ if (n_gpu_layers > int(n_layer)) {
2027
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2028
+ // on Windows however this is detrimental unless everything is on the GPU
2029
+ #ifndef _WIN32
2030
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2031
+ #else
2032
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2033
+ #endif // _WIN32
2034
+
2035
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2036
+ } else {
2037
+ backend_norm = GGML_BACKEND_CPU;
2038
+ backend_output = GGML_BACKEND_CPU;
2039
+ }
2040
+
2041
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2042
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2043
+
2044
+ if (backend_norm == GGML_BACKEND_GPU) {
2045
+ vram_weights += ggml_nbytes(model.output_norm);
2046
+ }
2047
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2048
+ vram_weights += ggml_nbytes(model.output);
2049
+ }
2050
+ }
2051
+
2052
+ const uint32_t n_ff = hparams.n_ff;
2053
+
2054
+ const int i_gpu_start = n_layer - n_gpu_layers;
2055
+
2056
+ model.layers.resize(n_layer);
2057
+
1971
2058
  for (uint32_t i = 0; i < n_layer; ++i) {
1972
2059
  const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1973
2060
  const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
@@ -2544,6 +2631,367 @@ static struct ggml_cgraph * llm_build_llama(
2544
2631
  return gf;
2545
2632
  }
2546
2633
 
2634
+
2635
+ static struct ggml_cgraph * llm_build_baichaun(
2636
+ llama_context & lctx,
2637
+ const llama_token * tokens,
2638
+ const float * embd,
2639
+ int n_tokens,
2640
+ int n_past) {
2641
+
2642
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2643
+
2644
+ const int N = n_tokens;
2645
+
2646
+ const auto & model = lctx.model;
2647
+ const auto & hparams = model.hparams;
2648
+
2649
+ const auto & kv_self = lctx.kv_self;
2650
+
2651
+ GGML_ASSERT(!!kv_self.ctx);
2652
+
2653
+ const int64_t n_embd = hparams.n_embd;
2654
+ const int64_t n_layer = hparams.n_layer;
2655
+ const int64_t n_ctx = hparams.n_ctx;
2656
+ const int64_t n_head = hparams.n_head;
2657
+ const int64_t n_head_kv = hparams.n_head_kv;
2658
+ const int64_t n_embd_head = hparams.n_embd_head();
2659
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
2660
+
2661
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
2662
+
2663
+ const float freq_base = hparams.rope_freq_base;
2664
+ const float freq_scale = hparams.rope_freq_scale;
2665
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
2666
+
2667
+ const int n_gpu_layers = model.n_gpu_layers;
2668
+
2669
+ auto & buf_compute = lctx.buf_compute;
2670
+
2671
+ struct ggml_init_params params = {
2672
+ /*.mem_size =*/ buf_compute.size,
2673
+ /*.mem_buffer =*/ buf_compute.data,
2674
+ /*.no_alloc =*/ false,
2675
+ };
2676
+
2677
+ params.no_alloc = true;
2678
+
2679
+ struct ggml_context * ctx0 = ggml_init(params);
2680
+
2681
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
2682
+
2683
+ struct ggml_tensor * cur;
2684
+ struct ggml_tensor * inpL;
2685
+
2686
+ if (tokens) {
2687
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2688
+
2689
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
2690
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2691
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2692
+ }
2693
+ ggml_set_name(inp_tokens, "inp_tokens");
2694
+
2695
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
2696
+ } else {
2697
+ #ifdef GGML_USE_MPI
2698
+ GGML_ASSERT(false && "not implemented");
2699
+ #endif
2700
+
2701
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2702
+
2703
+ ggml_allocr_alloc(lctx.alloc, inpL);
2704
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2705
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2706
+ }
2707
+ }
2708
+
2709
+ const int i_gpu_start = n_layer - n_gpu_layers;
2710
+ (void) i_gpu_start;
2711
+
2712
+ // offload functions set the tensor output backend to GPU
2713
+ // tensors are GPU-accelerated if any input or the output has been offloaded
2714
+ //
2715
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2716
+ // in that case ggml_cuda_assign_buffers has no effect
2717
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2718
+ offload_func_t offload_func_kq = llama_nop;
2719
+ offload_func_t offload_func_v = llama_nop;
2720
+
2721
+ #ifdef GGML_USE_CUBLAS
2722
+ if (n_gpu_layers > n_layer) {
2723
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
2724
+ }
2725
+ if (n_gpu_layers > n_layer + 1) {
2726
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
2727
+ }
2728
+ if (n_gpu_layers > n_layer + 2) {
2729
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
2730
+ }
2731
+ #endif // GGML_USE_CUBLAS
2732
+
2733
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2734
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
2735
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2736
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2737
+ }
2738
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2739
+
2740
+ for (int il = 0; il < n_layer; ++il) {
2741
+ ggml_format_name(inpL, "layer_inp_%d", il);
2742
+
2743
+ offload_func_t offload_func = llama_nop;
2744
+
2745
+ #ifdef GGML_USE_CUBLAS
2746
+ if (il >= i_gpu_start) {
2747
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
2748
+ }
2749
+ #endif // GGML_USE_CUBLAS
2750
+
2751
+ struct ggml_tensor * inpSA = inpL;
2752
+
2753
+ // norm
2754
+ {
2755
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
2756
+ offload_func(cur);
2757
+ ggml_set_name(cur, "rms_norm_0");
2758
+
2759
+ // cur = cur*attn_norm(broadcasted)
2760
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
2761
+ offload_func(cur);
2762
+ ggml_set_name(cur, "attention_norm_0");
2763
+ }
2764
+
2765
+ // self-attention
2766
+ {
2767
+ // compute Q and K and RoPE them
2768
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
2769
+ offload_func_kq(tmpk);
2770
+ ggml_set_name(tmpk, "tmpk");
2771
+
2772
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
2773
+ offload_func_kq(tmpq);
2774
+ ggml_set_name(tmpq, "tmpq");
2775
+
2776
+ struct ggml_tensor * Kcur;
2777
+ struct ggml_tensor * Qcur;
2778
+ switch (model.type) {
2779
+ case MODEL_7B:
2780
+ Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2781
+ Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2782
+ break;
2783
+ case MODEL_13B:
2784
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2785
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
2786
+ break;
2787
+ default:
2788
+ GGML_ASSERT(false);
2789
+ }
2790
+
2791
+ offload_func_kq(Kcur);
2792
+ ggml_set_name(Kcur, "Kcur");
2793
+
2794
+ offload_func_kq(Qcur);
2795
+ ggml_set_name(Qcur, "Qcur");
2796
+
2797
+ // store key and value to memory
2798
+ {
2799
+ // compute the transposed [N, n_embd] V matrix
2800
+
2801
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2802
+ offload_func_v(tmpv);
2803
+ ggml_set_name(tmpv, "tmpv");
2804
+
2805
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2806
+ offload_func_v(Vcur);
2807
+ ggml_set_name(Vcur, "Vcur");
2808
+
2809
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2810
+ offload_func_kq(k);
2811
+ ggml_set_name(k, "k");
2812
+
2813
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2814
+ ( n_ctx)*ggml_element_size(kv_self.v),
2815
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2816
+ offload_func_v(v);
2817
+ ggml_set_name(v, "v");
2818
+
2819
+ // important: storing RoPE-ed version of K in the KV cache!
2820
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
2821
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
2822
+ }
2823
+
2824
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
2825
+ offload_func_kq(Q);
2826
+ ggml_set_name(Q, "Q");
2827
+
2828
+ struct ggml_tensor * K =
2829
+ ggml_view_3d(ctx0, kv_self.k,
2830
+ n_embd_head, n_past + N, n_head_kv,
2831
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2832
+ ggml_element_size(kv_self.k)*n_embd_head,
2833
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
2834
+ offload_func_kq(K);
2835
+ ggml_set_name(K, "K");
2836
+
2837
+ // K * Q
2838
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2839
+ offload_func_kq(KQ);
2840
+ ggml_set_name(KQ, "KQ");
2841
+
2842
+ // KQ_scaled = KQ / sqrt(n_embd_head)
2843
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
2844
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2845
+ offload_func_kq(KQ_scaled);
2846
+ ggml_set_name(KQ_scaled, "KQ_scaled");
2847
+
2848
+ struct ggml_tensor * KQ_masked;
2849
+ struct ggml_tensor * KQ_scaled_alibi;
2850
+
2851
+ switch (model.type) {
2852
+ case MODEL_7B:
2853
+ KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2854
+ break;
2855
+ case MODEL_13B:
2856
+ KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
2857
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2858
+ KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2859
+ break;
2860
+ default:
2861
+ GGML_ASSERT(false);
2862
+ }
2863
+ // KQ_masked = mask_past(KQ_scaled)
2864
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2865
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2866
+ // offload_func_kq(KQ_masked);
2867
+ // ggml_set_name(KQ_masked, "KQ_masked");
2868
+
2869
+ // KQ = soft_max(KQ_masked)
2870
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2871
+ offload_func_v(KQ_soft_max);
2872
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
2873
+
2874
+ // split cached V into n_head heads
2875
+ struct ggml_tensor * V =
2876
+ ggml_view_3d(ctx0, kv_self.v,
2877
+ n_past + N, n_embd_head, n_head_kv,
2878
+ ggml_element_size(kv_self.v)*n_ctx,
2879
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2880
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
2881
+ offload_func_v(V);
2882
+ ggml_set_name(V, "V");
2883
+
2884
+ #if 1
2885
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2886
+ offload_func_v(KQV);
2887
+ ggml_set_name(KQV, "KQV");
2888
+ #else
2889
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2890
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2891
+ // is there a better way?
2892
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2893
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2894
+ #endif
2895
+
2896
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
2897
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2898
+ offload_func_v(KQV_merged);
2899
+ ggml_set_name(KQV_merged, "KQV_merged");
2900
+
2901
+ // cur = KQV_merged.contiguous().view(n_embd, N)
2902
+ cur = ggml_cpy(ctx0,
2903
+ KQV_merged,
2904
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2905
+ offload_func_v(cur);
2906
+ ggml_set_name(cur, "KQV_merged_contiguous");
2907
+
2908
+ // projection (no bias)
2909
+ cur = ggml_mul_mat(ctx0,
2910
+ model.layers[il].wo,
2911
+ cur);
2912
+ offload_func(cur);
2913
+ ggml_set_name(cur, "result_wo");
2914
+ }
2915
+
2916
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
2917
+ offload_func(inpFF);
2918
+ ggml_set_name(inpFF, "inpFF");
2919
+
2920
+ // feed-forward network
2921
+ {
2922
+ // norm
2923
+ {
2924
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
2925
+ offload_func(cur);
2926
+ ggml_set_name(cur, "rms_norm_1");
2927
+
2928
+ // cur = cur*ffn_norm(broadcasted)
2929
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
2930
+ offload_func(cur);
2931
+ ggml_set_name(cur, "ffn_norm");
2932
+ }
2933
+
2934
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
2935
+ model.layers[il].w3,
2936
+ cur);
2937
+ offload_func(tmp);
2938
+ ggml_set_name(tmp, "result_w3");
2939
+
2940
+ cur = ggml_mul_mat(ctx0,
2941
+ model.layers[il].w1,
2942
+ cur);
2943
+ offload_func(cur);
2944
+ ggml_set_name(cur, "result_w1");
2945
+
2946
+ // SILU activation
2947
+ cur = ggml_silu(ctx0, cur);
2948
+ offload_func(cur);
2949
+ ggml_set_name(cur, "silu");
2950
+
2951
+ cur = ggml_mul(ctx0, cur, tmp);
2952
+ offload_func(cur);
2953
+ ggml_set_name(cur, "silu_x_result_w3");
2954
+
2955
+ cur = ggml_mul_mat(ctx0,
2956
+ model.layers[il].w2,
2957
+ cur);
2958
+ offload_func(cur);
2959
+ ggml_set_name(cur, "result_w2");
2960
+ }
2961
+
2962
+ cur = ggml_add(ctx0, cur, inpFF);
2963
+ offload_func(cur);
2964
+ ggml_set_name(cur, "inpFF_+_result_w2");
2965
+
2966
+ // input for next layer
2967
+ inpL = cur;
2968
+ }
2969
+
2970
+ cur = inpL;
2971
+
2972
+ // norm
2973
+ {
2974
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
2975
+ offload_func_nr(cur);
2976
+ ggml_set_name(cur, "rms_norm_2");
2977
+
2978
+ // cur = cur*norm(broadcasted)
2979
+ cur = ggml_mul(ctx0, cur, model.output_norm);
2980
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
2981
+ ggml_set_name(cur, "result_norm");
2982
+ }
2983
+
2984
+ // lm_head
2985
+ cur = ggml_mul_mat(ctx0, model.output, cur);
2986
+ ggml_set_name(cur, "result_output");
2987
+
2988
+ ggml_build_forward_expand(gf, cur);
2989
+
2990
+ ggml_free(ctx0);
2991
+
2992
+ return gf;
2993
+ }
2994
+
2547
2995
  static struct ggml_cgraph * llm_build_falcon(
2548
2996
  llama_context & lctx,
2549
2997
  const llama_token * tokens,
@@ -2866,6 +3314,10 @@ static struct ggml_cgraph * llama_build_graph(
2866
3314
  {
2867
3315
  result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
2868
3316
  } break;
3317
+ case LLM_ARCH_BAICHUAN:
3318
+ {
3319
+ result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3320
+ } break;
2869
3321
  case LLM_ARCH_FALCON:
2870
3322
  {
2871
3323
  result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
@@ -3123,10 +3575,9 @@ struct llm_tokenizer_spm {
3123
3575
  while (offs < text.size()) {
3124
3576
  llm_symbol sym;
3125
3577
  size_t len = utf8_len(text[offs]);
3126
- GGML_ASSERT(offs + len <= text.size());
3127
3578
  sym.text = text.c_str() + offs;
3128
- sym.n = len;
3129
- offs += len;
3579
+ sym.n = std::min(len, text.size() - offs);
3580
+ offs += sym.n;
3130
3581
  sym.prev = index - 1;
3131
3582
  sym.next = offs == text.size() ? -1 : index + 1;
3132
3583
  index++;
@@ -4642,7 +5093,16 @@ void llama_beam_search(llama_context * ctx,
4642
5093
  // quantization
4643
5094
  //
4644
5095
 
4645
- static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
5096
+ template <typename T>
5097
+ struct no_init {
5098
+ T value;
5099
+ no_init() { /* do nothing */ }
5100
+ };
5101
+
5102
+ static void llama_convert_tensor_internal(
5103
+ struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
5104
+ const size_t nelements, const int nthread
5105
+ ) {
4646
5106
  if (output.size() < nelements) {
4647
5107
  output.resize(nelements);
4648
5108
  }
@@ -4677,7 +5137,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4677
5137
  auto blocks_per_thread = nblocks / nthread;
4678
5138
  auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
4679
5139
 
4680
- std::vector<std::thread> workers;
4681
5140
  for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
4682
5141
  auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
4683
5142
  auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4690,14 +5149,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4690
5149
  qtype.to_float(inbuf, outbuf, nels);
4691
5150
  }
4692
5151
  };
4693
- workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
5152
+ workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
4694
5153
  in_buff_offs += thr_block_bytes;
4695
5154
  out_buff_offs += thr_elems;
4696
5155
  }
4697
- for (auto & worker : workers) {
4698
- worker.join();
5156
+ for (auto & w : workers) { w.join(); }
5157
+ workers.clear();
5158
+ }
5159
+
5160
+ #ifdef GGML_USE_K_QUANTS
5161
+ static ggml_type get_k_quant_type(
5162
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
5163
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
5164
+ ) {
5165
+ const std::string name = ggml_get_name(tensor);
5166
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
5167
+ const auto tn = LLM_TN(model.arch);
5168
+
5169
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
5170
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
5171
+ };
5172
+
5173
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5174
+ int nx = tensor->ne[0];
5175
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
5176
+ new_type = GGML_TYPE_Q8_0;
5177
+ }
5178
+ else if (new_type != GGML_TYPE_Q8_0) {
5179
+ new_type = GGML_TYPE_Q6_K;
5180
+ }
5181
+ } else if (name.find("attn_v.weight") != std::string::npos) {
5182
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5183
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5184
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5185
+ }
5186
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5187
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5188
+ use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5189
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
5190
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5191
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
5192
+ if (model.type == MODEL_70B) {
5193
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5194
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5195
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
5196
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5197
+ }
5198
+ ++*i_attention_wv;
5199
+ } else if (name.find("ffn_down.weight") != std::string::npos) {
5200
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5201
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5202
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5203
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5204
+ : GGML_TYPE_Q3_K;
5205
+ }
5206
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5207
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5208
+ }
5209
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5210
+ if (model.arch == LLM_ARCH_FALCON) {
5211
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5212
+ use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5213
+ } else {
5214
+ if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5215
+ }
5216
+ }
5217
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5218
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
5219
+ new_type = GGML_TYPE_Q5_K;
5220
+ }
5221
+ ++*i_feed_forward_w2;
5222
+ } else if (name.find("attn_output.weight") != std::string::npos) {
5223
+ if (model.arch != LLM_ARCH_FALCON) {
5224
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5225
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5226
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5227
+ } else {
5228
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5229
+ }
5230
+ }
5231
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
5232
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5233
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5234
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5235
+ }
5236
+ else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
5237
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5238
+ }
5239
+ // This can be used to reduce the size of the Q5_K_S model.
5240
+ // The associated PPL increase is fully in line with the size reduction
5241
+ //else {
5242
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5243
+ //}
5244
+ bool convert_incompatible_tensor = false;
5245
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5246
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5247
+ int nx = tensor->ne[0];
5248
+ int ny = tensor->ne[1];
5249
+ if (nx % QK_K != 0) {
5250
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
5251
+ convert_incompatible_tensor = true;
5252
+ }
5253
+ }
5254
+ if (convert_incompatible_tensor) {
5255
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5256
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
5257
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
5258
+ } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
5259
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
5260
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
5261
+ } else {
5262
+ throw std::runtime_error("Unsupported tensor size encountered\n");
5263
+ }
4699
5264
  }
5265
+
5266
+ return new_type;
4700
5267
  }
5268
+ #endif
4701
5269
 
4702
5270
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
4703
5271
  ggml_type quantized_type;
@@ -4782,18 +5350,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4782
5350
  std::vector<int64_t> hist_all(1 << 4, 0);
4783
5351
 
4784
5352
  std::vector<std::thread> workers;
5353
+ workers.reserve(nthread);
4785
5354
  std::mutex mutex;
4786
5355
 
4787
- #ifdef GGML_USE_K_QUANTS
4788
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4789
- return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4790
- };
4791
- #endif
4792
-
4793
5356
  int idx = 0;
4794
5357
 
4795
- std::vector<uint8_t> read_data;
4796
- std::vector<uint8_t> work;
5358
+ std::vector<no_init<uint8_t>> read_data;
5359
+ std::vector<no_init<uint8_t>> work;
5360
+ std::vector<no_init<float>> f32_conv_buf;
4797
5361
 
4798
5362
  // populate the original tensors so we get an initial meta data
4799
5363
  for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4815,7 +5379,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4815
5379
 
4816
5380
  const std::string name = ggml_get_name(tensor);
4817
5381
 
4818
- read_data.resize(ggml_nbytes(tensor));
5382
+ if (read_data.size() < ggml_nbytes(tensor)) {
5383
+ read_data.resize(ggml_nbytes(tensor));
5384
+ }
4819
5385
  tensor->data = read_data.data();
4820
5386
  ml->load_data_for(tensor);
4821
5387
 
@@ -4840,101 +5406,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4840
5406
  if (quantize) {
4841
5407
  new_type = quantized_type;
4842
5408
  #ifdef GGML_USE_K_QUANTS
4843
- // TODO: avoid hardcoded tensor names - use the TN_* constants
4844
- const auto tn = LLM_TN(ml->get_arch());
4845
-
4846
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4847
- int nx = tensor->ne[0];
4848
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4849
- new_type = GGML_TYPE_Q8_0;
4850
- }
4851
- else if (new_type != GGML_TYPE_Q8_0) {
4852
- new_type = GGML_TYPE_Q6_K;
4853
- }
4854
- } else if (name.find("attn_v.weight") != std::string::npos) {
4855
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4856
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4857
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4858
- }
4859
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4860
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4861
- use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4862
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4863
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4864
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4865
- if (model.type == MODEL_70B) {
4866
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4867
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4868
- // nearly negligible increase in model size by quantizing this tensor with more bits:
4869
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4870
- }
4871
- ++i_attention_wv;
4872
- } else if (name.find("ffn_down.weight") != std::string::npos) {
4873
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4874
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4875
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4876
- : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4877
- : GGML_TYPE_Q3_K;
4878
- }
4879
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4880
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4881
- }
4882
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4883
- if (model.arch == LLM_ARCH_FALCON) {
4884
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4885
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4886
- } else {
4887
- if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4888
- }
4889
- }
4890
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4891
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4892
- new_type = GGML_TYPE_Q5_K;
4893
- }
4894
- ++i_feed_forward_w2;
4895
- } else if (name.find("attn_output.weight") != std::string::npos) {
4896
- if (model.arch != LLM_ARCH_FALCON) {
4897
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4898
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4899
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4900
- } else {
4901
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4902
- }
4903
- }
4904
- else if (name.find("attn_qkv.weight") != std::string::npos) {
4905
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4906
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4907
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4908
- }
4909
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4910
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4911
- }
4912
- // This can be used to reduce the size of the Q5_K_S model.
4913
- // The associated PPL increase is fully in line with the size reduction
4914
- //else {
4915
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4916
- //}
4917
- bool convert_incompatible_tensor = false;
4918
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4919
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4920
- int nx = tensor->ne[0];
4921
- int ny = tensor->ne[1];
4922
- if (nx % QK_K != 0) {
4923
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4924
- convert_incompatible_tensor = true;
4925
- }
4926
- }
4927
- if (convert_incompatible_tensor) {
4928
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4929
- new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4930
- LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4931
- } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4932
- new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4933
- LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4934
- } else {
4935
- throw std::runtime_error("Unsupported tensor size encountered\n");
4936
- }
4937
- }
5409
+ new_type = get_k_quant_type(
5410
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
5411
+ );
4938
5412
  #endif
4939
5413
  // If we've decided to quantize to the same type the tensor is already
4940
5414
  // in then there's nothing to do.
@@ -4949,23 +5423,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4949
5423
  const size_t nelements = ggml_nelements(tensor);
4950
5424
 
4951
5425
  float * f32_data;
4952
- std::vector<float> f32_conv_buf;
4953
5426
 
4954
5427
  if (tensor->type == GGML_TYPE_F32) {
4955
5428
  f32_data = (float *) tensor->data;
4956
5429
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
4957
5430
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
4958
5431
  } else {
4959
- llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
5432
+ llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
4960
5433
  f32_data = (float *) f32_conv_buf.data();
4961
5434
  }
4962
5435
 
4963
5436
  LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
4964
5437
  fflush(stdout);
4965
5438
 
4966
- work.resize(nelements * 4); // upper bound on size
5439
+ if (work.size() < nelements * 4) {
5440
+ work.resize(nelements * 4); // upper bound on size
5441
+ }
4967
5442
  new_data = work.data();
4968
- std::vector<int64_t> hist_cur(1 << 4, 0);
5443
+ std::array<int64_t, 1 << 4> hist_cur = {};
4969
5444
 
4970
5445
  static const int chunk_size = 32 * 512;
4971
5446
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4976,13 +5451,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4976
5451
  size_t counter = 0;
4977
5452
  new_size = 0;
4978
5453
  auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
4979
- std::vector<int64_t> local_hist;
5454
+ std::array<int64_t, 1 << 4> local_hist = {};
4980
5455
  size_t local_size = 0;
4981
5456
  while (true) {
4982
5457
  std::unique_lock<std::mutex> lock(mutex);
4983
5458
  size_t first = counter; counter += chunk_size;
4984
5459
  if (first >= nelements) {
4985
- if (!local_hist.empty()) {
5460
+ if (local_size > 0) {
4986
5461
  for (int j=0; j<int(local_hist.size()); ++j) {
4987
5462
  hist_cur[j] += local_hist[j];
4988
5463
  }
@@ -4992,22 +5467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4992
5467
  }
4993
5468
  lock.unlock();
4994
5469
  size_t last = std::min(nelements, first + chunk_size);
4995
- if (local_hist.empty()) {
4996
- local_hist.resize(hist_cur.size(), 0);
4997
- }
4998
5470
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
4999
5471
  }
5000
5472
  };
5001
- if ((int) workers.size() < nthread_use - 1) {
5002
- workers.resize(nthread_use - 1);
5003
- }
5004
5473
  for (int it = 0; it < nthread_use - 1; ++it) {
5005
- workers[it] = std::thread(compute);
5474
+ workers.emplace_back(compute);
5006
5475
  }
5007
5476
  compute();
5008
- for (int it = 0; it < nthread_use - 1; ++it) {
5009
- workers[it].join();
5010
- }
5477
+ for (auto & w : workers) { w.join(); }
5478
+ workers.clear();
5011
5479
  }
5012
5480
 
5013
5481
  LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -5635,15 +6103,19 @@ void llama_free(struct llama_context * ctx) {
5635
6103
  }
5636
6104
 
5637
6105
  int llama_n_vocab(const struct llama_context * ctx) {
5638
- return ctx->model.vocab.id_to_token.size();
6106
+ return llama_model_n_vocab(&ctx->model);
5639
6107
  }
5640
6108
 
5641
6109
  int llama_n_ctx(const struct llama_context * ctx) {
5642
- return ctx->model.hparams.n_ctx;
6110
+ return llama_model_n_ctx(&ctx->model);
6111
+ }
6112
+
6113
+ int llama_n_ctx_train(const struct llama_context * ctx) {
6114
+ return llama_model_n_ctx_train(&ctx->model);
5643
6115
  }
5644
6116
 
5645
6117
  int llama_n_embd(const struct llama_context * ctx) {
5646
- return ctx->model.hparams.n_embd;
6118
+ return llama_model_n_embd(&ctx->model);
5647
6119
  }
5648
6120
 
5649
6121
  enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
@@ -5658,6 +6130,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
5658
6130
  return model->hparams.n_ctx;
5659
6131
  }
5660
6132
 
6133
+ int llama_model_n_ctx_train(const struct llama_model * model) {
6134
+ return model->hparams.n_ctx_train;
6135
+ }
6136
+
5661
6137
  int llama_model_n_embd(const struct llama_model * model) {
5662
6138
  return model->hparams.n_embd;
5663
6139
  }
@@ -6212,7 +6688,7 @@ int llama_tokenize_with_model(
6212
6688
  auto res = llama_tokenize_internal(model->vocab, text, add_bos);
6213
6689
 
6214
6690
  if (n_max_tokens < (int) res.size()) {
6215
- LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6691
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6216
6692
  return -((int) res.size());
6217
6693
  }
6218
6694