llama_cpp 0.12.0 → 0.12.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -990,14 +990,14 @@ struct llama_mmap {
990
990
 
991
991
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
992
992
  if (prefetch > 0) {
993
- // Advise the kernel to preload the mapped memory
994
- WIN32_MEMORY_RANGE_ENTRY range;
995
- range.VirtualAddress = addr;
996
- range.NumberOfBytes = (SIZE_T)size;
997
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
998
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
999
- llama_format_win_err(GetLastError()).c_str());
1000
- }
993
+ // Advise the kernel to preload the mapped memory
994
+ WIN32_MEMORY_RANGE_ENTRY range;
995
+ range.VirtualAddress = addr;
996
+ range.NumberOfBytes = (SIZE_T)size;
997
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
998
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
999
+ llama_format_win_err(GetLastError()).c_str());
1000
+ }
1001
1001
  }
1002
1002
  #else
1003
1003
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -1898,6 +1898,28 @@ static void llama_kv_cache_seq_shift(
1898
1898
  cache.head = new_head != cache.size ? new_head : 0;
1899
1899
  }
1900
1900
 
1901
+ static void llama_kv_cache_seq_div(
1902
+ struct llama_kv_cache & cache,
1903
+ llama_seq_id seq_id,
1904
+ llama_pos p0,
1905
+ llama_pos p1,
1906
+ int d) {
1907
+ if (p0 < 0) p0 = 0;
1908
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1909
+
1910
+ for (uint32_t i = 0; i < cache.size; ++i) {
1911
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1912
+ cache.has_shift = true;
1913
+
1914
+ {
1915
+ llama_pos p_old = cache.cells[i].pos;
1916
+ cache.cells[i].pos /= d;
1917
+ cache.cells[i].delta += cache.cells[i].pos - p_old;
1918
+ }
1919
+ }
1920
+ }
1921
+ }
1922
+
1901
1923
  //
1902
1924
  // model loading and saving
1903
1925
  //
@@ -2175,7 +2197,11 @@ struct llama_model_loader {
2175
2197
  type_max = type;
2176
2198
  }
2177
2199
 
2178
- // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2200
+ // TODO: make runtime configurable
2201
+ #if 0
2202
+ struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2203
+ LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
2204
+ #endif
2179
2205
  }
2180
2206
 
2181
2207
  switch (type_max) {
@@ -2191,6 +2217,8 @@ struct llama_model_loader {
2191
2217
  case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2192
2218
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2193
2219
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2220
+ case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2221
+ case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2194
2222
  default:
2195
2223
  {
2196
2224
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2553,7 +2581,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2553
2581
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2554
2582
 
2555
2583
  // K-quants
2556
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2584
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
2585
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
2557
2586
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2558
2587
  case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2559
2588
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -2562,6 +2591,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2562
2591
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2563
2592
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2564
2593
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2594
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2595
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2565
2596
 
2566
2597
  default: return "unknown, may not work";
2567
2598
  }
@@ -2796,6 +2827,7 @@ static void llm_load_hparams(
2796
2827
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2797
2828
 
2798
2829
  switch (hparams.n_layer) {
2830
+ case 24: model.type = e_model::MODEL_1B; break;
2799
2831
  case 32: model.type = e_model::MODEL_3B; break;
2800
2832
  default: model.type = e_model::MODEL_UNKNOWN;
2801
2833
  }
@@ -3112,7 +3144,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3112
3144
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
3113
3145
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
3114
3146
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
3115
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3147
+ if (ml.n_elements >= 1e12) {
3148
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
3149
+ } else if (ml.n_elements >= 1e9) {
3150
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3151
+ } else if (ml.n_elements >= 1e6) {
3152
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
3153
+ } else {
3154
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
3155
+ }
3116
3156
  if (ml.n_bytes < GiB) {
3117
3157
  LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
3118
3158
  } else {
@@ -4767,7 +4807,6 @@ struct llm_build_context {
4767
4807
  const int64_t n_embd_head = hparams.n_embd_head_v;
4768
4808
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4769
4809
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4770
- GGML_ASSERT(n_embd_gqa == n_embd);
4771
4810
 
4772
4811
  struct ggml_tensor * cur;
4773
4812
  struct ggml_tensor * inpL;
@@ -4891,7 +4930,6 @@ struct llm_build_context {
4891
4930
  const int64_t n_embd_head = hparams.n_embd_head_v;
4892
4931
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4893
4932
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4894
- GGML_ASSERT(n_embd_gqa == n_embd);
4895
4933
 
4896
4934
  struct ggml_tensor * cur;
4897
4935
  struct ggml_tensor * pos;
@@ -4990,9 +5028,7 @@ struct llm_build_context {
4990
5028
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4991
5029
 
4992
5030
  const int64_t n_embd_head = hparams.n_embd_head_v;
4993
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4994
5031
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4995
- GGML_ASSERT(n_embd_gqa == n_embd);
4996
5032
 
4997
5033
  const int64_t n_rot = n_embd_head_k / 2;
4998
5034
 
@@ -5204,9 +5240,7 @@ struct llm_build_context {
5204
5240
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5205
5241
 
5206
5242
  const int64_t n_embd_head = hparams.n_embd_head_v;
5207
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5208
5243
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5209
- GGML_ASSERT(n_embd_gqa == n_embd);
5210
5244
 
5211
5245
  struct ggml_tensor * cur;
5212
5246
  struct ggml_tensor * inpL;
@@ -5299,7 +5333,6 @@ struct llm_build_context {
5299
5333
  const int64_t n_embd_head = hparams.n_embd_head_v;
5300
5334
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5301
5335
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5302
- GGML_ASSERT(n_embd_gqa == n_embd);
5303
5336
 
5304
5337
  struct ggml_tensor * cur;
5305
5338
  struct ggml_tensor * inpL;
@@ -5395,7 +5428,6 @@ struct llm_build_context {
5395
5428
  const int64_t n_embd_head = hparams.n_embd_head_v;
5396
5429
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5397
5430
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5398
- GGML_ASSERT(n_embd_gqa == n_embd);
5399
5431
 
5400
5432
  struct ggml_tensor * cur;
5401
5433
  struct ggml_tensor * inpL;
@@ -5722,7 +5754,6 @@ struct llm_build_context {
5722
5754
  const int64_t n_embd_head = hparams.n_embd_head_v;
5723
5755
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5724
5756
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5725
- GGML_ASSERT(n_embd_gqa == n_embd);
5726
5757
 
5727
5758
  struct ggml_tensor * cur;
5728
5759
  struct ggml_tensor * attn_norm_output;
@@ -5946,7 +5977,6 @@ struct llm_build_context {
5946
5977
  const int64_t n_embd_head = hparams.n_embd_head_v;
5947
5978
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5948
5979
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5949
- GGML_ASSERT(n_embd_gqa == n_embd);
5950
5980
 
5951
5981
  struct ggml_tensor * cur;
5952
5982
  struct ggml_tensor * pos;
@@ -8921,10 +8951,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8921
8951
  // TODO: explore better strategies
8922
8952
  new_type = GGML_TYPE_Q8_0;
8923
8953
  }
8924
- } else if (name.find("ffn_down.weight") != std::string::npos) {
8954
+ } else if (name.find("ffn_down") != std::string::npos) {
8925
8955
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8956
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8957
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8958
+ }
8926
8959
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8927
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8960
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
8928
8961
  : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8929
8962
  : GGML_TYPE_Q3_K;
8930
8963
  }
@@ -8933,14 +8966,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8933
8966
  }
8934
8967
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8935
8968
  if (arch == LLM_ARCH_FALCON) {
8936
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8969
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
8937
8970
  use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8938
8971
  } else {
8939
8972
  if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8940
8973
  }
8941
8974
  }
8942
8975
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8943
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
8976
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
8944
8977
  new_type = GGML_TYPE_Q5_K;
8945
8978
  }
8946
8979
  ++qs.i_feed_forward_w2;
@@ -8958,9 +8991,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8958
8991
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8959
8992
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8960
8993
  }
8961
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
8962
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8963
- }
8994
+ // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
8995
+ //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
8996
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8997
+ //}
8964
8998
  // This can be used to reduce the size of the Q5_K_S model.
8965
8999
  // The associated PPL increase is fully in line with the size reduction
8966
9000
  //else {
@@ -9009,6 +9043,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9009
9043
 
9010
9044
  // K-quants
9011
9045
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9046
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
9012
9047
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9013
9048
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9014
9049
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9017,6 +9052,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9017
9052
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9018
9053
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9019
9054
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9055
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9056
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9020
9057
 
9021
9058
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9022
9059
  }
@@ -9065,7 +9102,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9065
9102
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
9066
9103
  ++qs.n_attention_wv;
9067
9104
  }
9068
- else if (name.find("ffn_down.weight") != std::string::npos) {
9105
+ else if (name.find("ffn_down") != std::string::npos) {
9069
9106
  ++qs.n_feed_forward_w2;
9070
9107
  }
9071
9108
  }
@@ -10141,9 +10178,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
10141
10178
  }
10142
10179
 
10143
10180
  void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
10181
+ if (delta == 0) {
10182
+ return;
10183
+ }
10184
+
10144
10185
  llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
10145
10186
  }
10146
10187
 
10188
+ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
10189
+ if (d == 1) {
10190
+ return;
10191
+ }
10192
+
10193
+ llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
10194
+ }
10195
+
10147
10196
  // Returns the *maximum* size of the state
10148
10197
  size_t llama_get_state_size(const struct llama_context * ctx) {
10149
10198
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -10876,7 +10925,7 @@ void llama_print_timings(struct llama_context * ctx) {
10876
10925
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
10877
10926
  LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
10878
10927
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
10879
- LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
10928
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
10880
10929
  }
10881
10930
 
10882
10931
  void llama_reset_timings(struct llama_context * ctx) {
@@ -103,6 +103,9 @@ extern "C" {
103
103
  LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
104
104
  LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
105
105
  LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
106
+ LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
107
+ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
108
+ LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
106
109
 
107
110
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
108
111
  };
@@ -484,6 +487,17 @@ extern "C" {
484
487
  llama_pos p1,
485
488
  llama_pos delta);
486
489
 
490
+ // Integer division of the positions by factor of `d > 1`
491
+ // If the KV cache is RoPEd, the KV data is updated accordingly
492
+ // p0 < 0 : [0, p1]
493
+ // p1 < 0 : [p0, inf)
494
+ LLAMA_API void llama_kv_cache_seq_div(
495
+ struct llama_context * ctx,
496
+ llama_seq_id seq_id,
497
+ llama_pos p0,
498
+ llama_pos p1,
499
+ int d);
500
+
487
501
  //
488
502
  // State / sessions
489
503
  //
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.12.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-11 00:00:00.000000000 Z
11
+ date: 2024-01-13 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: