llama_cpp 0.12.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -990,14 +990,14 @@ struct llama_mmap {
990
990
 
991
991
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
992
992
  if (prefetch > 0) {
993
- // Advise the kernel to preload the mapped memory
994
- WIN32_MEMORY_RANGE_ENTRY range;
995
- range.VirtualAddress = addr;
996
- range.NumberOfBytes = (SIZE_T)size;
997
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
998
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
999
- llama_format_win_err(GetLastError()).c_str());
1000
- }
993
+ // Advise the kernel to preload the mapped memory
994
+ WIN32_MEMORY_RANGE_ENTRY range;
995
+ range.VirtualAddress = addr;
996
+ range.NumberOfBytes = (SIZE_T)size;
997
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
998
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
999
+ llama_format_win_err(GetLastError()).c_str());
1000
+ }
1001
1001
  }
1002
1002
  #else
1003
1003
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -1898,6 +1898,28 @@ static void llama_kv_cache_seq_shift(
1898
1898
  cache.head = new_head != cache.size ? new_head : 0;
1899
1899
  }
1900
1900
 
1901
+ static void llama_kv_cache_seq_div(
1902
+ struct llama_kv_cache & cache,
1903
+ llama_seq_id seq_id,
1904
+ llama_pos p0,
1905
+ llama_pos p1,
1906
+ int d) {
1907
+ if (p0 < 0) p0 = 0;
1908
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1909
+
1910
+ for (uint32_t i = 0; i < cache.size; ++i) {
1911
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1912
+ cache.has_shift = true;
1913
+
1914
+ {
1915
+ llama_pos p_old = cache.cells[i].pos;
1916
+ cache.cells[i].pos /= d;
1917
+ cache.cells[i].delta += cache.cells[i].pos - p_old;
1918
+ }
1919
+ }
1920
+ }
1921
+ }
1922
+
1901
1923
  //
1902
1924
  // model loading and saving
1903
1925
  //
@@ -2175,7 +2197,11 @@ struct llama_model_loader {
2175
2197
  type_max = type;
2176
2198
  }
2177
2199
 
2178
- // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2200
+ // TODO: make runtime configurable
2201
+ #if 0
2202
+ struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2203
+ LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
2204
+ #endif
2179
2205
  }
2180
2206
 
2181
2207
  switch (type_max) {
@@ -2191,6 +2217,8 @@ struct llama_model_loader {
2191
2217
  case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2192
2218
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2193
2219
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2220
+ case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2221
+ case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2194
2222
  default:
2195
2223
  {
2196
2224
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2553,7 +2581,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2553
2581
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2554
2582
 
2555
2583
  // K-quants
2556
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2584
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
2585
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
2557
2586
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2558
2587
  case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2559
2588
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -2562,6 +2591,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2562
2591
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2563
2592
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2564
2593
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2594
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2595
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2565
2596
 
2566
2597
  default: return "unknown, may not work";
2567
2598
  }
@@ -2796,6 +2827,7 @@ static void llm_load_hparams(
2796
2827
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2797
2828
 
2798
2829
  switch (hparams.n_layer) {
2830
+ case 24: model.type = e_model::MODEL_1B; break;
2799
2831
  case 32: model.type = e_model::MODEL_3B; break;
2800
2832
  default: model.type = e_model::MODEL_UNKNOWN;
2801
2833
  }
@@ -3112,7 +3144,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3112
3144
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
3113
3145
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
3114
3146
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
3115
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3147
+ if (ml.n_elements >= 1e12) {
3148
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
3149
+ } else if (ml.n_elements >= 1e9) {
3150
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3151
+ } else if (ml.n_elements >= 1e6) {
3152
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
3153
+ } else {
3154
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
3155
+ }
3116
3156
  if (ml.n_bytes < GiB) {
3117
3157
  LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
3118
3158
  } else {
@@ -4767,7 +4807,6 @@ struct llm_build_context {
4767
4807
  const int64_t n_embd_head = hparams.n_embd_head_v;
4768
4808
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4769
4809
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4770
- GGML_ASSERT(n_embd_gqa == n_embd);
4771
4810
 
4772
4811
  struct ggml_tensor * cur;
4773
4812
  struct ggml_tensor * inpL;
@@ -4891,7 +4930,6 @@ struct llm_build_context {
4891
4930
  const int64_t n_embd_head = hparams.n_embd_head_v;
4892
4931
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4893
4932
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4894
- GGML_ASSERT(n_embd_gqa == n_embd);
4895
4933
 
4896
4934
  struct ggml_tensor * cur;
4897
4935
  struct ggml_tensor * pos;
@@ -4990,9 +5028,7 @@ struct llm_build_context {
4990
5028
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4991
5029
 
4992
5030
  const int64_t n_embd_head = hparams.n_embd_head_v;
4993
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4994
5031
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4995
- GGML_ASSERT(n_embd_gqa == n_embd);
4996
5032
 
4997
5033
  const int64_t n_rot = n_embd_head_k / 2;
4998
5034
 
@@ -5204,9 +5240,7 @@ struct llm_build_context {
5204
5240
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5205
5241
 
5206
5242
  const int64_t n_embd_head = hparams.n_embd_head_v;
5207
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5208
5243
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5209
- GGML_ASSERT(n_embd_gqa == n_embd);
5210
5244
 
5211
5245
  struct ggml_tensor * cur;
5212
5246
  struct ggml_tensor * inpL;
@@ -5299,7 +5333,6 @@ struct llm_build_context {
5299
5333
  const int64_t n_embd_head = hparams.n_embd_head_v;
5300
5334
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5301
5335
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5302
- GGML_ASSERT(n_embd_gqa == n_embd);
5303
5336
 
5304
5337
  struct ggml_tensor * cur;
5305
5338
  struct ggml_tensor * inpL;
@@ -5395,7 +5428,6 @@ struct llm_build_context {
5395
5428
  const int64_t n_embd_head = hparams.n_embd_head_v;
5396
5429
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5397
5430
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5398
- GGML_ASSERT(n_embd_gqa == n_embd);
5399
5431
 
5400
5432
  struct ggml_tensor * cur;
5401
5433
  struct ggml_tensor * inpL;
@@ -5722,7 +5754,6 @@ struct llm_build_context {
5722
5754
  const int64_t n_embd_head = hparams.n_embd_head_v;
5723
5755
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5724
5756
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5725
- GGML_ASSERT(n_embd_gqa == n_embd);
5726
5757
 
5727
5758
  struct ggml_tensor * cur;
5728
5759
  struct ggml_tensor * attn_norm_output;
@@ -5946,7 +5977,6 @@ struct llm_build_context {
5946
5977
  const int64_t n_embd_head = hparams.n_embd_head_v;
5947
5978
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5948
5979
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5949
- GGML_ASSERT(n_embd_gqa == n_embd);
5950
5980
 
5951
5981
  struct ggml_tensor * cur;
5952
5982
  struct ggml_tensor * pos;
@@ -8921,10 +8951,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8921
8951
  // TODO: explore better strategies
8922
8952
  new_type = GGML_TYPE_Q8_0;
8923
8953
  }
8924
- } else if (name.find("ffn_down.weight") != std::string::npos) {
8954
+ } else if (name.find("ffn_down") != std::string::npos) {
8925
8955
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8956
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8957
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8958
+ }
8926
8959
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8927
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8960
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
8928
8961
  : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8929
8962
  : GGML_TYPE_Q3_K;
8930
8963
  }
@@ -8933,14 +8966,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8933
8966
  }
8934
8967
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8935
8968
  if (arch == LLM_ARCH_FALCON) {
8936
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8969
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
8937
8970
  use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8938
8971
  } else {
8939
8972
  if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8940
8973
  }
8941
8974
  }
8942
8975
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8943
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
8976
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
8944
8977
  new_type = GGML_TYPE_Q5_K;
8945
8978
  }
8946
8979
  ++qs.i_feed_forward_w2;
@@ -8958,9 +8991,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8958
8991
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8959
8992
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8960
8993
  }
8961
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
8962
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8963
- }
8994
+ // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
8995
+ //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
8996
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8997
+ //}
8964
8998
  // This can be used to reduce the size of the Q5_K_S model.
8965
8999
  // The associated PPL increase is fully in line with the size reduction
8966
9000
  //else {
@@ -9009,6 +9043,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9009
9043
 
9010
9044
  // K-quants
9011
9045
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9046
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
9012
9047
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9013
9048
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9014
9049
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9017,6 +9052,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9017
9052
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9018
9053
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9019
9054
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9055
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9056
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9020
9057
 
9021
9058
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9022
9059
  }
@@ -9065,7 +9102,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9065
9102
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
9066
9103
  ++qs.n_attention_wv;
9067
9104
  }
9068
- else if (name.find("ffn_down.weight") != std::string::npos) {
9105
+ else if (name.find("ffn_down") != std::string::npos) {
9069
9106
  ++qs.n_feed_forward_w2;
9070
9107
  }
9071
9108
  }
@@ -10141,9 +10178,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
10141
10178
  }
10142
10179
 
10143
10180
  void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
10181
+ if (delta == 0) {
10182
+ return;
10183
+ }
10184
+
10144
10185
  llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
10145
10186
  }
10146
10187
 
10188
+ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
10189
+ if (d == 1) {
10190
+ return;
10191
+ }
10192
+
10193
+ llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
10194
+ }
10195
+
10147
10196
  // Returns the *maximum* size of the state
10148
10197
  size_t llama_get_state_size(const struct llama_context * ctx) {
10149
10198
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -10876,7 +10925,7 @@ void llama_print_timings(struct llama_context * ctx) {
10876
10925
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
10877
10926
  LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
10878
10927
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
10879
- LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
10928
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
10880
10929
  }
10881
10930
 
10882
10931
  void llama_reset_timings(struct llama_context * ctx) {
@@ -103,6 +103,9 @@ extern "C" {
103
103
  LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
104
104
  LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
105
105
  LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
106
+ LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
107
+ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
108
+ LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
106
109
 
107
110
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
108
111
  };
@@ -484,6 +487,17 @@ extern "C" {
484
487
  llama_pos p1,
485
488
  llama_pos delta);
486
489
 
490
+ // Integer division of the positions by factor of `d > 1`
491
+ // If the KV cache is RoPEd, the KV data is updated accordingly
492
+ // p0 < 0 : [0, p1]
493
+ // p1 < 0 : [p0, inf)
494
+ LLAMA_API void llama_kv_cache_seq_div(
495
+ struct llama_context * ctx,
496
+ llama_seq_id seq_id,
497
+ llama_pos p0,
498
+ llama_pos p1,
499
+ int d);
500
+
487
501
  //
488
502
  // State / sessions
489
503
  //
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.12.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-11 00:00:00.000000000 Z
11
+ date: 2024-01-13 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: