llama_cpp 0.12.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
@@ -990,14 +990,14 @@ struct llama_mmap {
|
|
990
990
|
|
991
991
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
992
992
|
if (prefetch > 0) {
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
993
|
+
// Advise the kernel to preload the mapped memory
|
994
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
995
|
+
range.VirtualAddress = addr;
|
996
|
+
range.NumberOfBytes = (SIZE_T)size;
|
997
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
998
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
999
|
+
llama_format_win_err(GetLastError()).c_str());
|
1000
|
+
}
|
1001
1001
|
}
|
1002
1002
|
#else
|
1003
1003
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -1898,6 +1898,28 @@ static void llama_kv_cache_seq_shift(
|
|
1898
1898
|
cache.head = new_head != cache.size ? new_head : 0;
|
1899
1899
|
}
|
1900
1900
|
|
1901
|
+
static void llama_kv_cache_seq_div(
|
1902
|
+
struct llama_kv_cache & cache,
|
1903
|
+
llama_seq_id seq_id,
|
1904
|
+
llama_pos p0,
|
1905
|
+
llama_pos p1,
|
1906
|
+
int d) {
|
1907
|
+
if (p0 < 0) p0 = 0;
|
1908
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1909
|
+
|
1910
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1911
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1912
|
+
cache.has_shift = true;
|
1913
|
+
|
1914
|
+
{
|
1915
|
+
llama_pos p_old = cache.cells[i].pos;
|
1916
|
+
cache.cells[i].pos /= d;
|
1917
|
+
cache.cells[i].delta += cache.cells[i].pos - p_old;
|
1918
|
+
}
|
1919
|
+
}
|
1920
|
+
}
|
1921
|
+
}
|
1922
|
+
|
1901
1923
|
//
|
1902
1924
|
// model loading and saving
|
1903
1925
|
//
|
@@ -2175,7 +2197,11 @@ struct llama_model_loader {
|
|
2175
2197
|
type_max = type;
|
2176
2198
|
}
|
2177
2199
|
|
2178
|
-
//
|
2200
|
+
// TODO: make runtime configurable
|
2201
|
+
#if 0
|
2202
|
+
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
2203
|
+
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
|
2204
|
+
#endif
|
2179
2205
|
}
|
2180
2206
|
|
2181
2207
|
switch (type_max) {
|
@@ -2191,6 +2217,8 @@ struct llama_model_loader {
|
|
2191
2217
|
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
2192
2218
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
2193
2219
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2220
|
+
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2221
|
+
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2194
2222
|
default:
|
2195
2223
|
{
|
2196
2224
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2553,7 +2581,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2553
2581
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
2554
2582
|
|
2555
2583
|
// K-quants
|
2556
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
|
2584
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
2585
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
2557
2586
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
2558
2587
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
2559
2588
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
@@ -2562,6 +2591,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2562
2591
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
2563
2592
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
2564
2593
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2594
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
2595
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2565
2596
|
|
2566
2597
|
default: return "unknown, may not work";
|
2567
2598
|
}
|
@@ -2796,6 +2827,7 @@ static void llm_load_hparams(
|
|
2796
2827
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2797
2828
|
|
2798
2829
|
switch (hparams.n_layer) {
|
2830
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2799
2831
|
case 32: model.type = e_model::MODEL_3B; break;
|
2800
2832
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2801
2833
|
}
|
@@ -3112,7 +3144,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3112
3144
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
3113
3145
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
3114
3146
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
3115
|
-
|
3147
|
+
if (ml.n_elements >= 1e12) {
|
3148
|
+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
|
3149
|
+
} else if (ml.n_elements >= 1e9) {
|
3150
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
3151
|
+
} else if (ml.n_elements >= 1e6) {
|
3152
|
+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
|
3153
|
+
} else {
|
3154
|
+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
|
3155
|
+
}
|
3116
3156
|
if (ml.n_bytes < GiB) {
|
3117
3157
|
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
3118
3158
|
} else {
|
@@ -4767,7 +4807,6 @@ struct llm_build_context {
|
|
4767
4807
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4768
4808
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4769
4809
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4770
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
4771
4810
|
|
4772
4811
|
struct ggml_tensor * cur;
|
4773
4812
|
struct ggml_tensor * inpL;
|
@@ -4891,7 +4930,6 @@ struct llm_build_context {
|
|
4891
4930
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4892
4931
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4893
4932
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4894
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
4895
4933
|
|
4896
4934
|
struct ggml_tensor * cur;
|
4897
4935
|
struct ggml_tensor * pos;
|
@@ -4990,9 +5028,7 @@ struct llm_build_context {
|
|
4990
5028
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4991
5029
|
|
4992
5030
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4993
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4994
5031
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4995
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
4996
5032
|
|
4997
5033
|
const int64_t n_rot = n_embd_head_k / 2;
|
4998
5034
|
|
@@ -5204,9 +5240,7 @@ struct llm_build_context {
|
|
5204
5240
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5205
5241
|
|
5206
5242
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5207
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5208
5243
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5209
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5210
5244
|
|
5211
5245
|
struct ggml_tensor * cur;
|
5212
5246
|
struct ggml_tensor * inpL;
|
@@ -5299,7 +5333,6 @@ struct llm_build_context {
|
|
5299
5333
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5300
5334
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5301
5335
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5302
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5303
5336
|
|
5304
5337
|
struct ggml_tensor * cur;
|
5305
5338
|
struct ggml_tensor * inpL;
|
@@ -5395,7 +5428,6 @@ struct llm_build_context {
|
|
5395
5428
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5396
5429
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5397
5430
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5398
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5399
5431
|
|
5400
5432
|
struct ggml_tensor * cur;
|
5401
5433
|
struct ggml_tensor * inpL;
|
@@ -5722,7 +5754,6 @@ struct llm_build_context {
|
|
5722
5754
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5723
5755
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5724
5756
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5725
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5726
5757
|
|
5727
5758
|
struct ggml_tensor * cur;
|
5728
5759
|
struct ggml_tensor * attn_norm_output;
|
@@ -5946,7 +5977,6 @@ struct llm_build_context {
|
|
5946
5977
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5947
5978
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5948
5979
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5949
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5950
5980
|
|
5951
5981
|
struct ggml_tensor * cur;
|
5952
5982
|
struct ggml_tensor * pos;
|
@@ -8921,10 +8951,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8921
8951
|
// TODO: explore better strategies
|
8922
8952
|
new_type = GGML_TYPE_Q8_0;
|
8923
8953
|
}
|
8924
|
-
} else if (name.find("ffn_down
|
8954
|
+
} else if (name.find("ffn_down") != std::string::npos) {
|
8925
8955
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8956
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
8957
|
+
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
|
8958
|
+
}
|
8926
8959
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
8927
|
-
new_type = qs.i_feed_forward_w2 <
|
8960
|
+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
|
8928
8961
|
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
8929
8962
|
: GGML_TYPE_Q3_K;
|
8930
8963
|
}
|
@@ -8933,14 +8966,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8933
8966
|
}
|
8934
8967
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
8935
8968
|
if (arch == LLM_ARCH_FALCON) {
|
8936
|
-
new_type = qs.i_feed_forward_w2 <
|
8969
|
+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
|
8937
8970
|
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
8938
8971
|
} else {
|
8939
8972
|
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
8940
8973
|
}
|
8941
8974
|
}
|
8942
8975
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
8943
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 <
|
8976
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
|
8944
8977
|
new_type = GGML_TYPE_Q5_K;
|
8945
8978
|
}
|
8946
8979
|
++qs.i_feed_forward_w2;
|
@@ -8958,9 +8991,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8958
8991
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
8959
8992
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
8960
8993
|
}
|
8961
|
-
|
8962
|
-
|
8963
|
-
|
8994
|
+
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
8995
|
+
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
8996
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8997
|
+
//}
|
8964
8998
|
// This can be used to reduce the size of the Q5_K_S model.
|
8965
8999
|
// The associated PPL increase is fully in line with the size reduction
|
8966
9000
|
//else {
|
@@ -9009,6 +9043,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9009
9043
|
|
9010
9044
|
// K-quants
|
9011
9045
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
9046
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
|
9012
9047
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
9013
9048
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
9014
9049
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
@@ -9017,6 +9052,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9017
9052
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
9018
9053
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
9019
9054
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
9055
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
9056
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
9020
9057
|
|
9021
9058
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
9022
9059
|
}
|
@@ -9065,7 +9102,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9065
9102
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
9066
9103
|
++qs.n_attention_wv;
|
9067
9104
|
}
|
9068
|
-
else if (name.find("ffn_down
|
9105
|
+
else if (name.find("ffn_down") != std::string::npos) {
|
9069
9106
|
++qs.n_feed_forward_w2;
|
9070
9107
|
}
|
9071
9108
|
}
|
@@ -10141,9 +10178,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
10141
10178
|
}
|
10142
10179
|
|
10143
10180
|
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
10181
|
+
if (delta == 0) {
|
10182
|
+
return;
|
10183
|
+
}
|
10184
|
+
|
10144
10185
|
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
10145
10186
|
}
|
10146
10187
|
|
10188
|
+
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
10189
|
+
if (d == 1) {
|
10190
|
+
return;
|
10191
|
+
}
|
10192
|
+
|
10193
|
+
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
10194
|
+
}
|
10195
|
+
|
10147
10196
|
// Returns the *maximum* size of the state
|
10148
10197
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
10149
10198
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
@@ -10876,7 +10925,7 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
10876
10925
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
10877
10926
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
10878
10927
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
10879
|
-
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
10928
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
|
10880
10929
|
}
|
10881
10930
|
|
10882
10931
|
void llama_reset_timings(struct llama_context * ctx) {
|
@@ -103,6 +103,9 @@ extern "C" {
|
|
103
103
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
104
104
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
105
105
|
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
106
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
107
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
108
|
+
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
106
109
|
|
107
110
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
108
111
|
};
|
@@ -484,6 +487,17 @@ extern "C" {
|
|
484
487
|
llama_pos p1,
|
485
488
|
llama_pos delta);
|
486
489
|
|
490
|
+
// Integer division of the positions by factor of `d > 1`
|
491
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly
|
492
|
+
// p0 < 0 : [0, p1]
|
493
|
+
// p1 < 0 : [p0, inf)
|
494
|
+
LLAMA_API void llama_kv_cache_seq_div(
|
495
|
+
struct llama_context * ctx,
|
496
|
+
llama_seq_id seq_id,
|
497
|
+
llama_pos p0,
|
498
|
+
llama_pos p1,
|
499
|
+
int d);
|
500
|
+
|
487
501
|
//
|
488
502
|
// State / sessions
|
489
503
|
//
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-01-
|
11
|
+
date: 2024-01-13 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|