llama_cpp 0.12.0 → 0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
@@ -990,14 +990,14 @@ struct llama_mmap {
|
|
990
990
|
|
991
991
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
992
992
|
if (prefetch > 0) {
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
993
|
+
// Advise the kernel to preload the mapped memory
|
994
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
995
|
+
range.VirtualAddress = addr;
|
996
|
+
range.NumberOfBytes = (SIZE_T)size;
|
997
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
998
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
999
|
+
llama_format_win_err(GetLastError()).c_str());
|
1000
|
+
}
|
1001
1001
|
}
|
1002
1002
|
#else
|
1003
1003
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -1898,6 +1898,28 @@ static void llama_kv_cache_seq_shift(
|
|
1898
1898
|
cache.head = new_head != cache.size ? new_head : 0;
|
1899
1899
|
}
|
1900
1900
|
|
1901
|
+
static void llama_kv_cache_seq_div(
|
1902
|
+
struct llama_kv_cache & cache,
|
1903
|
+
llama_seq_id seq_id,
|
1904
|
+
llama_pos p0,
|
1905
|
+
llama_pos p1,
|
1906
|
+
int d) {
|
1907
|
+
if (p0 < 0) p0 = 0;
|
1908
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1909
|
+
|
1910
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1911
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1912
|
+
cache.has_shift = true;
|
1913
|
+
|
1914
|
+
{
|
1915
|
+
llama_pos p_old = cache.cells[i].pos;
|
1916
|
+
cache.cells[i].pos /= d;
|
1917
|
+
cache.cells[i].delta += cache.cells[i].pos - p_old;
|
1918
|
+
}
|
1919
|
+
}
|
1920
|
+
}
|
1921
|
+
}
|
1922
|
+
|
1901
1923
|
//
|
1902
1924
|
// model loading and saving
|
1903
1925
|
//
|
@@ -2175,7 +2197,11 @@ struct llama_model_loader {
|
|
2175
2197
|
type_max = type;
|
2176
2198
|
}
|
2177
2199
|
|
2178
|
-
//
|
2200
|
+
// TODO: make runtime configurable
|
2201
|
+
#if 0
|
2202
|
+
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
2203
|
+
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
|
2204
|
+
#endif
|
2179
2205
|
}
|
2180
2206
|
|
2181
2207
|
switch (type_max) {
|
@@ -2191,6 +2217,8 @@ struct llama_model_loader {
|
|
2191
2217
|
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
2192
2218
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
2193
2219
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2220
|
+
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2221
|
+
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2194
2222
|
default:
|
2195
2223
|
{
|
2196
2224
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2553,7 +2581,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2553
2581
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
2554
2582
|
|
2555
2583
|
// K-quants
|
2556
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
|
2584
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
2585
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
2557
2586
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
2558
2587
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
2559
2588
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
@@ -2562,6 +2591,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2562
2591
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
2563
2592
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
2564
2593
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2594
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
2595
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2565
2596
|
|
2566
2597
|
default: return "unknown, may not work";
|
2567
2598
|
}
|
@@ -2796,6 +2827,7 @@ static void llm_load_hparams(
|
|
2796
2827
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2797
2828
|
|
2798
2829
|
switch (hparams.n_layer) {
|
2830
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2799
2831
|
case 32: model.type = e_model::MODEL_3B; break;
|
2800
2832
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2801
2833
|
}
|
@@ -3112,7 +3144,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3112
3144
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
3113
3145
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
3114
3146
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
3115
|
-
|
3147
|
+
if (ml.n_elements >= 1e12) {
|
3148
|
+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
|
3149
|
+
} else if (ml.n_elements >= 1e9) {
|
3150
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
3151
|
+
} else if (ml.n_elements >= 1e6) {
|
3152
|
+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
|
3153
|
+
} else {
|
3154
|
+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
|
3155
|
+
}
|
3116
3156
|
if (ml.n_bytes < GiB) {
|
3117
3157
|
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
3118
3158
|
} else {
|
@@ -4767,7 +4807,6 @@ struct llm_build_context {
|
|
4767
4807
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4768
4808
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4769
4809
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4770
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
4771
4810
|
|
4772
4811
|
struct ggml_tensor * cur;
|
4773
4812
|
struct ggml_tensor * inpL;
|
@@ -4891,7 +4930,6 @@ struct llm_build_context {
|
|
4891
4930
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4892
4931
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4893
4932
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4894
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
4895
4933
|
|
4896
4934
|
struct ggml_tensor * cur;
|
4897
4935
|
struct ggml_tensor * pos;
|
@@ -4990,9 +5028,7 @@ struct llm_build_context {
|
|
4990
5028
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4991
5029
|
|
4992
5030
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4993
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4994
5031
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4995
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
4996
5032
|
|
4997
5033
|
const int64_t n_rot = n_embd_head_k / 2;
|
4998
5034
|
|
@@ -5204,9 +5240,7 @@ struct llm_build_context {
|
|
5204
5240
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5205
5241
|
|
5206
5242
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5207
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5208
5243
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5209
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5210
5244
|
|
5211
5245
|
struct ggml_tensor * cur;
|
5212
5246
|
struct ggml_tensor * inpL;
|
@@ -5299,7 +5333,6 @@ struct llm_build_context {
|
|
5299
5333
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5300
5334
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5301
5335
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5302
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5303
5336
|
|
5304
5337
|
struct ggml_tensor * cur;
|
5305
5338
|
struct ggml_tensor * inpL;
|
@@ -5395,7 +5428,6 @@ struct llm_build_context {
|
|
5395
5428
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5396
5429
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5397
5430
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5398
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5399
5431
|
|
5400
5432
|
struct ggml_tensor * cur;
|
5401
5433
|
struct ggml_tensor * inpL;
|
@@ -5722,7 +5754,6 @@ struct llm_build_context {
|
|
5722
5754
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5723
5755
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5724
5756
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5725
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5726
5757
|
|
5727
5758
|
struct ggml_tensor * cur;
|
5728
5759
|
struct ggml_tensor * attn_norm_output;
|
@@ -5946,7 +5977,6 @@ struct llm_build_context {
|
|
5946
5977
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5947
5978
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5948
5979
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5949
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
5950
5980
|
|
5951
5981
|
struct ggml_tensor * cur;
|
5952
5982
|
struct ggml_tensor * pos;
|
@@ -8921,10 +8951,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8921
8951
|
// TODO: explore better strategies
|
8922
8952
|
new_type = GGML_TYPE_Q8_0;
|
8923
8953
|
}
|
8924
|
-
} else if (name.find("ffn_down
|
8954
|
+
} else if (name.find("ffn_down") != std::string::npos) {
|
8925
8955
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8956
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
8957
|
+
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
|
8958
|
+
}
|
8926
8959
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
8927
|
-
new_type = qs.i_feed_forward_w2 <
|
8960
|
+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
|
8928
8961
|
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
8929
8962
|
: GGML_TYPE_Q3_K;
|
8930
8963
|
}
|
@@ -8933,14 +8966,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8933
8966
|
}
|
8934
8967
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
8935
8968
|
if (arch == LLM_ARCH_FALCON) {
|
8936
|
-
new_type = qs.i_feed_forward_w2 <
|
8969
|
+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
|
8937
8970
|
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
8938
8971
|
} else {
|
8939
8972
|
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
8940
8973
|
}
|
8941
8974
|
}
|
8942
8975
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
8943
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 <
|
8976
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
|
8944
8977
|
new_type = GGML_TYPE_Q5_K;
|
8945
8978
|
}
|
8946
8979
|
++qs.i_feed_forward_w2;
|
@@ -8958,9 +8991,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8958
8991
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
8959
8992
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
8960
8993
|
}
|
8961
|
-
|
8962
|
-
|
8963
|
-
|
8994
|
+
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
8995
|
+
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
8996
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8997
|
+
//}
|
8964
8998
|
// This can be used to reduce the size of the Q5_K_S model.
|
8965
8999
|
// The associated PPL increase is fully in line with the size reduction
|
8966
9000
|
//else {
|
@@ -9009,6 +9043,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9009
9043
|
|
9010
9044
|
// K-quants
|
9011
9045
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
9046
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
|
9012
9047
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
9013
9048
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
9014
9049
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
@@ -9017,6 +9052,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9017
9052
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
9018
9053
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
9019
9054
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
9055
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
9056
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
9020
9057
|
|
9021
9058
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
9022
9059
|
}
|
@@ -9065,7 +9102,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9065
9102
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
9066
9103
|
++qs.n_attention_wv;
|
9067
9104
|
}
|
9068
|
-
else if (name.find("ffn_down
|
9105
|
+
else if (name.find("ffn_down") != std::string::npos) {
|
9069
9106
|
++qs.n_feed_forward_w2;
|
9070
9107
|
}
|
9071
9108
|
}
|
@@ -10141,9 +10178,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
10141
10178
|
}
|
10142
10179
|
|
10143
10180
|
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
10181
|
+
if (delta == 0) {
|
10182
|
+
return;
|
10183
|
+
}
|
10184
|
+
|
10144
10185
|
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
10145
10186
|
}
|
10146
10187
|
|
10188
|
+
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
10189
|
+
if (d == 1) {
|
10190
|
+
return;
|
10191
|
+
}
|
10192
|
+
|
10193
|
+
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
10194
|
+
}
|
10195
|
+
|
10147
10196
|
// Returns the *maximum* size of the state
|
10148
10197
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
10149
10198
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
@@ -10876,7 +10925,7 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
10876
10925
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
10877
10926
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
10878
10927
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
10879
|
-
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
10928
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
|
10880
10929
|
}
|
10881
10930
|
|
10882
10931
|
void llama_reset_timings(struct llama_context * ctx) {
|
@@ -103,6 +103,9 @@ extern "C" {
|
|
103
103
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
104
104
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
105
105
|
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
106
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
107
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
108
|
+
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
106
109
|
|
107
110
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
108
111
|
};
|
@@ -484,6 +487,17 @@ extern "C" {
|
|
484
487
|
llama_pos p1,
|
485
488
|
llama_pos delta);
|
486
489
|
|
490
|
+
// Integer division of the positions by factor of `d > 1`
|
491
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly
|
492
|
+
// p0 < 0 : [0, p1]
|
493
|
+
// p1 < 0 : [p0, inf)
|
494
|
+
LLAMA_API void llama_kv_cache_seq_div(
|
495
|
+
struct llama_context * ctx,
|
496
|
+
llama_seq_id seq_id,
|
497
|
+
llama_pos p0,
|
498
|
+
llama_pos p1,
|
499
|
+
int d);
|
500
|
+
|
487
501
|
//
|
488
502
|
// State / sessions
|
489
503
|
//
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-01-
|
11
|
+
date: 2024-01-13 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|