llama_cpp 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
196
196
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
197
197
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
198
198
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
199
|
-
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
|
200
|
-
void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
|
201
199
|
|
202
200
|
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
203
201
|
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
@@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
|
212
210
|
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
213
211
|
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
214
212
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
215
|
-
void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
|
216
|
-
void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
|
217
213
|
|
218
214
|
// Dequantization
|
219
215
|
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
@@ -246,3 +242,18 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
|
246
242
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
247
243
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
248
244
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
245
|
+
|
246
|
+
//
|
247
|
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
248
|
+
//
|
249
|
+
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
250
|
+
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
251
|
+
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
252
|
+
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
253
|
+
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
254
|
+
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
255
|
+
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
256
|
+
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
257
|
+
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
258
|
+
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
259
|
+
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
585
585
|
.type_size = sizeof(block_iq2_xxs),
|
586
586
|
.is_quantized = true,
|
587
587
|
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
588
|
-
.from_float =
|
589
|
-
.from_float_reference =
|
588
|
+
.from_float = NULL,
|
589
|
+
.from_float_reference = NULL,
|
590
590
|
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
591
591
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
592
592
|
},
|
@@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
596
596
|
.type_size = sizeof(block_iq2_xs),
|
597
597
|
.is_quantized = true,
|
598
598
|
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
599
|
-
.from_float =
|
600
|
-
.from_float_reference =
|
599
|
+
.from_float = NULL,
|
600
|
+
.from_float_reference = NULL,
|
601
601
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
602
602
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
603
603
|
},
|
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
|
1990
1990
|
GGML_PRINT("%s: --- end ---\n", __func__);
|
1991
1991
|
}
|
1992
1992
|
|
1993
|
-
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
1993
|
+
GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
1994
1994
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
1995
1995
|
|
1996
1996
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
1997
1997
|
}
|
1998
1998
|
|
1999
|
-
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
1999
|
+
GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
2000
2000
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2001
2001
|
|
2002
2002
|
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
2003
2003
|
}
|
2004
2004
|
|
2005
|
-
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
2005
|
+
GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
2006
2006
|
size_t nbytes;
|
2007
2007
|
size_t blck_size = ggml_blck_size(tensor->type);
|
2008
2008
|
if (blck_size == 1) {
|
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
2025
2025
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
2026
2026
|
}
|
2027
2027
|
|
2028
|
-
int ggml_blck_size(enum ggml_type type) {
|
2028
|
+
GGML_CALL int ggml_blck_size(enum ggml_type type) {
|
2029
2029
|
return type_traits[type].blck_size;
|
2030
2030
|
}
|
2031
2031
|
|
2032
|
-
size_t ggml_type_size(enum ggml_type type) {
|
2032
|
+
GGML_CALL size_t ggml_type_size(enum ggml_type type) {
|
2033
2033
|
return type_traits[type].type_size;
|
2034
2034
|
}
|
2035
2035
|
|
2036
|
-
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
2036
|
+
GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
2037
2037
|
assert(ne % ggml_blck_size(type) == 0);
|
2038
2038
|
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
2039
2039
|
}
|
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
|
|
2042
2042
|
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
2043
2043
|
}
|
2044
2044
|
|
2045
|
-
const char * ggml_type_name(enum ggml_type type) {
|
2045
|
+
GGML_CALL const char * ggml_type_name(enum ggml_type type) {
|
2046
2046
|
return type_traits[type].type_name;
|
2047
2047
|
}
|
2048
2048
|
|
2049
|
-
bool ggml_is_quantized(enum ggml_type type) {
|
2049
|
+
GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
|
2050
2050
|
return type_traits[type].is_quantized;
|
2051
2051
|
}
|
2052
2052
|
|
2053
|
-
const char * ggml_op_name(enum ggml_op op) {
|
2053
|
+
GGML_CALL const char * ggml_op_name(enum ggml_op op) {
|
2054
2054
|
return GGML_OP_NAME[op];
|
2055
2055
|
}
|
2056
2056
|
|
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
|
2062
2062
|
return GGML_UNARY_OP_NAME[op];
|
2063
2063
|
}
|
2064
2064
|
|
2065
|
-
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2065
|
+
GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2066
2066
|
if (t->op == GGML_OP_UNARY) {
|
2067
2067
|
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
2068
2068
|
return ggml_unary_op_name(uop);
|
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
|
|
2072
2072
|
}
|
2073
2073
|
}
|
2074
2074
|
|
2075
|
-
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2075
|
+
GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2076
2076
|
return ggml_type_size(tensor->type);
|
2077
2077
|
}
|
2078
2078
|
|
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
|
|
2154
2154
|
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
2155
2155
|
}
|
2156
2156
|
|
2157
|
-
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
2157
|
+
GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
2158
2158
|
return tensor->nb[0] > tensor->nb[1];
|
2159
2159
|
}
|
2160
2160
|
|
2161
|
-
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
2161
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
2162
2162
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2163
2163
|
|
2164
2164
|
return
|
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
2177
2177
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
2178
2178
|
}
|
2179
2179
|
|
2180
|
-
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
2180
|
+
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
2181
2181
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2182
2182
|
|
2183
2183
|
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
@@ -2354,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2354
2354
|
}
|
2355
2355
|
|
2356
2356
|
void ggml_free(struct ggml_context * ctx) {
|
2357
|
+
if (ctx == NULL) {
|
2358
|
+
return;
|
2359
|
+
}
|
2360
|
+
|
2357
2361
|
// make this function thread safe
|
2358
2362
|
ggml_critical_section_start();
|
2359
2363
|
|
@@ -3075,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
3075
3079
|
return (float *)(tensor->data);
|
3076
3080
|
}
|
3077
3081
|
|
3078
|
-
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
3082
|
+
GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
3079
3083
|
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
3080
3084
|
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
3081
3085
|
}
|
@@ -4362,6 +4366,23 @@ struct ggml_tensor * ggml_cpy(
|
|
4362
4366
|
return ggml_cpy_impl(ctx, a, b);
|
4363
4367
|
}
|
4364
4368
|
|
4369
|
+
struct ggml_tensor * ggml_cast(
|
4370
|
+
struct ggml_context * ctx,
|
4371
|
+
struct ggml_tensor * a,
|
4372
|
+
enum ggml_type type) {
|
4373
|
+
bool is_node = false;
|
4374
|
+
|
4375
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
4376
|
+
ggml_format_name(result, "%s (copy)", a->name);
|
4377
|
+
|
4378
|
+
result->op = GGML_OP_CPY;
|
4379
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4380
|
+
result->src[0] = a;
|
4381
|
+
result->src[1] = result;
|
4382
|
+
|
4383
|
+
return result;
|
4384
|
+
}
|
4385
|
+
|
4365
4386
|
// ggml_cont
|
4366
4387
|
|
4367
4388
|
static struct ggml_tensor * ggml_cont_impl(
|
@@ -11617,7 +11638,22 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
|
|
11617
11638
|
return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
11618
11639
|
}
|
11619
11640
|
|
11620
|
-
void
|
11641
|
+
static void ggml_rope_cache_init(
|
11642
|
+
float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
11643
|
+
float * cache, float sin_sign, float theta_scale
|
11644
|
+
) {
|
11645
|
+
float theta = theta_base;
|
11646
|
+
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11647
|
+
rope_yarn(
|
11648
|
+
theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
11649
|
+
);
|
11650
|
+
cache[i0 + 1] *= sin_sign;
|
11651
|
+
|
11652
|
+
theta *= theta_scale;
|
11653
|
+
}
|
11654
|
+
}
|
11655
|
+
|
11656
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
11621
11657
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
11622
11658
|
) {
|
11623
11659
|
// start and end correction dims
|
@@ -11699,6 +11735,12 @@ static void ggml_compute_forward_rope_f32(
|
|
11699
11735
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11700
11736
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
11701
11737
|
const int64_t p = pos[i2];
|
11738
|
+
|
11739
|
+
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
11740
|
+
if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
|
11741
|
+
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
11742
|
+
}
|
11743
|
+
|
11702
11744
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
11703
11745
|
if (ir++ < ir0) continue;
|
11704
11746
|
if (ir > ir1) break;
|
@@ -11732,18 +11774,13 @@ static void ggml_compute_forward_rope_f32(
|
|
11732
11774
|
}
|
11733
11775
|
} else if (!is_neox) {
|
11734
11776
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11735
|
-
float cos_theta
|
11736
|
-
|
11737
|
-
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11738
|
-
);
|
11739
|
-
sin_theta *= sin_sign;
|
11777
|
+
const float cos_theta = cache[i0 + 0];
|
11778
|
+
const float sin_theta = cache[i0 + 1];
|
11740
11779
|
|
11741
11780
|
// zeta scaling for xPos only:
|
11742
11781
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
11743
11782
|
if (xpos_down) zeta = 1.0f / zeta;
|
11744
11783
|
|
11745
|
-
theta_base *= theta_scale;
|
11746
|
-
|
11747
11784
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11748
11785
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11749
11786
|
|
@@ -11867,6 +11904,12 @@ static void ggml_compute_forward_rope_f16(
|
|
11867
11904
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11868
11905
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
11869
11906
|
const int64_t p = pos[i2];
|
11907
|
+
|
11908
|
+
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
11909
|
+
if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
|
11910
|
+
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
11911
|
+
}
|
11912
|
+
|
11870
11913
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
11871
11914
|
if (ir++ < ir0) continue;
|
11872
11915
|
if (ir > ir1) break;
|
@@ -11900,13 +11943,8 @@ static void ggml_compute_forward_rope_f16(
|
|
11900
11943
|
}
|
11901
11944
|
} else if (!is_neox) {
|
11902
11945
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11903
|
-
float cos_theta
|
11904
|
-
|
11905
|
-
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11906
|
-
);
|
11907
|
-
sin_theta *= sin_sign;
|
11908
|
-
|
11909
|
-
theta_base *= theta_scale;
|
11946
|
+
const float cos_theta = cache[i0 + 0];
|
11947
|
+
const float sin_theta = cache[i0 + 1];
|
11910
11948
|
|
11911
11949
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11912
11950
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
@@ -14871,7 +14909,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
|
|
14871
14909
|
return i;
|
14872
14910
|
}
|
14873
14911
|
|
14874
|
-
|
14912
|
+
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
14875
14913
|
size = ggml_hash_size(size);
|
14876
14914
|
struct ggml_hash_set result;
|
14877
14915
|
result.size = size;
|
@@ -16620,7 +16658,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16620
16658
|
return GGML_EXIT_SUCCESS;
|
16621
16659
|
}
|
16622
16660
|
|
16623
|
-
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
16661
|
+
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
16624
16662
|
if (n_threads <= 0) {
|
16625
16663
|
n_threads = GGML_DEFAULT_N_THREADS;
|
16626
16664
|
}
|
@@ -16682,14 +16720,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16682
16720
|
} break;
|
16683
16721
|
case GGML_OP_MUL_MAT_ID:
|
16684
16722
|
{
|
16723
|
+
cur = 0;
|
16685
16724
|
const struct ggml_tensor * src0 = node->src[2];
|
16686
16725
|
const struct ggml_tensor * src1 = node->src[1];
|
16687
16726
|
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
16688
16727
|
if (src1->type != vec_dot_type) {
|
16689
|
-
cur
|
16728
|
+
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
16690
16729
|
}
|
16691
16730
|
const int n_as = ggml_get_op_params_i32(node, 1);
|
16692
|
-
cur
|
16731
|
+
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
16693
16732
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
16694
16733
|
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
16695
16734
|
} break;
|
@@ -16700,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16700
16739
|
}
|
16701
16740
|
} break;
|
16702
16741
|
case GGML_OP_SOFT_MAX:
|
16742
|
+
case GGML_OP_ROPE:
|
16703
16743
|
{
|
16704
16744
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16705
16745
|
} break;
|
@@ -18625,32 +18665,47 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|
18625
18665
|
return (n/QK8_0*sizeof(block_q8_0));
|
18626
18666
|
}
|
18627
18667
|
|
18628
|
-
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
18668
|
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
18669
|
+
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
18670
|
+
(void)imatrix;
|
18629
18671
|
size_t result = 0;
|
18672
|
+
int n = nrows * n_per_row;
|
18630
18673
|
switch (type) {
|
18631
18674
|
case GGML_TYPE_Q4_0:
|
18632
18675
|
{
|
18633
18676
|
GGML_ASSERT(start % QK4_0 == 0);
|
18634
|
-
|
18635
|
-
|
18677
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18678
|
+
size_t start_row = start / n_per_row;
|
18679
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18680
|
+
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18681
|
+
GGML_ASSERT(result == row_size * nrows);
|
18636
18682
|
} break;
|
18637
18683
|
case GGML_TYPE_Q4_1:
|
18638
18684
|
{
|
18639
18685
|
GGML_ASSERT(start % QK4_1 == 0);
|
18640
|
-
|
18641
|
-
|
18686
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18687
|
+
size_t start_row = start / n_per_row;
|
18688
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18689
|
+
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18690
|
+
GGML_ASSERT(result == row_size * nrows);
|
18642
18691
|
} break;
|
18643
18692
|
case GGML_TYPE_Q5_0:
|
18644
18693
|
{
|
18645
18694
|
GGML_ASSERT(start % QK5_0 == 0);
|
18646
|
-
|
18647
|
-
|
18695
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18696
|
+
size_t start_row = start / n_per_row;
|
18697
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18698
|
+
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18699
|
+
GGML_ASSERT(result == row_size * nrows);
|
18648
18700
|
} break;
|
18649
18701
|
case GGML_TYPE_Q5_1:
|
18650
18702
|
{
|
18651
18703
|
GGML_ASSERT(start % QK5_1 == 0);
|
18652
|
-
|
18653
|
-
|
18704
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18705
|
+
size_t start_row = start / n_per_row;
|
18706
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18707
|
+
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18708
|
+
GGML_ASSERT(result == row_size * nrows);
|
18654
18709
|
} break;
|
18655
18710
|
case GGML_TYPE_Q8_0:
|
18656
18711
|
{
|
@@ -18661,44 +18716,67 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18661
18716
|
case GGML_TYPE_Q2_K:
|
18662
18717
|
{
|
18663
18718
|
GGML_ASSERT(start % QK_K == 0);
|
18664
|
-
|
18665
|
-
|
18719
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18720
|
+
size_t start_row = start / n_per_row;
|
18721
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18722
|
+
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18723
|
+
GGML_ASSERT(result == row_size * nrows);
|
18666
18724
|
} break;
|
18667
18725
|
case GGML_TYPE_Q3_K:
|
18668
18726
|
{
|
18669
18727
|
GGML_ASSERT(start % QK_K == 0);
|
18670
|
-
|
18671
|
-
|
18728
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18729
|
+
size_t start_row = start / n_per_row;
|
18730
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18731
|
+
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18732
|
+
GGML_ASSERT(result == row_size * nrows);
|
18672
18733
|
} break;
|
18673
18734
|
case GGML_TYPE_Q4_K:
|
18674
18735
|
{
|
18675
18736
|
GGML_ASSERT(start % QK_K == 0);
|
18676
|
-
|
18677
|
-
|
18737
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18738
|
+
size_t start_row = start / n_per_row;
|
18739
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18740
|
+
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18741
|
+
GGML_ASSERT(result == row_size * nrows);
|
18678
18742
|
} break;
|
18679
18743
|
case GGML_TYPE_Q5_K:
|
18680
18744
|
{
|
18681
18745
|
GGML_ASSERT(start % QK_K == 0);
|
18682
|
-
|
18683
|
-
|
18746
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18747
|
+
size_t start_row = start / n_per_row;
|
18748
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18749
|
+
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18750
|
+
GGML_ASSERT(result == row_size * nrows);
|
18684
18751
|
} break;
|
18685
18752
|
case GGML_TYPE_Q6_K:
|
18686
18753
|
{
|
18687
18754
|
GGML_ASSERT(start % QK_K == 0);
|
18688
|
-
|
18689
|
-
|
18755
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18756
|
+
size_t start_row = start / n_per_row;
|
18757
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18758
|
+
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18759
|
+
GGML_ASSERT(result == row_size * nrows);
|
18690
18760
|
} break;
|
18691
18761
|
case GGML_TYPE_IQ2_XXS:
|
18692
18762
|
{
|
18693
18763
|
GGML_ASSERT(start % QK_K == 0);
|
18694
|
-
|
18695
|
-
|
18764
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18765
|
+
GGML_ASSERT(imatrix);
|
18766
|
+
size_t start_row = start / n_per_row;
|
18767
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18768
|
+
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18769
|
+
GGML_ASSERT(result == row_size * nrows);
|
18696
18770
|
} break;
|
18697
18771
|
case GGML_TYPE_IQ2_XS:
|
18698
18772
|
{
|
18699
18773
|
GGML_ASSERT(start % QK_K == 0);
|
18700
|
-
|
18701
|
-
|
18774
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18775
|
+
GGML_ASSERT(imatrix);
|
18776
|
+
size_t start_row = start / n_per_row;
|
18777
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18778
|
+
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18779
|
+
GGML_ASSERT(result == row_size * nrows);
|
18702
18780
|
} break;
|
18703
18781
|
case GGML_TYPE_F16:
|
18704
18782
|
{
|
@@ -19162,7 +19240,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19162
19240
|
|
19163
19241
|
if (ctx->kv) {
|
19164
19242
|
// free string memory - not great..
|
19165
|
-
for (
|
19243
|
+
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
19166
19244
|
struct gguf_kv * kv = &ctx->kv[i];
|
19167
19245
|
|
19168
19246
|
if (kv->key.data) {
|
@@ -19178,7 +19256,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19178
19256
|
if (kv->type == GGUF_TYPE_ARRAY) {
|
19179
19257
|
if (kv->value.arr.data) {
|
19180
19258
|
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
19181
|
-
for (
|
19259
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
19182
19260
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
19183
19261
|
if (str->data) {
|
19184
19262
|
free(str->data);
|
@@ -19194,7 +19272,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19194
19272
|
}
|
19195
19273
|
|
19196
19274
|
if (ctx->infos) {
|
19197
|
-
for (
|
19275
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19198
19276
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
19199
19277
|
|
19200
19278
|
if (info->name.data) {
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -187,6 +187,16 @@
|
|
187
187
|
# define GGML_API
|
188
188
|
#endif
|
189
189
|
|
190
|
+
#ifdef GGML_MULTIPLATFORM
|
191
|
+
# if defined(_WIN32)
|
192
|
+
# define GGML_CALL
|
193
|
+
# else
|
194
|
+
# define GGML_CALL __attribute__((__ms_abi__))
|
195
|
+
# endif
|
196
|
+
#else
|
197
|
+
# define GGML_CALL
|
198
|
+
#endif
|
199
|
+
|
190
200
|
// TODO: support for clang
|
191
201
|
#ifdef __GNUC__
|
192
202
|
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
@@ -649,41 +659,41 @@ extern "C" {
|
|
649
659
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
650
660
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
651
661
|
|
652
|
-
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
653
|
-
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
654
|
-
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
655
|
-
GGML_API
|
662
|
+
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
663
|
+
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
664
|
+
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
665
|
+
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
656
666
|
|
657
|
-
GGML_API int ggml_blck_size(enum ggml_type type);
|
658
|
-
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
659
|
-
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
667
|
+
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
|
668
|
+
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
669
|
+
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
660
670
|
|
661
671
|
GGML_DEPRECATED(
|
662
672
|
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
663
673
|
"use ggml_row_size() instead");
|
664
674
|
|
665
|
-
GGML_API const char * ggml_type_name(enum ggml_type type);
|
666
|
-
GGML_API const char * ggml_op_name (enum ggml_op op);
|
667
|
-
GGML_API
|
675
|
+
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
|
676
|
+
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
|
677
|
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
668
678
|
|
669
|
-
GGML_API
|
670
|
-
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
679
|
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
680
|
+
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
671
681
|
|
672
|
-
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
682
|
+
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
|
673
683
|
|
674
|
-
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
684
|
+
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
|
675
685
|
|
676
686
|
// TODO: temporary until model loading of ggml examples is refactored
|
677
687
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
678
688
|
|
679
|
-
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
680
|
-
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
681
|
-
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
682
|
-
GGML_API
|
683
|
-
GGML_API
|
684
|
-
GGML_API
|
685
|
-
GGML_API
|
686
|
-
GGML_API
|
689
|
+
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
690
|
+
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
691
|
+
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
692
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
693
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
694
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
695
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
696
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
687
697
|
|
688
698
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
689
699
|
|
@@ -770,7 +780,7 @@ extern "C" {
|
|
770
780
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
771
781
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
772
782
|
|
773
|
-
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
783
|
+
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
774
784
|
|
775
785
|
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
776
786
|
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
@@ -1165,6 +1175,11 @@ extern "C" {
|
|
1165
1175
|
struct ggml_tensor * a,
|
1166
1176
|
struct ggml_tensor * b);
|
1167
1177
|
|
1178
|
+
GGML_API struct ggml_tensor * ggml_cast(
|
1179
|
+
struct ggml_context * ctx,
|
1180
|
+
struct ggml_tensor * a,
|
1181
|
+
enum ggml_type type);
|
1182
|
+
|
1168
1183
|
// make contiguous
|
1169
1184
|
GGML_API struct ggml_tensor * ggml_cont(
|
1170
1185
|
struct ggml_context * ctx,
|
@@ -1408,7 +1423,7 @@ extern "C" {
|
|
1408
1423
|
float beta_slow);
|
1409
1424
|
|
1410
1425
|
// compute correction dims for YaRN RoPE scaling
|
1411
|
-
void ggml_rope_yarn_corr_dims(
|
1426
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1412
1427
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1413
1428
|
|
1414
1429
|
// xPos RoPE, in-place, returns view(a)
|
@@ -1842,8 +1857,8 @@ extern "C" {
|
|
1842
1857
|
|
1843
1858
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1844
1859
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1845
|
-
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1846
|
-
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1860
|
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1861
|
+
GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1847
1862
|
|
1848
1863
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1849
1864
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
@@ -2062,10 +2077,13 @@ extern "C" {
|
|
2062
2077
|
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2063
2078
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2064
2079
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2065
|
-
GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
|
2066
|
-
GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
|
2067
2080
|
|
2068
|
-
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
2081
|
+
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
2082
|
+
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
2083
|
+
|
2084
|
+
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
2085
|
+
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
2086
|
+
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
2069
2087
|
|
2070
2088
|
//
|
2071
2089
|
// Importance matrix
|