llama_cpp 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
196
196
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
197
197
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
198
198
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
199
|
-
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
|
200
|
-
void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
|
201
199
|
|
202
200
|
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
203
201
|
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
@@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
|
212
210
|
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
213
211
|
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
214
212
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
215
|
-
void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
|
216
|
-
void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
|
217
213
|
|
218
214
|
// Dequantization
|
219
215
|
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
@@ -246,3 +242,18 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
|
246
242
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
247
243
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
248
244
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
245
|
+
|
246
|
+
//
|
247
|
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
248
|
+
//
|
249
|
+
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
250
|
+
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
251
|
+
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
252
|
+
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
253
|
+
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
254
|
+
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
255
|
+
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
256
|
+
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
257
|
+
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
258
|
+
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
259
|
+
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
585
585
|
.type_size = sizeof(block_iq2_xxs),
|
586
586
|
.is_quantized = true,
|
587
587
|
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
588
|
-
.from_float =
|
589
|
-
.from_float_reference =
|
588
|
+
.from_float = NULL,
|
589
|
+
.from_float_reference = NULL,
|
590
590
|
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
591
591
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
592
592
|
},
|
@@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
596
596
|
.type_size = sizeof(block_iq2_xs),
|
597
597
|
.is_quantized = true,
|
598
598
|
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
599
|
-
.from_float =
|
600
|
-
.from_float_reference =
|
599
|
+
.from_float = NULL,
|
600
|
+
.from_float_reference = NULL,
|
601
601
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
602
602
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
603
603
|
},
|
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
|
1990
1990
|
GGML_PRINT("%s: --- end ---\n", __func__);
|
1991
1991
|
}
|
1992
1992
|
|
1993
|
-
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
1993
|
+
GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
1994
1994
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
1995
1995
|
|
1996
1996
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
1997
1997
|
}
|
1998
1998
|
|
1999
|
-
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
1999
|
+
GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
2000
2000
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2001
2001
|
|
2002
2002
|
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
2003
2003
|
}
|
2004
2004
|
|
2005
|
-
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
2005
|
+
GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
2006
2006
|
size_t nbytes;
|
2007
2007
|
size_t blck_size = ggml_blck_size(tensor->type);
|
2008
2008
|
if (blck_size == 1) {
|
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
2025
2025
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
2026
2026
|
}
|
2027
2027
|
|
2028
|
-
int ggml_blck_size(enum ggml_type type) {
|
2028
|
+
GGML_CALL int ggml_blck_size(enum ggml_type type) {
|
2029
2029
|
return type_traits[type].blck_size;
|
2030
2030
|
}
|
2031
2031
|
|
2032
|
-
size_t ggml_type_size(enum ggml_type type) {
|
2032
|
+
GGML_CALL size_t ggml_type_size(enum ggml_type type) {
|
2033
2033
|
return type_traits[type].type_size;
|
2034
2034
|
}
|
2035
2035
|
|
2036
|
-
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
2036
|
+
GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
2037
2037
|
assert(ne % ggml_blck_size(type) == 0);
|
2038
2038
|
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
2039
2039
|
}
|
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
|
|
2042
2042
|
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
2043
2043
|
}
|
2044
2044
|
|
2045
|
-
const char * ggml_type_name(enum ggml_type type) {
|
2045
|
+
GGML_CALL const char * ggml_type_name(enum ggml_type type) {
|
2046
2046
|
return type_traits[type].type_name;
|
2047
2047
|
}
|
2048
2048
|
|
2049
|
-
bool ggml_is_quantized(enum ggml_type type) {
|
2049
|
+
GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
|
2050
2050
|
return type_traits[type].is_quantized;
|
2051
2051
|
}
|
2052
2052
|
|
2053
|
-
const char * ggml_op_name(enum ggml_op op) {
|
2053
|
+
GGML_CALL const char * ggml_op_name(enum ggml_op op) {
|
2054
2054
|
return GGML_OP_NAME[op];
|
2055
2055
|
}
|
2056
2056
|
|
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
|
2062
2062
|
return GGML_UNARY_OP_NAME[op];
|
2063
2063
|
}
|
2064
2064
|
|
2065
|
-
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2065
|
+
GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
2066
2066
|
if (t->op == GGML_OP_UNARY) {
|
2067
2067
|
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
2068
2068
|
return ggml_unary_op_name(uop);
|
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
|
|
2072
2072
|
}
|
2073
2073
|
}
|
2074
2074
|
|
2075
|
-
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2075
|
+
GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
2076
2076
|
return ggml_type_size(tensor->type);
|
2077
2077
|
}
|
2078
2078
|
|
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
|
|
2154
2154
|
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
2155
2155
|
}
|
2156
2156
|
|
2157
|
-
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
2157
|
+
GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
2158
2158
|
return tensor->nb[0] > tensor->nb[1];
|
2159
2159
|
}
|
2160
2160
|
|
2161
|
-
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
2161
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
2162
2162
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2163
2163
|
|
2164
2164
|
return
|
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
2177
2177
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
2178
2178
|
}
|
2179
2179
|
|
2180
|
-
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
2180
|
+
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
2181
2181
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2182
2182
|
|
2183
2183
|
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
@@ -2354,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2354
2354
|
}
|
2355
2355
|
|
2356
2356
|
void ggml_free(struct ggml_context * ctx) {
|
2357
|
+
if (ctx == NULL) {
|
2358
|
+
return;
|
2359
|
+
}
|
2360
|
+
|
2357
2361
|
// make this function thread safe
|
2358
2362
|
ggml_critical_section_start();
|
2359
2363
|
|
@@ -3075,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
3075
3079
|
return (float *)(tensor->data);
|
3076
3080
|
}
|
3077
3081
|
|
3078
|
-
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
3082
|
+
GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
3079
3083
|
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
3080
3084
|
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
3081
3085
|
}
|
@@ -4362,6 +4366,23 @@ struct ggml_tensor * ggml_cpy(
|
|
4362
4366
|
return ggml_cpy_impl(ctx, a, b);
|
4363
4367
|
}
|
4364
4368
|
|
4369
|
+
struct ggml_tensor * ggml_cast(
|
4370
|
+
struct ggml_context * ctx,
|
4371
|
+
struct ggml_tensor * a,
|
4372
|
+
enum ggml_type type) {
|
4373
|
+
bool is_node = false;
|
4374
|
+
|
4375
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
4376
|
+
ggml_format_name(result, "%s (copy)", a->name);
|
4377
|
+
|
4378
|
+
result->op = GGML_OP_CPY;
|
4379
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4380
|
+
result->src[0] = a;
|
4381
|
+
result->src[1] = result;
|
4382
|
+
|
4383
|
+
return result;
|
4384
|
+
}
|
4385
|
+
|
4365
4386
|
// ggml_cont
|
4366
4387
|
|
4367
4388
|
static struct ggml_tensor * ggml_cont_impl(
|
@@ -11617,7 +11638,22 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
|
|
11617
11638
|
return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
11618
11639
|
}
|
11619
11640
|
|
11620
|
-
void
|
11641
|
+
static void ggml_rope_cache_init(
|
11642
|
+
float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
11643
|
+
float * cache, float sin_sign, float theta_scale
|
11644
|
+
) {
|
11645
|
+
float theta = theta_base;
|
11646
|
+
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11647
|
+
rope_yarn(
|
11648
|
+
theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
11649
|
+
);
|
11650
|
+
cache[i0 + 1] *= sin_sign;
|
11651
|
+
|
11652
|
+
theta *= theta_scale;
|
11653
|
+
}
|
11654
|
+
}
|
11655
|
+
|
11656
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
11621
11657
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
11622
11658
|
) {
|
11623
11659
|
// start and end correction dims
|
@@ -11699,6 +11735,12 @@ static void ggml_compute_forward_rope_f32(
|
|
11699
11735
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11700
11736
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
11701
11737
|
const int64_t p = pos[i2];
|
11738
|
+
|
11739
|
+
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
11740
|
+
if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
|
11741
|
+
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
11742
|
+
}
|
11743
|
+
|
11702
11744
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
11703
11745
|
if (ir++ < ir0) continue;
|
11704
11746
|
if (ir > ir1) break;
|
@@ -11732,18 +11774,13 @@ static void ggml_compute_forward_rope_f32(
|
|
11732
11774
|
}
|
11733
11775
|
} else if (!is_neox) {
|
11734
11776
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11735
|
-
float cos_theta
|
11736
|
-
|
11737
|
-
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11738
|
-
);
|
11739
|
-
sin_theta *= sin_sign;
|
11777
|
+
const float cos_theta = cache[i0 + 0];
|
11778
|
+
const float sin_theta = cache[i0 + 1];
|
11740
11779
|
|
11741
11780
|
// zeta scaling for xPos only:
|
11742
11781
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
11743
11782
|
if (xpos_down) zeta = 1.0f / zeta;
|
11744
11783
|
|
11745
|
-
theta_base *= theta_scale;
|
11746
|
-
|
11747
11784
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11748
11785
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
11749
11786
|
|
@@ -11867,6 +11904,12 @@ static void ggml_compute_forward_rope_f16(
|
|
11867
11904
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11868
11905
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
11869
11906
|
const int64_t p = pos[i2];
|
11907
|
+
|
11908
|
+
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
11909
|
+
if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
|
11910
|
+
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
11911
|
+
}
|
11912
|
+
|
11870
11913
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
11871
11914
|
if (ir++ < ir0) continue;
|
11872
11915
|
if (ir > ir1) break;
|
@@ -11900,13 +11943,8 @@ static void ggml_compute_forward_rope_f16(
|
|
11900
11943
|
}
|
11901
11944
|
} else if (!is_neox) {
|
11902
11945
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11903
|
-
float cos_theta
|
11904
|
-
|
11905
|
-
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
11906
|
-
);
|
11907
|
-
sin_theta *= sin_sign;
|
11908
|
-
|
11909
|
-
theta_base *= theta_scale;
|
11946
|
+
const float cos_theta = cache[i0 + 0];
|
11947
|
+
const float sin_theta = cache[i0 + 1];
|
11910
11948
|
|
11911
11949
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
11912
11950
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
@@ -14871,7 +14909,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
|
|
14871
14909
|
return i;
|
14872
14910
|
}
|
14873
14911
|
|
14874
|
-
|
14912
|
+
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
14875
14913
|
size = ggml_hash_size(size);
|
14876
14914
|
struct ggml_hash_set result;
|
14877
14915
|
result.size = size;
|
@@ -16620,7 +16658,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16620
16658
|
return GGML_EXIT_SUCCESS;
|
16621
16659
|
}
|
16622
16660
|
|
16623
|
-
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
16661
|
+
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
16624
16662
|
if (n_threads <= 0) {
|
16625
16663
|
n_threads = GGML_DEFAULT_N_THREADS;
|
16626
16664
|
}
|
@@ -16682,14 +16720,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16682
16720
|
} break;
|
16683
16721
|
case GGML_OP_MUL_MAT_ID:
|
16684
16722
|
{
|
16723
|
+
cur = 0;
|
16685
16724
|
const struct ggml_tensor * src0 = node->src[2];
|
16686
16725
|
const struct ggml_tensor * src1 = node->src[1];
|
16687
16726
|
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
16688
16727
|
if (src1->type != vec_dot_type) {
|
16689
|
-
cur
|
16728
|
+
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
16690
16729
|
}
|
16691
16730
|
const int n_as = ggml_get_op_params_i32(node, 1);
|
16692
|
-
cur
|
16731
|
+
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
16693
16732
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
16694
16733
|
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
16695
16734
|
} break;
|
@@ -16700,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16700
16739
|
}
|
16701
16740
|
} break;
|
16702
16741
|
case GGML_OP_SOFT_MAX:
|
16742
|
+
case GGML_OP_ROPE:
|
16703
16743
|
{
|
16704
16744
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16705
16745
|
} break;
|
@@ -18625,32 +18665,47 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|
18625
18665
|
return (n/QK8_0*sizeof(block_q8_0));
|
18626
18666
|
}
|
18627
18667
|
|
18628
|
-
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
18668
|
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
18669
|
+
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
18670
|
+
(void)imatrix;
|
18629
18671
|
size_t result = 0;
|
18672
|
+
int n = nrows * n_per_row;
|
18630
18673
|
switch (type) {
|
18631
18674
|
case GGML_TYPE_Q4_0:
|
18632
18675
|
{
|
18633
18676
|
GGML_ASSERT(start % QK4_0 == 0);
|
18634
|
-
|
18635
|
-
|
18677
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18678
|
+
size_t start_row = start / n_per_row;
|
18679
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18680
|
+
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18681
|
+
GGML_ASSERT(result == row_size * nrows);
|
18636
18682
|
} break;
|
18637
18683
|
case GGML_TYPE_Q4_1:
|
18638
18684
|
{
|
18639
18685
|
GGML_ASSERT(start % QK4_1 == 0);
|
18640
|
-
|
18641
|
-
|
18686
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18687
|
+
size_t start_row = start / n_per_row;
|
18688
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18689
|
+
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18690
|
+
GGML_ASSERT(result == row_size * nrows);
|
18642
18691
|
} break;
|
18643
18692
|
case GGML_TYPE_Q5_0:
|
18644
18693
|
{
|
18645
18694
|
GGML_ASSERT(start % QK5_0 == 0);
|
18646
|
-
|
18647
|
-
|
18695
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18696
|
+
size_t start_row = start / n_per_row;
|
18697
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18698
|
+
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18699
|
+
GGML_ASSERT(result == row_size * nrows);
|
18648
18700
|
} break;
|
18649
18701
|
case GGML_TYPE_Q5_1:
|
18650
18702
|
{
|
18651
18703
|
GGML_ASSERT(start % QK5_1 == 0);
|
18652
|
-
|
18653
|
-
|
18704
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18705
|
+
size_t start_row = start / n_per_row;
|
18706
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18707
|
+
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18708
|
+
GGML_ASSERT(result == row_size * nrows);
|
18654
18709
|
} break;
|
18655
18710
|
case GGML_TYPE_Q8_0:
|
18656
18711
|
{
|
@@ -18661,44 +18716,67 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18661
18716
|
case GGML_TYPE_Q2_K:
|
18662
18717
|
{
|
18663
18718
|
GGML_ASSERT(start % QK_K == 0);
|
18664
|
-
|
18665
|
-
|
18719
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18720
|
+
size_t start_row = start / n_per_row;
|
18721
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18722
|
+
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18723
|
+
GGML_ASSERT(result == row_size * nrows);
|
18666
18724
|
} break;
|
18667
18725
|
case GGML_TYPE_Q3_K:
|
18668
18726
|
{
|
18669
18727
|
GGML_ASSERT(start % QK_K == 0);
|
18670
|
-
|
18671
|
-
|
18728
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18729
|
+
size_t start_row = start / n_per_row;
|
18730
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18731
|
+
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18732
|
+
GGML_ASSERT(result == row_size * nrows);
|
18672
18733
|
} break;
|
18673
18734
|
case GGML_TYPE_Q4_K:
|
18674
18735
|
{
|
18675
18736
|
GGML_ASSERT(start % QK_K == 0);
|
18676
|
-
|
18677
|
-
|
18737
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18738
|
+
size_t start_row = start / n_per_row;
|
18739
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18740
|
+
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18741
|
+
GGML_ASSERT(result == row_size * nrows);
|
18678
18742
|
} break;
|
18679
18743
|
case GGML_TYPE_Q5_K:
|
18680
18744
|
{
|
18681
18745
|
GGML_ASSERT(start % QK_K == 0);
|
18682
|
-
|
18683
|
-
|
18746
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18747
|
+
size_t start_row = start / n_per_row;
|
18748
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18749
|
+
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18750
|
+
GGML_ASSERT(result == row_size * nrows);
|
18684
18751
|
} break;
|
18685
18752
|
case GGML_TYPE_Q6_K:
|
18686
18753
|
{
|
18687
18754
|
GGML_ASSERT(start % QK_K == 0);
|
18688
|
-
|
18689
|
-
|
18755
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18756
|
+
size_t start_row = start / n_per_row;
|
18757
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18758
|
+
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18759
|
+
GGML_ASSERT(result == row_size * nrows);
|
18690
18760
|
} break;
|
18691
18761
|
case GGML_TYPE_IQ2_XXS:
|
18692
18762
|
{
|
18693
18763
|
GGML_ASSERT(start % QK_K == 0);
|
18694
|
-
|
18695
|
-
|
18764
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18765
|
+
GGML_ASSERT(imatrix);
|
18766
|
+
size_t start_row = start / n_per_row;
|
18767
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18768
|
+
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18769
|
+
GGML_ASSERT(result == row_size * nrows);
|
18696
18770
|
} break;
|
18697
18771
|
case GGML_TYPE_IQ2_XS:
|
18698
18772
|
{
|
18699
18773
|
GGML_ASSERT(start % QK_K == 0);
|
18700
|
-
|
18701
|
-
|
18774
|
+
GGML_ASSERT(start % n_per_row == 0);
|
18775
|
+
GGML_ASSERT(imatrix);
|
18776
|
+
size_t start_row = start / n_per_row;
|
18777
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
18778
|
+
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18779
|
+
GGML_ASSERT(result == row_size * nrows);
|
18702
18780
|
} break;
|
18703
18781
|
case GGML_TYPE_F16:
|
18704
18782
|
{
|
@@ -19162,7 +19240,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19162
19240
|
|
19163
19241
|
if (ctx->kv) {
|
19164
19242
|
// free string memory - not great..
|
19165
|
-
for (
|
19243
|
+
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
19166
19244
|
struct gguf_kv * kv = &ctx->kv[i];
|
19167
19245
|
|
19168
19246
|
if (kv->key.data) {
|
@@ -19178,7 +19256,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19178
19256
|
if (kv->type == GGUF_TYPE_ARRAY) {
|
19179
19257
|
if (kv->value.arr.data) {
|
19180
19258
|
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
19181
|
-
for (
|
19259
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
19182
19260
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
19183
19261
|
if (str->data) {
|
19184
19262
|
free(str->data);
|
@@ -19194,7 +19272,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19194
19272
|
}
|
19195
19273
|
|
19196
19274
|
if (ctx->infos) {
|
19197
|
-
for (
|
19275
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19198
19276
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
19199
19277
|
|
19200
19278
|
if (info->name.data) {
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -187,6 +187,16 @@
|
|
187
187
|
# define GGML_API
|
188
188
|
#endif
|
189
189
|
|
190
|
+
#ifdef GGML_MULTIPLATFORM
|
191
|
+
# if defined(_WIN32)
|
192
|
+
# define GGML_CALL
|
193
|
+
# else
|
194
|
+
# define GGML_CALL __attribute__((__ms_abi__))
|
195
|
+
# endif
|
196
|
+
#else
|
197
|
+
# define GGML_CALL
|
198
|
+
#endif
|
199
|
+
|
190
200
|
// TODO: support for clang
|
191
201
|
#ifdef __GNUC__
|
192
202
|
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
@@ -649,41 +659,41 @@ extern "C" {
|
|
649
659
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
650
660
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
651
661
|
|
652
|
-
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
653
|
-
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
654
|
-
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
655
|
-
GGML_API
|
662
|
+
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
663
|
+
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
664
|
+
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
665
|
+
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
656
666
|
|
657
|
-
GGML_API int ggml_blck_size(enum ggml_type type);
|
658
|
-
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
659
|
-
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
667
|
+
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
|
668
|
+
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
669
|
+
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
660
670
|
|
661
671
|
GGML_DEPRECATED(
|
662
672
|
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
663
673
|
"use ggml_row_size() instead");
|
664
674
|
|
665
|
-
GGML_API const char * ggml_type_name(enum ggml_type type);
|
666
|
-
GGML_API const char * ggml_op_name (enum ggml_op op);
|
667
|
-
GGML_API
|
675
|
+
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
|
676
|
+
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
|
677
|
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
668
678
|
|
669
|
-
GGML_API
|
670
|
-
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
679
|
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
680
|
+
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
671
681
|
|
672
|
-
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
682
|
+
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
|
673
683
|
|
674
|
-
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
684
|
+
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
|
675
685
|
|
676
686
|
// TODO: temporary until model loading of ggml examples is refactored
|
677
687
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
678
688
|
|
679
|
-
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
680
|
-
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
681
|
-
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
682
|
-
GGML_API
|
683
|
-
GGML_API
|
684
|
-
GGML_API
|
685
|
-
GGML_API
|
686
|
-
GGML_API
|
689
|
+
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
690
|
+
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
691
|
+
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
692
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
693
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
694
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
695
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
696
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
687
697
|
|
688
698
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
689
699
|
|
@@ -770,7 +780,7 @@ extern "C" {
|
|
770
780
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
771
781
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
772
782
|
|
773
|
-
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
783
|
+
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
774
784
|
|
775
785
|
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
776
786
|
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
@@ -1165,6 +1175,11 @@ extern "C" {
|
|
1165
1175
|
struct ggml_tensor * a,
|
1166
1176
|
struct ggml_tensor * b);
|
1167
1177
|
|
1178
|
+
GGML_API struct ggml_tensor * ggml_cast(
|
1179
|
+
struct ggml_context * ctx,
|
1180
|
+
struct ggml_tensor * a,
|
1181
|
+
enum ggml_type type);
|
1182
|
+
|
1168
1183
|
// make contiguous
|
1169
1184
|
GGML_API struct ggml_tensor * ggml_cont(
|
1170
1185
|
struct ggml_context * ctx,
|
@@ -1408,7 +1423,7 @@ extern "C" {
|
|
1408
1423
|
float beta_slow);
|
1409
1424
|
|
1410
1425
|
// compute correction dims for YaRN RoPE scaling
|
1411
|
-
void ggml_rope_yarn_corr_dims(
|
1426
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1412
1427
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1413
1428
|
|
1414
1429
|
// xPos RoPE, in-place, returns view(a)
|
@@ -1842,8 +1857,8 @@ extern "C" {
|
|
1842
1857
|
|
1843
1858
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1844
1859
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1845
|
-
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1846
|
-
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1860
|
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1861
|
+
GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1847
1862
|
|
1848
1863
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1849
1864
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
@@ -2062,10 +2077,13 @@ extern "C" {
|
|
2062
2077
|
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2063
2078
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2064
2079
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2065
|
-
GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
|
2066
|
-
GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
|
2067
2080
|
|
2068
|
-
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
2081
|
+
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
2082
|
+
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
2083
|
+
|
2084
|
+
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
2085
|
+
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
2086
|
+
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
2069
2087
|
|
2070
2088
|
//
|
2071
2089
|
// Importance matrix
|