llama_cpp 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +78 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +11 -0
- data/vendor/tmp/llama.cpp/Makefile +7 -10
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +512 -261
- data/vendor/tmp/llama.cpp/ggml-backend.h +43 -33
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1494 -559
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1868 -2002
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +2182 -44
- data/vendor/tmp/llama.cpp/ggml-quants.h +36 -1
- data/vendor/tmp/llama.cpp/ggml.c +222 -105
- data/vendor/tmp/llama.cpp/ggml.h +56 -35
- data/vendor/tmp/llama.cpp/llama.cpp +1271 -1618
- data/vendor/tmp/llama.cpp/llama.h +44 -8
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
|
@@ -132,7 +132,7 @@ void ggml_print_backtrace(void) {
|
|
|
132
132
|
"-ex", "bt -frame-info source-and-location",
|
|
133
133
|
"-ex", "detach",
|
|
134
134
|
"-ex", "quit",
|
|
135
|
-
NULL);
|
|
135
|
+
(char *) NULL);
|
|
136
136
|
} else {
|
|
137
137
|
waitpid(pid, NULL, 0);
|
|
138
138
|
}
|
|
@@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
|
394
394
|
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
|
395
395
|
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
|
396
396
|
|
|
397
|
+
ggml_collect_imatrix_t g_imatrix_collect = NULL;
|
|
398
|
+
|
|
399
|
+
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
|
|
400
|
+
g_imatrix_collect = imatrix_collect;
|
|
401
|
+
}
|
|
402
|
+
|
|
397
403
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
398
404
|
[GGML_TYPE_I8] = {
|
|
399
405
|
.type_name = "i8",
|
|
@@ -573,6 +579,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
|
573
579
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
|
574
580
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
|
575
581
|
},
|
|
582
|
+
[GGML_TYPE_IQ2_XXS] = {
|
|
583
|
+
.type_name = "iq2_xxs",
|
|
584
|
+
.blck_size = QK_K,
|
|
585
|
+
.type_size = sizeof(block_iq2_xxs),
|
|
586
|
+
.is_quantized = true,
|
|
587
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
|
588
|
+
.from_float = NULL,
|
|
589
|
+
.from_float_reference = NULL,
|
|
590
|
+
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
|
591
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
|
592
|
+
},
|
|
593
|
+
[GGML_TYPE_IQ2_XS] = {
|
|
594
|
+
.type_name = "iq2_xs",
|
|
595
|
+
.blck_size = QK_K,
|
|
596
|
+
.type_size = sizeof(block_iq2_xs),
|
|
597
|
+
.is_quantized = true,
|
|
598
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
|
599
|
+
.from_float = NULL,
|
|
600
|
+
.from_float_reference = NULL,
|
|
601
|
+
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
|
602
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
|
603
|
+
},
|
|
576
604
|
[GGML_TYPE_Q8_K] = {
|
|
577
605
|
.type_name = "q8_K",
|
|
578
606
|
.blck_size = QK_K,
|
|
@@ -1962,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
|
|
1962
1990
|
GGML_PRINT("%s: --- end ---\n", __func__);
|
|
1963
1991
|
}
|
|
1964
1992
|
|
|
1965
|
-
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|
1993
|
+
GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|
1966
1994
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
1967
1995
|
|
|
1968
1996
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
|
1969
1997
|
}
|
|
1970
1998
|
|
|
1971
|
-
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
1999
|
+
GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
1972
2000
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
1973
2001
|
|
|
1974
2002
|
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
|
1975
2003
|
}
|
|
1976
2004
|
|
|
1977
|
-
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|
2005
|
+
GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|
1978
2006
|
size_t nbytes;
|
|
1979
2007
|
size_t blck_size = ggml_blck_size(tensor->type);
|
|
1980
2008
|
if (blck_size == 1) {
|
|
@@ -1997,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
|
1997
2025
|
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
|
1998
2026
|
}
|
|
1999
2027
|
|
|
2000
|
-
int ggml_blck_size(enum ggml_type type) {
|
|
2028
|
+
GGML_CALL int ggml_blck_size(enum ggml_type type) {
|
|
2001
2029
|
return type_traits[type].blck_size;
|
|
2002
2030
|
}
|
|
2003
2031
|
|
|
2004
|
-
size_t ggml_type_size(enum ggml_type type) {
|
|
2032
|
+
GGML_CALL size_t ggml_type_size(enum ggml_type type) {
|
|
2005
2033
|
return type_traits[type].type_size;
|
|
2006
2034
|
}
|
|
2007
2035
|
|
|
2008
|
-
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
|
2036
|
+
GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
|
2009
2037
|
assert(ne % ggml_blck_size(type) == 0);
|
|
2010
2038
|
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
|
2011
2039
|
}
|
|
@@ -2014,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
|
|
|
2014
2042
|
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
|
2015
2043
|
}
|
|
2016
2044
|
|
|
2017
|
-
const char * ggml_type_name(enum ggml_type type) {
|
|
2045
|
+
GGML_CALL const char * ggml_type_name(enum ggml_type type) {
|
|
2018
2046
|
return type_traits[type].type_name;
|
|
2019
2047
|
}
|
|
2020
2048
|
|
|
2021
|
-
bool ggml_is_quantized(enum ggml_type type) {
|
|
2049
|
+
GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
|
|
2022
2050
|
return type_traits[type].is_quantized;
|
|
2023
2051
|
}
|
|
2024
2052
|
|
|
2025
|
-
const char * ggml_op_name(enum ggml_op op) {
|
|
2053
|
+
GGML_CALL const char * ggml_op_name(enum ggml_op op) {
|
|
2026
2054
|
return GGML_OP_NAME[op];
|
|
2027
2055
|
}
|
|
2028
2056
|
|
|
@@ -2034,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
|
|
2034
2062
|
return GGML_UNARY_OP_NAME[op];
|
|
2035
2063
|
}
|
|
2036
2064
|
|
|
2037
|
-
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
|
2065
|
+
GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
|
2038
2066
|
if (t->op == GGML_OP_UNARY) {
|
|
2039
2067
|
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
|
2040
2068
|
return ggml_unary_op_name(uop);
|
|
@@ -2044,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
|
|
|
2044
2072
|
}
|
|
2045
2073
|
}
|
|
2046
2074
|
|
|
2047
|
-
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
|
2075
|
+
GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
|
2048
2076
|
return ggml_type_size(tensor->type);
|
|
2049
2077
|
}
|
|
2050
2078
|
|
|
@@ -2111,6 +2139,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
|
2111
2139
|
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
|
2112
2140
|
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
|
2113
2141
|
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
|
2142
|
+
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
|
2143
|
+
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
|
2114
2144
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
|
2115
2145
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
|
2116
2146
|
}
|
|
@@ -2124,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
|
|
|
2124
2154
|
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
|
2125
2155
|
}
|
|
2126
2156
|
|
|
2127
|
-
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
|
2157
|
+
GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
|
2128
2158
|
return tensor->nb[0] > tensor->nb[1];
|
|
2129
2159
|
}
|
|
2130
2160
|
|
|
2131
|
-
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
2161
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
2132
2162
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
2133
2163
|
|
|
2134
2164
|
return
|
|
@@ -2147,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
|
2147
2177
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
2148
2178
|
}
|
|
2149
2179
|
|
|
2150
|
-
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
|
2180
|
+
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
|
2151
2181
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
2152
2182
|
|
|
2153
2183
|
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
|
@@ -2324,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
2324
2354
|
}
|
|
2325
2355
|
|
|
2326
2356
|
void ggml_free(struct ggml_context * ctx) {
|
|
2357
|
+
if (ctx == NULL) {
|
|
2358
|
+
return;
|
|
2359
|
+
}
|
|
2360
|
+
|
|
2327
2361
|
// make this function thread safe
|
|
2328
2362
|
ggml_critical_section_start();
|
|
2329
2363
|
|
|
@@ -3045,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
|
3045
3079
|
return (float *)(tensor->data);
|
|
3046
3080
|
}
|
|
3047
3081
|
|
|
3048
|
-
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
|
3082
|
+
GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
|
3049
3083
|
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
|
3050
3084
|
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
|
3051
3085
|
}
|
|
@@ -4299,13 +4333,13 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
|
4299
4333
|
static struct ggml_tensor * ggml_cpy_impl(
|
|
4300
4334
|
struct ggml_context * ctx,
|
|
4301
4335
|
struct ggml_tensor * a,
|
|
4302
|
-
struct ggml_tensor * b
|
|
4303
|
-
bool inplace) {
|
|
4336
|
+
struct ggml_tensor * b) {
|
|
4304
4337
|
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
|
|
4305
4338
|
|
|
4306
4339
|
bool is_node = false;
|
|
4307
4340
|
|
|
4308
|
-
if (
|
|
4341
|
+
if (a->grad || b->grad) {
|
|
4342
|
+
// inplace is false and either one have a grad
|
|
4309
4343
|
is_node = true;
|
|
4310
4344
|
}
|
|
4311
4345
|
|
|
@@ -4329,29 +4363,38 @@ struct ggml_tensor * ggml_cpy(
|
|
|
4329
4363
|
struct ggml_context * ctx,
|
|
4330
4364
|
struct ggml_tensor * a,
|
|
4331
4365
|
struct ggml_tensor * b) {
|
|
4332
|
-
return ggml_cpy_impl(ctx, a, b
|
|
4366
|
+
return ggml_cpy_impl(ctx, a, b);
|
|
4333
4367
|
}
|
|
4334
4368
|
|
|
4335
|
-
struct ggml_tensor *
|
|
4369
|
+
struct ggml_tensor * ggml_cast(
|
|
4336
4370
|
struct ggml_context * ctx,
|
|
4337
|
-
struct ggml_tensor
|
|
4338
|
-
|
|
4339
|
-
|
|
4371
|
+
struct ggml_tensor * a,
|
|
4372
|
+
enum ggml_type type) {
|
|
4373
|
+
bool is_node = false;
|
|
4374
|
+
|
|
4375
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
|
4376
|
+
ggml_format_name(result, "%s (copy)", a->name);
|
|
4377
|
+
|
|
4378
|
+
result->op = GGML_OP_CPY;
|
|
4379
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
4380
|
+
result->src[0] = a;
|
|
4381
|
+
result->src[1] = result;
|
|
4382
|
+
|
|
4383
|
+
return result;
|
|
4340
4384
|
}
|
|
4341
4385
|
|
|
4342
4386
|
// ggml_cont
|
|
4343
4387
|
|
|
4344
4388
|
static struct ggml_tensor * ggml_cont_impl(
|
|
4345
4389
|
struct ggml_context * ctx,
|
|
4346
|
-
struct ggml_tensor * a
|
|
4347
|
-
bool inplace) {
|
|
4390
|
+
struct ggml_tensor * a) {
|
|
4348
4391
|
bool is_node = false;
|
|
4349
4392
|
|
|
4350
|
-
if (
|
|
4393
|
+
if (a->grad) {
|
|
4351
4394
|
is_node = true;
|
|
4352
4395
|
}
|
|
4353
4396
|
|
|
4354
|
-
struct ggml_tensor * result =
|
|
4397
|
+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
|
4355
4398
|
ggml_format_name(result, "%s (cont)", a->name);
|
|
4356
4399
|
|
|
4357
4400
|
result->op = GGML_OP_CONT;
|
|
@@ -4364,13 +4407,7 @@ static struct ggml_tensor * ggml_cont_impl(
|
|
|
4364
4407
|
struct ggml_tensor * ggml_cont(
|
|
4365
4408
|
struct ggml_context * ctx,
|
|
4366
4409
|
struct ggml_tensor * a) {
|
|
4367
|
-
return ggml_cont_impl(ctx, a
|
|
4368
|
-
}
|
|
4369
|
-
|
|
4370
|
-
struct ggml_tensor * ggml_cont_inplace(
|
|
4371
|
-
struct ggml_context * ctx,
|
|
4372
|
-
struct ggml_tensor * a) {
|
|
4373
|
-
return ggml_cont_impl(ctx, a, true);
|
|
4410
|
+
return ggml_cont_impl(ctx, a);
|
|
4374
4411
|
}
|
|
4375
4412
|
|
|
4376
4413
|
// make contiguous, with new shape
|
|
@@ -7436,6 +7473,8 @@ static void ggml_compute_forward_add(
|
|
|
7436
7473
|
case GGML_TYPE_Q4_K:
|
|
7437
7474
|
case GGML_TYPE_Q5_K:
|
|
7438
7475
|
case GGML_TYPE_Q6_K:
|
|
7476
|
+
case GGML_TYPE_IQ2_XXS:
|
|
7477
|
+
case GGML_TYPE_IQ2_XS:
|
|
7439
7478
|
{
|
|
7440
7479
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
|
7441
7480
|
} break;
|
|
@@ -7700,6 +7739,8 @@ static void ggml_compute_forward_add1(
|
|
|
7700
7739
|
case GGML_TYPE_Q4_K:
|
|
7701
7740
|
case GGML_TYPE_Q5_K:
|
|
7702
7741
|
case GGML_TYPE_Q6_K:
|
|
7742
|
+
case GGML_TYPE_IQ2_XXS:
|
|
7743
|
+
case GGML_TYPE_IQ2_XS:
|
|
7703
7744
|
{
|
|
7704
7745
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
|
7705
7746
|
} break;
|
|
@@ -7814,6 +7855,8 @@ static void ggml_compute_forward_acc(
|
|
|
7814
7855
|
case GGML_TYPE_Q4_K:
|
|
7815
7856
|
case GGML_TYPE_Q5_K:
|
|
7816
7857
|
case GGML_TYPE_Q6_K:
|
|
7858
|
+
case GGML_TYPE_IQ2_XXS:
|
|
7859
|
+
case GGML_TYPE_IQ2_XS:
|
|
7817
7860
|
default:
|
|
7818
7861
|
{
|
|
7819
7862
|
GGML_ASSERT(false);
|
|
@@ -9704,10 +9747,10 @@ static void ggml_compute_forward_group_norm(
|
|
|
9704
9747
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
9705
9748
|
// helper function to determine if it is better to use BLAS or not
|
|
9706
9749
|
// for large matrices, BLAS is faster
|
|
9707
|
-
static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9708
|
-
|
|
9709
|
-
|
|
9710
|
-
|
|
9750
|
+
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
9751
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
9752
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
9753
|
+
|
|
9711
9754
|
//const int64_t ne00 = src0->ne[0];
|
|
9712
9755
|
//const int64_t ne01 = src0->ne[1];
|
|
9713
9756
|
|
|
@@ -9747,6 +9790,10 @@ static void ggml_compute_forward_mul_mat(
|
|
|
9747
9790
|
const int ith = params->ith;
|
|
9748
9791
|
const int nth = params->nth;
|
|
9749
9792
|
|
|
9793
|
+
if (ith == 1 && g_imatrix_collect) {
|
|
9794
|
+
g_imatrix_collect(src0, src1);
|
|
9795
|
+
}
|
|
9796
|
+
|
|
9750
9797
|
const enum ggml_type type = src0->type;
|
|
9751
9798
|
|
|
9752
9799
|
const bool src1_cont = ggml_is_contiguous(src1);
|
|
@@ -9787,7 +9834,7 @@ static void ggml_compute_forward_mul_mat(
|
|
|
9787
9834
|
#endif
|
|
9788
9835
|
|
|
9789
9836
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
9790
|
-
if (ggml_compute_forward_mul_mat_use_blas(
|
|
9837
|
+
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
|
9791
9838
|
if (params->ith != 0) {
|
|
9792
9839
|
return;
|
|
9793
9840
|
}
|
|
@@ -10050,6 +10097,10 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
10050
10097
|
|
|
10051
10098
|
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
|
10052
10099
|
|
|
10100
|
+
if (ith == 1 && g_imatrix_collect) {
|
|
10101
|
+
g_imatrix_collect(src0_cur, src1);
|
|
10102
|
+
}
|
|
10103
|
+
|
|
10053
10104
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
|
10054
10105
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
10055
10106
|
|
|
@@ -10455,6 +10506,8 @@ static void ggml_compute_forward_out_prod(
|
|
|
10455
10506
|
case GGML_TYPE_Q4_K:
|
|
10456
10507
|
case GGML_TYPE_Q5_K:
|
|
10457
10508
|
case GGML_TYPE_Q6_K:
|
|
10509
|
+
case GGML_TYPE_IQ2_XXS:
|
|
10510
|
+
case GGML_TYPE_IQ2_XS:
|
|
10458
10511
|
{
|
|
10459
10512
|
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
|
10460
10513
|
} break;
|
|
@@ -10629,6 +10682,8 @@ static void ggml_compute_forward_set(
|
|
|
10629
10682
|
case GGML_TYPE_Q4_K:
|
|
10630
10683
|
case GGML_TYPE_Q5_K:
|
|
10631
10684
|
case GGML_TYPE_Q6_K:
|
|
10685
|
+
case GGML_TYPE_IQ2_XXS:
|
|
10686
|
+
case GGML_TYPE_IQ2_XS:
|
|
10632
10687
|
default:
|
|
10633
10688
|
{
|
|
10634
10689
|
GGML_ASSERT(false);
|
|
@@ -10823,6 +10878,8 @@ static void ggml_compute_forward_get_rows(
|
|
|
10823
10878
|
case GGML_TYPE_Q4_K:
|
|
10824
10879
|
case GGML_TYPE_Q5_K:
|
|
10825
10880
|
case GGML_TYPE_Q6_K:
|
|
10881
|
+
case GGML_TYPE_IQ2_XXS:
|
|
10882
|
+
case GGML_TYPE_IQ2_XS:
|
|
10826
10883
|
{
|
|
10827
10884
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
|
10828
10885
|
} break;
|
|
@@ -11459,6 +11516,8 @@ static void ggml_compute_forward_alibi(
|
|
|
11459
11516
|
case GGML_TYPE_Q4_K:
|
|
11460
11517
|
case GGML_TYPE_Q5_K:
|
|
11461
11518
|
case GGML_TYPE_Q6_K:
|
|
11519
|
+
case GGML_TYPE_IQ2_XXS:
|
|
11520
|
+
case GGML_TYPE_IQ2_XS:
|
|
11462
11521
|
case GGML_TYPE_Q8_K:
|
|
11463
11522
|
case GGML_TYPE_I8:
|
|
11464
11523
|
case GGML_TYPE_I16:
|
|
@@ -11533,6 +11592,8 @@ static void ggml_compute_forward_clamp(
|
|
|
11533
11592
|
case GGML_TYPE_Q4_K:
|
|
11534
11593
|
case GGML_TYPE_Q5_K:
|
|
11535
11594
|
case GGML_TYPE_Q6_K:
|
|
11595
|
+
case GGML_TYPE_IQ2_XXS:
|
|
11596
|
+
case GGML_TYPE_IQ2_XS:
|
|
11536
11597
|
case GGML_TYPE_Q8_K:
|
|
11537
11598
|
case GGML_TYPE_I8:
|
|
11538
11599
|
case GGML_TYPE_I16:
|
|
@@ -11577,7 +11638,22 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
|
|
|
11577
11638
|
return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
|
11578
11639
|
}
|
|
11579
11640
|
|
|
11580
|
-
void
|
|
11641
|
+
static void ggml_rope_cache_init(
|
|
11642
|
+
float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
|
11643
|
+
float * cache, float sin_sign, float theta_scale
|
|
11644
|
+
) {
|
|
11645
|
+
float theta = theta_base;
|
|
11646
|
+
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
11647
|
+
rope_yarn(
|
|
11648
|
+
theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
|
11649
|
+
);
|
|
11650
|
+
cache[i0 + 1] *= sin_sign;
|
|
11651
|
+
|
|
11652
|
+
theta *= theta_scale;
|
|
11653
|
+
}
|
|
11654
|
+
}
|
|
11655
|
+
|
|
11656
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
|
11581
11657
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
|
11582
11658
|
) {
|
|
11583
11659
|
// start and end correction dims
|
|
@@ -11659,6 +11735,12 @@ static void ggml_compute_forward_rope_f32(
|
|
|
11659
11735
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
11660
11736
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
|
11661
11737
|
const int64_t p = pos[i2];
|
|
11738
|
+
|
|
11739
|
+
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
|
11740
|
+
if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
|
|
11741
|
+
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
11742
|
+
}
|
|
11743
|
+
|
|
11662
11744
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
11663
11745
|
if (ir++ < ir0) continue;
|
|
11664
11746
|
if (ir > ir1) break;
|
|
@@ -11692,18 +11774,13 @@ static void ggml_compute_forward_rope_f32(
|
|
|
11692
11774
|
}
|
|
11693
11775
|
} else if (!is_neox) {
|
|
11694
11776
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
11695
|
-
float cos_theta
|
|
11696
|
-
|
|
11697
|
-
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
|
11698
|
-
);
|
|
11699
|
-
sin_theta *= sin_sign;
|
|
11777
|
+
const float cos_theta = cache[i0 + 0];
|
|
11778
|
+
const float sin_theta = cache[i0 + 1];
|
|
11700
11779
|
|
|
11701
11780
|
// zeta scaling for xPos only:
|
|
11702
11781
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
|
11703
11782
|
if (xpos_down) zeta = 1.0f / zeta;
|
|
11704
11783
|
|
|
11705
|
-
theta_base *= theta_scale;
|
|
11706
|
-
|
|
11707
11784
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
11708
11785
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
11709
11786
|
|
|
@@ -11827,6 +11904,12 @@ static void ggml_compute_forward_rope_f16(
|
|
|
11827
11904
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
11828
11905
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
|
11829
11906
|
const int64_t p = pos[i2];
|
|
11907
|
+
|
|
11908
|
+
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
|
11909
|
+
if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
|
|
11910
|
+
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
11911
|
+
}
|
|
11912
|
+
|
|
11830
11913
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
11831
11914
|
if (ir++ < ir0) continue;
|
|
11832
11915
|
if (ir > ir1) break;
|
|
@@ -11860,13 +11943,8 @@ static void ggml_compute_forward_rope_f16(
|
|
|
11860
11943
|
}
|
|
11861
11944
|
} else if (!is_neox) {
|
|
11862
11945
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
11863
|
-
float cos_theta
|
|
11864
|
-
|
|
11865
|
-
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
|
11866
|
-
);
|
|
11867
|
-
sin_theta *= sin_sign;
|
|
11868
|
-
|
|
11869
|
-
theta_base *= theta_scale;
|
|
11946
|
+
const float cos_theta = cache[i0 + 0];
|
|
11947
|
+
const float sin_theta = cache[i0 + 1];
|
|
11870
11948
|
|
|
11871
11949
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
11872
11950
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
@@ -14831,7 +14909,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
|
|
|
14831
14909
|
return i;
|
|
14832
14910
|
}
|
|
14833
14911
|
|
|
14834
|
-
|
|
14912
|
+
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
|
14835
14913
|
size = ggml_hash_size(size);
|
|
14836
14914
|
struct ggml_hash_set result;
|
|
14837
14915
|
result.size = size;
|
|
@@ -16301,24 +16379,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
16301
16379
|
|
|
16302
16380
|
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
|
16303
16381
|
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
|
16304
|
-
|
|
16305
|
-
#if defined(GGML_USE_CUBLAS)
|
|
16306
|
-
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
16307
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
|
16308
|
-
// the threads are still spinning
|
|
16309
|
-
}
|
|
16310
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
16311
|
-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
16312
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
|
16313
|
-
// the threads are still spinning
|
|
16314
|
-
}
|
|
16315
|
-
#endif
|
|
16316
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
16317
|
-
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
|
16318
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
|
16319
|
-
// the threads are still spinning
|
|
16320
|
-
}
|
|
16321
|
-
#endif
|
|
16322
16382
|
} break;
|
|
16323
16383
|
case GGML_OP_MUL_MAT_ID:
|
|
16324
16384
|
{
|
|
@@ -16491,6 +16551,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
16491
16551
|
state->shared->node_n += 1;
|
|
16492
16552
|
return (thread_ret_t) GGML_EXIT_ABORTED;
|
|
16493
16553
|
}
|
|
16554
|
+
|
|
16494
16555
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
|
16495
16556
|
// all other threads are finished and spinning
|
|
16496
16557
|
// do finalize and init here so we don't have synchronize again
|
|
@@ -16556,14 +16617,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
16556
16617
|
} else {
|
|
16557
16618
|
// wait for other threads to finish
|
|
16558
16619
|
const int last = node_n;
|
|
16620
|
+
|
|
16621
|
+
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
|
16622
|
+
|
|
16559
16623
|
while (true) {
|
|
16560
16624
|
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
|
16561
16625
|
// depending on the workload and the operating system.
|
|
16562
16626
|
// since it is not clear what is the best approach, it should potentially become user-configurable
|
|
16563
16627
|
// ref: https://github.com/ggerganov/ggml/issues/291
|
|
16564
|
-
|
|
16565
|
-
|
|
16566
|
-
|
|
16628
|
+
// UPD: adding the do_yield flag seems to resolve the issue universally
|
|
16629
|
+
if (do_yield) {
|
|
16630
|
+
sched_yield();
|
|
16631
|
+
}
|
|
16567
16632
|
|
|
16568
16633
|
node_n = atomic_load(&state->shared->node_n);
|
|
16569
16634
|
if (node_n != last) break;
|
|
@@ -16593,7 +16658,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
16593
16658
|
return GGML_EXIT_SUCCESS;
|
|
16594
16659
|
}
|
|
16595
16660
|
|
|
16596
|
-
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16661
|
+
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
|
16597
16662
|
if (n_threads <= 0) {
|
|
16598
16663
|
n_threads = GGML_DEFAULT_N_THREADS;
|
|
16599
16664
|
}
|
|
@@ -16642,7 +16707,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
|
16642
16707
|
} else
|
|
16643
16708
|
#endif
|
|
16644
16709
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
16645
|
-
if (ggml_compute_forward_mul_mat_use_blas(node
|
|
16710
|
+
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
|
16646
16711
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
|
16647
16712
|
// here we need memory just for single 2D matrix from src0
|
|
16648
16713
|
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
|
@@ -16655,14 +16720,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
|
16655
16720
|
} break;
|
|
16656
16721
|
case GGML_OP_MUL_MAT_ID:
|
|
16657
16722
|
{
|
|
16723
|
+
cur = 0;
|
|
16658
16724
|
const struct ggml_tensor * src0 = node->src[2];
|
|
16659
16725
|
const struct ggml_tensor * src1 = node->src[1];
|
|
16660
16726
|
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
|
16661
16727
|
if (src1->type != vec_dot_type) {
|
|
16662
|
-
cur
|
|
16728
|
+
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
|
16663
16729
|
}
|
|
16664
16730
|
const int n_as = ggml_get_op_params_i32(node, 1);
|
|
16665
|
-
cur
|
|
16731
|
+
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
|
16666
16732
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
|
16667
16733
|
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
|
16668
16734
|
} break;
|
|
@@ -16673,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
|
16673
16739
|
}
|
|
16674
16740
|
} break;
|
|
16675
16741
|
case GGML_OP_SOFT_MAX:
|
|
16742
|
+
case GGML_OP_ROPE:
|
|
16676
16743
|
{
|
|
16677
16744
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
|
16678
16745
|
} break;
|
|
@@ -18598,32 +18665,47 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|
|
18598
18665
|
return (n/QK8_0*sizeof(block_q8_0));
|
|
18599
18666
|
}
|
|
18600
18667
|
|
|
18601
|
-
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
|
18668
|
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
|
18669
|
+
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
|
18670
|
+
(void)imatrix;
|
|
18602
18671
|
size_t result = 0;
|
|
18672
|
+
int n = nrows * n_per_row;
|
|
18603
18673
|
switch (type) {
|
|
18604
18674
|
case GGML_TYPE_Q4_0:
|
|
18605
18675
|
{
|
|
18606
18676
|
GGML_ASSERT(start % QK4_0 == 0);
|
|
18607
|
-
|
|
18608
|
-
|
|
18677
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18678
|
+
size_t start_row = start / n_per_row;
|
|
18679
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18680
|
+
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18681
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18609
18682
|
} break;
|
|
18610
18683
|
case GGML_TYPE_Q4_1:
|
|
18611
18684
|
{
|
|
18612
18685
|
GGML_ASSERT(start % QK4_1 == 0);
|
|
18613
|
-
|
|
18614
|
-
|
|
18686
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18687
|
+
size_t start_row = start / n_per_row;
|
|
18688
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18689
|
+
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18690
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18615
18691
|
} break;
|
|
18616
18692
|
case GGML_TYPE_Q5_0:
|
|
18617
18693
|
{
|
|
18618
18694
|
GGML_ASSERT(start % QK5_0 == 0);
|
|
18619
|
-
|
|
18620
|
-
|
|
18695
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18696
|
+
size_t start_row = start / n_per_row;
|
|
18697
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18698
|
+
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18699
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18621
18700
|
} break;
|
|
18622
18701
|
case GGML_TYPE_Q5_1:
|
|
18623
18702
|
{
|
|
18624
18703
|
GGML_ASSERT(start % QK5_1 == 0);
|
|
18625
|
-
|
|
18626
|
-
|
|
18704
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18705
|
+
size_t start_row = start / n_per_row;
|
|
18706
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18707
|
+
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18708
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18627
18709
|
} break;
|
|
18628
18710
|
case GGML_TYPE_Q8_0:
|
|
18629
18711
|
{
|
|
@@ -18634,32 +18716,67 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
|
18634
18716
|
case GGML_TYPE_Q2_K:
|
|
18635
18717
|
{
|
|
18636
18718
|
GGML_ASSERT(start % QK_K == 0);
|
|
18637
|
-
|
|
18638
|
-
|
|
18719
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18720
|
+
size_t start_row = start / n_per_row;
|
|
18721
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18722
|
+
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18723
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18639
18724
|
} break;
|
|
18640
18725
|
case GGML_TYPE_Q3_K:
|
|
18641
18726
|
{
|
|
18642
18727
|
GGML_ASSERT(start % QK_K == 0);
|
|
18643
|
-
|
|
18644
|
-
|
|
18728
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18729
|
+
size_t start_row = start / n_per_row;
|
|
18730
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18731
|
+
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18732
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18645
18733
|
} break;
|
|
18646
18734
|
case GGML_TYPE_Q4_K:
|
|
18647
18735
|
{
|
|
18648
18736
|
GGML_ASSERT(start % QK_K == 0);
|
|
18649
|
-
|
|
18650
|
-
|
|
18737
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18738
|
+
size_t start_row = start / n_per_row;
|
|
18739
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18740
|
+
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18741
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18651
18742
|
} break;
|
|
18652
18743
|
case GGML_TYPE_Q5_K:
|
|
18653
18744
|
{
|
|
18654
18745
|
GGML_ASSERT(start % QK_K == 0);
|
|
18655
|
-
|
|
18656
|
-
|
|
18746
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18747
|
+
size_t start_row = start / n_per_row;
|
|
18748
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18749
|
+
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18750
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18657
18751
|
} break;
|
|
18658
18752
|
case GGML_TYPE_Q6_K:
|
|
18659
18753
|
{
|
|
18660
18754
|
GGML_ASSERT(start % QK_K == 0);
|
|
18661
|
-
|
|
18662
|
-
|
|
18755
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18756
|
+
size_t start_row = start / n_per_row;
|
|
18757
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18758
|
+
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18759
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18760
|
+
} break;
|
|
18761
|
+
case GGML_TYPE_IQ2_XXS:
|
|
18762
|
+
{
|
|
18763
|
+
GGML_ASSERT(start % QK_K == 0);
|
|
18764
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18765
|
+
GGML_ASSERT(imatrix);
|
|
18766
|
+
size_t start_row = start / n_per_row;
|
|
18767
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18768
|
+
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18769
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18770
|
+
} break;
|
|
18771
|
+
case GGML_TYPE_IQ2_XS:
|
|
18772
|
+
{
|
|
18773
|
+
GGML_ASSERT(start % QK_K == 0);
|
|
18774
|
+
GGML_ASSERT(start % n_per_row == 0);
|
|
18775
|
+
GGML_ASSERT(imatrix);
|
|
18776
|
+
size_t start_row = start / n_per_row;
|
|
18777
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
|
18778
|
+
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
|
18779
|
+
GGML_ASSERT(result == row_size * nrows);
|
|
18663
18780
|
} break;
|
|
18664
18781
|
case GGML_TYPE_F16:
|
|
18665
18782
|
{
|
|
@@ -19016,8 +19133,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
|
19016
19133
|
(int64_t) info->ne[3];
|
|
19017
19134
|
|
|
19018
19135
|
if (ne % ggml_blck_size(info->type) != 0) {
|
|
19019
|
-
fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
|
19020
|
-
__func__, info->name.data, ne, ggml_blck_size(info->type));
|
|
19136
|
+
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
|
19137
|
+
__func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
|
19021
19138
|
fclose(file);
|
|
19022
19139
|
gguf_free(ctx);
|
|
19023
19140
|
return NULL;
|
|
@@ -19123,7 +19240,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
|
19123
19240
|
|
|
19124
19241
|
if (ctx->kv) {
|
|
19125
19242
|
// free string memory - not great..
|
|
19126
|
-
for (
|
|
19243
|
+
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
|
19127
19244
|
struct gguf_kv * kv = &ctx->kv[i];
|
|
19128
19245
|
|
|
19129
19246
|
if (kv->key.data) {
|
|
@@ -19139,7 +19256,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
|
19139
19256
|
if (kv->type == GGUF_TYPE_ARRAY) {
|
|
19140
19257
|
if (kv->value.arr.data) {
|
|
19141
19258
|
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
|
19142
|
-
for (
|
|
19259
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
|
19143
19260
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
|
19144
19261
|
if (str->data) {
|
|
19145
19262
|
free(str->data);
|
|
@@ -19155,7 +19272,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
|
19155
19272
|
}
|
|
19156
19273
|
|
|
19157
19274
|
if (ctx->infos) {
|
|
19158
|
-
for (
|
|
19275
|
+
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
|
19159
19276
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
|
19160
19277
|
|
|
19161
19278
|
if (info->name.data) {
|