llama_cpp 0.12.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
@@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
|
|
70
70
|
// 2-bit quantization
|
71
71
|
// weight is represented as x = a * q + b
|
72
72
|
// 16 blocks of 16 elements each
|
73
|
-
// Effectively 2.
|
73
|
+
// Effectively 2.625 bits per weight
|
74
74
|
typedef struct {
|
75
75
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
76
76
|
uint8_t qs[QK_K/4]; // quants
|
@@ -165,6 +165,22 @@ typedef struct {
|
|
165
165
|
} block_q8_K;
|
166
166
|
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
167
167
|
|
168
|
+
// (Almost) "true" 2-bit quantization.
|
169
|
+
// Due to the need to use blocks as per ggml dsign, it ends up using
|
170
|
+
// 2.0625 bpw because of the 16-bit scale for each block of 256.
|
171
|
+
typedef struct {
|
172
|
+
ggml_fp16_t d;
|
173
|
+
uint16_t qs[QK_K/8];
|
174
|
+
} block_iq2_xxs;
|
175
|
+
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
176
|
+
|
177
|
+
// 2.3125 bpw quants
|
178
|
+
typedef struct {
|
179
|
+
ggml_fp16_t d;
|
180
|
+
uint16_t qs[QK_K/8];
|
181
|
+
uint8_t scales[QK_K/32];
|
182
|
+
} block_iq2_xs;
|
183
|
+
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
168
184
|
|
169
185
|
// Quantization
|
170
186
|
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
@@ -180,6 +196,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
180
196
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
181
197
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
182
198
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
199
|
+
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
|
200
|
+
void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
|
183
201
|
|
184
202
|
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
185
203
|
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
@@ -194,6 +212,8 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
|
194
212
|
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
195
213
|
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
196
214
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
215
|
+
void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
|
216
|
+
void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
|
197
217
|
|
198
218
|
// Dequantization
|
199
219
|
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
@@ -209,6 +229,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
|
|
209
229
|
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
210
230
|
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
211
231
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
232
|
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
|
233
|
+
void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
|
212
234
|
|
213
235
|
// Dot product
|
214
236
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
@@ -222,3 +244,5 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
|
222
244
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
223
245
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
224
246
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
247
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
248
|
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -132,7 +132,7 @@ void ggml_print_backtrace(void) {
|
|
132
132
|
"-ex", "bt -frame-info source-and-location",
|
133
133
|
"-ex", "detach",
|
134
134
|
"-ex", "quit",
|
135
|
-
NULL);
|
135
|
+
(char *) NULL);
|
136
136
|
} else {
|
137
137
|
waitpid(pid, NULL, 0);
|
138
138
|
}
|
@@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
394
394
|
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
395
395
|
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
396
396
|
|
397
|
+
ggml_collect_imatrix_t g_imatrix_collect = NULL;
|
398
|
+
|
399
|
+
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
|
400
|
+
g_imatrix_collect = imatrix_collect;
|
401
|
+
}
|
402
|
+
|
397
403
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
398
404
|
[GGML_TYPE_I8] = {
|
399
405
|
.type_name = "i8",
|
@@ -573,6 +579,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
573
579
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
574
580
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
575
581
|
},
|
582
|
+
[GGML_TYPE_IQ2_XXS] = {
|
583
|
+
.type_name = "iq2_xxs",
|
584
|
+
.blck_size = QK_K,
|
585
|
+
.type_size = sizeof(block_iq2_xxs),
|
586
|
+
.is_quantized = true,
|
587
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
588
|
+
.from_float = quantize_row_iq2_xxs,
|
589
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
|
590
|
+
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
591
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
592
|
+
},
|
593
|
+
[GGML_TYPE_IQ2_XS] = {
|
594
|
+
.type_name = "iq2_xs",
|
595
|
+
.blck_size = QK_K,
|
596
|
+
.type_size = sizeof(block_iq2_xs),
|
597
|
+
.is_quantized = true,
|
598
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
599
|
+
.from_float = quantize_row_iq2_xs,
|
600
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
|
601
|
+
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
602
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
603
|
+
},
|
576
604
|
[GGML_TYPE_Q8_K] = {
|
577
605
|
.type_name = "q8_K",
|
578
606
|
.blck_size = QK_K,
|
@@ -2111,6 +2139,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2111
2139
|
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
2112
2140
|
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
2113
2141
|
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
2142
|
+
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
2143
|
+
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2114
2144
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
2115
2145
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
2116
2146
|
}
|
@@ -4299,13 +4329,13 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
4299
4329
|
static struct ggml_tensor * ggml_cpy_impl(
|
4300
4330
|
struct ggml_context * ctx,
|
4301
4331
|
struct ggml_tensor * a,
|
4302
|
-
struct ggml_tensor * b
|
4303
|
-
bool inplace) {
|
4332
|
+
struct ggml_tensor * b) {
|
4304
4333
|
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
|
4305
4334
|
|
4306
4335
|
bool is_node = false;
|
4307
4336
|
|
4308
|
-
if (
|
4337
|
+
if (a->grad || b->grad) {
|
4338
|
+
// inplace is false and either one have a grad
|
4309
4339
|
is_node = true;
|
4310
4340
|
}
|
4311
4341
|
|
@@ -4329,29 +4359,21 @@ struct ggml_tensor * ggml_cpy(
|
|
4329
4359
|
struct ggml_context * ctx,
|
4330
4360
|
struct ggml_tensor * a,
|
4331
4361
|
struct ggml_tensor * b) {
|
4332
|
-
return ggml_cpy_impl(ctx, a, b
|
4333
|
-
}
|
4334
|
-
|
4335
|
-
struct ggml_tensor * ggml_cpy_inplace(
|
4336
|
-
struct ggml_context * ctx,
|
4337
|
-
struct ggml_tensor * a,
|
4338
|
-
struct ggml_tensor * b) {
|
4339
|
-
return ggml_cpy_impl(ctx, a, b, true);
|
4362
|
+
return ggml_cpy_impl(ctx, a, b);
|
4340
4363
|
}
|
4341
4364
|
|
4342
4365
|
// ggml_cont
|
4343
4366
|
|
4344
4367
|
static struct ggml_tensor * ggml_cont_impl(
|
4345
4368
|
struct ggml_context * ctx,
|
4346
|
-
struct ggml_tensor * a
|
4347
|
-
bool inplace) {
|
4369
|
+
struct ggml_tensor * a) {
|
4348
4370
|
bool is_node = false;
|
4349
4371
|
|
4350
|
-
if (
|
4372
|
+
if (a->grad) {
|
4351
4373
|
is_node = true;
|
4352
4374
|
}
|
4353
4375
|
|
4354
|
-
struct ggml_tensor * result =
|
4376
|
+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
4355
4377
|
ggml_format_name(result, "%s (cont)", a->name);
|
4356
4378
|
|
4357
4379
|
result->op = GGML_OP_CONT;
|
@@ -4364,13 +4386,7 @@ static struct ggml_tensor * ggml_cont_impl(
|
|
4364
4386
|
struct ggml_tensor * ggml_cont(
|
4365
4387
|
struct ggml_context * ctx,
|
4366
4388
|
struct ggml_tensor * a) {
|
4367
|
-
return ggml_cont_impl(ctx, a
|
4368
|
-
}
|
4369
|
-
|
4370
|
-
struct ggml_tensor * ggml_cont_inplace(
|
4371
|
-
struct ggml_context * ctx,
|
4372
|
-
struct ggml_tensor * a) {
|
4373
|
-
return ggml_cont_impl(ctx, a, true);
|
4389
|
+
return ggml_cont_impl(ctx, a);
|
4374
4390
|
}
|
4375
4391
|
|
4376
4392
|
// make contiguous, with new shape
|
@@ -7436,6 +7452,8 @@ static void ggml_compute_forward_add(
|
|
7436
7452
|
case GGML_TYPE_Q4_K:
|
7437
7453
|
case GGML_TYPE_Q5_K:
|
7438
7454
|
case GGML_TYPE_Q6_K:
|
7455
|
+
case GGML_TYPE_IQ2_XXS:
|
7456
|
+
case GGML_TYPE_IQ2_XS:
|
7439
7457
|
{
|
7440
7458
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7441
7459
|
} break;
|
@@ -7700,6 +7718,8 @@ static void ggml_compute_forward_add1(
|
|
7700
7718
|
case GGML_TYPE_Q4_K:
|
7701
7719
|
case GGML_TYPE_Q5_K:
|
7702
7720
|
case GGML_TYPE_Q6_K:
|
7721
|
+
case GGML_TYPE_IQ2_XXS:
|
7722
|
+
case GGML_TYPE_IQ2_XS:
|
7703
7723
|
{
|
7704
7724
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7705
7725
|
} break;
|
@@ -7814,6 +7834,8 @@ static void ggml_compute_forward_acc(
|
|
7814
7834
|
case GGML_TYPE_Q4_K:
|
7815
7835
|
case GGML_TYPE_Q5_K:
|
7816
7836
|
case GGML_TYPE_Q6_K:
|
7837
|
+
case GGML_TYPE_IQ2_XXS:
|
7838
|
+
case GGML_TYPE_IQ2_XS:
|
7817
7839
|
default:
|
7818
7840
|
{
|
7819
7841
|
GGML_ASSERT(false);
|
@@ -9704,10 +9726,10 @@ static void ggml_compute_forward_group_norm(
|
|
9704
9726
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9705
9727
|
// helper function to determine if it is better to use BLAS or not
|
9706
9728
|
// for large matrices, BLAS is faster
|
9707
|
-
static bool ggml_compute_forward_mul_mat_use_blas(
|
9708
|
-
|
9709
|
-
|
9710
|
-
|
9729
|
+
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
9730
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
9731
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
9732
|
+
|
9711
9733
|
//const int64_t ne00 = src0->ne[0];
|
9712
9734
|
//const int64_t ne01 = src0->ne[1];
|
9713
9735
|
|
@@ -9747,6 +9769,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9747
9769
|
const int ith = params->ith;
|
9748
9770
|
const int nth = params->nth;
|
9749
9771
|
|
9772
|
+
if (ith == 1 && g_imatrix_collect) {
|
9773
|
+
g_imatrix_collect(src0, src1);
|
9774
|
+
}
|
9775
|
+
|
9750
9776
|
const enum ggml_type type = src0->type;
|
9751
9777
|
|
9752
9778
|
const bool src1_cont = ggml_is_contiguous(src1);
|
@@ -9787,7 +9813,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9787
9813
|
#endif
|
9788
9814
|
|
9789
9815
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9790
|
-
if (ggml_compute_forward_mul_mat_use_blas(
|
9816
|
+
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
9791
9817
|
if (params->ith != 0) {
|
9792
9818
|
return;
|
9793
9819
|
}
|
@@ -10050,6 +10076,10 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10050
10076
|
|
10051
10077
|
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
10052
10078
|
|
10079
|
+
if (ith == 1 && g_imatrix_collect) {
|
10080
|
+
g_imatrix_collect(src0_cur, src1);
|
10081
|
+
}
|
10082
|
+
|
10053
10083
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10054
10084
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10055
10085
|
|
@@ -10455,6 +10485,8 @@ static void ggml_compute_forward_out_prod(
|
|
10455
10485
|
case GGML_TYPE_Q4_K:
|
10456
10486
|
case GGML_TYPE_Q5_K:
|
10457
10487
|
case GGML_TYPE_Q6_K:
|
10488
|
+
case GGML_TYPE_IQ2_XXS:
|
10489
|
+
case GGML_TYPE_IQ2_XS:
|
10458
10490
|
{
|
10459
10491
|
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
10460
10492
|
} break;
|
@@ -10629,6 +10661,8 @@ static void ggml_compute_forward_set(
|
|
10629
10661
|
case GGML_TYPE_Q4_K:
|
10630
10662
|
case GGML_TYPE_Q5_K:
|
10631
10663
|
case GGML_TYPE_Q6_K:
|
10664
|
+
case GGML_TYPE_IQ2_XXS:
|
10665
|
+
case GGML_TYPE_IQ2_XS:
|
10632
10666
|
default:
|
10633
10667
|
{
|
10634
10668
|
GGML_ASSERT(false);
|
@@ -10823,6 +10857,8 @@ static void ggml_compute_forward_get_rows(
|
|
10823
10857
|
case GGML_TYPE_Q4_K:
|
10824
10858
|
case GGML_TYPE_Q5_K:
|
10825
10859
|
case GGML_TYPE_Q6_K:
|
10860
|
+
case GGML_TYPE_IQ2_XXS:
|
10861
|
+
case GGML_TYPE_IQ2_XS:
|
10826
10862
|
{
|
10827
10863
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
10828
10864
|
} break;
|
@@ -11459,6 +11495,8 @@ static void ggml_compute_forward_alibi(
|
|
11459
11495
|
case GGML_TYPE_Q4_K:
|
11460
11496
|
case GGML_TYPE_Q5_K:
|
11461
11497
|
case GGML_TYPE_Q6_K:
|
11498
|
+
case GGML_TYPE_IQ2_XXS:
|
11499
|
+
case GGML_TYPE_IQ2_XS:
|
11462
11500
|
case GGML_TYPE_Q8_K:
|
11463
11501
|
case GGML_TYPE_I8:
|
11464
11502
|
case GGML_TYPE_I16:
|
@@ -11533,6 +11571,8 @@ static void ggml_compute_forward_clamp(
|
|
11533
11571
|
case GGML_TYPE_Q4_K:
|
11534
11572
|
case GGML_TYPE_Q5_K:
|
11535
11573
|
case GGML_TYPE_Q6_K:
|
11574
|
+
case GGML_TYPE_IQ2_XXS:
|
11575
|
+
case GGML_TYPE_IQ2_XS:
|
11536
11576
|
case GGML_TYPE_Q8_K:
|
11537
11577
|
case GGML_TYPE_I8:
|
11538
11578
|
case GGML_TYPE_I16:
|
@@ -16301,24 +16341,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16301
16341
|
|
16302
16342
|
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
16303
16343
|
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
16304
|
-
|
16305
|
-
#if defined(GGML_USE_CUBLAS)
|
16306
|
-
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
16307
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16308
|
-
// the threads are still spinning
|
16309
|
-
}
|
16310
|
-
#elif defined(GGML_USE_CLBLAST)
|
16311
|
-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
16312
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16313
|
-
// the threads are still spinning
|
16314
|
-
}
|
16315
|
-
#endif
|
16316
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16317
|
-
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
16318
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16319
|
-
// the threads are still spinning
|
16320
|
-
}
|
16321
|
-
#endif
|
16322
16344
|
} break;
|
16323
16345
|
case GGML_OP_MUL_MAT_ID:
|
16324
16346
|
{
|
@@ -16491,6 +16513,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16491
16513
|
state->shared->node_n += 1;
|
16492
16514
|
return (thread_ret_t) GGML_EXIT_ABORTED;
|
16493
16515
|
}
|
16516
|
+
|
16494
16517
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
16495
16518
|
// all other threads are finished and spinning
|
16496
16519
|
// do finalize and init here so we don't have synchronize again
|
@@ -16556,14 +16579,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16556
16579
|
} else {
|
16557
16580
|
// wait for other threads to finish
|
16558
16581
|
const int last = node_n;
|
16582
|
+
|
16583
|
+
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
16584
|
+
|
16559
16585
|
while (true) {
|
16560
16586
|
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
16561
16587
|
// depending on the workload and the operating system.
|
16562
16588
|
// since it is not clear what is the best approach, it should potentially become user-configurable
|
16563
16589
|
// ref: https://github.com/ggerganov/ggml/issues/291
|
16564
|
-
|
16565
|
-
|
16566
|
-
|
16590
|
+
// UPD: adding the do_yield flag seems to resolve the issue universally
|
16591
|
+
if (do_yield) {
|
16592
|
+
sched_yield();
|
16593
|
+
}
|
16567
16594
|
|
16568
16595
|
node_n = atomic_load(&state->shared->node_n);
|
16569
16596
|
if (node_n != last) break;
|
@@ -16642,7 +16669,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16642
16669
|
} else
|
16643
16670
|
#endif
|
16644
16671
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16645
|
-
if (ggml_compute_forward_mul_mat_use_blas(node
|
16672
|
+
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
16646
16673
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16647
16674
|
// here we need memory just for single 2D matrix from src0
|
16648
16675
|
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
@@ -18661,6 +18688,18 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18661
18688
|
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
18662
18689
|
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
18663
18690
|
} break;
|
18691
|
+
case GGML_TYPE_IQ2_XXS:
|
18692
|
+
{
|
18693
|
+
GGML_ASSERT(start % QK_K == 0);
|
18694
|
+
block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
|
18695
|
+
result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
|
18696
|
+
} break;
|
18697
|
+
case GGML_TYPE_IQ2_XS:
|
18698
|
+
{
|
18699
|
+
GGML_ASSERT(start % QK_K == 0);
|
18700
|
+
block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
|
18701
|
+
result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
|
18702
|
+
} break;
|
18664
18703
|
case GGML_TYPE_F16:
|
18665
18704
|
{
|
18666
18705
|
int elemsize = sizeof(ggml_fp16_t);
|
@@ -19016,8 +19055,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19016
19055
|
(int64_t) info->ne[3];
|
19017
19056
|
|
19018
19057
|
if (ne % ggml_blck_size(info->type) != 0) {
|
19019
|
-
fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
19020
|
-
__func__, info->name.data, ne, ggml_blck_size(info->type));
|
19058
|
+
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
19059
|
+
__func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
19021
19060
|
fclose(file);
|
19022
19061
|
gguf_free(ctx);
|
19023
19062
|
return NULL;
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -218,7 +218,9 @@
|
|
218
218
|
#define GGML_MAX_PARAMS 2048
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 10
|
221
|
+
#ifndef GGML_MAX_NAME
|
221
222
|
#define GGML_MAX_NAME 64
|
223
|
+
#endif
|
222
224
|
#define GGML_MAX_OP_PARAMS 64
|
223
225
|
#define GGML_DEFAULT_N_THREADS 4
|
224
226
|
#define GGML_DEFAULT_GRAPH_SIZE 2048
|
@@ -339,6 +341,8 @@ extern "C" {
|
|
339
341
|
GGML_TYPE_Q5_K = 13,
|
340
342
|
GGML_TYPE_Q6_K = 14,
|
341
343
|
GGML_TYPE_Q8_K = 15,
|
344
|
+
GGML_TYPE_IQ2_XXS = 16,
|
345
|
+
GGML_TYPE_IQ2_XS = 17,
|
342
346
|
GGML_TYPE_I8,
|
343
347
|
GGML_TYPE_I16,
|
344
348
|
GGML_TYPE_I32,
|
@@ -373,6 +377,8 @@ extern "C" {
|
|
373
377
|
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
374
378
|
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
375
379
|
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
380
|
+
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
381
|
+
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
376
382
|
};
|
377
383
|
|
378
384
|
// available tensor operations:
|
@@ -1159,22 +1165,11 @@ extern "C" {
|
|
1159
1165
|
struct ggml_tensor * a,
|
1160
1166
|
struct ggml_tensor * b);
|
1161
1167
|
|
1162
|
-
// a -> b, in-place, return view(b)
|
1163
|
-
GGML_API struct ggml_tensor * ggml_cpy_inplace(
|
1164
|
-
struct ggml_context * ctx,
|
1165
|
-
struct ggml_tensor * a,
|
1166
|
-
struct ggml_tensor * b);
|
1167
|
-
|
1168
1168
|
// make contiguous
|
1169
1169
|
GGML_API struct ggml_tensor * ggml_cont(
|
1170
1170
|
struct ggml_context * ctx,
|
1171
1171
|
struct ggml_tensor * a);
|
1172
1172
|
|
1173
|
-
// make contiguous, in-place
|
1174
|
-
GGML_API struct ggml_tensor * ggml_cont_inplace(
|
1175
|
-
struct ggml_context * ctx,
|
1176
|
-
struct ggml_tensor * a);
|
1177
|
-
|
1178
1173
|
// make contiguous, with new shape
|
1179
1174
|
GGML_API struct ggml_tensor * ggml_cont_1d(
|
1180
1175
|
struct ggml_context * ctx,
|
@@ -2067,9 +2062,17 @@ extern "C" {
|
|
2067
2062
|
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2068
2063
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2069
2064
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2065
|
+
GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
|
2066
|
+
GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
|
2070
2067
|
|
2071
2068
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
2072
2069
|
|
2070
|
+
//
|
2071
|
+
// Importance matrix
|
2072
|
+
//
|
2073
|
+
typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
2074
|
+
GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
|
2075
|
+
|
2073
2076
|
//
|
2074
2077
|
// gguf
|
2075
2078
|
//
|