llama_cpp 0.12.0 → 0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
@@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
|
|
70
70
|
// 2-bit quantization
|
71
71
|
// weight is represented as x = a * q + b
|
72
72
|
// 16 blocks of 16 elements each
|
73
|
-
// Effectively 2.
|
73
|
+
// Effectively 2.625 bits per weight
|
74
74
|
typedef struct {
|
75
75
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
76
76
|
uint8_t qs[QK_K/4]; // quants
|
@@ -165,6 +165,22 @@ typedef struct {
|
|
165
165
|
} block_q8_K;
|
166
166
|
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
167
167
|
|
168
|
+
// (Almost) "true" 2-bit quantization.
|
169
|
+
// Due to the need to use blocks as per ggml dsign, it ends up using
|
170
|
+
// 2.0625 bpw because of the 16-bit scale for each block of 256.
|
171
|
+
typedef struct {
|
172
|
+
ggml_fp16_t d;
|
173
|
+
uint16_t qs[QK_K/8];
|
174
|
+
} block_iq2_xxs;
|
175
|
+
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
176
|
+
|
177
|
+
// 2.3125 bpw quants
|
178
|
+
typedef struct {
|
179
|
+
ggml_fp16_t d;
|
180
|
+
uint16_t qs[QK_K/8];
|
181
|
+
uint8_t scales[QK_K/32];
|
182
|
+
} block_iq2_xs;
|
183
|
+
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
168
184
|
|
169
185
|
// Quantization
|
170
186
|
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
@@ -180,6 +196,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
180
196
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
181
197
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
182
198
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
199
|
+
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
|
200
|
+
void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
|
183
201
|
|
184
202
|
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
185
203
|
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
@@ -194,6 +212,8 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
|
194
212
|
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
195
213
|
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
196
214
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
215
|
+
void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
|
216
|
+
void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
|
197
217
|
|
198
218
|
// Dequantization
|
199
219
|
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
@@ -209,6 +229,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
|
|
209
229
|
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
210
230
|
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
211
231
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
232
|
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
|
233
|
+
void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
|
212
234
|
|
213
235
|
// Dot product
|
214
236
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
@@ -222,3 +244,5 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
|
222
244
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
223
245
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
224
246
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
247
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
248
|
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -132,7 +132,7 @@ void ggml_print_backtrace(void) {
|
|
132
132
|
"-ex", "bt -frame-info source-and-location",
|
133
133
|
"-ex", "detach",
|
134
134
|
"-ex", "quit",
|
135
|
-
NULL);
|
135
|
+
(char *) NULL);
|
136
136
|
} else {
|
137
137
|
waitpid(pid, NULL, 0);
|
138
138
|
}
|
@@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
394
394
|
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
395
395
|
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
396
396
|
|
397
|
+
ggml_collect_imatrix_t g_imatrix_collect = NULL;
|
398
|
+
|
399
|
+
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
|
400
|
+
g_imatrix_collect = imatrix_collect;
|
401
|
+
}
|
402
|
+
|
397
403
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
398
404
|
[GGML_TYPE_I8] = {
|
399
405
|
.type_name = "i8",
|
@@ -573,6 +579,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
573
579
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
574
580
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
575
581
|
},
|
582
|
+
[GGML_TYPE_IQ2_XXS] = {
|
583
|
+
.type_name = "iq2_xxs",
|
584
|
+
.blck_size = QK_K,
|
585
|
+
.type_size = sizeof(block_iq2_xxs),
|
586
|
+
.is_quantized = true,
|
587
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
588
|
+
.from_float = quantize_row_iq2_xxs,
|
589
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
|
590
|
+
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
591
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
592
|
+
},
|
593
|
+
[GGML_TYPE_IQ2_XS] = {
|
594
|
+
.type_name = "iq2_xs",
|
595
|
+
.blck_size = QK_K,
|
596
|
+
.type_size = sizeof(block_iq2_xs),
|
597
|
+
.is_quantized = true,
|
598
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
599
|
+
.from_float = quantize_row_iq2_xs,
|
600
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
|
601
|
+
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
602
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
603
|
+
},
|
576
604
|
[GGML_TYPE_Q8_K] = {
|
577
605
|
.type_name = "q8_K",
|
578
606
|
.blck_size = QK_K,
|
@@ -2111,6 +2139,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2111
2139
|
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
2112
2140
|
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
2113
2141
|
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
2142
|
+
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
2143
|
+
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2114
2144
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
2115
2145
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
2116
2146
|
}
|
@@ -4299,13 +4329,13 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|
4299
4329
|
static struct ggml_tensor * ggml_cpy_impl(
|
4300
4330
|
struct ggml_context * ctx,
|
4301
4331
|
struct ggml_tensor * a,
|
4302
|
-
struct ggml_tensor * b
|
4303
|
-
bool inplace) {
|
4332
|
+
struct ggml_tensor * b) {
|
4304
4333
|
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
|
4305
4334
|
|
4306
4335
|
bool is_node = false;
|
4307
4336
|
|
4308
|
-
if (
|
4337
|
+
if (a->grad || b->grad) {
|
4338
|
+
// inplace is false and either one have a grad
|
4309
4339
|
is_node = true;
|
4310
4340
|
}
|
4311
4341
|
|
@@ -4329,29 +4359,21 @@ struct ggml_tensor * ggml_cpy(
|
|
4329
4359
|
struct ggml_context * ctx,
|
4330
4360
|
struct ggml_tensor * a,
|
4331
4361
|
struct ggml_tensor * b) {
|
4332
|
-
return ggml_cpy_impl(ctx, a, b
|
4333
|
-
}
|
4334
|
-
|
4335
|
-
struct ggml_tensor * ggml_cpy_inplace(
|
4336
|
-
struct ggml_context * ctx,
|
4337
|
-
struct ggml_tensor * a,
|
4338
|
-
struct ggml_tensor * b) {
|
4339
|
-
return ggml_cpy_impl(ctx, a, b, true);
|
4362
|
+
return ggml_cpy_impl(ctx, a, b);
|
4340
4363
|
}
|
4341
4364
|
|
4342
4365
|
// ggml_cont
|
4343
4366
|
|
4344
4367
|
static struct ggml_tensor * ggml_cont_impl(
|
4345
4368
|
struct ggml_context * ctx,
|
4346
|
-
struct ggml_tensor * a
|
4347
|
-
bool inplace) {
|
4369
|
+
struct ggml_tensor * a) {
|
4348
4370
|
bool is_node = false;
|
4349
4371
|
|
4350
|
-
if (
|
4372
|
+
if (a->grad) {
|
4351
4373
|
is_node = true;
|
4352
4374
|
}
|
4353
4375
|
|
4354
|
-
struct ggml_tensor * result =
|
4376
|
+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
4355
4377
|
ggml_format_name(result, "%s (cont)", a->name);
|
4356
4378
|
|
4357
4379
|
result->op = GGML_OP_CONT;
|
@@ -4364,13 +4386,7 @@ static struct ggml_tensor * ggml_cont_impl(
|
|
4364
4386
|
struct ggml_tensor * ggml_cont(
|
4365
4387
|
struct ggml_context * ctx,
|
4366
4388
|
struct ggml_tensor * a) {
|
4367
|
-
return ggml_cont_impl(ctx, a
|
4368
|
-
}
|
4369
|
-
|
4370
|
-
struct ggml_tensor * ggml_cont_inplace(
|
4371
|
-
struct ggml_context * ctx,
|
4372
|
-
struct ggml_tensor * a) {
|
4373
|
-
return ggml_cont_impl(ctx, a, true);
|
4389
|
+
return ggml_cont_impl(ctx, a);
|
4374
4390
|
}
|
4375
4391
|
|
4376
4392
|
// make contiguous, with new shape
|
@@ -7436,6 +7452,8 @@ static void ggml_compute_forward_add(
|
|
7436
7452
|
case GGML_TYPE_Q4_K:
|
7437
7453
|
case GGML_TYPE_Q5_K:
|
7438
7454
|
case GGML_TYPE_Q6_K:
|
7455
|
+
case GGML_TYPE_IQ2_XXS:
|
7456
|
+
case GGML_TYPE_IQ2_XS:
|
7439
7457
|
{
|
7440
7458
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7441
7459
|
} break;
|
@@ -7700,6 +7718,8 @@ static void ggml_compute_forward_add1(
|
|
7700
7718
|
case GGML_TYPE_Q4_K:
|
7701
7719
|
case GGML_TYPE_Q5_K:
|
7702
7720
|
case GGML_TYPE_Q6_K:
|
7721
|
+
case GGML_TYPE_IQ2_XXS:
|
7722
|
+
case GGML_TYPE_IQ2_XS:
|
7703
7723
|
{
|
7704
7724
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7705
7725
|
} break;
|
@@ -7814,6 +7834,8 @@ static void ggml_compute_forward_acc(
|
|
7814
7834
|
case GGML_TYPE_Q4_K:
|
7815
7835
|
case GGML_TYPE_Q5_K:
|
7816
7836
|
case GGML_TYPE_Q6_K:
|
7837
|
+
case GGML_TYPE_IQ2_XXS:
|
7838
|
+
case GGML_TYPE_IQ2_XS:
|
7817
7839
|
default:
|
7818
7840
|
{
|
7819
7841
|
GGML_ASSERT(false);
|
@@ -9704,10 +9726,10 @@ static void ggml_compute_forward_group_norm(
|
|
9704
9726
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9705
9727
|
// helper function to determine if it is better to use BLAS or not
|
9706
9728
|
// for large matrices, BLAS is faster
|
9707
|
-
static bool ggml_compute_forward_mul_mat_use_blas(
|
9708
|
-
|
9709
|
-
|
9710
|
-
|
9729
|
+
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
9730
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
9731
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
9732
|
+
|
9711
9733
|
//const int64_t ne00 = src0->ne[0];
|
9712
9734
|
//const int64_t ne01 = src0->ne[1];
|
9713
9735
|
|
@@ -9747,6 +9769,10 @@ static void ggml_compute_forward_mul_mat(
|
|
9747
9769
|
const int ith = params->ith;
|
9748
9770
|
const int nth = params->nth;
|
9749
9771
|
|
9772
|
+
if (ith == 1 && g_imatrix_collect) {
|
9773
|
+
g_imatrix_collect(src0, src1);
|
9774
|
+
}
|
9775
|
+
|
9750
9776
|
const enum ggml_type type = src0->type;
|
9751
9777
|
|
9752
9778
|
const bool src1_cont = ggml_is_contiguous(src1);
|
@@ -9787,7 +9813,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9787
9813
|
#endif
|
9788
9814
|
|
9789
9815
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9790
|
-
if (ggml_compute_forward_mul_mat_use_blas(
|
9816
|
+
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
9791
9817
|
if (params->ith != 0) {
|
9792
9818
|
return;
|
9793
9819
|
}
|
@@ -10050,6 +10076,10 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10050
10076
|
|
10051
10077
|
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
10052
10078
|
|
10079
|
+
if (ith == 1 && g_imatrix_collect) {
|
10080
|
+
g_imatrix_collect(src0_cur, src1);
|
10081
|
+
}
|
10082
|
+
|
10053
10083
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10054
10084
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10055
10085
|
|
@@ -10455,6 +10485,8 @@ static void ggml_compute_forward_out_prod(
|
|
10455
10485
|
case GGML_TYPE_Q4_K:
|
10456
10486
|
case GGML_TYPE_Q5_K:
|
10457
10487
|
case GGML_TYPE_Q6_K:
|
10488
|
+
case GGML_TYPE_IQ2_XXS:
|
10489
|
+
case GGML_TYPE_IQ2_XS:
|
10458
10490
|
{
|
10459
10491
|
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
10460
10492
|
} break;
|
@@ -10629,6 +10661,8 @@ static void ggml_compute_forward_set(
|
|
10629
10661
|
case GGML_TYPE_Q4_K:
|
10630
10662
|
case GGML_TYPE_Q5_K:
|
10631
10663
|
case GGML_TYPE_Q6_K:
|
10664
|
+
case GGML_TYPE_IQ2_XXS:
|
10665
|
+
case GGML_TYPE_IQ2_XS:
|
10632
10666
|
default:
|
10633
10667
|
{
|
10634
10668
|
GGML_ASSERT(false);
|
@@ -10823,6 +10857,8 @@ static void ggml_compute_forward_get_rows(
|
|
10823
10857
|
case GGML_TYPE_Q4_K:
|
10824
10858
|
case GGML_TYPE_Q5_K:
|
10825
10859
|
case GGML_TYPE_Q6_K:
|
10860
|
+
case GGML_TYPE_IQ2_XXS:
|
10861
|
+
case GGML_TYPE_IQ2_XS:
|
10826
10862
|
{
|
10827
10863
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
10828
10864
|
} break;
|
@@ -11459,6 +11495,8 @@ static void ggml_compute_forward_alibi(
|
|
11459
11495
|
case GGML_TYPE_Q4_K:
|
11460
11496
|
case GGML_TYPE_Q5_K:
|
11461
11497
|
case GGML_TYPE_Q6_K:
|
11498
|
+
case GGML_TYPE_IQ2_XXS:
|
11499
|
+
case GGML_TYPE_IQ2_XS:
|
11462
11500
|
case GGML_TYPE_Q8_K:
|
11463
11501
|
case GGML_TYPE_I8:
|
11464
11502
|
case GGML_TYPE_I16:
|
@@ -11533,6 +11571,8 @@ static void ggml_compute_forward_clamp(
|
|
11533
11571
|
case GGML_TYPE_Q4_K:
|
11534
11572
|
case GGML_TYPE_Q5_K:
|
11535
11573
|
case GGML_TYPE_Q6_K:
|
11574
|
+
case GGML_TYPE_IQ2_XXS:
|
11575
|
+
case GGML_TYPE_IQ2_XS:
|
11536
11576
|
case GGML_TYPE_Q8_K:
|
11537
11577
|
case GGML_TYPE_I8:
|
11538
11578
|
case GGML_TYPE_I16:
|
@@ -16301,24 +16341,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16301
16341
|
|
16302
16342
|
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
16303
16343
|
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
16304
|
-
|
16305
|
-
#if defined(GGML_USE_CUBLAS)
|
16306
|
-
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
16307
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16308
|
-
// the threads are still spinning
|
16309
|
-
}
|
16310
|
-
#elif defined(GGML_USE_CLBLAST)
|
16311
|
-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
16312
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16313
|
-
// the threads are still spinning
|
16314
|
-
}
|
16315
|
-
#endif
|
16316
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16317
|
-
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
|
16318
|
-
n_tasks = 1; // TODO: this actually is doing nothing
|
16319
|
-
// the threads are still spinning
|
16320
|
-
}
|
16321
|
-
#endif
|
16322
16344
|
} break;
|
16323
16345
|
case GGML_OP_MUL_MAT_ID:
|
16324
16346
|
{
|
@@ -16491,6 +16513,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16491
16513
|
state->shared->node_n += 1;
|
16492
16514
|
return (thread_ret_t) GGML_EXIT_ABORTED;
|
16493
16515
|
}
|
16516
|
+
|
16494
16517
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
16495
16518
|
// all other threads are finished and spinning
|
16496
16519
|
// do finalize and init here so we don't have synchronize again
|
@@ -16556,14 +16579,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16556
16579
|
} else {
|
16557
16580
|
// wait for other threads to finish
|
16558
16581
|
const int last = node_n;
|
16582
|
+
|
16583
|
+
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
16584
|
+
|
16559
16585
|
while (true) {
|
16560
16586
|
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
16561
16587
|
// depending on the workload and the operating system.
|
16562
16588
|
// since it is not clear what is the best approach, it should potentially become user-configurable
|
16563
16589
|
// ref: https://github.com/ggerganov/ggml/issues/291
|
16564
|
-
|
16565
|
-
|
16566
|
-
|
16590
|
+
// UPD: adding the do_yield flag seems to resolve the issue universally
|
16591
|
+
if (do_yield) {
|
16592
|
+
sched_yield();
|
16593
|
+
}
|
16567
16594
|
|
16568
16595
|
node_n = atomic_load(&state->shared->node_n);
|
16569
16596
|
if (node_n != last) break;
|
@@ -16642,7 +16669,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16642
16669
|
} else
|
16643
16670
|
#endif
|
16644
16671
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16645
|
-
if (ggml_compute_forward_mul_mat_use_blas(node
|
16672
|
+
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
16646
16673
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16647
16674
|
// here we need memory just for single 2D matrix from src0
|
16648
16675
|
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
@@ -18661,6 +18688,18 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18661
18688
|
block_q6_K * block = (block_q6_K*)dst + start / QK_K;
|
18662
18689
|
result = ggml_quantize_q6_K(src + start, block, n, n, hist);
|
18663
18690
|
} break;
|
18691
|
+
case GGML_TYPE_IQ2_XXS:
|
18692
|
+
{
|
18693
|
+
GGML_ASSERT(start % QK_K == 0);
|
18694
|
+
block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
|
18695
|
+
result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
|
18696
|
+
} break;
|
18697
|
+
case GGML_TYPE_IQ2_XS:
|
18698
|
+
{
|
18699
|
+
GGML_ASSERT(start % QK_K == 0);
|
18700
|
+
block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
|
18701
|
+
result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
|
18702
|
+
} break;
|
18664
18703
|
case GGML_TYPE_F16:
|
18665
18704
|
{
|
18666
18705
|
int elemsize = sizeof(ggml_fp16_t);
|
@@ -19016,8 +19055,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19016
19055
|
(int64_t) info->ne[3];
|
19017
19056
|
|
19018
19057
|
if (ne % ggml_blck_size(info->type) != 0) {
|
19019
|
-
fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
19020
|
-
__func__, info->name.data, ne, ggml_blck_size(info->type));
|
19058
|
+
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
19059
|
+
__func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
19021
19060
|
fclose(file);
|
19022
19061
|
gguf_free(ctx);
|
19023
19062
|
return NULL;
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -218,7 +218,9 @@
|
|
218
218
|
#define GGML_MAX_PARAMS 2048
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 10
|
221
|
+
#ifndef GGML_MAX_NAME
|
221
222
|
#define GGML_MAX_NAME 64
|
223
|
+
#endif
|
222
224
|
#define GGML_MAX_OP_PARAMS 64
|
223
225
|
#define GGML_DEFAULT_N_THREADS 4
|
224
226
|
#define GGML_DEFAULT_GRAPH_SIZE 2048
|
@@ -339,6 +341,8 @@ extern "C" {
|
|
339
341
|
GGML_TYPE_Q5_K = 13,
|
340
342
|
GGML_TYPE_Q6_K = 14,
|
341
343
|
GGML_TYPE_Q8_K = 15,
|
344
|
+
GGML_TYPE_IQ2_XXS = 16,
|
345
|
+
GGML_TYPE_IQ2_XS = 17,
|
342
346
|
GGML_TYPE_I8,
|
343
347
|
GGML_TYPE_I16,
|
344
348
|
GGML_TYPE_I32,
|
@@ -373,6 +377,8 @@ extern "C" {
|
|
373
377
|
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
374
378
|
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
375
379
|
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
380
|
+
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
381
|
+
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
376
382
|
};
|
377
383
|
|
378
384
|
// available tensor operations:
|
@@ -1159,22 +1165,11 @@ extern "C" {
|
|
1159
1165
|
struct ggml_tensor * a,
|
1160
1166
|
struct ggml_tensor * b);
|
1161
1167
|
|
1162
|
-
// a -> b, in-place, return view(b)
|
1163
|
-
GGML_API struct ggml_tensor * ggml_cpy_inplace(
|
1164
|
-
struct ggml_context * ctx,
|
1165
|
-
struct ggml_tensor * a,
|
1166
|
-
struct ggml_tensor * b);
|
1167
|
-
|
1168
1168
|
// make contiguous
|
1169
1169
|
GGML_API struct ggml_tensor * ggml_cont(
|
1170
1170
|
struct ggml_context * ctx,
|
1171
1171
|
struct ggml_tensor * a);
|
1172
1172
|
|
1173
|
-
// make contiguous, in-place
|
1174
|
-
GGML_API struct ggml_tensor * ggml_cont_inplace(
|
1175
|
-
struct ggml_context * ctx,
|
1176
|
-
struct ggml_tensor * a);
|
1177
|
-
|
1178
1173
|
// make contiguous, with new shape
|
1179
1174
|
GGML_API struct ggml_tensor * ggml_cont_1d(
|
1180
1175
|
struct ggml_context * ctx,
|
@@ -2067,9 +2062,17 @@ extern "C" {
|
|
2067
2062
|
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2068
2063
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2069
2064
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2065
|
+
GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
|
2066
|
+
GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
|
2070
2067
|
|
2071
2068
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
2072
2069
|
|
2070
|
+
//
|
2071
|
+
// Importance matrix
|
2072
|
+
//
|
2073
|
+
typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
2074
|
+
GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
|
2075
|
+
|
2073
2076
|
//
|
2074
2077
|
// gguf
|
2075
2078
|
//
|