llama_cpp 0.3.5 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2090 -438
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +17 -16
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +49 -26
- data/ext/llama_cpp/src/ggml.h +12 -1
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama.cpp +199 -68
- data/ext/llama_cpp/src/llama.h +1 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
|
27
27
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
28
28
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
29
29
|
void ggml_cuda_set_main_device(int main_device);
|
30
|
+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
30
31
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
31
32
|
void ggml_cuda_free_scratch(void);
|
32
33
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
@@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
|
|
718
718
|
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
719
719
|
|
720
720
|
GGML_ASSERT(ne00 == ne10);
|
721
|
-
GGML_ASSERT(ne02 == ne12);
|
721
|
+
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
722
|
+
GGML_ASSERT(ne03 == ne13);
|
722
723
|
|
723
724
|
if (ggml_is_contiguous(src0) &&
|
724
725
|
ggml_is_contiguous(src1) &&
|
@@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
|
|
746
747
|
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
747
748
|
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
748
749
|
|
749
|
-
// we need to do
|
750
|
+
// we need to do ne12 multiplications
|
750
751
|
// TODO: is there a way to do this in parallel - currently very slow ..
|
751
752
|
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
752
|
-
for (int64_t i02 = 0; i02 <
|
753
|
-
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
753
|
+
for (int64_t i02 = 0; i02 < ne12; ++i02) {
|
754
|
+
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
|
754
755
|
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
755
756
|
size_t offs_dst_cur = offs_dst + i02*nb2;
|
756
757
|
|
@@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
|
|
772
773
|
switch (src0t) {
|
773
774
|
case GGML_TYPE_F16:
|
774
775
|
{
|
775
|
-
GGML_ASSERT(ne02 == ne12);
|
776
|
-
|
777
776
|
nth0 = 64;
|
778
777
|
nth1 = 1;
|
779
778
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
@@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
|
|
853
852
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
854
853
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
855
854
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
856
|
-
[encoder setBytes:&
|
857
|
-
[encoder setBytes:&
|
858
|
-
[encoder setBytes:&
|
859
|
-
[encoder setBytes:&
|
860
|
-
[encoder setBytes:&
|
861
|
-
[encoder setBytes:&
|
862
|
-
[encoder setBytes:&
|
863
|
-
[encoder setBytes:&
|
864
|
-
[encoder setBytes:&
|
865
|
-
[encoder setBytes:&
|
855
|
+
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
856
|
+
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
857
|
+
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
858
|
+
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
859
|
+
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
860
|
+
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
861
|
+
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
|
862
|
+
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
|
863
|
+
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
|
864
|
+
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
865
|
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
866
|
+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
866
867
|
|
867
868
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
868
869
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
|
509
509
|
device float * dst,
|
510
510
|
constant int64_t & ne00,
|
511
511
|
constant int64_t & ne01,
|
512
|
+
constant int64_t & ne02,
|
512
513
|
constant uint64_t & nb00,
|
513
514
|
constant uint64_t & nb01,
|
514
515
|
constant uint64_t & nb02,
|
515
516
|
constant int64_t & ne10,
|
516
517
|
constant int64_t & ne11,
|
518
|
+
constant int64_t & ne12,
|
517
519
|
constant uint64_t & nb10,
|
518
520
|
constant uint64_t & nb11,
|
519
521
|
constant uint64_t & nb12,
|
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
529
531
|
const int64_t r1 = tgpig.y;
|
530
532
|
const int64_t im = tgpig.z;
|
531
533
|
|
532
|
-
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
534
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
533
535
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
534
536
|
|
535
537
|
sum[tpitg.x] = 0.0f;
|
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
552
554
|
}
|
553
555
|
}
|
554
556
|
|
557
|
+
|
555
558
|
kernel void kernel_alibi_f32(
|
556
559
|
device const float * src0,
|
557
560
|
device float * dst,
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
|
4557
4557
|
|
4558
4558
|
static struct ggml_tensor * ggml_new_tensor_impl(
|
4559
4559
|
struct ggml_context * ctx,
|
4560
|
-
enum ggml_type
|
4561
|
-
int
|
4562
|
-
const int64_t* ne,
|
4563
|
-
void*
|
4560
|
+
enum ggml_type type,
|
4561
|
+
int n_dims,
|
4562
|
+
const int64_t * ne,
|
4563
|
+
void * data) {
|
4564
|
+
|
4565
|
+
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
4564
4566
|
|
4565
4567
|
size_t data_size = 0;
|
4566
4568
|
|
@@ -4648,22 +4650,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
|
|
4648
4650
|
|
4649
4651
|
struct ggml_tensor * ggml_new_tensor(
|
4650
4652
|
struct ggml_context * ctx,
|
4651
|
-
enum ggml_type
|
4652
|
-
int
|
4653
|
-
const int64_t
|
4653
|
+
enum ggml_type type,
|
4654
|
+
int n_dims,
|
4655
|
+
const int64_t * ne) {
|
4654
4656
|
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
4655
4657
|
}
|
4656
4658
|
|
4657
4659
|
struct ggml_tensor * ggml_new_tensor_1d(
|
4658
4660
|
struct ggml_context * ctx,
|
4659
|
-
enum ggml_type
|
4661
|
+
enum ggml_type type,
|
4660
4662
|
int64_t ne0) {
|
4661
4663
|
return ggml_new_tensor(ctx, type, 1, &ne0);
|
4662
4664
|
}
|
4663
4665
|
|
4664
4666
|
struct ggml_tensor * ggml_new_tensor_2d(
|
4665
4667
|
struct ggml_context * ctx,
|
4666
|
-
enum ggml_type
|
4668
|
+
enum ggml_type type,
|
4667
4669
|
int64_t ne0,
|
4668
4670
|
int64_t ne1) {
|
4669
4671
|
const int64_t ne[2] = { ne0, ne1 };
|
@@ -4672,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
|
|
4672
4674
|
|
4673
4675
|
struct ggml_tensor * ggml_new_tensor_3d(
|
4674
4676
|
struct ggml_context * ctx,
|
4675
|
-
enum ggml_type
|
4677
|
+
enum ggml_type type,
|
4676
4678
|
int64_t ne0,
|
4677
4679
|
int64_t ne1,
|
4678
4680
|
int64_t ne2) {
|
@@ -6238,6 +6240,27 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6238
6240
|
|
6239
6241
|
// ggml_view_1d
|
6240
6242
|
|
6243
|
+
static struct ggml_tensor * ggml_view_tensor_offset(
|
6244
|
+
struct ggml_context * ctx,
|
6245
|
+
struct ggml_tensor * a,
|
6246
|
+
int n_dims,
|
6247
|
+
const int64_t * ne,
|
6248
|
+
size_t offset) {
|
6249
|
+
// don't calculate an offset from an unallocated tensor
|
6250
|
+
void * data = NULL;
|
6251
|
+
if (a->data != NULL) {
|
6252
|
+
data = (char *) a->data + offset;
|
6253
|
+
}
|
6254
|
+
|
6255
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
|
6256
|
+
|
6257
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6258
|
+
|
6259
|
+
ggml_set_op_params(result, &offset, sizeof(offset));
|
6260
|
+
|
6261
|
+
return result;
|
6262
|
+
}
|
6263
|
+
|
6241
6264
|
struct ggml_tensor * ggml_view_1d(
|
6242
6265
|
struct ggml_context * ctx,
|
6243
6266
|
struct ggml_tensor * a,
|
@@ -6250,10 +6273,7 @@ struct ggml_tensor * ggml_view_1d(
|
|
6250
6273
|
is_node = true;
|
6251
6274
|
}
|
6252
6275
|
|
6253
|
-
struct ggml_tensor * result =
|
6254
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6255
|
-
|
6256
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6276
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
6257
6277
|
|
6258
6278
|
result->op = GGML_OP_VIEW;
|
6259
6279
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6280,10 +6300,7 @@ struct ggml_tensor * ggml_view_2d(
|
|
6280
6300
|
|
6281
6301
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6282
6302
|
|
6283
|
-
struct ggml_tensor * result =
|
6284
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6285
|
-
|
6286
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6303
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
|
6287
6304
|
|
6288
6305
|
result->nb[1] = nb1;
|
6289
6306
|
result->nb[2] = result->nb[1]*ne1;
|
@@ -6316,10 +6333,7 @@ struct ggml_tensor * ggml_view_3d(
|
|
6316
6333
|
|
6317
6334
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6318
6335
|
|
6319
|
-
struct ggml_tensor * result =
|
6320
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6321
|
-
|
6322
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6336
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
|
6323
6337
|
|
6324
6338
|
result->nb[1] = nb1;
|
6325
6339
|
result->nb[2] = nb2;
|
@@ -6354,10 +6368,7 @@ struct ggml_tensor * ggml_view_4d(
|
|
6354
6368
|
|
6355
6369
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6356
6370
|
|
6357
|
-
struct ggml_tensor * result =
|
6358
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6359
|
-
|
6360
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6371
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
|
6361
6372
|
|
6362
6373
|
result->nb[1] = nb1;
|
6363
6374
|
result->nb[2] = nb2;
|
@@ -6741,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6741
6752
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
6742
6753
|
}
|
6743
6754
|
|
6755
|
+
struct ggml_tensor * ggml_rope_custom(
|
6756
|
+
struct ggml_context * ctx,
|
6757
|
+
struct ggml_tensor * a,
|
6758
|
+
int n_past,
|
6759
|
+
int n_dims,
|
6760
|
+
int mode,
|
6761
|
+
int n_ctx,
|
6762
|
+
float freq_base,
|
6763
|
+
float freq_scale) {
|
6764
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
6765
|
+
}
|
6766
|
+
|
6744
6767
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
6745
6768
|
struct ggml_context * ctx,
|
6746
6769
|
struct ggml_tensor * a,
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -1170,7 +1170,18 @@ extern "C" {
|
|
1170
1170
|
int mode,
|
1171
1171
|
int n_ctx);
|
1172
1172
|
|
1173
|
-
// custom RoPE
|
1173
|
+
// custom RoPE
|
1174
|
+
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1175
|
+
struct ggml_context * ctx,
|
1176
|
+
struct ggml_tensor * a,
|
1177
|
+
int n_past,
|
1178
|
+
int n_dims,
|
1179
|
+
int mode,
|
1180
|
+
int n_ctx,
|
1181
|
+
float freq_base,
|
1182
|
+
float freq_scale);
|
1183
|
+
|
1184
|
+
// in-place, returns view(a)
|
1174
1185
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1175
1186
|
struct ggml_context * ctx,
|
1176
1187
|
struct ggml_tensor * a,
|
@@ -39,6 +39,8 @@
|
|
39
39
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
40
40
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
41
41
|
|
42
|
+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
43
|
+
|
42
44
|
//
|
43
45
|
// 2-6 bit quantization in super-blocks
|
44
46
|
//
|
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1353
1355
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
|
1354
1356
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
1355
1357
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
1356
|
-
const __m256i scales[2] = {
|
1358
|
+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
1357
1359
|
|
1358
1360
|
__m256i sumi = _mm256_setzero_si256();
|
1359
1361
|
|
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1421
1423
|
const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
|
1422
1424
|
|
1423
1425
|
// sumf += -dmin * summs in 32bits*8
|
1424
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(
|
1426
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
|
1425
1427
|
|
1426
1428
|
const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
|
1427
1429
|
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
|
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1493
1495
|
}
|
1494
1496
|
|
1495
1497
|
// sumf += dall * isum - dmin * summs in 32bits
|
1496
|
-
__m256i sumi =
|
1498
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
1497
1499
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
|
1498
1500
|
}
|
1499
1501
|
|
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1644
1646
|
summs += dmin * smin;
|
1645
1647
|
|
1646
1648
|
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
|
1647
|
-
const __m256i q2_0 = _mm256_and_si256(
|
1648
|
-
const __m256i q2_1 = _mm256_and_si256(
|
1649
|
+
const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
|
1650
|
+
const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
|
1649
1651
|
|
1650
1652
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
1651
1653
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1709
1711
|
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
|
1710
1712
|
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
|
1711
1713
|
|
1712
|
-
const __m256i p_0 =
|
1713
|
-
const __m256i p_1 =
|
1714
|
-
const __m256i p_2 =
|
1715
|
-
const __m256i p_3 =
|
1714
|
+
const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
1715
|
+
const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
1716
|
+
const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
1717
|
+
const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
1716
1718
|
|
1717
1719
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
|
1718
1720
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
|
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1917
1919
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
|
1918
1920
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
1919
1921
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
1920
|
-
const __m256i scales[2] = {
|
1922
|
+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
1921
1923
|
|
1922
1924
|
// high bit
|
1923
1925
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
|
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2128
2130
|
}
|
2129
2131
|
|
2130
2132
|
// multiply with block scale and accumulate
|
2131
|
-
__m256i sumi =
|
2133
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
2132
2134
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
2133
2135
|
|
2134
2136
|
}
|
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2303
2305
|
aux16[0] = a & 0x0f0f;
|
2304
2306
|
aux16[1] = (a >> 4) & 0x0f0f;
|
2305
2307
|
|
2306
|
-
const __m256i scale_0 =
|
2307
|
-
const __m256i scale_1 =
|
2308
|
+
const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
|
2309
|
+
const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
|
2308
2310
|
|
2309
2311
|
memcpy(&aux64, x[i].hmask, 8);
|
2310
2312
|
|
2311
2313
|
const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
|
2312
|
-
__m256i q3h_0 =
|
2314
|
+
__m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
|
2313
2315
|
__m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
|
2314
2316
|
q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
|
2315
2317
|
q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
|
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2318
2320
|
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
|
2319
2321
|
|
2320
2322
|
// prepare low and high bits
|
2321
|
-
const __m256i q3aux =
|
2323
|
+
const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
|
2322
2324
|
const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
|
2323
2325
|
const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
|
2324
2326
|
|
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2429
2431
|
|
2430
2432
|
p16_0 = _mm_add_epi32(p16_0, p16_2);
|
2431
2433
|
p16_1 = _mm_add_epi32(p16_1, p16_3);
|
2432
|
-
__m256i p16 =
|
2434
|
+
__m256i p16 = MM256_SET_M128I(p16_1, p16_0);
|
2433
2435
|
|
2434
2436
|
// multiply with block scale and accumulate
|
2435
2437
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
|
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2620
2622
|
acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
|
2621
2623
|
|
2622
2624
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
2623
|
-
const __m256i scales =
|
2625
|
+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
2624
2626
|
|
2625
2627
|
__m256i sumi = _mm256_setzero_si256();
|
2626
2628
|
|
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2727
2729
|
}
|
2728
2730
|
|
2729
2731
|
__m256 vd = _mm256_set1_ps(d);
|
2730
|
-
__m256i sumi =
|
2732
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
2731
2733
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
2732
2734
|
|
2733
2735
|
}
|
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2968
2970
|
|
2969
2971
|
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
|
2970
2972
|
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
|
2971
|
-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(
|
2973
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
|
2972
2974
|
|
2973
2975
|
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
|
2974
2976
|
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
|
2975
|
-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(
|
2977
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
|
2976
2978
|
|
2977
2979
|
}
|
2978
2980
|
|
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3160
3162
|
summs += dmin * _mm_extract_epi32(hsum, 0);
|
3161
3163
|
|
3162
3164
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
3163
|
-
const __m256i scales =
|
3165
|
+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
3164
3166
|
|
3165
3167
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
|
3166
3168
|
__m256i hmask = mone;
|
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3299
3301
|
}
|
3300
3302
|
|
3301
3303
|
__m256 vd = _mm256_set1_ps(d);
|
3302
|
-
__m256i sumi =
|
3304
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
3303
3305
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
3304
3306
|
|
3305
3307
|
}
|
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3462
3464
|
|
3463
3465
|
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
|
3464
3466
|
|
3465
|
-
const __m256i scale_l =
|
3466
|
-
const __m256i scale_h =
|
3467
|
+
const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
|
3468
|
+
const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
|
3467
3469
|
|
3468
3470
|
int64_t aux64;
|
3469
3471
|
memcpy(&aux64, x[i].qh, 8);
|
3470
3472
|
const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
|
3471
|
-
const __m256i haux256 =
|
3473
|
+
const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
|
3472
3474
|
|
3473
3475
|
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
|
3474
3476
|
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
|
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3543
3545
|
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
|
3544
3546
|
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
|
3545
3547
|
|
3546
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(
|
3548
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
|
3547
3549
|
|
3548
3550
|
}
|
3549
3551
|
|
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3925
3927
|
|
3926
3928
|
}
|
3927
3929
|
|
3928
|
-
__m256i sumi =
|
3930
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
3929
3931
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
3930
3932
|
}
|
3931
3933
|
|
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4083
4085
|
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
|
4084
4086
|
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
|
4085
4087
|
|
4086
|
-
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(
|
4087
|
-
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(
|
4088
|
+
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
|
4089
|
+
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
|
4088
4090
|
|
4089
4091
|
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
|
4090
4092
|
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
|
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4177
4179
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
4178
4180
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
4179
4181
|
|
4180
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(
|
4182
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
|
4181
4183
|
}
|
4182
4184
|
|
4183
4185
|
*s = hsum_float_8(acc);
|