llama_cpp 0.3.5 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2090 -438
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +17 -16
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +49 -26
- data/ext/llama_cpp/src/ggml.h +12 -1
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama.cpp +199 -68
- data/ext/llama_cpp/src/llama.h +1 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
|
27
27
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
28
28
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
29
29
|
void ggml_cuda_set_main_device(int main_device);
|
30
|
+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
30
31
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
31
32
|
void ggml_cuda_free_scratch(void);
|
32
33
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
@@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
|
|
718
718
|
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
719
719
|
|
720
720
|
GGML_ASSERT(ne00 == ne10);
|
721
|
-
GGML_ASSERT(ne02 == ne12);
|
721
|
+
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
722
|
+
GGML_ASSERT(ne03 == ne13);
|
722
723
|
|
723
724
|
if (ggml_is_contiguous(src0) &&
|
724
725
|
ggml_is_contiguous(src1) &&
|
@@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
|
|
746
747
|
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
747
748
|
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
748
749
|
|
749
|
-
// we need to do
|
750
|
+
// we need to do ne12 multiplications
|
750
751
|
// TODO: is there a way to do this in parallel - currently very slow ..
|
751
752
|
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
752
|
-
for (int64_t i02 = 0; i02 <
|
753
|
-
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
753
|
+
for (int64_t i02 = 0; i02 < ne12; ++i02) {
|
754
|
+
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
|
754
755
|
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
755
756
|
size_t offs_dst_cur = offs_dst + i02*nb2;
|
756
757
|
|
@@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
|
|
772
773
|
switch (src0t) {
|
773
774
|
case GGML_TYPE_F16:
|
774
775
|
{
|
775
|
-
GGML_ASSERT(ne02 == ne12);
|
776
|
-
|
777
776
|
nth0 = 64;
|
778
777
|
nth1 = 1;
|
779
778
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
@@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
|
|
853
852
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
854
853
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
855
854
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
856
|
-
[encoder setBytes:&
|
857
|
-
[encoder setBytes:&
|
858
|
-
[encoder setBytes:&
|
859
|
-
[encoder setBytes:&
|
860
|
-
[encoder setBytes:&
|
861
|
-
[encoder setBytes:&
|
862
|
-
[encoder setBytes:&
|
863
|
-
[encoder setBytes:&
|
864
|
-
[encoder setBytes:&
|
865
|
-
[encoder setBytes:&
|
855
|
+
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
856
|
+
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
857
|
+
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
858
|
+
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
859
|
+
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
860
|
+
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
861
|
+
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
|
862
|
+
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
|
863
|
+
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
|
864
|
+
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
865
|
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
866
|
+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
866
867
|
|
867
868
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
868
869
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
|
509
509
|
device float * dst,
|
510
510
|
constant int64_t & ne00,
|
511
511
|
constant int64_t & ne01,
|
512
|
+
constant int64_t & ne02,
|
512
513
|
constant uint64_t & nb00,
|
513
514
|
constant uint64_t & nb01,
|
514
515
|
constant uint64_t & nb02,
|
515
516
|
constant int64_t & ne10,
|
516
517
|
constant int64_t & ne11,
|
518
|
+
constant int64_t & ne12,
|
517
519
|
constant uint64_t & nb10,
|
518
520
|
constant uint64_t & nb11,
|
519
521
|
constant uint64_t & nb12,
|
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
529
531
|
const int64_t r1 = tgpig.y;
|
530
532
|
const int64_t im = tgpig.z;
|
531
533
|
|
532
|
-
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
534
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
533
535
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
534
536
|
|
535
537
|
sum[tpitg.x] = 0.0f;
|
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
552
554
|
}
|
553
555
|
}
|
554
556
|
|
557
|
+
|
555
558
|
kernel void kernel_alibi_f32(
|
556
559
|
device const float * src0,
|
557
560
|
device float * dst,
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
|
4557
4557
|
|
4558
4558
|
static struct ggml_tensor * ggml_new_tensor_impl(
|
4559
4559
|
struct ggml_context * ctx,
|
4560
|
-
enum ggml_type
|
4561
|
-
int
|
4562
|
-
const int64_t* ne,
|
4563
|
-
void*
|
4560
|
+
enum ggml_type type,
|
4561
|
+
int n_dims,
|
4562
|
+
const int64_t * ne,
|
4563
|
+
void * data) {
|
4564
|
+
|
4565
|
+
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
4564
4566
|
|
4565
4567
|
size_t data_size = 0;
|
4566
4568
|
|
@@ -4648,22 +4650,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
|
|
4648
4650
|
|
4649
4651
|
struct ggml_tensor * ggml_new_tensor(
|
4650
4652
|
struct ggml_context * ctx,
|
4651
|
-
enum ggml_type
|
4652
|
-
int
|
4653
|
-
const int64_t
|
4653
|
+
enum ggml_type type,
|
4654
|
+
int n_dims,
|
4655
|
+
const int64_t * ne) {
|
4654
4656
|
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
4655
4657
|
}
|
4656
4658
|
|
4657
4659
|
struct ggml_tensor * ggml_new_tensor_1d(
|
4658
4660
|
struct ggml_context * ctx,
|
4659
|
-
enum ggml_type
|
4661
|
+
enum ggml_type type,
|
4660
4662
|
int64_t ne0) {
|
4661
4663
|
return ggml_new_tensor(ctx, type, 1, &ne0);
|
4662
4664
|
}
|
4663
4665
|
|
4664
4666
|
struct ggml_tensor * ggml_new_tensor_2d(
|
4665
4667
|
struct ggml_context * ctx,
|
4666
|
-
enum ggml_type
|
4668
|
+
enum ggml_type type,
|
4667
4669
|
int64_t ne0,
|
4668
4670
|
int64_t ne1) {
|
4669
4671
|
const int64_t ne[2] = { ne0, ne1 };
|
@@ -4672,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
|
|
4672
4674
|
|
4673
4675
|
struct ggml_tensor * ggml_new_tensor_3d(
|
4674
4676
|
struct ggml_context * ctx,
|
4675
|
-
enum ggml_type
|
4677
|
+
enum ggml_type type,
|
4676
4678
|
int64_t ne0,
|
4677
4679
|
int64_t ne1,
|
4678
4680
|
int64_t ne2) {
|
@@ -6238,6 +6240,27 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6238
6240
|
|
6239
6241
|
// ggml_view_1d
|
6240
6242
|
|
6243
|
+
static struct ggml_tensor * ggml_view_tensor_offset(
|
6244
|
+
struct ggml_context * ctx,
|
6245
|
+
struct ggml_tensor * a,
|
6246
|
+
int n_dims,
|
6247
|
+
const int64_t * ne,
|
6248
|
+
size_t offset) {
|
6249
|
+
// don't calculate an offset from an unallocated tensor
|
6250
|
+
void * data = NULL;
|
6251
|
+
if (a->data != NULL) {
|
6252
|
+
data = (char *) a->data + offset;
|
6253
|
+
}
|
6254
|
+
|
6255
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
|
6256
|
+
|
6257
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6258
|
+
|
6259
|
+
ggml_set_op_params(result, &offset, sizeof(offset));
|
6260
|
+
|
6261
|
+
return result;
|
6262
|
+
}
|
6263
|
+
|
6241
6264
|
struct ggml_tensor * ggml_view_1d(
|
6242
6265
|
struct ggml_context * ctx,
|
6243
6266
|
struct ggml_tensor * a,
|
@@ -6250,10 +6273,7 @@ struct ggml_tensor * ggml_view_1d(
|
|
6250
6273
|
is_node = true;
|
6251
6274
|
}
|
6252
6275
|
|
6253
|
-
struct ggml_tensor * result =
|
6254
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6255
|
-
|
6256
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6276
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
6257
6277
|
|
6258
6278
|
result->op = GGML_OP_VIEW;
|
6259
6279
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6280,10 +6300,7 @@ struct ggml_tensor * ggml_view_2d(
|
|
6280
6300
|
|
6281
6301
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6282
6302
|
|
6283
|
-
struct ggml_tensor * result =
|
6284
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6285
|
-
|
6286
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6303
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
|
6287
6304
|
|
6288
6305
|
result->nb[1] = nb1;
|
6289
6306
|
result->nb[2] = result->nb[1]*ne1;
|
@@ -6316,10 +6333,7 @@ struct ggml_tensor * ggml_view_3d(
|
|
6316
6333
|
|
6317
6334
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6318
6335
|
|
6319
|
-
struct ggml_tensor * result =
|
6320
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6321
|
-
|
6322
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6336
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
|
6323
6337
|
|
6324
6338
|
result->nb[1] = nb1;
|
6325
6339
|
result->nb[2] = nb2;
|
@@ -6354,10 +6368,7 @@ struct ggml_tensor * ggml_view_4d(
|
|
6354
6368
|
|
6355
6369
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6356
6370
|
|
6357
|
-
struct ggml_tensor * result =
|
6358
|
-
ggml_format_name(result, "%s (view)", a->name);
|
6359
|
-
|
6360
|
-
ggml_set_op_params(result, &offset, sizeof(offset));
|
6371
|
+
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
|
6361
6372
|
|
6362
6373
|
result->nb[1] = nb1;
|
6363
6374
|
result->nb[2] = nb2;
|
@@ -6741,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6741
6752
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
6742
6753
|
}
|
6743
6754
|
|
6755
|
+
struct ggml_tensor * ggml_rope_custom(
|
6756
|
+
struct ggml_context * ctx,
|
6757
|
+
struct ggml_tensor * a,
|
6758
|
+
int n_past,
|
6759
|
+
int n_dims,
|
6760
|
+
int mode,
|
6761
|
+
int n_ctx,
|
6762
|
+
float freq_base,
|
6763
|
+
float freq_scale) {
|
6764
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
6765
|
+
}
|
6766
|
+
|
6744
6767
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
6745
6768
|
struct ggml_context * ctx,
|
6746
6769
|
struct ggml_tensor * a,
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -1170,7 +1170,18 @@ extern "C" {
|
|
1170
1170
|
int mode,
|
1171
1171
|
int n_ctx);
|
1172
1172
|
|
1173
|
-
// custom RoPE
|
1173
|
+
// custom RoPE
|
1174
|
+
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1175
|
+
struct ggml_context * ctx,
|
1176
|
+
struct ggml_tensor * a,
|
1177
|
+
int n_past,
|
1178
|
+
int n_dims,
|
1179
|
+
int mode,
|
1180
|
+
int n_ctx,
|
1181
|
+
float freq_base,
|
1182
|
+
float freq_scale);
|
1183
|
+
|
1184
|
+
// in-place, returns view(a)
|
1174
1185
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1175
1186
|
struct ggml_context * ctx,
|
1176
1187
|
struct ggml_tensor * a,
|
@@ -39,6 +39,8 @@
|
|
39
39
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
40
40
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
41
41
|
|
42
|
+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
43
|
+
|
42
44
|
//
|
43
45
|
// 2-6 bit quantization in super-blocks
|
44
46
|
//
|
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1353
1355
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
|
1354
1356
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
1355
1357
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
1356
|
-
const __m256i scales[2] = {
|
1358
|
+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
1357
1359
|
|
1358
1360
|
__m256i sumi = _mm256_setzero_si256();
|
1359
1361
|
|
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1421
1423
|
const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
|
1422
1424
|
|
1423
1425
|
// sumf += -dmin * summs in 32bits*8
|
1424
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(
|
1426
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
|
1425
1427
|
|
1426
1428
|
const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
|
1427
1429
|
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
|
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1493
1495
|
}
|
1494
1496
|
|
1495
1497
|
// sumf += dall * isum - dmin * summs in 32bits
|
1496
|
-
__m256i sumi =
|
1498
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
1497
1499
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
|
1498
1500
|
}
|
1499
1501
|
|
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1644
1646
|
summs += dmin * smin;
|
1645
1647
|
|
1646
1648
|
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
|
1647
|
-
const __m256i q2_0 = _mm256_and_si256(
|
1648
|
-
const __m256i q2_1 = _mm256_and_si256(
|
1649
|
+
const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
|
1650
|
+
const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
|
1649
1651
|
|
1650
1652
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
1651
1653
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1709
1711
|
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
|
1710
1712
|
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
|
1711
1713
|
|
1712
|
-
const __m256i p_0 =
|
1713
|
-
const __m256i p_1 =
|
1714
|
-
const __m256i p_2 =
|
1715
|
-
const __m256i p_3 =
|
1714
|
+
const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
1715
|
+
const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
1716
|
+
const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
1717
|
+
const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
1716
1718
|
|
1717
1719
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
|
1718
1720
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
|
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1917
1919
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
|
1918
1920
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
1919
1921
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
1920
|
-
const __m256i scales[2] = {
|
1922
|
+
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
1921
1923
|
|
1922
1924
|
// high bit
|
1923
1925
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
|
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2128
2130
|
}
|
2129
2131
|
|
2130
2132
|
// multiply with block scale and accumulate
|
2131
|
-
__m256i sumi =
|
2133
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
2132
2134
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
2133
2135
|
|
2134
2136
|
}
|
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2303
2305
|
aux16[0] = a & 0x0f0f;
|
2304
2306
|
aux16[1] = (a >> 4) & 0x0f0f;
|
2305
2307
|
|
2306
|
-
const __m256i scale_0 =
|
2307
|
-
const __m256i scale_1 =
|
2308
|
+
const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
|
2309
|
+
const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
|
2308
2310
|
|
2309
2311
|
memcpy(&aux64, x[i].hmask, 8);
|
2310
2312
|
|
2311
2313
|
const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
|
2312
|
-
__m256i q3h_0 =
|
2314
|
+
__m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
|
2313
2315
|
__m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
|
2314
2316
|
q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
|
2315
2317
|
q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
|
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2318
2320
|
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
|
2319
2321
|
|
2320
2322
|
// prepare low and high bits
|
2321
|
-
const __m256i q3aux =
|
2323
|
+
const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
|
2322
2324
|
const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
|
2323
2325
|
const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
|
2324
2326
|
|
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2429
2431
|
|
2430
2432
|
p16_0 = _mm_add_epi32(p16_0, p16_2);
|
2431
2433
|
p16_1 = _mm_add_epi32(p16_1, p16_3);
|
2432
|
-
__m256i p16 =
|
2434
|
+
__m256i p16 = MM256_SET_M128I(p16_1, p16_0);
|
2433
2435
|
|
2434
2436
|
// multiply with block scale and accumulate
|
2435
2437
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
|
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2620
2622
|
acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
|
2621
2623
|
|
2622
2624
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
2623
|
-
const __m256i scales =
|
2625
|
+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
2624
2626
|
|
2625
2627
|
__m256i sumi = _mm256_setzero_si256();
|
2626
2628
|
|
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2727
2729
|
}
|
2728
2730
|
|
2729
2731
|
__m256 vd = _mm256_set1_ps(d);
|
2730
|
-
__m256i sumi =
|
2732
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
2731
2733
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
2732
2734
|
|
2733
2735
|
}
|
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2968
2970
|
|
2969
2971
|
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
|
2970
2972
|
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
|
2971
|
-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(
|
2973
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
|
2972
2974
|
|
2973
2975
|
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
|
2974
2976
|
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
|
2975
|
-
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(
|
2977
|
+
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
|
2976
2978
|
|
2977
2979
|
}
|
2978
2980
|
|
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3160
3162
|
summs += dmin * _mm_extract_epi32(hsum, 0);
|
3161
3163
|
|
3162
3164
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
3163
|
-
const __m256i scales =
|
3165
|
+
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
3164
3166
|
|
3165
3167
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
|
3166
3168
|
__m256i hmask = mone;
|
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3299
3301
|
}
|
3300
3302
|
|
3301
3303
|
__m256 vd = _mm256_set1_ps(d);
|
3302
|
-
__m256i sumi =
|
3304
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
3303
3305
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
3304
3306
|
|
3305
3307
|
}
|
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3462
3464
|
|
3463
3465
|
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
|
3464
3466
|
|
3465
|
-
const __m256i scale_l =
|
3466
|
-
const __m256i scale_h =
|
3467
|
+
const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
|
3468
|
+
const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
|
3467
3469
|
|
3468
3470
|
int64_t aux64;
|
3469
3471
|
memcpy(&aux64, x[i].qh, 8);
|
3470
3472
|
const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
|
3471
|
-
const __m256i haux256 =
|
3473
|
+
const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
|
3472
3474
|
|
3473
3475
|
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
|
3474
3476
|
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
|
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3543
3545
|
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
|
3544
3546
|
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
|
3545
3547
|
|
3546
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(
|
3548
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
|
3547
3549
|
|
3548
3550
|
}
|
3549
3551
|
|
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3925
3927
|
|
3926
3928
|
}
|
3927
3929
|
|
3928
|
-
__m256i sumi =
|
3930
|
+
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
3929
3931
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
3930
3932
|
}
|
3931
3933
|
|
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4083
4085
|
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
|
4084
4086
|
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
|
4085
4087
|
|
4086
|
-
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(
|
4087
|
-
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(
|
4088
|
+
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
|
4089
|
+
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
|
4088
4090
|
|
4089
4091
|
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
|
4090
4092
|
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
|
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4177
4179
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
4178
4180
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
4179
4181
|
|
4180
|
-
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(
|
4182
|
+
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
|
4181
4183
|
}
|
4182
4184
|
|
4183
4185
|
*s = hsum_float_8(acc);
|