llama_cpp 0.3.5 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
27
27
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
28
28
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
29
29
  void ggml_cuda_set_main_device(int main_device);
30
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
30
31
  void ggml_cuda_set_scratch_size(size_t scratch_size);
31
32
  void ggml_cuda_free_scratch(void);
32
33
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
@@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
718
718
  // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
719
719
 
720
720
  GGML_ASSERT(ne00 == ne10);
721
- GGML_ASSERT(ne02 == ne12);
721
+ // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
722
+ GGML_ASSERT(ne03 == ne13);
722
723
 
723
724
  if (ggml_is_contiguous(src0) &&
724
725
  ggml_is_contiguous(src1) &&
@@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
746
747
  initWithDevice:ctx->device transposeLeft:false transposeRight:true
747
748
  resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
748
749
 
749
- // we need to do ne02 multiplications
750
+ // we need to do ne12 multiplications
750
751
  // TODO: is there a way to do this in parallel - currently very slow ..
751
752
  // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
752
- for (int64_t i02 = 0; i02 < ne02; ++i02) {
753
- size_t offs_src0_cur = offs_src0 + i02*nb02;
753
+ for (int64_t i02 = 0; i02 < ne12; ++i02) {
754
+ size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
754
755
  size_t offs_src1_cur = offs_src1 + i02*nb12;
755
756
  size_t offs_dst_cur = offs_dst + i02*nb2;
756
757
 
@@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
772
773
  switch (src0t) {
773
774
  case GGML_TYPE_F16:
774
775
  {
775
- GGML_ASSERT(ne02 == ne12);
776
-
777
776
  nth0 = 64;
778
777
  nth1 = 1;
779
778
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
853
852
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
854
853
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
855
854
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
856
- [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
857
- [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
858
- [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
859
- [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
860
- [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
861
- [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
862
- [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
863
- [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
864
- [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
865
- [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
855
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
856
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
857
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
858
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
859
+ [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
860
+ [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
861
+ [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
862
+ [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
863
+ [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
864
+ [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
865
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
866
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
866
867
 
867
868
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
868
869
  src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
509
509
  device float * dst,
510
510
  constant int64_t & ne00,
511
511
  constant int64_t & ne01,
512
+ constant int64_t & ne02,
512
513
  constant uint64_t & nb00,
513
514
  constant uint64_t & nb01,
514
515
  constant uint64_t & nb02,
515
516
  constant int64_t & ne10,
516
517
  constant int64_t & ne11,
518
+ constant int64_t & ne12,
517
519
  constant uint64_t & nb10,
518
520
  constant uint64_t & nb11,
519
521
  constant uint64_t & nb12,
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
529
531
  const int64_t r1 = tgpig.y;
530
532
  const int64_t im = tgpig.z;
531
533
 
532
- device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
534
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
533
535
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
534
536
 
535
537
  sum[tpitg.x] = 0.0f;
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
552
554
  }
553
555
  }
554
556
 
557
+
555
558
  kernel void kernel_alibi_f32(
556
559
  device const float * src0,
557
560
  device float * dst,
@@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
4557
4557
 
4558
4558
  static struct ggml_tensor * ggml_new_tensor_impl(
4559
4559
  struct ggml_context * ctx,
4560
- enum ggml_type type,
4561
- int n_dims,
4562
- const int64_t* ne,
4563
- void* data) {
4560
+ enum ggml_type type,
4561
+ int n_dims,
4562
+ const int64_t * ne,
4563
+ void * data) {
4564
+
4565
+ assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4564
4566
 
4565
4567
  size_t data_size = 0;
4566
4568
 
@@ -4648,22 +4650,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
4648
4650
 
4649
4651
  struct ggml_tensor * ggml_new_tensor(
4650
4652
  struct ggml_context * ctx,
4651
- enum ggml_type type,
4652
- int n_dims,
4653
- const int64_t * ne) {
4653
+ enum ggml_type type,
4654
+ int n_dims,
4655
+ const int64_t * ne) {
4654
4656
  return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4655
4657
  }
4656
4658
 
4657
4659
  struct ggml_tensor * ggml_new_tensor_1d(
4658
4660
  struct ggml_context * ctx,
4659
- enum ggml_type type,
4661
+ enum ggml_type type,
4660
4662
  int64_t ne0) {
4661
4663
  return ggml_new_tensor(ctx, type, 1, &ne0);
4662
4664
  }
4663
4665
 
4664
4666
  struct ggml_tensor * ggml_new_tensor_2d(
4665
4667
  struct ggml_context * ctx,
4666
- enum ggml_type type,
4668
+ enum ggml_type type,
4667
4669
  int64_t ne0,
4668
4670
  int64_t ne1) {
4669
4671
  const int64_t ne[2] = { ne0, ne1 };
@@ -4672,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
4672
4674
 
4673
4675
  struct ggml_tensor * ggml_new_tensor_3d(
4674
4676
  struct ggml_context * ctx,
4675
- enum ggml_type type,
4677
+ enum ggml_type type,
4676
4678
  int64_t ne0,
4677
4679
  int64_t ne1,
4678
4680
  int64_t ne2) {
@@ -6238,6 +6240,27 @@ struct ggml_tensor * ggml_reshape_4d(
6238
6240
 
6239
6241
  // ggml_view_1d
6240
6242
 
6243
+ static struct ggml_tensor * ggml_view_tensor_offset(
6244
+ struct ggml_context * ctx,
6245
+ struct ggml_tensor * a,
6246
+ int n_dims,
6247
+ const int64_t * ne,
6248
+ size_t offset) {
6249
+ // don't calculate an offset from an unallocated tensor
6250
+ void * data = NULL;
6251
+ if (a->data != NULL) {
6252
+ data = (char *) a->data + offset;
6253
+ }
6254
+
6255
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6256
+
6257
+ ggml_format_name(result, "%s (view)", a->name);
6258
+
6259
+ ggml_set_op_params(result, &offset, sizeof(offset));
6260
+
6261
+ return result;
6262
+ }
6263
+
6241
6264
  struct ggml_tensor * ggml_view_1d(
6242
6265
  struct ggml_context * ctx,
6243
6266
  struct ggml_tensor * a,
@@ -6250,10 +6273,7 @@ struct ggml_tensor * ggml_view_1d(
6250
6273
  is_node = true;
6251
6274
  }
6252
6275
 
6253
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6254
- ggml_format_name(result, "%s (view)", a->name);
6255
-
6256
- ggml_set_op_params(result, &offset, sizeof(offset));
6276
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6257
6277
 
6258
6278
  result->op = GGML_OP_VIEW;
6259
6279
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6280,10 +6300,7 @@ struct ggml_tensor * ggml_view_2d(
6280
6300
 
6281
6301
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6282
6302
 
6283
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6284
- ggml_format_name(result, "%s (view)", a->name);
6285
-
6286
- ggml_set_op_params(result, &offset, sizeof(offset));
6303
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6287
6304
 
6288
6305
  result->nb[1] = nb1;
6289
6306
  result->nb[2] = result->nb[1]*ne1;
@@ -6316,10 +6333,7 @@ struct ggml_tensor * ggml_view_3d(
6316
6333
 
6317
6334
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6318
6335
 
6319
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6320
- ggml_format_name(result, "%s (view)", a->name);
6321
-
6322
- ggml_set_op_params(result, &offset, sizeof(offset));
6336
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6323
6337
 
6324
6338
  result->nb[1] = nb1;
6325
6339
  result->nb[2] = nb2;
@@ -6354,10 +6368,7 @@ struct ggml_tensor * ggml_view_4d(
6354
6368
 
6355
6369
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6356
6370
 
6357
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6358
- ggml_format_name(result, "%s (view)", a->name);
6359
-
6360
- ggml_set_op_params(result, &offset, sizeof(offset));
6371
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6361
6372
 
6362
6373
  result->nb[1] = nb1;
6363
6374
  result->nb[2] = nb2;
@@ -6741,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace(
6741
6752
  return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
6742
6753
  }
6743
6754
 
6755
+ struct ggml_tensor * ggml_rope_custom(
6756
+ struct ggml_context * ctx,
6757
+ struct ggml_tensor * a,
6758
+ int n_past,
6759
+ int n_dims,
6760
+ int mode,
6761
+ int n_ctx,
6762
+ float freq_base,
6763
+ float freq_scale) {
6764
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
6765
+ }
6766
+
6744
6767
  struct ggml_tensor * ggml_rope_custom_inplace(
6745
6768
  struct ggml_context * ctx,
6746
6769
  struct ggml_tensor * a,
@@ -1170,7 +1170,18 @@ extern "C" {
1170
1170
  int mode,
1171
1171
  int n_ctx);
1172
1172
 
1173
- // custom RoPE, in-place, returns view(a)
1173
+ // custom RoPE
1174
+ GGML_API struct ggml_tensor * ggml_rope_custom(
1175
+ struct ggml_context * ctx,
1176
+ struct ggml_tensor * a,
1177
+ int n_past,
1178
+ int n_dims,
1179
+ int mode,
1180
+ int n_ctx,
1181
+ float freq_base,
1182
+ float freq_scale);
1183
+
1184
+ // in-place, returns view(a)
1174
1185
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1175
1186
  struct ggml_context * ctx,
1176
1187
  struct ggml_tensor * a,
@@ -39,6 +39,8 @@
39
39
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
40
40
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
41
41
 
42
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
43
+
42
44
  //
43
45
  // 2-6 bit quantization in super-blocks
44
46
  //
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1353
1355
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
1354
1356
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1355
1357
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1356
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1358
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1357
1359
 
1358
1360
  __m256i sumi = _mm256_setzero_si256();
1359
1361
 
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1421
1423
  const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1422
1424
 
1423
1425
  // sumf += -dmin * summs in 32bits*8
1424
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
1426
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
1425
1427
 
1426
1428
  const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1427
1429
  const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1493
1495
  }
1494
1496
 
1495
1497
  // sumf += dall * isum - dmin * summs in 32bits
1496
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
1498
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1497
1499
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1498
1500
  }
1499
1501
 
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1644
1646
  summs += dmin * smin;
1645
1647
 
1646
1648
  const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1647
- const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1648
- const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
+ const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1650
+ const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
1651
 
1650
1652
  const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1651
1653
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1709
1711
  const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
1710
1712
  const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
1711
1713
 
1712
- const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1713
- const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1714
- const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1715
- const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1714
+ const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1715
+ const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1716
+ const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1717
+ const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1716
1718
 
1717
1719
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
1718
1720
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
1917
1919
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
1918
1920
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1919
1921
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1920
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1922
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1921
1923
 
1922
1924
  // high bit
1923
1925
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2128
2130
  }
2129
2131
 
2130
2132
  // multiply with block scale and accumulate
2131
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2133
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2132
2134
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
2133
2135
 
2134
2136
  }
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2303
2305
  aux16[0] = a & 0x0f0f;
2304
2306
  aux16[1] = (a >> 4) & 0x0f0f;
2305
2307
 
2306
- const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2307
- const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
+ const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2309
+ const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
2310
 
2309
2311
  memcpy(&aux64, x[i].hmask, 8);
2310
2312
 
2311
2313
  const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2312
- __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
2314
+ __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
2313
2315
  __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
2314
2316
  q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
2315
2317
  q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2318
2320
  const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2319
2321
 
2320
2322
  // prepare low and high bits
2321
- const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
2323
+ const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
2322
2324
  const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
2323
2325
  const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
2324
2326
 
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2429
2431
 
2430
2432
  p16_0 = _mm_add_epi32(p16_0, p16_2);
2431
2433
  p16_1 = _mm_add_epi32(p16_1, p16_3);
2432
- __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
2434
+ __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
2433
2435
 
2434
2436
  // multiply with block scale and accumulate
2435
2437
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2620
2622
  acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
2621
2623
 
2622
2624
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
2623
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
2625
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
2624
2626
 
2625
2627
  __m256i sumi = _mm256_setzero_si256();
2626
2628
 
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2727
2729
  }
2728
2730
 
2729
2731
  __m256 vd = _mm256_set1_ps(d);
2730
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2732
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2731
2733
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2732
2734
 
2733
2735
  }
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2968
2970
 
2969
2971
  const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
2970
2972
  const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2971
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
2973
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
2972
2974
 
2973
2975
  const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
2974
2976
  const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2975
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
2977
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
2976
2978
 
2977
2979
  }
2978
2980
 
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3160
3162
  summs += dmin * _mm_extract_epi32(hsum, 0);
3161
3163
 
3162
3164
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
3163
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
3165
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
3164
3166
 
3165
3167
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
3166
3168
  __m256i hmask = mone;
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3299
3301
  }
3300
3302
 
3301
3303
  __m256 vd = _mm256_set1_ps(d);
3302
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3304
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3303
3305
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
3304
3306
 
3305
3307
  }
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3462
3464
 
3463
3465
  const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3464
3466
 
3465
- const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3466
- const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
+ const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3468
+ const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
3469
 
3468
3470
  int64_t aux64;
3469
3471
  memcpy(&aux64, x[i].qh, 8);
3470
3472
  const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
3471
- const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
3473
+ const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
3472
3474
 
3473
3475
  const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
3474
3476
  const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3543
3545
  const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
3544
3546
  const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
3545
3547
 
3546
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
3548
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
3547
3549
 
3548
3550
  }
3549
3551
 
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3925
3927
 
3926
3928
  }
3927
3929
 
3928
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3930
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3929
3931
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
3930
3932
  }
3931
3933
 
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4083
4085
  const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
4084
4086
  const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
4085
4087
 
4086
- const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4087
- const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
+ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4089
+ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
4090
 
4089
4091
  const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
4090
4092
  const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4177
4179
  sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
4178
4180
  sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
4179
4181
 
4180
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
4182
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
4181
4183
  }
4182
4184
 
4183
4185
  *s = hsum_float_8(acc);