llama_cpp 0.3.5 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
27
27
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
28
28
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
29
29
  void ggml_cuda_set_main_device(int main_device);
30
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
30
31
  void ggml_cuda_set_scratch_size(size_t scratch_size);
31
32
  void ggml_cuda_free_scratch(void);
32
33
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
@@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
718
718
  // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
719
719
 
720
720
  GGML_ASSERT(ne00 == ne10);
721
- GGML_ASSERT(ne02 == ne12);
721
+ // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
722
+ GGML_ASSERT(ne03 == ne13);
722
723
 
723
724
  if (ggml_is_contiguous(src0) &&
724
725
  ggml_is_contiguous(src1) &&
@@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
746
747
  initWithDevice:ctx->device transposeLeft:false transposeRight:true
747
748
  resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
748
749
 
749
- // we need to do ne02 multiplications
750
+ // we need to do ne12 multiplications
750
751
  // TODO: is there a way to do this in parallel - currently very slow ..
751
752
  // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
752
- for (int64_t i02 = 0; i02 < ne02; ++i02) {
753
- size_t offs_src0_cur = offs_src0 + i02*nb02;
753
+ for (int64_t i02 = 0; i02 < ne12; ++i02) {
754
+ size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
754
755
  size_t offs_src1_cur = offs_src1 + i02*nb12;
755
756
  size_t offs_dst_cur = offs_dst + i02*nb2;
756
757
 
@@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
772
773
  switch (src0t) {
773
774
  case GGML_TYPE_F16:
774
775
  {
775
- GGML_ASSERT(ne02 == ne12);
776
-
777
776
  nth0 = 64;
778
777
  nth1 = 1;
779
778
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
853
852
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
854
853
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
855
854
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
856
- [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
857
- [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
858
- [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
859
- [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
860
- [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
861
- [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
862
- [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
863
- [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
864
- [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
865
- [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
855
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
856
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
857
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
858
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
859
+ [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
860
+ [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
861
+ [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
862
+ [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
863
+ [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
864
+ [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
865
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
866
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
866
867
 
867
868
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
868
869
  src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
509
509
  device float * dst,
510
510
  constant int64_t & ne00,
511
511
  constant int64_t & ne01,
512
+ constant int64_t & ne02,
512
513
  constant uint64_t & nb00,
513
514
  constant uint64_t & nb01,
514
515
  constant uint64_t & nb02,
515
516
  constant int64_t & ne10,
516
517
  constant int64_t & ne11,
518
+ constant int64_t & ne12,
517
519
  constant uint64_t & nb10,
518
520
  constant uint64_t & nb11,
519
521
  constant uint64_t & nb12,
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
529
531
  const int64_t r1 = tgpig.y;
530
532
  const int64_t im = tgpig.z;
531
533
 
532
- device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
534
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
533
535
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
534
536
 
535
537
  sum[tpitg.x] = 0.0f;
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
552
554
  }
553
555
  }
554
556
 
557
+
555
558
  kernel void kernel_alibi_f32(
556
559
  device const float * src0,
557
560
  device float * dst,
@@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
4557
4557
 
4558
4558
  static struct ggml_tensor * ggml_new_tensor_impl(
4559
4559
  struct ggml_context * ctx,
4560
- enum ggml_type type,
4561
- int n_dims,
4562
- const int64_t* ne,
4563
- void* data) {
4560
+ enum ggml_type type,
4561
+ int n_dims,
4562
+ const int64_t * ne,
4563
+ void * data) {
4564
+
4565
+ assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4564
4566
 
4565
4567
  size_t data_size = 0;
4566
4568
 
@@ -4648,22 +4650,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
4648
4650
 
4649
4651
  struct ggml_tensor * ggml_new_tensor(
4650
4652
  struct ggml_context * ctx,
4651
- enum ggml_type type,
4652
- int n_dims,
4653
- const int64_t * ne) {
4653
+ enum ggml_type type,
4654
+ int n_dims,
4655
+ const int64_t * ne) {
4654
4656
  return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4655
4657
  }
4656
4658
 
4657
4659
  struct ggml_tensor * ggml_new_tensor_1d(
4658
4660
  struct ggml_context * ctx,
4659
- enum ggml_type type,
4661
+ enum ggml_type type,
4660
4662
  int64_t ne0) {
4661
4663
  return ggml_new_tensor(ctx, type, 1, &ne0);
4662
4664
  }
4663
4665
 
4664
4666
  struct ggml_tensor * ggml_new_tensor_2d(
4665
4667
  struct ggml_context * ctx,
4666
- enum ggml_type type,
4668
+ enum ggml_type type,
4667
4669
  int64_t ne0,
4668
4670
  int64_t ne1) {
4669
4671
  const int64_t ne[2] = { ne0, ne1 };
@@ -4672,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
4672
4674
 
4673
4675
  struct ggml_tensor * ggml_new_tensor_3d(
4674
4676
  struct ggml_context * ctx,
4675
- enum ggml_type type,
4677
+ enum ggml_type type,
4676
4678
  int64_t ne0,
4677
4679
  int64_t ne1,
4678
4680
  int64_t ne2) {
@@ -6238,6 +6240,27 @@ struct ggml_tensor * ggml_reshape_4d(
6238
6240
 
6239
6241
  // ggml_view_1d
6240
6242
 
6243
+ static struct ggml_tensor * ggml_view_tensor_offset(
6244
+ struct ggml_context * ctx,
6245
+ struct ggml_tensor * a,
6246
+ int n_dims,
6247
+ const int64_t * ne,
6248
+ size_t offset) {
6249
+ // don't calculate an offset from an unallocated tensor
6250
+ void * data = NULL;
6251
+ if (a->data != NULL) {
6252
+ data = (char *) a->data + offset;
6253
+ }
6254
+
6255
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6256
+
6257
+ ggml_format_name(result, "%s (view)", a->name);
6258
+
6259
+ ggml_set_op_params(result, &offset, sizeof(offset));
6260
+
6261
+ return result;
6262
+ }
6263
+
6241
6264
  struct ggml_tensor * ggml_view_1d(
6242
6265
  struct ggml_context * ctx,
6243
6266
  struct ggml_tensor * a,
@@ -6250,10 +6273,7 @@ struct ggml_tensor * ggml_view_1d(
6250
6273
  is_node = true;
6251
6274
  }
6252
6275
 
6253
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6254
- ggml_format_name(result, "%s (view)", a->name);
6255
-
6256
- ggml_set_op_params(result, &offset, sizeof(offset));
6276
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
6257
6277
 
6258
6278
  result->op = GGML_OP_VIEW;
6259
6279
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6280,10 +6300,7 @@ struct ggml_tensor * ggml_view_2d(
6280
6300
 
6281
6301
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6282
6302
 
6283
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6284
- ggml_format_name(result, "%s (view)", a->name);
6285
-
6286
- ggml_set_op_params(result, &offset, sizeof(offset));
6303
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
6287
6304
 
6288
6305
  result->nb[1] = nb1;
6289
6306
  result->nb[2] = result->nb[1]*ne1;
@@ -6316,10 +6333,7 @@ struct ggml_tensor * ggml_view_3d(
6316
6333
 
6317
6334
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6318
6335
 
6319
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6320
- ggml_format_name(result, "%s (view)", a->name);
6321
-
6322
- ggml_set_op_params(result, &offset, sizeof(offset));
6336
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
6323
6337
 
6324
6338
  result->nb[1] = nb1;
6325
6339
  result->nb[2] = nb2;
@@ -6354,10 +6368,7 @@ struct ggml_tensor * ggml_view_4d(
6354
6368
 
6355
6369
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6356
6370
 
6357
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6358
- ggml_format_name(result, "%s (view)", a->name);
6359
-
6360
- ggml_set_op_params(result, &offset, sizeof(offset));
6371
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
6361
6372
 
6362
6373
  result->nb[1] = nb1;
6363
6374
  result->nb[2] = nb2;
@@ -6741,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace(
6741
6752
  return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
6742
6753
  }
6743
6754
 
6755
+ struct ggml_tensor * ggml_rope_custom(
6756
+ struct ggml_context * ctx,
6757
+ struct ggml_tensor * a,
6758
+ int n_past,
6759
+ int n_dims,
6760
+ int mode,
6761
+ int n_ctx,
6762
+ float freq_base,
6763
+ float freq_scale) {
6764
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
6765
+ }
6766
+
6744
6767
  struct ggml_tensor * ggml_rope_custom_inplace(
6745
6768
  struct ggml_context * ctx,
6746
6769
  struct ggml_tensor * a,
@@ -1170,7 +1170,18 @@ extern "C" {
1170
1170
  int mode,
1171
1171
  int n_ctx);
1172
1172
 
1173
- // custom RoPE, in-place, returns view(a)
1173
+ // custom RoPE
1174
+ GGML_API struct ggml_tensor * ggml_rope_custom(
1175
+ struct ggml_context * ctx,
1176
+ struct ggml_tensor * a,
1177
+ int n_past,
1178
+ int n_dims,
1179
+ int mode,
1180
+ int n_ctx,
1181
+ float freq_base,
1182
+ float freq_scale);
1183
+
1184
+ // in-place, returns view(a)
1174
1185
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1175
1186
  struct ggml_context * ctx,
1176
1187
  struct ggml_tensor * a,
@@ -39,6 +39,8 @@
39
39
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
40
40
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
41
41
 
42
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
43
+
42
44
  //
43
45
  // 2-6 bit quantization in super-blocks
44
46
  //
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1353
1355
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
1354
1356
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1355
1357
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1356
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1358
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1357
1359
 
1358
1360
  __m256i sumi = _mm256_setzero_si256();
1359
1361
 
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1421
1423
  const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1422
1424
 
1423
1425
  // sumf += -dmin * summs in 32bits*8
1424
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
1426
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
1425
1427
 
1426
1428
  const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1427
1429
  const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1493
1495
  }
1494
1496
 
1495
1497
  // sumf += dall * isum - dmin * summs in 32bits
1496
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
1498
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1497
1499
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1498
1500
  }
1499
1501
 
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1644
1646
  summs += dmin * smin;
1645
1647
 
1646
1648
  const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1647
- const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1648
- const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
+ const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1650
+ const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
1651
 
1650
1652
  const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1651
1653
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1709
1711
  const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
1710
1712
  const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
1711
1713
 
1712
- const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1713
- const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1714
- const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1715
- const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1714
+ const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1715
+ const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1716
+ const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1717
+ const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1716
1718
 
1717
1719
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
1718
1720
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
1917
1919
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
1918
1920
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1919
1921
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1920
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1922
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1921
1923
 
1922
1924
  // high bit
1923
1925
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2128
2130
  }
2129
2131
 
2130
2132
  // multiply with block scale and accumulate
2131
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2133
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2132
2134
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
2133
2135
 
2134
2136
  }
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2303
2305
  aux16[0] = a & 0x0f0f;
2304
2306
  aux16[1] = (a >> 4) & 0x0f0f;
2305
2307
 
2306
- const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2307
- const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
+ const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2309
+ const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
2310
 
2309
2311
  memcpy(&aux64, x[i].hmask, 8);
2310
2312
 
2311
2313
  const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2312
- __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
2314
+ __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
2313
2315
  __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
2314
2316
  q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
2315
2317
  q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2318
2320
  const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2319
2321
 
2320
2322
  // prepare low and high bits
2321
- const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
2323
+ const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
2322
2324
  const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
2323
2325
  const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
2324
2326
 
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2429
2431
 
2430
2432
  p16_0 = _mm_add_epi32(p16_0, p16_2);
2431
2433
  p16_1 = _mm_add_epi32(p16_1, p16_3);
2432
- __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
2434
+ __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
2433
2435
 
2434
2436
  // multiply with block scale and accumulate
2435
2437
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2620
2622
  acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
2621
2623
 
2622
2624
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
2623
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
2625
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
2624
2626
 
2625
2627
  __m256i sumi = _mm256_setzero_si256();
2626
2628
 
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2727
2729
  }
2728
2730
 
2729
2731
  __m256 vd = _mm256_set1_ps(d);
2730
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2732
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2731
2733
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2732
2734
 
2733
2735
  }
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2968
2970
 
2969
2971
  const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
2970
2972
  const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2971
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
2973
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
2972
2974
 
2973
2975
  const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
2974
2976
  const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2975
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
2977
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
2976
2978
 
2977
2979
  }
2978
2980
 
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3160
3162
  summs += dmin * _mm_extract_epi32(hsum, 0);
3161
3163
 
3162
3164
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
3163
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
3165
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
3164
3166
 
3165
3167
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
3166
3168
  __m256i hmask = mone;
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3299
3301
  }
3300
3302
 
3301
3303
  __m256 vd = _mm256_set1_ps(d);
3302
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3304
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3303
3305
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
3304
3306
 
3305
3307
  }
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3462
3464
 
3463
3465
  const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3464
3466
 
3465
- const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3466
- const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
+ const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3468
+ const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
3469
 
3468
3470
  int64_t aux64;
3469
3471
  memcpy(&aux64, x[i].qh, 8);
3470
3472
  const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
3471
- const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
3473
+ const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
3472
3474
 
3473
3475
  const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
3474
3476
  const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3543
3545
  const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
3544
3546
  const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
3545
3547
 
3546
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
3548
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
3547
3549
 
3548
3550
  }
3549
3551
 
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3925
3927
 
3926
3928
  }
3927
3929
 
3928
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3930
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3929
3931
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
3930
3932
  }
3931
3933
 
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4083
4085
  const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
4084
4086
  const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
4085
4087
 
4086
- const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4087
- const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
+ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4089
+ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
4090
 
4089
4091
  const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
4090
4092
  const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4177
4179
  sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
4178
4180
  sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
4179
4181
 
4180
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
4182
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
4181
4183
  }
4182
4184
 
4183
4185
  *s = hsum_float_8(acc);