llama_cpp 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6bf717ec1012d78b7d83f3f7a7546f589fbf368c1b2babc69a99fd28a5d9ff3
4
- data.tar.gz: 6ab2e2ae4b6410f32890a86b7ac2dbb93ab9e2f43888158b7cbfd9b16f435447
3
+ metadata.gz: e4b9a70ca3137fb187c1455291828001086e373db7d9189f7f8d45f0d252b0dc
4
+ data.tar.gz: 22d67fa3d1c71d73569735876aebe953038bb0465a67b07ea991dc8568d11bac
5
5
  SHA512:
6
- metadata.gz: cd1ae63e518a422dbe3a281a598b18b9397fdf880867f92bad20e56b5a60756a1a929a62879f7aed0c7c24012b87b85353e175c773aeed4f8d87294ba0422cb1
7
- data.tar.gz: 2828321d0589ac16713745b2770844d5c6fed848ff0efed90304370152650a8e0619657a91184f74c402eb9351800ac3517c20f775faf52db91331d95ac1c87d
6
+ metadata.gz: 3767e7950004aba7980a27dbffaec2c360a18295e845b58ab647eff4b9f90515e47c646e48e5d75cac261908415602df50908e429fca0637664e93b2efd7dc1a
7
+ data.tar.gz: b08e00960ab036fe7ac7778dd33a5a72795153cd7c8beea642b5422da41575a19ea41e1b865e25d16f36afe2879ff4b5b3f303d49598c30888a95ecf459501da
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.0.2] - 2023-04-02
4
+
5
+ - Bump bundled llama.cpp from master-2a98bc1 to master-5b70e7d
6
+ - Add n_threads arguments to generate method.
7
+
3
8
  ## [0.0.1] - 2023-04-02
4
9
 
5
10
  - Initial release
data/README.md CHANGED
@@ -20,6 +20,8 @@ If bundler is not being used to manage dependencies, install the gem by executin
20
20
 
21
21
  ## Usage
22
22
 
23
+ Prepare a quantized model file by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage).
24
+
23
25
  ```ruby
24
26
  require 'llama_cpp'
25
27
 
@@ -79,6 +79,19 @@ static int sched_yield (void) {
79
79
  typedef void* thread_ret_t;
80
80
  #endif
81
81
 
82
+ // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
83
+ #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
84
+ #ifndef __FMA__
85
+ #define __FMA__
86
+ #endif
87
+ #ifndef __F16C__
88
+ #define __F16C__
89
+ #endif
90
+ #ifndef __SSE3__
91
+ #define __SSE3__
92
+ #endif
93
+ #endif
94
+
82
95
  #ifdef __HAIKU__
83
96
  #define static_assert(cond, msg) _Static_assert(cond, msg)
84
97
  #endif
@@ -172,8 +185,13 @@ typedef double ggml_float;
172
185
 
173
186
  #ifdef __F16C__
174
187
 
188
+ #ifdef _MSC_VER
189
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
190
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
191
+ #else
175
192
  #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
176
193
  #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
194
+ #endif
177
195
 
178
196
  #elif defined(__POWER9_VECTOR__)
179
197
 
@@ -443,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes )
443
461
  __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
444
462
  return _mm_packus_epi16( r0, r1 );
445
463
  }
464
+ #elif __AVX__
465
+ static inline __m128i bytesFromNibbles( const uint8_t* rsi )
466
+ {
467
+ // Load 8 bytes from memory
468
+ __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
469
+
470
+ // Expand bytes into uint16_t values
471
+ __m128i bytes = _mm_cvtepu8_epi16( tmp );
472
+
473
+ // Unpack values into individual bytes
474
+ const __m128i lowMask = _mm_set1_epi8( 0xF );
475
+ __m128i high = _mm_andnot_si128( lowMask, bytes );
476
+ __m128i low = _mm_and_si128( lowMask, bytes );
477
+ high = _mm_slli_epi16( high, 4 );
478
+ bytes = _mm_or_si128( low, high );
479
+ return bytes;
480
+ }
481
+
482
+ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
483
+ {
484
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
485
+ const __m128i lowByte = _mm_set1_epi16( 0xFF );
486
+ __m128i high = _mm_andnot_si128( lowByte, bytes1 );
487
+ __m128i low = _mm_and_si128( lowByte, bytes1 );
488
+ high = _mm_srli_epi16( high, 4 );
489
+ bytes1 = _mm_or_si128( low, high );
490
+ high = _mm_andnot_si128( lowByte, bytes2 );
491
+ low = _mm_and_si128( lowByte, bytes2 );
492
+ high = _mm_srli_epi16( high, 4 );
493
+ bytes2 = _mm_or_si128( low, high );
494
+
495
+ return _mm_packus_epi16( bytes1, bytes2);
496
+ }
446
497
  #endif
447
498
 
448
499
  // method 5
@@ -491,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
491
542
  const uint8_t vi0 = (int8_t)roundf(v0) + 8;
492
543
  const uint8_t vi1 = (int8_t)roundf(v1) + 8;
493
544
 
494
- assert(vi0 >= 0 && vi0 < 16);
495
- assert(vi1 >= 0 && vi1 < 16);
545
+ assert(vi0 < 16);
546
+ assert(vi1 < 16);
496
547
 
497
548
  pp[l/2] = vi0 | (vi1 << 4);
498
549
  }
@@ -546,10 +597,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
546
597
  }
547
598
  }
548
599
  #elif __ARM_NEON
549
- uint8_t pp[QK/2];
550
600
  for (int i = 0; i < nb; i++) {
551
- float amax = 0.0f; // absolute max
552
-
553
601
  float32x4_t srcv [8];
554
602
  float32x4_t asrcv[8];
555
603
  float32x4_t amaxv[8];
@@ -561,7 +609,8 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
561
609
  for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
562
610
  for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
563
611
 
564
- amax = MAX(
612
+ // absolute max
613
+ const float amax = MAX(
565
614
  MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
566
615
  MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
567
616
 
@@ -575,11 +624,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
575
624
  const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
576
625
  const int32x4_t vi = vcvtq_s32_f32(vf);
577
626
 
578
- pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
579
- pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
627
+ y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
628
+ y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
580
629
  }
581
-
582
- memcpy(y[i].qs, pp, sizeof(pp));
583
630
  }
584
631
  #elif defined(__AVX2__)
585
632
  for (int i = 0; i < nb; i++) {
@@ -646,8 +693,81 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
646
693
  __m128i res = packNibbles( i0 );
647
694
  _mm_storeu_si128( ( __m128i* )y[i].qs, res );
648
695
  }
696
+ #elif defined(__AVX__)
697
+ for (int i = 0; i < nb; i++) {
698
+ // Load elements into 4 AVX vectors
699
+ __m256 v0 = _mm256_loadu_ps( x );
700
+ __m256 v1 = _mm256_loadu_ps( x + 8 );
701
+ __m256 v2 = _mm256_loadu_ps( x + 16 );
702
+ __m256 v3 = _mm256_loadu_ps( x + 24 );
703
+ x += 32;
704
+
705
+ // Compute max(abs(e)) for the block
706
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
707
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
708
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
709
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
710
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
711
+
712
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
713
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
714
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
715
+ const float maxScalar = _mm_cvtss_f32( max4 );
716
+
717
+ // Quantize these floats
718
+ const float d = maxScalar / 7.0f;
719
+ y[i].d = d;
720
+ const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
721
+ const __m256 mul = _mm256_set1_ps( id );
722
+
723
+ // Apply the multiplier
724
+ v0 = _mm256_mul_ps( v0, mul );
725
+ v1 = _mm256_mul_ps( v1, mul );
726
+ v2 = _mm256_mul_ps( v2, mul );
727
+ v3 = _mm256_mul_ps( v3, mul );
728
+
729
+ // Round to nearest integer
730
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
731
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
732
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
733
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
734
+
735
+ // Convert floats to integers
736
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
737
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
738
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
739
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
740
+
741
+ // Since we don't have in AVX some necessary functions,
742
+ // we split the registers in half and call AVX2 analogs from SSE
743
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
744
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
745
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
746
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
747
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
748
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
749
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
750
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
751
+
752
+ // Convert int32 to int16
753
+ ni0 = _mm_packs_epi32( ni0, ni1 );
754
+ ni2 = _mm_packs_epi32( ni2, ni3 );
755
+ ni4 = _mm_packs_epi32( ni4, ni5 );
756
+ ni6 = _mm_packs_epi32( ni6, ni7 );
757
+ // Convert int16 to int8
758
+ ni0 = _mm_packs_epi16( ni0, ni2 );
759
+ ni4 = _mm_packs_epi16( ni4, ni6 );
760
+
761
+ // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
762
+ const __m128i off = _mm_set1_epi8( 8);
763
+ ni0 = _mm_add_epi8( ni0, off );
764
+ ni4 = _mm_add_epi8( ni4, off );
765
+
766
+ // Compress the vector into 4 bit/value, and store
767
+ __m128i res = packNibbles( ni0, ni4 );
768
+ _mm_storeu_si128( ( __m128i* )y[i].qs, res );
769
+ }
649
770
  #elif defined(__wasm_simd128__)
650
- uint8_t pp[QK/2];
651
771
  for (int i = 0; i < nb; i++) {
652
772
  float amax = 0.0f; // absolute max
653
773
 
@@ -676,11 +796,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
676
796
  const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
677
797
  const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
678
798
 
679
- pp[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
680
- pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
799
+ y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
800
+ y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
681
801
  }
682
-
683
- memcpy(y[i].qs, pp, sizeof(pp));
684
802
  }
685
803
  #else
686
804
  // scalar
@@ -719,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
719
837
  const uint8_t vi0 = roundf(v0);
720
838
  const uint8_t vi1 = roundf(v1);
721
839
 
722
- assert(vi0 >= 0 && vi0 < 16);
723
- assert(vi1 >= 0 && vi1 < 16);
840
+ assert(vi0 < 16);
841
+ assert(vi1 < 16);
724
842
 
725
843
  pp[l/2] = vi0 | (vi1 << 4);
726
844
  }
@@ -732,11 +850,11 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
732
850
  static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) {
733
851
  assert(k % QK == 0);
734
852
 
735
- #if defined(__AVX2__)
736
853
  const int nb = k / QK;
737
854
 
738
855
  block_q4_1 * restrict y = vy;
739
856
 
857
+ #if defined(__AVX2__)
740
858
  for (int i = 0; i < nb; i++) {
741
859
  // Load elements into 4 AVX vectors
742
860
  __m256 v0 = _mm256_loadu_ps( x );
@@ -810,6 +928,41 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
810
928
  __m128i res = packNibbles( i0 );
811
929
  _mm_storeu_si128( ( __m128i* )y[i].qs, res );
812
930
  }
931
+ #elif __ARM_NEON
932
+ for (int i = 0; i < nb; i++) {
933
+ float32x4_t srcv[8];
934
+ float32x4_t minv[8];
935
+ float32x4_t maxv[8];
936
+
937
+ for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
938
+
939
+ for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
940
+ for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
941
+ for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
942
+
943
+ for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
944
+ for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
945
+ for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
946
+
947
+ const float min = vminvq_f32(minv[0]);
948
+ const float max = vmaxvq_f32(maxv[0]);
949
+
950
+ const float d = (max - min) / ((1 << 4) - 1);
951
+ const float id = d ? 1.0f/d : 0.0f;
952
+
953
+ y[i].d = d;
954
+ y[i].m = min;
955
+
956
+ const float32x4_t minv0 = vdupq_n_f32(min);
957
+
958
+ for (int l = 0; l < 8; l++) {
959
+ const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
960
+ const int32x4_t vi = vcvtq_s32_f32(v);
961
+
962
+ y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
963
+ y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
964
+ }
965
+ }
813
966
  #else
814
967
  // scalar
815
968
  quantize_row_q4_1_reference(x, vy, k);
@@ -970,6 +1123,50 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
970
1123
  }
971
1124
  }
972
1125
  }
1126
+ #elif defined(__ARM_NEON)
1127
+ for (int i = 0; i < nb; i++) {
1128
+ const float32x4_t vd = vdupq_n_f32(x[i].d);
1129
+ const float32x4_t vm = vdupq_n_f32(x[i].m);
1130
+
1131
+ const uint8_t * restrict pp = x[i].qs;
1132
+
1133
+ for (int l = 0; l < QK; l += 16) {
1134
+ // Load 16x4-bit integers into 8x8-bit integers
1135
+ const uint8x8_t v8 = vld1_u8(pp + l/2);
1136
+
1137
+ // Expand 4-bit qs to 8-bit bytes
1138
+ const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
1139
+ const uint8x8_t v1 = vshr_n_u8(v8, 4);
1140
+
1141
+ // Interleave and combine
1142
+ const uint8x8_t vx_0 = vzip1_u8(v0, v1);
1143
+ const uint8x8_t vx_1 = vzip2_u8(v0, v1);
1144
+
1145
+ const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
1146
+
1147
+ // convert to 2x uint16x8_t
1148
+ const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
1149
+ const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
1150
+
1151
+ // convert to 4x float32x4_t
1152
+ const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
1153
+ const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
1154
+ const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
1155
+ const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
1156
+
1157
+ // multiply by d and add m
1158
+ const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
1159
+ const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
1160
+ const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
1161
+ const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
1162
+
1163
+ // Store
1164
+ vst1q_f32(y + i*QK + l + 0, r0);
1165
+ vst1q_f32(y + i*QK + l + 4, r1);
1166
+ vst1q_f32(y + i*QK + l + 8, r2);
1167
+ vst1q_f32(y + i*QK + l + 12, r3);
1168
+ }
1169
+ }
973
1170
  #else
974
1171
  for (int i = 0; i < nb; i++) {
975
1172
  const float d = x[i].d;
@@ -1207,7 +1404,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1207
1404
  _mm256_storeu_ps(arr, y);
1208
1405
 
1209
1406
  for (int i = 0; i < 8; i++)
1210
- x[i] = GGML_FP16_TO_FP32(arr[i]);
1407
+ x[i] = GGML_FP32_TO_FP16(arr[i]);
1211
1408
  }
1212
1409
  #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
1213
1410
  #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
@@ -1636,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1636
1833
  const block_q4_0 * restrict x = vx;
1637
1834
  const block_q4_0 * restrict y = vy;
1638
1835
 
1639
- ggml_float sumf = 0.0;
1836
+ float sumf = 0.0;
1640
1837
 
1641
1838
  #if defined(__ARM_NEON)
1642
1839
  float sum0 = 0.0f;
@@ -1731,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1731
1928
  #endif
1732
1929
  }
1733
1930
 
1734
- sumf = (ggml_float)(sum0 + sum1);
1931
+ sumf = sum0 + sum1;
1735
1932
  #elif defined(__AVX512F__)
1736
1933
  // Initialize accumulator with zeros
1737
1934
  __m512 acc0 = _mm512_setzero_ps();
@@ -1739,7 +1936,6 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1739
1936
 
1740
1937
  const int superblock_size = 8;
1741
1938
  const int superblock_count = nb / superblock_size;
1742
- const int remainder = nb % superblock_size;
1743
1939
 
1744
1940
  for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
1745
1941
  int i = superblock_ix * superblock_size;
@@ -1766,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1766
1962
  __m256 acc = _mm256_setzero_ps();
1767
1963
 
1768
1964
  // Main loop
1965
+ // TODO: figure a way to do this in a portable way
1966
+ #ifdef __GNUC__
1967
+ #pragma GCC unroll 16
1968
+ #endif
1769
1969
  for (int i = 0; i < nb; ++i) {
1770
1970
  // Compute combined scale for the block
1771
1971
  const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
@@ -1779,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1779
1979
  bx = _mm256_sub_epi8( bx, off );
1780
1980
  by = _mm256_sub_epi8( by, off );
1781
1981
 
1782
- // Sign-extend first 16 signed bytes into int16_t
1783
- __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
1784
- __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
1785
- // Compute products of int16_t integers, add pairwise
1786
- __m256i i32 = _mm256_madd_epi16( x16, y16 );
1982
+ // Get absolute values of x vectors
1983
+ const __m256i ax = _mm256_sign_epi8(bx, bx);
1787
1984
 
1788
- // Sign-extend last 16 signed bytes into int16_t vectors
1789
- x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
1790
- y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
1791
- // Accumulate products of int16_t integers
1792
- i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );
1985
+ // Sign the values of the y vectors
1986
+ const __m256i sy = _mm256_sign_epi8(by, bx);
1987
+
1988
+ // Perform multiplication and create 16-bit values
1989
+ const __m256i dot = _mm256_maddubs_epi16(ax, sy);
1990
+
1991
+ const __m256i ones = _mm256_set1_epi16(1);
1992
+ const __m256i i32 = _mm256_madd_epi16(ones, dot);
1793
1993
 
1794
1994
  // Convert int32_t to float
1795
- __m256 p = _mm256_cvtepi32_ps( i32 );
1995
+ const __m256 p = _mm256_cvtepi32_ps( i32 );
1996
+
1796
1997
  // Apply the scale, and accumulate
1797
1998
  acc = _mm256_fmadd_ps( d, p, acc );
1798
1999
  }
@@ -1803,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1803
2004
  res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
1804
2005
  res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
1805
2006
 
2007
+ sumf = _mm_cvtss_f32( res );
2008
+ #elif defined(__AVX__)
2009
+ // Initialize accumulator with zeros
2010
+ __m256 acc = _mm256_setzero_ps();
2011
+
2012
+ // Main loop
2013
+ for (int i = 0; i < nb; ++i) {
2014
+ // Compute combined scale for the block
2015
+ const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
2016
+
2017
+ __m128i i32[2];
2018
+ for (int j = 0; j < 2; ++j) {
2019
+ // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
2020
+ __m128i bx = bytesFromNibbles( x[i].qs + 8*j );
2021
+ __m128i by = bytesFromNibbles( y[i].qs + 8*j );
2022
+
2023
+ // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
2024
+ const __m128i off = _mm_set1_epi8( 8 );
2025
+ bx = _mm_sub_epi8( bx, off );
2026
+ by = _mm_sub_epi8( by, off );
2027
+
2028
+ // Get absolute values of x vectors
2029
+ const __m128i ax = _mm_sign_epi8(bx, bx);
2030
+
2031
+ // Sign the values of the y vectors
2032
+ const __m128i sy = _mm_sign_epi8(by, bx);
2033
+
2034
+ // Perform multiplication and create 16-bit values
2035
+ const __m128i dot = _mm_maddubs_epi16(ax, sy);
2036
+
2037
+ const __m128i ones = _mm_set1_epi16(1);
2038
+ i32[j] = _mm_madd_epi16(ones, dot);
2039
+ }
2040
+
2041
+ // Convert int32_t to float
2042
+ __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
2043
+ // Apply the scale, and accumulate
2044
+ acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
2045
+ }
2046
+
2047
+ // Return horizontal sum of the acc vector
2048
+ __m128 res = _mm256_extractf128_ps( acc, 1 );
2049
+ res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
2050
+ res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
2051
+ res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
2052
+
1806
2053
  sumf = _mm_cvtss_f32( res );
1807
2054
  #elif defined(__wasm_simd128__)
1808
2055
  // wasm simd
@@ -1944,7 +2191,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
1944
2191
  // Compute cross scales for the block
1945
2192
  const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
1946
2193
  const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
1947
- const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
2194
+ const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ );
1948
2195
 
1949
2196
  // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
1950
2197
  __m256i bx = bytesFromNibbles( x[i].qs );
@@ -1990,6 +2237,45 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
1990
2237
  res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
1991
2238
 
1992
2239
  sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
2240
+ #elif defined(__ARM_NEON)
2241
+ float sum00 = 0.0f;
2242
+ float sum01 = 0.0f;
2243
+ float sum10 = 0.0f;
2244
+ float sum11 = 0.0f;
2245
+
2246
+ for (int i = 0; i < nb; ++i) {
2247
+ const block_q4_1 * restrict x0 = &x[i + 0];
2248
+ const block_q4_1 * restrict y0 = &y[i + 0];
2249
+
2250
+ const uint8x16_t m4b = vdupq_n_u8(0xf);
2251
+
2252
+ const uint8x16_t v0_0 = vld1q_u8(x0->qs);
2253
+ const uint8x16_t v1_0 = vld1q_u8(y0->qs);
2254
+
2255
+ // and with 0xf
2256
+ const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
2257
+ const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
2258
+
2259
+ const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
2260
+ const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
2261
+
2262
+ // dot product into uint16x8_t
2263
+ const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
2264
+ const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
2265
+
2266
+ const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
2267
+ const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
2268
+
2269
+ const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
2270
+ const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
2271
+
2272
+ sum00 += x0->m*y0->m;
2273
+ sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
2274
+ sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
2275
+ sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
2276
+ }
2277
+
2278
+ sumf = QK*sum00 + sum01 + sum10 + sum11;
1993
2279
  #else
1994
2280
  // scalar
1995
2281
  for (int i = 0; i < nb; i++) {
@@ -2401,8 +2687,9 @@ struct ggml_context {
2401
2687
  void * mem_buffer;
2402
2688
  bool mem_buffer_owned;
2403
2689
  bool mem_buffer_mlocked;
2690
+ bool no_alloc;
2404
2691
 
2405
- int n_objects;
2692
+ int n_objects;
2406
2693
 
2407
2694
  struct ggml_object * objects_begin;
2408
2695
  struct ggml_object * objects_end;
@@ -2619,6 +2906,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2619
2906
  static bool is_first_call = true;
2620
2907
 
2621
2908
  if (is_first_call) {
2909
+ // initialize time system (required on Windows)
2910
+ ggml_time_init();
2911
+
2622
2912
  // initialize GELU, SILU and EXP F32 tables
2623
2913
  {
2624
2914
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
@@ -2684,6 +2974,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2684
2974
  /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
2685
2975
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
2686
2976
  /*.mem_buffer_mlocked =*/ false,
2977
+ /*.no_alloc =*/ params.no_alloc,
2687
2978
  /*.n_objects =*/ 0,
2688
2979
  /*.objects_begin =*/ NULL,
2689
2980
  /*.objects_end =*/ NULL,
@@ -2751,36 +3042,47 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
2751
3042
  return result;
2752
3043
  }
2753
3044
 
3045
+ #ifdef __APPLE__
3046
+ #define MLOCK_SUGGESTION \
3047
+ "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
3048
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
3049
+ #else
3050
+ #define MLOCK_SUGGESTION \
3051
+ "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
3052
+ #endif
3053
+
2754
3054
  bool ggml_mlock_supported(void) {
2755
3055
  return GGML_MLOCK_SUPPORT;
2756
3056
  }
2757
3057
 
3058
+ bool ggml_mlock(
3059
+ struct ggml_context * ctx,
3060
+ const void *opt_extra_addr,
3061
+ size_t opt_extra_len,
3062
+ char **err_p) {
3063
+ // TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
2758
3064
  #if GGML_MLOCK_SUPPORT
2759
- #ifdef __APPLE__
2760
- #define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
2761
- "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
2762
- #else
2763
- #define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
2764
- #endif
2765
- bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
2766
3065
  if (ctx->mem_buffer_mlocked) {
2767
3066
  return true;
2768
3067
  }
2769
- if (mlock(ctx->mem_buffer, ctx->mem_size)) {
2770
- int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
2771
- ctx->mem_size, strerror(errno));
2772
- GGML_ASSERT(ret >= 0);
3068
+ if (mlock(ctx->mem_buffer, ctx->mem_size) ||
3069
+ (opt_extra_len &&
3070
+ mlock(opt_extra_addr, opt_extra_len))) {
3071
+ if ((*err_p = malloc(1024))) {
3072
+ snprintf(*err_p, 1024,
3073
+ "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
3074
+ ctx->mem_size + opt_extra_len,
3075
+ strerror(errno));
3076
+ }
2773
3077
  return false;
2774
3078
  }
2775
3079
  ctx->mem_buffer_mlocked = true;
2776
3080
  return true;
2777
- }
2778
3081
  #else // GGML_MLOCK_SUPPORT
2779
- bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
2780
3082
  *err_p = strdup("can't mlock because it's not supported on this system");
2781
3083
  return false;
2782
- }
2783
3084
  #endif // GGML_MLOCK_SUPPORT
3085
+ }
2784
3086
 
2785
3087
  ////////////////////////////////////////////////////////////////////////////////
2786
3088
 
@@ -2799,7 +3101,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
2799
3101
 
2800
3102
  size_t size_needed = 0;
2801
3103
 
2802
- if (data == NULL) {
3104
+ if (data == NULL && !ctx->no_alloc) {
2803
3105
  size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
2804
3106
  for (int i = 1; i < n_dims; i++) {
2805
3107
  size_needed *= ne[i];
@@ -2883,7 +3185,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
2883
3185
  /*.perf_runs =*/ 0,
2884
3186
  /*.perf_cycles =*/ 0,
2885
3187
  /*.perf_time_us =*/ 0,
2886
- /*.data =*/ data == NULL ? (void *)(result + 1) : data,
3188
+ /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
2887
3189
  /*.pad =*/ { 0 },
2888
3190
  };
2889
3191
 
@@ -10146,6 +10448,7 @@ enum ggml_opt_result ggml_opt(
10146
10448
  struct ggml_init_params params_ctx = {
10147
10449
  .mem_size = 16*1024*1024,
10148
10450
  .mem_buffer = NULL,
10451
+ .no_alloc = false,
10149
10452
  };
10150
10453
 
10151
10454
  ctx = ggml_init(params_ctx);
@@ -316,6 +316,7 @@ struct ggml_init_params {
316
316
  // memory pool
317
317
  size_t mem_size; // bytes
318
318
  void * mem_buffer; // if NULL, memory will be allocated internally
319
+ bool no_alloc; // don't allocate memory for the tensor data
319
320
  };
320
321
 
321
322
  void ggml_time_init(void); // call this once at the beginning of the program
@@ -344,7 +345,11 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
344
345
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
345
346
 
346
347
  bool ggml_mlock_supported(void);
347
- bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
348
+ bool ggml_mlock(
349
+ struct ggml_context * ctx,
350
+ const void *opt_extra_addr,
351
+ size_t opt_extra_len,
352
+ char **err_p);
348
353
 
349
354
  struct ggml_tensor * ggml_new_tensor(
350
355
  struct ggml_context * ctx,