llama_cpp 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6bf717ec1012d78b7d83f3f7a7546f589fbf368c1b2babc69a99fd28a5d9ff3
4
- data.tar.gz: 6ab2e2ae4b6410f32890a86b7ac2dbb93ab9e2f43888158b7cbfd9b16f435447
3
+ metadata.gz: e4b9a70ca3137fb187c1455291828001086e373db7d9189f7f8d45f0d252b0dc
4
+ data.tar.gz: 22d67fa3d1c71d73569735876aebe953038bb0465a67b07ea991dc8568d11bac
5
5
  SHA512:
6
- metadata.gz: cd1ae63e518a422dbe3a281a598b18b9397fdf880867f92bad20e56b5a60756a1a929a62879f7aed0c7c24012b87b85353e175c773aeed4f8d87294ba0422cb1
7
- data.tar.gz: 2828321d0589ac16713745b2770844d5c6fed848ff0efed90304370152650a8e0619657a91184f74c402eb9351800ac3517c20f775faf52db91331d95ac1c87d
6
+ metadata.gz: 3767e7950004aba7980a27dbffaec2c360a18295e845b58ab647eff4b9f90515e47c646e48e5d75cac261908415602df50908e429fca0637664e93b2efd7dc1a
7
+ data.tar.gz: b08e00960ab036fe7ac7778dd33a5a72795153cd7c8beea642b5422da41575a19ea41e1b865e25d16f36afe2879ff4b5b3f303d49598c30888a95ecf459501da
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.0.2] - 2023-04-02
4
+
5
+ - Bump bundled llama.cpp from master-2a98bc1 to master-5b70e7d
6
+ - Add n_threads arguments to generate method.
7
+
3
8
  ## [0.0.1] - 2023-04-02
4
9
 
5
10
  - Initial release
data/README.md CHANGED
@@ -20,6 +20,8 @@ If bundler is not being used to manage dependencies, install the gem by executin
20
20
 
21
21
  ## Usage
22
22
 
23
+ Prepare a quantized model file by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage).
24
+
23
25
  ```ruby
24
26
  require 'llama_cpp'
25
27
 
@@ -79,6 +79,19 @@ static int sched_yield (void) {
79
79
  typedef void* thread_ret_t;
80
80
  #endif
81
81
 
82
+ // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
83
+ #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
84
+ #ifndef __FMA__
85
+ #define __FMA__
86
+ #endif
87
+ #ifndef __F16C__
88
+ #define __F16C__
89
+ #endif
90
+ #ifndef __SSE3__
91
+ #define __SSE3__
92
+ #endif
93
+ #endif
94
+
82
95
  #ifdef __HAIKU__
83
96
  #define static_assert(cond, msg) _Static_assert(cond, msg)
84
97
  #endif
@@ -172,8 +185,13 @@ typedef double ggml_float;
172
185
 
173
186
  #ifdef __F16C__
174
187
 
188
+ #ifdef _MSC_VER
189
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
190
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
191
+ #else
175
192
  #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
176
193
  #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
194
+ #endif
177
195
 
178
196
  #elif defined(__POWER9_VECTOR__)
179
197
 
@@ -443,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes )
443
461
  __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
444
462
  return _mm_packus_epi16( r0, r1 );
445
463
  }
464
+ #elif __AVX__
465
+ static inline __m128i bytesFromNibbles( const uint8_t* rsi )
466
+ {
467
+ // Load 8 bytes from memory
468
+ __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
469
+
470
+ // Expand bytes into uint16_t values
471
+ __m128i bytes = _mm_cvtepu8_epi16( tmp );
472
+
473
+ // Unpack values into individual bytes
474
+ const __m128i lowMask = _mm_set1_epi8( 0xF );
475
+ __m128i high = _mm_andnot_si128( lowMask, bytes );
476
+ __m128i low = _mm_and_si128( lowMask, bytes );
477
+ high = _mm_slli_epi16( high, 4 );
478
+ bytes = _mm_or_si128( low, high );
479
+ return bytes;
480
+ }
481
+
482
+ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
483
+ {
484
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
485
+ const __m128i lowByte = _mm_set1_epi16( 0xFF );
486
+ __m128i high = _mm_andnot_si128( lowByte, bytes1 );
487
+ __m128i low = _mm_and_si128( lowByte, bytes1 );
488
+ high = _mm_srli_epi16( high, 4 );
489
+ bytes1 = _mm_or_si128( low, high );
490
+ high = _mm_andnot_si128( lowByte, bytes2 );
491
+ low = _mm_and_si128( lowByte, bytes2 );
492
+ high = _mm_srli_epi16( high, 4 );
493
+ bytes2 = _mm_or_si128( low, high );
494
+
495
+ return _mm_packus_epi16( bytes1, bytes2);
496
+ }
446
497
  #endif
447
498
 
448
499
  // method 5
@@ -491,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
491
542
  const uint8_t vi0 = (int8_t)roundf(v0) + 8;
492
543
  const uint8_t vi1 = (int8_t)roundf(v1) + 8;
493
544
 
494
- assert(vi0 >= 0 && vi0 < 16);
495
- assert(vi1 >= 0 && vi1 < 16);
545
+ assert(vi0 < 16);
546
+ assert(vi1 < 16);
496
547
 
497
548
  pp[l/2] = vi0 | (vi1 << 4);
498
549
  }
@@ -546,10 +597,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
546
597
  }
547
598
  }
548
599
  #elif __ARM_NEON
549
- uint8_t pp[QK/2];
550
600
  for (int i = 0; i < nb; i++) {
551
- float amax = 0.0f; // absolute max
552
-
553
601
  float32x4_t srcv [8];
554
602
  float32x4_t asrcv[8];
555
603
  float32x4_t amaxv[8];
@@ -561,7 +609,8 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
561
609
  for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
562
610
  for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
563
611
 
564
- amax = MAX(
612
+ // absolute max
613
+ const float amax = MAX(
565
614
  MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
566
615
  MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
567
616
 
@@ -575,11 +624,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
575
624
  const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
576
625
  const int32x4_t vi = vcvtq_s32_f32(vf);
577
626
 
578
- pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
579
- pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
627
+ y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
628
+ y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
580
629
  }
581
-
582
- memcpy(y[i].qs, pp, sizeof(pp));
583
630
  }
584
631
  #elif defined(__AVX2__)
585
632
  for (int i = 0; i < nb; i++) {
@@ -646,8 +693,81 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
646
693
  __m128i res = packNibbles( i0 );
647
694
  _mm_storeu_si128( ( __m128i* )y[i].qs, res );
648
695
  }
696
+ #elif defined(__AVX__)
697
+ for (int i = 0; i < nb; i++) {
698
+ // Load elements into 4 AVX vectors
699
+ __m256 v0 = _mm256_loadu_ps( x );
700
+ __m256 v1 = _mm256_loadu_ps( x + 8 );
701
+ __m256 v2 = _mm256_loadu_ps( x + 16 );
702
+ __m256 v3 = _mm256_loadu_ps( x + 24 );
703
+ x += 32;
704
+
705
+ // Compute max(abs(e)) for the block
706
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
707
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
708
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
709
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
710
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
711
+
712
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
713
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
714
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
715
+ const float maxScalar = _mm_cvtss_f32( max4 );
716
+
717
+ // Quantize these floats
718
+ const float d = maxScalar / 7.0f;
719
+ y[i].d = d;
720
+ const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
721
+ const __m256 mul = _mm256_set1_ps( id );
722
+
723
+ // Apply the multiplier
724
+ v0 = _mm256_mul_ps( v0, mul );
725
+ v1 = _mm256_mul_ps( v1, mul );
726
+ v2 = _mm256_mul_ps( v2, mul );
727
+ v3 = _mm256_mul_ps( v3, mul );
728
+
729
+ // Round to nearest integer
730
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
731
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
732
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
733
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
734
+
735
+ // Convert floats to integers
736
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
737
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
738
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
739
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
740
+
741
+ // Since we don't have in AVX some necessary functions,
742
+ // we split the registers in half and call AVX2 analogs from SSE
743
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
744
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
745
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
746
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
747
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
748
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
749
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
750
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
751
+
752
+ // Convert int32 to int16
753
+ ni0 = _mm_packs_epi32( ni0, ni1 );
754
+ ni2 = _mm_packs_epi32( ni2, ni3 );
755
+ ni4 = _mm_packs_epi32( ni4, ni5 );
756
+ ni6 = _mm_packs_epi32( ni6, ni7 );
757
+ // Convert int16 to int8
758
+ ni0 = _mm_packs_epi16( ni0, ni2 );
759
+ ni4 = _mm_packs_epi16( ni4, ni6 );
760
+
761
+ // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
762
+ const __m128i off = _mm_set1_epi8( 8);
763
+ ni0 = _mm_add_epi8( ni0, off );
764
+ ni4 = _mm_add_epi8( ni4, off );
765
+
766
+ // Compress the vector into 4 bit/value, and store
767
+ __m128i res = packNibbles( ni0, ni4 );
768
+ _mm_storeu_si128( ( __m128i* )y[i].qs, res );
769
+ }
649
770
  #elif defined(__wasm_simd128__)
650
- uint8_t pp[QK/2];
651
771
  for (int i = 0; i < nb; i++) {
652
772
  float amax = 0.0f; // absolute max
653
773
 
@@ -676,11 +796,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
676
796
  const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
677
797
  const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
678
798
 
679
- pp[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
680
- pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
799
+ y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
800
+ y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
681
801
  }
682
-
683
- memcpy(y[i].qs, pp, sizeof(pp));
684
802
  }
685
803
  #else
686
804
  // scalar
@@ -719,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
719
837
  const uint8_t vi0 = roundf(v0);
720
838
  const uint8_t vi1 = roundf(v1);
721
839
 
722
- assert(vi0 >= 0 && vi0 < 16);
723
- assert(vi1 >= 0 && vi1 < 16);
840
+ assert(vi0 < 16);
841
+ assert(vi1 < 16);
724
842
 
725
843
  pp[l/2] = vi0 | (vi1 << 4);
726
844
  }
@@ -732,11 +850,11 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
732
850
  static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) {
733
851
  assert(k % QK == 0);
734
852
 
735
- #if defined(__AVX2__)
736
853
  const int nb = k / QK;
737
854
 
738
855
  block_q4_1 * restrict y = vy;
739
856
 
857
+ #if defined(__AVX2__)
740
858
  for (int i = 0; i < nb; i++) {
741
859
  // Load elements into 4 AVX vectors
742
860
  __m256 v0 = _mm256_loadu_ps( x );
@@ -810,6 +928,41 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
810
928
  __m128i res = packNibbles( i0 );
811
929
  _mm_storeu_si128( ( __m128i* )y[i].qs, res );
812
930
  }
931
+ #elif __ARM_NEON
932
+ for (int i = 0; i < nb; i++) {
933
+ float32x4_t srcv[8];
934
+ float32x4_t minv[8];
935
+ float32x4_t maxv[8];
936
+
937
+ for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
938
+
939
+ for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
940
+ for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
941
+ for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
942
+
943
+ for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
944
+ for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
945
+ for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
946
+
947
+ const float min = vminvq_f32(minv[0]);
948
+ const float max = vmaxvq_f32(maxv[0]);
949
+
950
+ const float d = (max - min) / ((1 << 4) - 1);
951
+ const float id = d ? 1.0f/d : 0.0f;
952
+
953
+ y[i].d = d;
954
+ y[i].m = min;
955
+
956
+ const float32x4_t minv0 = vdupq_n_f32(min);
957
+
958
+ for (int l = 0; l < 8; l++) {
959
+ const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
960
+ const int32x4_t vi = vcvtq_s32_f32(v);
961
+
962
+ y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
963
+ y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
964
+ }
965
+ }
813
966
  #else
814
967
  // scalar
815
968
  quantize_row_q4_1_reference(x, vy, k);
@@ -970,6 +1123,50 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
970
1123
  }
971
1124
  }
972
1125
  }
1126
+ #elif defined(__ARM_NEON)
1127
+ for (int i = 0; i < nb; i++) {
1128
+ const float32x4_t vd = vdupq_n_f32(x[i].d);
1129
+ const float32x4_t vm = vdupq_n_f32(x[i].m);
1130
+
1131
+ const uint8_t * restrict pp = x[i].qs;
1132
+
1133
+ for (int l = 0; l < QK; l += 16) {
1134
+ // Load 16x4-bit integers into 8x8-bit integers
1135
+ const uint8x8_t v8 = vld1_u8(pp + l/2);
1136
+
1137
+ // Expand 4-bit qs to 8-bit bytes
1138
+ const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
1139
+ const uint8x8_t v1 = vshr_n_u8(v8, 4);
1140
+
1141
+ // Interleave and combine
1142
+ const uint8x8_t vx_0 = vzip1_u8(v0, v1);
1143
+ const uint8x8_t vx_1 = vzip2_u8(v0, v1);
1144
+
1145
+ const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
1146
+
1147
+ // convert to 2x uint16x8_t
1148
+ const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
1149
+ const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
1150
+
1151
+ // convert to 4x float32x4_t
1152
+ const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
1153
+ const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
1154
+ const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
1155
+ const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
1156
+
1157
+ // multiply by d and add m
1158
+ const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
1159
+ const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
1160
+ const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
1161
+ const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
1162
+
1163
+ // Store
1164
+ vst1q_f32(y + i*QK + l + 0, r0);
1165
+ vst1q_f32(y + i*QK + l + 4, r1);
1166
+ vst1q_f32(y + i*QK + l + 8, r2);
1167
+ vst1q_f32(y + i*QK + l + 12, r3);
1168
+ }
1169
+ }
973
1170
  #else
974
1171
  for (int i = 0; i < nb; i++) {
975
1172
  const float d = x[i].d;
@@ -1207,7 +1404,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1207
1404
  _mm256_storeu_ps(arr, y);
1208
1405
 
1209
1406
  for (int i = 0; i < 8; i++)
1210
- x[i] = GGML_FP16_TO_FP32(arr[i]);
1407
+ x[i] = GGML_FP32_TO_FP16(arr[i]);
1211
1408
  }
1212
1409
  #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
1213
1410
  #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
@@ -1636,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1636
1833
  const block_q4_0 * restrict x = vx;
1637
1834
  const block_q4_0 * restrict y = vy;
1638
1835
 
1639
- ggml_float sumf = 0.0;
1836
+ float sumf = 0.0;
1640
1837
 
1641
1838
  #if defined(__ARM_NEON)
1642
1839
  float sum0 = 0.0f;
@@ -1731,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1731
1928
  #endif
1732
1929
  }
1733
1930
 
1734
- sumf = (ggml_float)(sum0 + sum1);
1931
+ sumf = sum0 + sum1;
1735
1932
  #elif defined(__AVX512F__)
1736
1933
  // Initialize accumulator with zeros
1737
1934
  __m512 acc0 = _mm512_setzero_ps();
@@ -1739,7 +1936,6 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1739
1936
 
1740
1937
  const int superblock_size = 8;
1741
1938
  const int superblock_count = nb / superblock_size;
1742
- const int remainder = nb % superblock_size;
1743
1939
 
1744
1940
  for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
1745
1941
  int i = superblock_ix * superblock_size;
@@ -1766,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1766
1962
  __m256 acc = _mm256_setzero_ps();
1767
1963
 
1768
1964
  // Main loop
1965
+ // TODO: figure a way to do this in a portable way
1966
+ #ifdef __GNUC__
1967
+ #pragma GCC unroll 16
1968
+ #endif
1769
1969
  for (int i = 0; i < nb; ++i) {
1770
1970
  // Compute combined scale for the block
1771
1971
  const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
@@ -1779,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1779
1979
  bx = _mm256_sub_epi8( bx, off );
1780
1980
  by = _mm256_sub_epi8( by, off );
1781
1981
 
1782
- // Sign-extend first 16 signed bytes into int16_t
1783
- __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
1784
- __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
1785
- // Compute products of int16_t integers, add pairwise
1786
- __m256i i32 = _mm256_madd_epi16( x16, y16 );
1982
+ // Get absolute values of x vectors
1983
+ const __m256i ax = _mm256_sign_epi8(bx, bx);
1787
1984
 
1788
- // Sign-extend last 16 signed bytes into int16_t vectors
1789
- x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
1790
- y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
1791
- // Accumulate products of int16_t integers
1792
- i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );
1985
+ // Sign the values of the y vectors
1986
+ const __m256i sy = _mm256_sign_epi8(by, bx);
1987
+
1988
+ // Perform multiplication and create 16-bit values
1989
+ const __m256i dot = _mm256_maddubs_epi16(ax, sy);
1990
+
1991
+ const __m256i ones = _mm256_set1_epi16(1);
1992
+ const __m256i i32 = _mm256_madd_epi16(ones, dot);
1793
1993
 
1794
1994
  // Convert int32_t to float
1795
- __m256 p = _mm256_cvtepi32_ps( i32 );
1995
+ const __m256 p = _mm256_cvtepi32_ps( i32 );
1996
+
1796
1997
  // Apply the scale, and accumulate
1797
1998
  acc = _mm256_fmadd_ps( d, p, acc );
1798
1999
  }
@@ -1803,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1803
2004
  res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
1804
2005
  res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
1805
2006
 
2007
+ sumf = _mm_cvtss_f32( res );
2008
+ #elif defined(__AVX__)
2009
+ // Initialize accumulator with zeros
2010
+ __m256 acc = _mm256_setzero_ps();
2011
+
2012
+ // Main loop
2013
+ for (int i = 0; i < nb; ++i) {
2014
+ // Compute combined scale for the block
2015
+ const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
2016
+
2017
+ __m128i i32[2];
2018
+ for (int j = 0; j < 2; ++j) {
2019
+ // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
2020
+ __m128i bx = bytesFromNibbles( x[i].qs + 8*j );
2021
+ __m128i by = bytesFromNibbles( y[i].qs + 8*j );
2022
+
2023
+ // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
2024
+ const __m128i off = _mm_set1_epi8( 8 );
2025
+ bx = _mm_sub_epi8( bx, off );
2026
+ by = _mm_sub_epi8( by, off );
2027
+
2028
+ // Get absolute values of x vectors
2029
+ const __m128i ax = _mm_sign_epi8(bx, bx);
2030
+
2031
+ // Sign the values of the y vectors
2032
+ const __m128i sy = _mm_sign_epi8(by, bx);
2033
+
2034
+ // Perform multiplication and create 16-bit values
2035
+ const __m128i dot = _mm_maddubs_epi16(ax, sy);
2036
+
2037
+ const __m128i ones = _mm_set1_epi16(1);
2038
+ i32[j] = _mm_madd_epi16(ones, dot);
2039
+ }
2040
+
2041
+ // Convert int32_t to float
2042
+ __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
2043
+ // Apply the scale, and accumulate
2044
+ acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
2045
+ }
2046
+
2047
+ // Return horizontal sum of the acc vector
2048
+ __m128 res = _mm256_extractf128_ps( acc, 1 );
2049
+ res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
2050
+ res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
2051
+ res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
2052
+
1806
2053
  sumf = _mm_cvtss_f32( res );
1807
2054
  #elif defined(__wasm_simd128__)
1808
2055
  // wasm simd
@@ -1944,7 +2191,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
1944
2191
  // Compute cross scales for the block
1945
2192
  const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
1946
2193
  const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
1947
- const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
2194
+ const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ );
1948
2195
 
1949
2196
  // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
1950
2197
  __m256i bx = bytesFromNibbles( x[i].qs );
@@ -1990,6 +2237,45 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
1990
2237
  res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
1991
2238
 
1992
2239
  sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
2240
+ #elif defined(__ARM_NEON)
2241
+ float sum00 = 0.0f;
2242
+ float sum01 = 0.0f;
2243
+ float sum10 = 0.0f;
2244
+ float sum11 = 0.0f;
2245
+
2246
+ for (int i = 0; i < nb; ++i) {
2247
+ const block_q4_1 * restrict x0 = &x[i + 0];
2248
+ const block_q4_1 * restrict y0 = &y[i + 0];
2249
+
2250
+ const uint8x16_t m4b = vdupq_n_u8(0xf);
2251
+
2252
+ const uint8x16_t v0_0 = vld1q_u8(x0->qs);
2253
+ const uint8x16_t v1_0 = vld1q_u8(y0->qs);
2254
+
2255
+ // and with 0xf
2256
+ const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
2257
+ const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
2258
+
2259
+ const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
2260
+ const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
2261
+
2262
+ // dot product into uint16x8_t
2263
+ const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
2264
+ const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
2265
+
2266
+ const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
2267
+ const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
2268
+
2269
+ const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
2270
+ const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
2271
+
2272
+ sum00 += x0->m*y0->m;
2273
+ sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
2274
+ sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
2275
+ sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
2276
+ }
2277
+
2278
+ sumf = QK*sum00 + sum01 + sum10 + sum11;
1993
2279
  #else
1994
2280
  // scalar
1995
2281
  for (int i = 0; i < nb; i++) {
@@ -2401,8 +2687,9 @@ struct ggml_context {
2401
2687
  void * mem_buffer;
2402
2688
  bool mem_buffer_owned;
2403
2689
  bool mem_buffer_mlocked;
2690
+ bool no_alloc;
2404
2691
 
2405
- int n_objects;
2692
+ int n_objects;
2406
2693
 
2407
2694
  struct ggml_object * objects_begin;
2408
2695
  struct ggml_object * objects_end;
@@ -2619,6 +2906,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2619
2906
  static bool is_first_call = true;
2620
2907
 
2621
2908
  if (is_first_call) {
2909
+ // initialize time system (required on Windows)
2910
+ ggml_time_init();
2911
+
2622
2912
  // initialize GELU, SILU and EXP F32 tables
2623
2913
  {
2624
2914
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
@@ -2684,6 +2974,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2684
2974
  /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
2685
2975
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
2686
2976
  /*.mem_buffer_mlocked =*/ false,
2977
+ /*.no_alloc =*/ params.no_alloc,
2687
2978
  /*.n_objects =*/ 0,
2688
2979
  /*.objects_begin =*/ NULL,
2689
2980
  /*.objects_end =*/ NULL,
@@ -2751,36 +3042,47 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
2751
3042
  return result;
2752
3043
  }
2753
3044
 
3045
+ #ifdef __APPLE__
3046
+ #define MLOCK_SUGGESTION \
3047
+ "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
3048
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
3049
+ #else
3050
+ #define MLOCK_SUGGESTION \
3051
+ "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
3052
+ #endif
3053
+
2754
3054
  bool ggml_mlock_supported(void) {
2755
3055
  return GGML_MLOCK_SUPPORT;
2756
3056
  }
2757
3057
 
3058
+ bool ggml_mlock(
3059
+ struct ggml_context * ctx,
3060
+ const void *opt_extra_addr,
3061
+ size_t opt_extra_len,
3062
+ char **err_p) {
3063
+ // TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
2758
3064
  #if GGML_MLOCK_SUPPORT
2759
- #ifdef __APPLE__
2760
- #define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
2761
- "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
2762
- #else
2763
- #define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
2764
- #endif
2765
- bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
2766
3065
  if (ctx->mem_buffer_mlocked) {
2767
3066
  return true;
2768
3067
  }
2769
- if (mlock(ctx->mem_buffer, ctx->mem_size)) {
2770
- int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
2771
- ctx->mem_size, strerror(errno));
2772
- GGML_ASSERT(ret >= 0);
3068
+ if (mlock(ctx->mem_buffer, ctx->mem_size) ||
3069
+ (opt_extra_len &&
3070
+ mlock(opt_extra_addr, opt_extra_len))) {
3071
+ if ((*err_p = malloc(1024))) {
3072
+ snprintf(*err_p, 1024,
3073
+ "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
3074
+ ctx->mem_size + opt_extra_len,
3075
+ strerror(errno));
3076
+ }
2773
3077
  return false;
2774
3078
  }
2775
3079
  ctx->mem_buffer_mlocked = true;
2776
3080
  return true;
2777
- }
2778
3081
  #else // GGML_MLOCK_SUPPORT
2779
- bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
2780
3082
  *err_p = strdup("can't mlock because it's not supported on this system");
2781
3083
  return false;
2782
- }
2783
3084
  #endif // GGML_MLOCK_SUPPORT
3085
+ }
2784
3086
 
2785
3087
  ////////////////////////////////////////////////////////////////////////////////
2786
3088
 
@@ -2799,7 +3101,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
2799
3101
 
2800
3102
  size_t size_needed = 0;
2801
3103
 
2802
- if (data == NULL) {
3104
+ if (data == NULL && !ctx->no_alloc) {
2803
3105
  size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
2804
3106
  for (int i = 1; i < n_dims; i++) {
2805
3107
  size_needed *= ne[i];
@@ -2883,7 +3185,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
2883
3185
  /*.perf_runs =*/ 0,
2884
3186
  /*.perf_cycles =*/ 0,
2885
3187
  /*.perf_time_us =*/ 0,
2886
- /*.data =*/ data == NULL ? (void *)(result + 1) : data,
3188
+ /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
2887
3189
  /*.pad =*/ { 0 },
2888
3190
  };
2889
3191
 
@@ -10146,6 +10448,7 @@ enum ggml_opt_result ggml_opt(
10146
10448
  struct ggml_init_params params_ctx = {
10147
10449
  .mem_size = 16*1024*1024,
10148
10450
  .mem_buffer = NULL,
10451
+ .no_alloc = false,
10149
10452
  };
10150
10453
 
10151
10454
  ctx = ggml_init(params_ctx);
@@ -316,6 +316,7 @@ struct ggml_init_params {
316
316
  // memory pool
317
317
  size_t mem_size; // bytes
318
318
  void * mem_buffer; // if NULL, memory will be allocated internally
319
+ bool no_alloc; // don't allocate memory for the tensor data
319
320
  };
320
321
 
321
322
  void ggml_time_init(void); // call this once at the beginning of the program
@@ -344,7 +345,11 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
344
345
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
345
346
 
346
347
  bool ggml_mlock_supported(void);
347
- bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
348
+ bool ggml_mlock(
349
+ struct ggml_context * ctx,
350
+ const void *opt_extra_addr,
351
+ size_t opt_extra_len,
352
+ char **err_p);
348
353
 
349
354
  struct ggml_tensor * ggml_new_tensor(
350
355
  struct ggml_context * ctx,