llama_cpp 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +2 -0
- data/ext/llama_cpp/src/ggml.c +354 -51
- data/ext/llama_cpp/src/ggml.h +6 -1
- data/ext/llama_cpp/src/llama.cpp +210 -259
- data/ext/llama_cpp/src/llama.h +2 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4b9a70ca3137fb187c1455291828001086e373db7d9189f7f8d45f0d252b0dc
|
4
|
+
data.tar.gz: 22d67fa3d1c71d73569735876aebe953038bb0465a67b07ea991dc8568d11bac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3767e7950004aba7980a27dbffaec2c360a18295e845b58ab647eff4b9f90515e47c646e48e5d75cac261908415602df50908e429fca0637664e93b2efd7dc1a
|
7
|
+
data.tar.gz: b08e00960ab036fe7ac7778dd33a5a72795153cd7c8beea642b5422da41575a19ea41e1b865e25d16f36afe2879ff4b5b3f303d49598c30888a95ecf459501da
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -20,6 +20,8 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
+
Prepare a quantized model file by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage).
|
24
|
+
|
23
25
|
```ruby
|
24
26
|
require 'llama_cpp'
|
25
27
|
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -79,6 +79,19 @@ static int sched_yield (void) {
|
|
79
79
|
typedef void* thread_ret_t;
|
80
80
|
#endif
|
81
81
|
|
82
|
+
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
83
|
+
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
84
|
+
#ifndef __FMA__
|
85
|
+
#define __FMA__
|
86
|
+
#endif
|
87
|
+
#ifndef __F16C__
|
88
|
+
#define __F16C__
|
89
|
+
#endif
|
90
|
+
#ifndef __SSE3__
|
91
|
+
#define __SSE3__
|
92
|
+
#endif
|
93
|
+
#endif
|
94
|
+
|
82
95
|
#ifdef __HAIKU__
|
83
96
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
84
97
|
#endif
|
@@ -172,8 +185,13 @@ typedef double ggml_float;
|
|
172
185
|
|
173
186
|
#ifdef __F16C__
|
174
187
|
|
188
|
+
#ifdef _MSC_VER
|
189
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
190
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
191
|
+
#else
|
175
192
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
176
193
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
194
|
+
#endif
|
177
195
|
|
178
196
|
#elif defined(__POWER9_VECTOR__)
|
179
197
|
|
@@ -443,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes )
|
|
443
461
|
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
|
444
462
|
return _mm_packus_epi16( r0, r1 );
|
445
463
|
}
|
464
|
+
#elif __AVX__
|
465
|
+
static inline __m128i bytesFromNibbles( const uint8_t* rsi )
|
466
|
+
{
|
467
|
+
// Load 8 bytes from memory
|
468
|
+
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
|
469
|
+
|
470
|
+
// Expand bytes into uint16_t values
|
471
|
+
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
472
|
+
|
473
|
+
// Unpack values into individual bytes
|
474
|
+
const __m128i lowMask = _mm_set1_epi8( 0xF );
|
475
|
+
__m128i high = _mm_andnot_si128( lowMask, bytes );
|
476
|
+
__m128i low = _mm_and_si128( lowMask, bytes );
|
477
|
+
high = _mm_slli_epi16( high, 4 );
|
478
|
+
bytes = _mm_or_si128( low, high );
|
479
|
+
return bytes;
|
480
|
+
}
|
481
|
+
|
482
|
+
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
483
|
+
{
|
484
|
+
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
485
|
+
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
486
|
+
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
487
|
+
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
488
|
+
high = _mm_srli_epi16( high, 4 );
|
489
|
+
bytes1 = _mm_or_si128( low, high );
|
490
|
+
high = _mm_andnot_si128( lowByte, bytes2 );
|
491
|
+
low = _mm_and_si128( lowByte, bytes2 );
|
492
|
+
high = _mm_srli_epi16( high, 4 );
|
493
|
+
bytes2 = _mm_or_si128( low, high );
|
494
|
+
|
495
|
+
return _mm_packus_epi16( bytes1, bytes2);
|
496
|
+
}
|
446
497
|
#endif
|
447
498
|
|
448
499
|
// method 5
|
@@ -491,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
|
491
542
|
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
|
492
543
|
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
|
493
544
|
|
494
|
-
assert(vi0
|
495
|
-
assert(vi1
|
545
|
+
assert(vi0 < 16);
|
546
|
+
assert(vi1 < 16);
|
496
547
|
|
497
548
|
pp[l/2] = vi0 | (vi1 << 4);
|
498
549
|
}
|
@@ -546,10 +597,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
546
597
|
}
|
547
598
|
}
|
548
599
|
#elif __ARM_NEON
|
549
|
-
uint8_t pp[QK/2];
|
550
600
|
for (int i = 0; i < nb; i++) {
|
551
|
-
float amax = 0.0f; // absolute max
|
552
|
-
|
553
601
|
float32x4_t srcv [8];
|
554
602
|
float32x4_t asrcv[8];
|
555
603
|
float32x4_t amaxv[8];
|
@@ -561,7 +609,8 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
561
609
|
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
|
562
610
|
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
|
563
611
|
|
564
|
-
|
612
|
+
// absolute max
|
613
|
+
const float amax = MAX(
|
565
614
|
MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
|
566
615
|
MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
|
567
616
|
|
@@ -575,11 +624,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
575
624
|
const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
|
576
625
|
const int32x4_t vi = vcvtq_s32_f32(vf);
|
577
626
|
|
578
|
-
|
579
|
-
|
627
|
+
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
628
|
+
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
580
629
|
}
|
581
|
-
|
582
|
-
memcpy(y[i].qs, pp, sizeof(pp));
|
583
630
|
}
|
584
631
|
#elif defined(__AVX2__)
|
585
632
|
for (int i = 0; i < nb; i++) {
|
@@ -646,8 +693,81 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
646
693
|
__m128i res = packNibbles( i0 );
|
647
694
|
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
648
695
|
}
|
696
|
+
#elif defined(__AVX__)
|
697
|
+
for (int i = 0; i < nb; i++) {
|
698
|
+
// Load elements into 4 AVX vectors
|
699
|
+
__m256 v0 = _mm256_loadu_ps( x );
|
700
|
+
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
701
|
+
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
702
|
+
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
703
|
+
x += 32;
|
704
|
+
|
705
|
+
// Compute max(abs(e)) for the block
|
706
|
+
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
707
|
+
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
708
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
709
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
710
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
711
|
+
|
712
|
+
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
713
|
+
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
714
|
+
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
715
|
+
const float maxScalar = _mm_cvtss_f32( max4 );
|
716
|
+
|
717
|
+
// Quantize these floats
|
718
|
+
const float d = maxScalar / 7.0f;
|
719
|
+
y[i].d = d;
|
720
|
+
const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
|
721
|
+
const __m256 mul = _mm256_set1_ps( id );
|
722
|
+
|
723
|
+
// Apply the multiplier
|
724
|
+
v0 = _mm256_mul_ps( v0, mul );
|
725
|
+
v1 = _mm256_mul_ps( v1, mul );
|
726
|
+
v2 = _mm256_mul_ps( v2, mul );
|
727
|
+
v3 = _mm256_mul_ps( v3, mul );
|
728
|
+
|
729
|
+
// Round to nearest integer
|
730
|
+
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
731
|
+
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
732
|
+
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
733
|
+
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
734
|
+
|
735
|
+
// Convert floats to integers
|
736
|
+
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
737
|
+
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
738
|
+
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
739
|
+
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
740
|
+
|
741
|
+
// Since we don't have in AVX some necessary functions,
|
742
|
+
// we split the registers in half and call AVX2 analogs from SSE
|
743
|
+
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
744
|
+
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
745
|
+
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
746
|
+
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
747
|
+
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
748
|
+
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
749
|
+
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
750
|
+
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
751
|
+
|
752
|
+
// Convert int32 to int16
|
753
|
+
ni0 = _mm_packs_epi32( ni0, ni1 );
|
754
|
+
ni2 = _mm_packs_epi32( ni2, ni3 );
|
755
|
+
ni4 = _mm_packs_epi32( ni4, ni5 );
|
756
|
+
ni6 = _mm_packs_epi32( ni6, ni7 );
|
757
|
+
// Convert int16 to int8
|
758
|
+
ni0 = _mm_packs_epi16( ni0, ni2 );
|
759
|
+
ni4 = _mm_packs_epi16( ni4, ni6 );
|
760
|
+
|
761
|
+
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
|
762
|
+
const __m128i off = _mm_set1_epi8( 8);
|
763
|
+
ni0 = _mm_add_epi8( ni0, off );
|
764
|
+
ni4 = _mm_add_epi8( ni4, off );
|
765
|
+
|
766
|
+
// Compress the vector into 4 bit/value, and store
|
767
|
+
__m128i res = packNibbles( ni0, ni4 );
|
768
|
+
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
769
|
+
}
|
649
770
|
#elif defined(__wasm_simd128__)
|
650
|
-
uint8_t pp[QK/2];
|
651
771
|
for (int i = 0; i < nb; i++) {
|
652
772
|
float amax = 0.0f; // absolute max
|
653
773
|
|
@@ -676,11 +796,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
676
796
|
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
|
677
797
|
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
|
678
798
|
|
679
|
-
|
680
|
-
|
799
|
+
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
|
800
|
+
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
|
681
801
|
}
|
682
|
-
|
683
|
-
memcpy(y[i].qs, pp, sizeof(pp));
|
684
802
|
}
|
685
803
|
#else
|
686
804
|
// scalar
|
@@ -719,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
|
|
719
837
|
const uint8_t vi0 = roundf(v0);
|
720
838
|
const uint8_t vi1 = roundf(v1);
|
721
839
|
|
722
|
-
assert(vi0
|
723
|
-
assert(vi1
|
840
|
+
assert(vi0 < 16);
|
841
|
+
assert(vi1 < 16);
|
724
842
|
|
725
843
|
pp[l/2] = vi0 | (vi1 << 4);
|
726
844
|
}
|
@@ -732,11 +850,11 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
|
|
732
850
|
static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) {
|
733
851
|
assert(k % QK == 0);
|
734
852
|
|
735
|
-
#if defined(__AVX2__)
|
736
853
|
const int nb = k / QK;
|
737
854
|
|
738
855
|
block_q4_1 * restrict y = vy;
|
739
856
|
|
857
|
+
#if defined(__AVX2__)
|
740
858
|
for (int i = 0; i < nb; i++) {
|
741
859
|
// Load elements into 4 AVX vectors
|
742
860
|
__m256 v0 = _mm256_loadu_ps( x );
|
@@ -810,6 +928,41 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
|
|
810
928
|
__m128i res = packNibbles( i0 );
|
811
929
|
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
812
930
|
}
|
931
|
+
#elif __ARM_NEON
|
932
|
+
for (int i = 0; i < nb; i++) {
|
933
|
+
float32x4_t srcv[8];
|
934
|
+
float32x4_t minv[8];
|
935
|
+
float32x4_t maxv[8];
|
936
|
+
|
937
|
+
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
|
938
|
+
|
939
|
+
for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
|
940
|
+
for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
|
941
|
+
for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
|
942
|
+
|
943
|
+
for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
|
944
|
+
for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
|
945
|
+
for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
|
946
|
+
|
947
|
+
const float min = vminvq_f32(minv[0]);
|
948
|
+
const float max = vmaxvq_f32(maxv[0]);
|
949
|
+
|
950
|
+
const float d = (max - min) / ((1 << 4) - 1);
|
951
|
+
const float id = d ? 1.0f/d : 0.0f;
|
952
|
+
|
953
|
+
y[i].d = d;
|
954
|
+
y[i].m = min;
|
955
|
+
|
956
|
+
const float32x4_t minv0 = vdupq_n_f32(min);
|
957
|
+
|
958
|
+
for (int l = 0; l < 8; l++) {
|
959
|
+
const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
|
960
|
+
const int32x4_t vi = vcvtq_s32_f32(v);
|
961
|
+
|
962
|
+
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
963
|
+
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
964
|
+
}
|
965
|
+
}
|
813
966
|
#else
|
814
967
|
// scalar
|
815
968
|
quantize_row_q4_1_reference(x, vy, k);
|
@@ -970,6 +1123,50 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
|
|
970
1123
|
}
|
971
1124
|
}
|
972
1125
|
}
|
1126
|
+
#elif defined(__ARM_NEON)
|
1127
|
+
for (int i = 0; i < nb; i++) {
|
1128
|
+
const float32x4_t vd = vdupq_n_f32(x[i].d);
|
1129
|
+
const float32x4_t vm = vdupq_n_f32(x[i].m);
|
1130
|
+
|
1131
|
+
const uint8_t * restrict pp = x[i].qs;
|
1132
|
+
|
1133
|
+
for (int l = 0; l < QK; l += 16) {
|
1134
|
+
// Load 16x4-bit integers into 8x8-bit integers
|
1135
|
+
const uint8x8_t v8 = vld1_u8(pp + l/2);
|
1136
|
+
|
1137
|
+
// Expand 4-bit qs to 8-bit bytes
|
1138
|
+
const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
|
1139
|
+
const uint8x8_t v1 = vshr_n_u8(v8, 4);
|
1140
|
+
|
1141
|
+
// Interleave and combine
|
1142
|
+
const uint8x8_t vx_0 = vzip1_u8(v0, v1);
|
1143
|
+
const uint8x8_t vx_1 = vzip2_u8(v0, v1);
|
1144
|
+
|
1145
|
+
const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
|
1146
|
+
|
1147
|
+
// convert to 2x uint16x8_t
|
1148
|
+
const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
|
1149
|
+
const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
|
1150
|
+
|
1151
|
+
// convert to 4x float32x4_t
|
1152
|
+
const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
|
1153
|
+
const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
|
1154
|
+
const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
|
1155
|
+
const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
|
1156
|
+
|
1157
|
+
// multiply by d and add m
|
1158
|
+
const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
|
1159
|
+
const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
|
1160
|
+
const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
|
1161
|
+
const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
|
1162
|
+
|
1163
|
+
// Store
|
1164
|
+
vst1q_f32(y + i*QK + l + 0, r0);
|
1165
|
+
vst1q_f32(y + i*QK + l + 4, r1);
|
1166
|
+
vst1q_f32(y + i*QK + l + 8, r2);
|
1167
|
+
vst1q_f32(y + i*QK + l + 12, r3);
|
1168
|
+
}
|
1169
|
+
}
|
973
1170
|
#else
|
974
1171
|
for (int i = 0; i < nb; i++) {
|
975
1172
|
const float d = x[i].d;
|
@@ -1207,7 +1404,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1207
1404
|
_mm256_storeu_ps(arr, y);
|
1208
1405
|
|
1209
1406
|
for (int i = 0; i < 8; i++)
|
1210
|
-
x[i] =
|
1407
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
1211
1408
|
}
|
1212
1409
|
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
1213
1410
|
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
@@ -1636,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1636
1833
|
const block_q4_0 * restrict x = vx;
|
1637
1834
|
const block_q4_0 * restrict y = vy;
|
1638
1835
|
|
1639
|
-
|
1836
|
+
float sumf = 0.0;
|
1640
1837
|
|
1641
1838
|
#if defined(__ARM_NEON)
|
1642
1839
|
float sum0 = 0.0f;
|
@@ -1731,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1731
1928
|
#endif
|
1732
1929
|
}
|
1733
1930
|
|
1734
|
-
sumf =
|
1931
|
+
sumf = sum0 + sum1;
|
1735
1932
|
#elif defined(__AVX512F__)
|
1736
1933
|
// Initialize accumulator with zeros
|
1737
1934
|
__m512 acc0 = _mm512_setzero_ps();
|
@@ -1739,7 +1936,6 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1739
1936
|
|
1740
1937
|
const int superblock_size = 8;
|
1741
1938
|
const int superblock_count = nb / superblock_size;
|
1742
|
-
const int remainder = nb % superblock_size;
|
1743
1939
|
|
1744
1940
|
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
|
1745
1941
|
int i = superblock_ix * superblock_size;
|
@@ -1766,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1766
1962
|
__m256 acc = _mm256_setzero_ps();
|
1767
1963
|
|
1768
1964
|
// Main loop
|
1965
|
+
// TODO: figure a way to do this in a portable way
|
1966
|
+
#ifdef __GNUC__
|
1967
|
+
#pragma GCC unroll 16
|
1968
|
+
#endif
|
1769
1969
|
for (int i = 0; i < nb; ++i) {
|
1770
1970
|
// Compute combined scale for the block
|
1771
1971
|
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
@@ -1779,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1779
1979
|
bx = _mm256_sub_epi8( bx, off );
|
1780
1980
|
by = _mm256_sub_epi8( by, off );
|
1781
1981
|
|
1782
|
-
//
|
1783
|
-
__m256i
|
1784
|
-
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
|
1785
|
-
// Compute products of int16_t integers, add pairwise
|
1786
|
-
__m256i i32 = _mm256_madd_epi16( x16, y16 );
|
1982
|
+
// Get absolute values of x vectors
|
1983
|
+
const __m256i ax = _mm256_sign_epi8(bx, bx);
|
1787
1984
|
|
1788
|
-
// Sign
|
1789
|
-
|
1790
|
-
|
1791
|
-
//
|
1792
|
-
|
1985
|
+
// Sign the values of the y vectors
|
1986
|
+
const __m256i sy = _mm256_sign_epi8(by, bx);
|
1987
|
+
|
1988
|
+
// Perform multiplication and create 16-bit values
|
1989
|
+
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
1990
|
+
|
1991
|
+
const __m256i ones = _mm256_set1_epi16(1);
|
1992
|
+
const __m256i i32 = _mm256_madd_epi16(ones, dot);
|
1793
1993
|
|
1794
1994
|
// Convert int32_t to float
|
1795
|
-
__m256 p = _mm256_cvtepi32_ps( i32 );
|
1995
|
+
const __m256 p = _mm256_cvtepi32_ps( i32 );
|
1996
|
+
|
1796
1997
|
// Apply the scale, and accumulate
|
1797
1998
|
acc = _mm256_fmadd_ps( d, p, acc );
|
1798
1999
|
}
|
@@ -1803,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1803
2004
|
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
1804
2005
|
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
1805
2006
|
|
2007
|
+
sumf = _mm_cvtss_f32( res );
|
2008
|
+
#elif defined(__AVX__)
|
2009
|
+
// Initialize accumulator with zeros
|
2010
|
+
__m256 acc = _mm256_setzero_ps();
|
2011
|
+
|
2012
|
+
// Main loop
|
2013
|
+
for (int i = 0; i < nb; ++i) {
|
2014
|
+
// Compute combined scale for the block
|
2015
|
+
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
2016
|
+
|
2017
|
+
__m128i i32[2];
|
2018
|
+
for (int j = 0; j < 2; ++j) {
|
2019
|
+
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
|
2020
|
+
__m128i bx = bytesFromNibbles( x[i].qs + 8*j );
|
2021
|
+
__m128i by = bytesFromNibbles( y[i].qs + 8*j );
|
2022
|
+
|
2023
|
+
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
2024
|
+
const __m128i off = _mm_set1_epi8( 8 );
|
2025
|
+
bx = _mm_sub_epi8( bx, off );
|
2026
|
+
by = _mm_sub_epi8( by, off );
|
2027
|
+
|
2028
|
+
// Get absolute values of x vectors
|
2029
|
+
const __m128i ax = _mm_sign_epi8(bx, bx);
|
2030
|
+
|
2031
|
+
// Sign the values of the y vectors
|
2032
|
+
const __m128i sy = _mm_sign_epi8(by, bx);
|
2033
|
+
|
2034
|
+
// Perform multiplication and create 16-bit values
|
2035
|
+
const __m128i dot = _mm_maddubs_epi16(ax, sy);
|
2036
|
+
|
2037
|
+
const __m128i ones = _mm_set1_epi16(1);
|
2038
|
+
i32[j] = _mm_madd_epi16(ones, dot);
|
2039
|
+
}
|
2040
|
+
|
2041
|
+
// Convert int32_t to float
|
2042
|
+
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
|
2043
|
+
// Apply the scale, and accumulate
|
2044
|
+
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
2045
|
+
}
|
2046
|
+
|
2047
|
+
// Return horizontal sum of the acc vector
|
2048
|
+
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
2049
|
+
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
2050
|
+
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
2051
|
+
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
2052
|
+
|
1806
2053
|
sumf = _mm_cvtss_f32( res );
|
1807
2054
|
#elif defined(__wasm_simd128__)
|
1808
2055
|
// wasm simd
|
@@ -1944,7 +2191,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
|
1944
2191
|
// Compute cross scales for the block
|
1945
2192
|
const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
|
1946
2193
|
const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
|
1947
|
-
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
|
2194
|
+
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ );
|
1948
2195
|
|
1949
2196
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
1950
2197
|
__m256i bx = bytesFromNibbles( x[i].qs );
|
@@ -1990,6 +2237,45 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
|
1990
2237
|
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
1991
2238
|
|
1992
2239
|
sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
|
2240
|
+
#elif defined(__ARM_NEON)
|
2241
|
+
float sum00 = 0.0f;
|
2242
|
+
float sum01 = 0.0f;
|
2243
|
+
float sum10 = 0.0f;
|
2244
|
+
float sum11 = 0.0f;
|
2245
|
+
|
2246
|
+
for (int i = 0; i < nb; ++i) {
|
2247
|
+
const block_q4_1 * restrict x0 = &x[i + 0];
|
2248
|
+
const block_q4_1 * restrict y0 = &y[i + 0];
|
2249
|
+
|
2250
|
+
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
2251
|
+
|
2252
|
+
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2253
|
+
const uint8x16_t v1_0 = vld1q_u8(y0->qs);
|
2254
|
+
|
2255
|
+
// and with 0xf
|
2256
|
+
const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
|
2257
|
+
const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
|
2258
|
+
|
2259
|
+
const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
|
2260
|
+
const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
|
2261
|
+
|
2262
|
+
// dot product into uint16x8_t
|
2263
|
+
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
|
2264
|
+
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
|
2265
|
+
|
2266
|
+
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
|
2267
|
+
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
|
2268
|
+
|
2269
|
+
const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
|
2270
|
+
const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
|
2271
|
+
|
2272
|
+
sum00 += x0->m*y0->m;
|
2273
|
+
sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
|
2274
|
+
sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
|
2275
|
+
sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
|
2276
|
+
}
|
2277
|
+
|
2278
|
+
sumf = QK*sum00 + sum01 + sum10 + sum11;
|
1993
2279
|
#else
|
1994
2280
|
// scalar
|
1995
2281
|
for (int i = 0; i < nb; i++) {
|
@@ -2401,8 +2687,9 @@ struct ggml_context {
|
|
2401
2687
|
void * mem_buffer;
|
2402
2688
|
bool mem_buffer_owned;
|
2403
2689
|
bool mem_buffer_mlocked;
|
2690
|
+
bool no_alloc;
|
2404
2691
|
|
2405
|
-
int
|
2692
|
+
int n_objects;
|
2406
2693
|
|
2407
2694
|
struct ggml_object * objects_begin;
|
2408
2695
|
struct ggml_object * objects_end;
|
@@ -2619,6 +2906,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2619
2906
|
static bool is_first_call = true;
|
2620
2907
|
|
2621
2908
|
if (is_first_call) {
|
2909
|
+
// initialize time system (required on Windows)
|
2910
|
+
ggml_time_init();
|
2911
|
+
|
2622
2912
|
// initialize GELU, SILU and EXP F32 tables
|
2623
2913
|
{
|
2624
2914
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
@@ -2684,6 +2974,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2684
2974
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
|
2685
2975
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
2686
2976
|
/*.mem_buffer_mlocked =*/ false,
|
2977
|
+
/*.no_alloc =*/ params.no_alloc,
|
2687
2978
|
/*.n_objects =*/ 0,
|
2688
2979
|
/*.objects_begin =*/ NULL,
|
2689
2980
|
/*.objects_end =*/ NULL,
|
@@ -2751,36 +3042,47 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|
2751
3042
|
return result;
|
2752
3043
|
}
|
2753
3044
|
|
3045
|
+
#ifdef __APPLE__
|
3046
|
+
#define MLOCK_SUGGESTION \
|
3047
|
+
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
3048
|
+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
3049
|
+
#else
|
3050
|
+
#define MLOCK_SUGGESTION \
|
3051
|
+
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
3052
|
+
#endif
|
3053
|
+
|
2754
3054
|
bool ggml_mlock_supported(void) {
|
2755
3055
|
return GGML_MLOCK_SUPPORT;
|
2756
3056
|
}
|
2757
3057
|
|
3058
|
+
bool ggml_mlock(
|
3059
|
+
struct ggml_context * ctx,
|
3060
|
+
const void *opt_extra_addr,
|
3061
|
+
size_t opt_extra_len,
|
3062
|
+
char **err_p) {
|
3063
|
+
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
|
2758
3064
|
#if GGML_MLOCK_SUPPORT
|
2759
|
-
#ifdef __APPLE__
|
2760
|
-
#define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
|
2761
|
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
|
2762
|
-
#else
|
2763
|
-
#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
|
2764
|
-
#endif
|
2765
|
-
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
2766
3065
|
if (ctx->mem_buffer_mlocked) {
|
2767
3066
|
return true;
|
2768
3067
|
}
|
2769
|
-
if (mlock(ctx->mem_buffer, ctx->mem_size)
|
2770
|
-
|
2771
|
-
|
2772
|
-
|
3068
|
+
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
|
3069
|
+
(opt_extra_len &&
|
3070
|
+
mlock(opt_extra_addr, opt_extra_len))) {
|
3071
|
+
if ((*err_p = malloc(1024))) {
|
3072
|
+
snprintf(*err_p, 1024,
|
3073
|
+
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
|
3074
|
+
ctx->mem_size + opt_extra_len,
|
3075
|
+
strerror(errno));
|
3076
|
+
}
|
2773
3077
|
return false;
|
2774
3078
|
}
|
2775
3079
|
ctx->mem_buffer_mlocked = true;
|
2776
3080
|
return true;
|
2777
|
-
}
|
2778
3081
|
#else // GGML_MLOCK_SUPPORT
|
2779
|
-
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
2780
3082
|
*err_p = strdup("can't mlock because it's not supported on this system");
|
2781
3083
|
return false;
|
2782
|
-
}
|
2783
3084
|
#endif // GGML_MLOCK_SUPPORT
|
3085
|
+
}
|
2784
3086
|
|
2785
3087
|
////////////////////////////////////////////////////////////////////////////////
|
2786
3088
|
|
@@ -2799,7 +3101,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
2799
3101
|
|
2800
3102
|
size_t size_needed = 0;
|
2801
3103
|
|
2802
|
-
if (data == NULL) {
|
3104
|
+
if (data == NULL && !ctx->no_alloc) {
|
2803
3105
|
size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
|
2804
3106
|
for (int i = 1; i < n_dims; i++) {
|
2805
3107
|
size_needed *= ne[i];
|
@@ -2883,7 +3185,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
2883
3185
|
/*.perf_runs =*/ 0,
|
2884
3186
|
/*.perf_cycles =*/ 0,
|
2885
3187
|
/*.perf_time_us =*/ 0,
|
2886
|
-
/*.data =*/ data == NULL ? (void *)(result + 1) : data,
|
3188
|
+
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
2887
3189
|
/*.pad =*/ { 0 },
|
2888
3190
|
};
|
2889
3191
|
|
@@ -10146,6 +10448,7 @@ enum ggml_opt_result ggml_opt(
|
|
10146
10448
|
struct ggml_init_params params_ctx = {
|
10147
10449
|
.mem_size = 16*1024*1024,
|
10148
10450
|
.mem_buffer = NULL,
|
10451
|
+
.no_alloc = false,
|
10149
10452
|
};
|
10150
10453
|
|
10151
10454
|
ctx = ggml_init(params_ctx);
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -316,6 +316,7 @@ struct ggml_init_params {
|
|
316
316
|
// memory pool
|
317
317
|
size_t mem_size; // bytes
|
318
318
|
void * mem_buffer; // if NULL, memory will be allocated internally
|
319
|
+
bool no_alloc; // don't allocate memory for the tensor data
|
319
320
|
};
|
320
321
|
|
321
322
|
void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -344,7 +345,11 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|
344
345
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
345
346
|
|
346
347
|
bool ggml_mlock_supported(void);
|
347
|
-
bool ggml_mlock(
|
348
|
+
bool ggml_mlock(
|
349
|
+
struct ggml_context * ctx,
|
350
|
+
const void *opt_extra_addr,
|
351
|
+
size_t opt_extra_len,
|
352
|
+
char **err_p);
|
348
353
|
|
349
354
|
struct ggml_tensor * ggml_new_tensor(
|
350
355
|
struct ggml_context * ctx,
|