llama_cpp 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +2 -0
- data/ext/llama_cpp/src/ggml.c +354 -51
- data/ext/llama_cpp/src/ggml.h +6 -1
- data/ext/llama_cpp/src/llama.cpp +210 -259
- data/ext/llama_cpp/src/llama.h +2 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4b9a70ca3137fb187c1455291828001086e373db7d9189f7f8d45f0d252b0dc
|
4
|
+
data.tar.gz: 22d67fa3d1c71d73569735876aebe953038bb0465a67b07ea991dc8568d11bac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3767e7950004aba7980a27dbffaec2c360a18295e845b58ab647eff4b9f90515e47c646e48e5d75cac261908415602df50908e429fca0637664e93b2efd7dc1a
|
7
|
+
data.tar.gz: b08e00960ab036fe7ac7778dd33a5a72795153cd7c8beea642b5422da41575a19ea41e1b865e25d16f36afe2879ff4b5b3f303d49598c30888a95ecf459501da
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -20,6 +20,8 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
+
Prepare a quantized model file by refering to [the usage section on the llama.cpp README](https://github.com/ggerganov/llama.cpp#usage).
|
24
|
+
|
23
25
|
```ruby
|
24
26
|
require 'llama_cpp'
|
25
27
|
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -79,6 +79,19 @@ static int sched_yield (void) {
|
|
79
79
|
typedef void* thread_ret_t;
|
80
80
|
#endif
|
81
81
|
|
82
|
+
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
83
|
+
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
84
|
+
#ifndef __FMA__
|
85
|
+
#define __FMA__
|
86
|
+
#endif
|
87
|
+
#ifndef __F16C__
|
88
|
+
#define __F16C__
|
89
|
+
#endif
|
90
|
+
#ifndef __SSE3__
|
91
|
+
#define __SSE3__
|
92
|
+
#endif
|
93
|
+
#endif
|
94
|
+
|
82
95
|
#ifdef __HAIKU__
|
83
96
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
84
97
|
#endif
|
@@ -172,8 +185,13 @@ typedef double ggml_float;
|
|
172
185
|
|
173
186
|
#ifdef __F16C__
|
174
187
|
|
188
|
+
#ifdef _MSC_VER
|
189
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
190
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
191
|
+
#else
|
175
192
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
176
193
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
194
|
+
#endif
|
177
195
|
|
178
196
|
#elif defined(__POWER9_VECTOR__)
|
179
197
|
|
@@ -443,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes )
|
|
443
461
|
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
|
444
462
|
return _mm_packus_epi16( r0, r1 );
|
445
463
|
}
|
464
|
+
#elif __AVX__
|
465
|
+
static inline __m128i bytesFromNibbles( const uint8_t* rsi )
|
466
|
+
{
|
467
|
+
// Load 8 bytes from memory
|
468
|
+
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
|
469
|
+
|
470
|
+
// Expand bytes into uint16_t values
|
471
|
+
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
472
|
+
|
473
|
+
// Unpack values into individual bytes
|
474
|
+
const __m128i lowMask = _mm_set1_epi8( 0xF );
|
475
|
+
__m128i high = _mm_andnot_si128( lowMask, bytes );
|
476
|
+
__m128i low = _mm_and_si128( lowMask, bytes );
|
477
|
+
high = _mm_slli_epi16( high, 4 );
|
478
|
+
bytes = _mm_or_si128( low, high );
|
479
|
+
return bytes;
|
480
|
+
}
|
481
|
+
|
482
|
+
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
483
|
+
{
|
484
|
+
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
485
|
+
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
486
|
+
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
487
|
+
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
488
|
+
high = _mm_srli_epi16( high, 4 );
|
489
|
+
bytes1 = _mm_or_si128( low, high );
|
490
|
+
high = _mm_andnot_si128( lowByte, bytes2 );
|
491
|
+
low = _mm_and_si128( lowByte, bytes2 );
|
492
|
+
high = _mm_srli_epi16( high, 4 );
|
493
|
+
bytes2 = _mm_or_si128( low, high );
|
494
|
+
|
495
|
+
return _mm_packus_epi16( bytes1, bytes2);
|
496
|
+
}
|
446
497
|
#endif
|
447
498
|
|
448
499
|
// method 5
|
@@ -491,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
|
491
542
|
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
|
492
543
|
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
|
493
544
|
|
494
|
-
assert(vi0
|
495
|
-
assert(vi1
|
545
|
+
assert(vi0 < 16);
|
546
|
+
assert(vi1 < 16);
|
496
547
|
|
497
548
|
pp[l/2] = vi0 | (vi1 << 4);
|
498
549
|
}
|
@@ -546,10 +597,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
546
597
|
}
|
547
598
|
}
|
548
599
|
#elif __ARM_NEON
|
549
|
-
uint8_t pp[QK/2];
|
550
600
|
for (int i = 0; i < nb; i++) {
|
551
|
-
float amax = 0.0f; // absolute max
|
552
|
-
|
553
601
|
float32x4_t srcv [8];
|
554
602
|
float32x4_t asrcv[8];
|
555
603
|
float32x4_t amaxv[8];
|
@@ -561,7 +609,8 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
561
609
|
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
|
562
610
|
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
|
563
611
|
|
564
|
-
|
612
|
+
// absolute max
|
613
|
+
const float amax = MAX(
|
565
614
|
MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
|
566
615
|
MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
|
567
616
|
|
@@ -575,11 +624,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
575
624
|
const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
|
576
625
|
const int32x4_t vi = vcvtq_s32_f32(vf);
|
577
626
|
|
578
|
-
|
579
|
-
|
627
|
+
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
628
|
+
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
580
629
|
}
|
581
|
-
|
582
|
-
memcpy(y[i].qs, pp, sizeof(pp));
|
583
630
|
}
|
584
631
|
#elif defined(__AVX2__)
|
585
632
|
for (int i = 0; i < nb; i++) {
|
@@ -646,8 +693,81 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
646
693
|
__m128i res = packNibbles( i0 );
|
647
694
|
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
648
695
|
}
|
696
|
+
#elif defined(__AVX__)
|
697
|
+
for (int i = 0; i < nb; i++) {
|
698
|
+
// Load elements into 4 AVX vectors
|
699
|
+
__m256 v0 = _mm256_loadu_ps( x );
|
700
|
+
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
701
|
+
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
702
|
+
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
703
|
+
x += 32;
|
704
|
+
|
705
|
+
// Compute max(abs(e)) for the block
|
706
|
+
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
707
|
+
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
708
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
709
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
710
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
711
|
+
|
712
|
+
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
713
|
+
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
714
|
+
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
715
|
+
const float maxScalar = _mm_cvtss_f32( max4 );
|
716
|
+
|
717
|
+
// Quantize these floats
|
718
|
+
const float d = maxScalar / 7.0f;
|
719
|
+
y[i].d = d;
|
720
|
+
const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
|
721
|
+
const __m256 mul = _mm256_set1_ps( id );
|
722
|
+
|
723
|
+
// Apply the multiplier
|
724
|
+
v0 = _mm256_mul_ps( v0, mul );
|
725
|
+
v1 = _mm256_mul_ps( v1, mul );
|
726
|
+
v2 = _mm256_mul_ps( v2, mul );
|
727
|
+
v3 = _mm256_mul_ps( v3, mul );
|
728
|
+
|
729
|
+
// Round to nearest integer
|
730
|
+
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
731
|
+
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
732
|
+
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
733
|
+
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
734
|
+
|
735
|
+
// Convert floats to integers
|
736
|
+
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
737
|
+
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
738
|
+
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
739
|
+
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
740
|
+
|
741
|
+
// Since we don't have in AVX some necessary functions,
|
742
|
+
// we split the registers in half and call AVX2 analogs from SSE
|
743
|
+
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
744
|
+
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
745
|
+
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
746
|
+
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
747
|
+
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
748
|
+
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
749
|
+
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
750
|
+
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
751
|
+
|
752
|
+
// Convert int32 to int16
|
753
|
+
ni0 = _mm_packs_epi32( ni0, ni1 );
|
754
|
+
ni2 = _mm_packs_epi32( ni2, ni3 );
|
755
|
+
ni4 = _mm_packs_epi32( ni4, ni5 );
|
756
|
+
ni6 = _mm_packs_epi32( ni6, ni7 );
|
757
|
+
// Convert int16 to int8
|
758
|
+
ni0 = _mm_packs_epi16( ni0, ni2 );
|
759
|
+
ni4 = _mm_packs_epi16( ni4, ni6 );
|
760
|
+
|
761
|
+
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
|
762
|
+
const __m128i off = _mm_set1_epi8( 8);
|
763
|
+
ni0 = _mm_add_epi8( ni0, off );
|
764
|
+
ni4 = _mm_add_epi8( ni4, off );
|
765
|
+
|
766
|
+
// Compress the vector into 4 bit/value, and store
|
767
|
+
__m128i res = packNibbles( ni0, ni4 );
|
768
|
+
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
769
|
+
}
|
649
770
|
#elif defined(__wasm_simd128__)
|
650
|
-
uint8_t pp[QK/2];
|
651
771
|
for (int i = 0; i < nb; i++) {
|
652
772
|
float amax = 0.0f; // absolute max
|
653
773
|
|
@@ -676,11 +796,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
676
796
|
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
|
677
797
|
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
|
678
798
|
|
679
|
-
|
680
|
-
|
799
|
+
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
|
800
|
+
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
|
681
801
|
}
|
682
|
-
|
683
|
-
memcpy(y[i].qs, pp, sizeof(pp));
|
684
802
|
}
|
685
803
|
#else
|
686
804
|
// scalar
|
@@ -719,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
|
|
719
837
|
const uint8_t vi0 = roundf(v0);
|
720
838
|
const uint8_t vi1 = roundf(v1);
|
721
839
|
|
722
|
-
assert(vi0
|
723
|
-
assert(vi1
|
840
|
+
assert(vi0 < 16);
|
841
|
+
assert(vi1 < 16);
|
724
842
|
|
725
843
|
pp[l/2] = vi0 | (vi1 << 4);
|
726
844
|
}
|
@@ -732,11 +850,11 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
|
|
732
850
|
static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) {
|
733
851
|
assert(k % QK == 0);
|
734
852
|
|
735
|
-
#if defined(__AVX2__)
|
736
853
|
const int nb = k / QK;
|
737
854
|
|
738
855
|
block_q4_1 * restrict y = vy;
|
739
856
|
|
857
|
+
#if defined(__AVX2__)
|
740
858
|
for (int i = 0; i < nb; i++) {
|
741
859
|
// Load elements into 4 AVX vectors
|
742
860
|
__m256 v0 = _mm256_loadu_ps( x );
|
@@ -810,6 +928,41 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
|
|
810
928
|
__m128i res = packNibbles( i0 );
|
811
929
|
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
812
930
|
}
|
931
|
+
#elif __ARM_NEON
|
932
|
+
for (int i = 0; i < nb; i++) {
|
933
|
+
float32x4_t srcv[8];
|
934
|
+
float32x4_t minv[8];
|
935
|
+
float32x4_t maxv[8];
|
936
|
+
|
937
|
+
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
|
938
|
+
|
939
|
+
for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
|
940
|
+
for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
|
941
|
+
for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
|
942
|
+
|
943
|
+
for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
|
944
|
+
for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
|
945
|
+
for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
|
946
|
+
|
947
|
+
const float min = vminvq_f32(minv[0]);
|
948
|
+
const float max = vmaxvq_f32(maxv[0]);
|
949
|
+
|
950
|
+
const float d = (max - min) / ((1 << 4) - 1);
|
951
|
+
const float id = d ? 1.0f/d : 0.0f;
|
952
|
+
|
953
|
+
y[i].d = d;
|
954
|
+
y[i].m = min;
|
955
|
+
|
956
|
+
const float32x4_t minv0 = vdupq_n_f32(min);
|
957
|
+
|
958
|
+
for (int l = 0; l < 8; l++) {
|
959
|
+
const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
|
960
|
+
const int32x4_t vi = vcvtq_s32_f32(v);
|
961
|
+
|
962
|
+
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
963
|
+
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
964
|
+
}
|
965
|
+
}
|
813
966
|
#else
|
814
967
|
// scalar
|
815
968
|
quantize_row_q4_1_reference(x, vy, k);
|
@@ -970,6 +1123,50 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
|
|
970
1123
|
}
|
971
1124
|
}
|
972
1125
|
}
|
1126
|
+
#elif defined(__ARM_NEON)
|
1127
|
+
for (int i = 0; i < nb; i++) {
|
1128
|
+
const float32x4_t vd = vdupq_n_f32(x[i].d);
|
1129
|
+
const float32x4_t vm = vdupq_n_f32(x[i].m);
|
1130
|
+
|
1131
|
+
const uint8_t * restrict pp = x[i].qs;
|
1132
|
+
|
1133
|
+
for (int l = 0; l < QK; l += 16) {
|
1134
|
+
// Load 16x4-bit integers into 8x8-bit integers
|
1135
|
+
const uint8x8_t v8 = vld1_u8(pp + l/2);
|
1136
|
+
|
1137
|
+
// Expand 4-bit qs to 8-bit bytes
|
1138
|
+
const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
|
1139
|
+
const uint8x8_t v1 = vshr_n_u8(v8, 4);
|
1140
|
+
|
1141
|
+
// Interleave and combine
|
1142
|
+
const uint8x8_t vx_0 = vzip1_u8(v0, v1);
|
1143
|
+
const uint8x8_t vx_1 = vzip2_u8(v0, v1);
|
1144
|
+
|
1145
|
+
const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
|
1146
|
+
|
1147
|
+
// convert to 2x uint16x8_t
|
1148
|
+
const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
|
1149
|
+
const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
|
1150
|
+
|
1151
|
+
// convert to 4x float32x4_t
|
1152
|
+
const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
|
1153
|
+
const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
|
1154
|
+
const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
|
1155
|
+
const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
|
1156
|
+
|
1157
|
+
// multiply by d and add m
|
1158
|
+
const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
|
1159
|
+
const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
|
1160
|
+
const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
|
1161
|
+
const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
|
1162
|
+
|
1163
|
+
// Store
|
1164
|
+
vst1q_f32(y + i*QK + l + 0, r0);
|
1165
|
+
vst1q_f32(y + i*QK + l + 4, r1);
|
1166
|
+
vst1q_f32(y + i*QK + l + 8, r2);
|
1167
|
+
vst1q_f32(y + i*QK + l + 12, r3);
|
1168
|
+
}
|
1169
|
+
}
|
973
1170
|
#else
|
974
1171
|
for (int i = 0; i < nb; i++) {
|
975
1172
|
const float d = x[i].d;
|
@@ -1207,7 +1404,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1207
1404
|
_mm256_storeu_ps(arr, y);
|
1208
1405
|
|
1209
1406
|
for (int i = 0; i < 8; i++)
|
1210
|
-
x[i] =
|
1407
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
1211
1408
|
}
|
1212
1409
|
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
1213
1410
|
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
@@ -1636,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1636
1833
|
const block_q4_0 * restrict x = vx;
|
1637
1834
|
const block_q4_0 * restrict y = vy;
|
1638
1835
|
|
1639
|
-
|
1836
|
+
float sumf = 0.0;
|
1640
1837
|
|
1641
1838
|
#if defined(__ARM_NEON)
|
1642
1839
|
float sum0 = 0.0f;
|
@@ -1731,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1731
1928
|
#endif
|
1732
1929
|
}
|
1733
1930
|
|
1734
|
-
sumf =
|
1931
|
+
sumf = sum0 + sum1;
|
1735
1932
|
#elif defined(__AVX512F__)
|
1736
1933
|
// Initialize accumulator with zeros
|
1737
1934
|
__m512 acc0 = _mm512_setzero_ps();
|
@@ -1739,7 +1936,6 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1739
1936
|
|
1740
1937
|
const int superblock_size = 8;
|
1741
1938
|
const int superblock_count = nb / superblock_size;
|
1742
|
-
const int remainder = nb % superblock_size;
|
1743
1939
|
|
1744
1940
|
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
|
1745
1941
|
int i = superblock_ix * superblock_size;
|
@@ -1766,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1766
1962
|
__m256 acc = _mm256_setzero_ps();
|
1767
1963
|
|
1768
1964
|
// Main loop
|
1965
|
+
// TODO: figure a way to do this in a portable way
|
1966
|
+
#ifdef __GNUC__
|
1967
|
+
#pragma GCC unroll 16
|
1968
|
+
#endif
|
1769
1969
|
for (int i = 0; i < nb; ++i) {
|
1770
1970
|
// Compute combined scale for the block
|
1771
1971
|
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
@@ -1779,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1779
1979
|
bx = _mm256_sub_epi8( bx, off );
|
1780
1980
|
by = _mm256_sub_epi8( by, off );
|
1781
1981
|
|
1782
|
-
//
|
1783
|
-
__m256i
|
1784
|
-
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
|
1785
|
-
// Compute products of int16_t integers, add pairwise
|
1786
|
-
__m256i i32 = _mm256_madd_epi16( x16, y16 );
|
1982
|
+
// Get absolute values of x vectors
|
1983
|
+
const __m256i ax = _mm256_sign_epi8(bx, bx);
|
1787
1984
|
|
1788
|
-
// Sign
|
1789
|
-
|
1790
|
-
|
1791
|
-
//
|
1792
|
-
|
1985
|
+
// Sign the values of the y vectors
|
1986
|
+
const __m256i sy = _mm256_sign_epi8(by, bx);
|
1987
|
+
|
1988
|
+
// Perform multiplication and create 16-bit values
|
1989
|
+
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
1990
|
+
|
1991
|
+
const __m256i ones = _mm256_set1_epi16(1);
|
1992
|
+
const __m256i i32 = _mm256_madd_epi16(ones, dot);
|
1793
1993
|
|
1794
1994
|
// Convert int32_t to float
|
1795
|
-
__m256 p = _mm256_cvtepi32_ps( i32 );
|
1995
|
+
const __m256 p = _mm256_cvtepi32_ps( i32 );
|
1996
|
+
|
1796
1997
|
// Apply the scale, and accumulate
|
1797
1998
|
acc = _mm256_fmadd_ps( d, p, acc );
|
1798
1999
|
}
|
@@ -1803,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1803
2004
|
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
1804
2005
|
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
1805
2006
|
|
2007
|
+
sumf = _mm_cvtss_f32( res );
|
2008
|
+
#elif defined(__AVX__)
|
2009
|
+
// Initialize accumulator with zeros
|
2010
|
+
__m256 acc = _mm256_setzero_ps();
|
2011
|
+
|
2012
|
+
// Main loop
|
2013
|
+
for (int i = 0; i < nb; ++i) {
|
2014
|
+
// Compute combined scale for the block
|
2015
|
+
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
2016
|
+
|
2017
|
+
__m128i i32[2];
|
2018
|
+
for (int j = 0; j < 2; ++j) {
|
2019
|
+
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
|
2020
|
+
__m128i bx = bytesFromNibbles( x[i].qs + 8*j );
|
2021
|
+
__m128i by = bytesFromNibbles( y[i].qs + 8*j );
|
2022
|
+
|
2023
|
+
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
2024
|
+
const __m128i off = _mm_set1_epi8( 8 );
|
2025
|
+
bx = _mm_sub_epi8( bx, off );
|
2026
|
+
by = _mm_sub_epi8( by, off );
|
2027
|
+
|
2028
|
+
// Get absolute values of x vectors
|
2029
|
+
const __m128i ax = _mm_sign_epi8(bx, bx);
|
2030
|
+
|
2031
|
+
// Sign the values of the y vectors
|
2032
|
+
const __m128i sy = _mm_sign_epi8(by, bx);
|
2033
|
+
|
2034
|
+
// Perform multiplication and create 16-bit values
|
2035
|
+
const __m128i dot = _mm_maddubs_epi16(ax, sy);
|
2036
|
+
|
2037
|
+
const __m128i ones = _mm_set1_epi16(1);
|
2038
|
+
i32[j] = _mm_madd_epi16(ones, dot);
|
2039
|
+
}
|
2040
|
+
|
2041
|
+
// Convert int32_t to float
|
2042
|
+
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
|
2043
|
+
// Apply the scale, and accumulate
|
2044
|
+
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
2045
|
+
}
|
2046
|
+
|
2047
|
+
// Return horizontal sum of the acc vector
|
2048
|
+
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
2049
|
+
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
2050
|
+
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
2051
|
+
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
2052
|
+
|
1806
2053
|
sumf = _mm_cvtss_f32( res );
|
1807
2054
|
#elif defined(__wasm_simd128__)
|
1808
2055
|
// wasm simd
|
@@ -1944,7 +2191,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
|
1944
2191
|
// Compute cross scales for the block
|
1945
2192
|
const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
|
1946
2193
|
const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
|
1947
|
-
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
|
2194
|
+
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ );
|
1948
2195
|
|
1949
2196
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
1950
2197
|
__m256i bx = bytesFromNibbles( x[i].qs );
|
@@ -1990,6 +2237,45 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
|
1990
2237
|
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
1991
2238
|
|
1992
2239
|
sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
|
2240
|
+
#elif defined(__ARM_NEON)
|
2241
|
+
float sum00 = 0.0f;
|
2242
|
+
float sum01 = 0.0f;
|
2243
|
+
float sum10 = 0.0f;
|
2244
|
+
float sum11 = 0.0f;
|
2245
|
+
|
2246
|
+
for (int i = 0; i < nb; ++i) {
|
2247
|
+
const block_q4_1 * restrict x0 = &x[i + 0];
|
2248
|
+
const block_q4_1 * restrict y0 = &y[i + 0];
|
2249
|
+
|
2250
|
+
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
2251
|
+
|
2252
|
+
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2253
|
+
const uint8x16_t v1_0 = vld1q_u8(y0->qs);
|
2254
|
+
|
2255
|
+
// and with 0xf
|
2256
|
+
const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
|
2257
|
+
const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
|
2258
|
+
|
2259
|
+
const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
|
2260
|
+
const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
|
2261
|
+
|
2262
|
+
// dot product into uint16x8_t
|
2263
|
+
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
|
2264
|
+
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
|
2265
|
+
|
2266
|
+
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
|
2267
|
+
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
|
2268
|
+
|
2269
|
+
const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
|
2270
|
+
const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
|
2271
|
+
|
2272
|
+
sum00 += x0->m*y0->m;
|
2273
|
+
sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
|
2274
|
+
sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
|
2275
|
+
sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
|
2276
|
+
}
|
2277
|
+
|
2278
|
+
sumf = QK*sum00 + sum01 + sum10 + sum11;
|
1993
2279
|
#else
|
1994
2280
|
// scalar
|
1995
2281
|
for (int i = 0; i < nb; i++) {
|
@@ -2401,8 +2687,9 @@ struct ggml_context {
|
|
2401
2687
|
void * mem_buffer;
|
2402
2688
|
bool mem_buffer_owned;
|
2403
2689
|
bool mem_buffer_mlocked;
|
2690
|
+
bool no_alloc;
|
2404
2691
|
|
2405
|
-
int
|
2692
|
+
int n_objects;
|
2406
2693
|
|
2407
2694
|
struct ggml_object * objects_begin;
|
2408
2695
|
struct ggml_object * objects_end;
|
@@ -2619,6 +2906,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2619
2906
|
static bool is_first_call = true;
|
2620
2907
|
|
2621
2908
|
if (is_first_call) {
|
2909
|
+
// initialize time system (required on Windows)
|
2910
|
+
ggml_time_init();
|
2911
|
+
|
2622
2912
|
// initialize GELU, SILU and EXP F32 tables
|
2623
2913
|
{
|
2624
2914
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
@@ -2684,6 +2974,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2684
2974
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
|
2685
2975
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
2686
2976
|
/*.mem_buffer_mlocked =*/ false,
|
2977
|
+
/*.no_alloc =*/ params.no_alloc,
|
2687
2978
|
/*.n_objects =*/ 0,
|
2688
2979
|
/*.objects_begin =*/ NULL,
|
2689
2980
|
/*.objects_end =*/ NULL,
|
@@ -2751,36 +3042,47 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|
2751
3042
|
return result;
|
2752
3043
|
}
|
2753
3044
|
|
3045
|
+
#ifdef __APPLE__
|
3046
|
+
#define MLOCK_SUGGESTION \
|
3047
|
+
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
3048
|
+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
3049
|
+
#else
|
3050
|
+
#define MLOCK_SUGGESTION \
|
3051
|
+
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
3052
|
+
#endif
|
3053
|
+
|
2754
3054
|
bool ggml_mlock_supported(void) {
|
2755
3055
|
return GGML_MLOCK_SUPPORT;
|
2756
3056
|
}
|
2757
3057
|
|
3058
|
+
bool ggml_mlock(
|
3059
|
+
struct ggml_context * ctx,
|
3060
|
+
const void *opt_extra_addr,
|
3061
|
+
size_t opt_extra_len,
|
3062
|
+
char **err_p) {
|
3063
|
+
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
|
2758
3064
|
#if GGML_MLOCK_SUPPORT
|
2759
|
-
#ifdef __APPLE__
|
2760
|
-
#define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
|
2761
|
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
|
2762
|
-
#else
|
2763
|
-
#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
|
2764
|
-
#endif
|
2765
|
-
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
2766
3065
|
if (ctx->mem_buffer_mlocked) {
|
2767
3066
|
return true;
|
2768
3067
|
}
|
2769
|
-
if (mlock(ctx->mem_buffer, ctx->mem_size)
|
2770
|
-
|
2771
|
-
|
2772
|
-
|
3068
|
+
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
|
3069
|
+
(opt_extra_len &&
|
3070
|
+
mlock(opt_extra_addr, opt_extra_len))) {
|
3071
|
+
if ((*err_p = malloc(1024))) {
|
3072
|
+
snprintf(*err_p, 1024,
|
3073
|
+
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
|
3074
|
+
ctx->mem_size + opt_extra_len,
|
3075
|
+
strerror(errno));
|
3076
|
+
}
|
2773
3077
|
return false;
|
2774
3078
|
}
|
2775
3079
|
ctx->mem_buffer_mlocked = true;
|
2776
3080
|
return true;
|
2777
|
-
}
|
2778
3081
|
#else // GGML_MLOCK_SUPPORT
|
2779
|
-
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
2780
3082
|
*err_p = strdup("can't mlock because it's not supported on this system");
|
2781
3083
|
return false;
|
2782
|
-
}
|
2783
3084
|
#endif // GGML_MLOCK_SUPPORT
|
3085
|
+
}
|
2784
3086
|
|
2785
3087
|
////////////////////////////////////////////////////////////////////////////////
|
2786
3088
|
|
@@ -2799,7 +3101,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
2799
3101
|
|
2800
3102
|
size_t size_needed = 0;
|
2801
3103
|
|
2802
|
-
if (data == NULL) {
|
3104
|
+
if (data == NULL && !ctx->no_alloc) {
|
2803
3105
|
size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
|
2804
3106
|
for (int i = 1; i < n_dims; i++) {
|
2805
3107
|
size_needed *= ne[i];
|
@@ -2883,7 +3185,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
2883
3185
|
/*.perf_runs =*/ 0,
|
2884
3186
|
/*.perf_cycles =*/ 0,
|
2885
3187
|
/*.perf_time_us =*/ 0,
|
2886
|
-
/*.data =*/ data == NULL ? (void *)(result + 1) : data,
|
3188
|
+
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
2887
3189
|
/*.pad =*/ { 0 },
|
2888
3190
|
};
|
2889
3191
|
|
@@ -10146,6 +10448,7 @@ enum ggml_opt_result ggml_opt(
|
|
10146
10448
|
struct ggml_init_params params_ctx = {
|
10147
10449
|
.mem_size = 16*1024*1024,
|
10148
10450
|
.mem_buffer = NULL,
|
10451
|
+
.no_alloc = false,
|
10149
10452
|
};
|
10150
10453
|
|
10151
10454
|
ctx = ggml_init(params_ctx);
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -316,6 +316,7 @@ struct ggml_init_params {
|
|
316
316
|
// memory pool
|
317
317
|
size_t mem_size; // bytes
|
318
318
|
void * mem_buffer; // if NULL, memory will be allocated internally
|
319
|
+
bool no_alloc; // don't allocate memory for the tensor data
|
319
320
|
};
|
320
321
|
|
321
322
|
void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -344,7 +345,11 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|
344
345
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
345
346
|
|
346
347
|
bool ggml_mlock_supported(void);
|
347
|
-
bool ggml_mlock(
|
348
|
+
bool ggml_mlock(
|
349
|
+
struct ggml_context * ctx,
|
350
|
+
const void *opt_extra_addr,
|
351
|
+
size_t opt_extra_len,
|
352
|
+
char **err_p);
|
348
353
|
|
349
354
|
struct ggml_tensor * ggml_new_tensor(
|
350
355
|
struct ggml_context * ctx,
|