llama_cpp 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +39 -1
- data/ext/llama_cpp/src/ggml.c +914 -509
- data/ext/llama_cpp/src/ggml.h +42 -27
- data/ext/llama_cpp/src/llama.cpp +293 -303
- data/ext/llama_cpp/src/llama.h +19 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -2
- data/sig/llama_cpp.rbs +52 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -16,6 +16,7 @@
|
|
16
16
|
#include <stdlib.h>
|
17
17
|
#include <string.h>
|
18
18
|
#include <stdint.h>
|
19
|
+
#include <inttypes.h>
|
19
20
|
#include <stdio.h>
|
20
21
|
#include <float.h>
|
21
22
|
|
@@ -79,6 +80,19 @@ static int sched_yield (void) {
|
|
79
80
|
typedef void* thread_ret_t;
|
80
81
|
#endif
|
81
82
|
|
83
|
+
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
84
|
+
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
85
|
+
#ifndef __FMA__
|
86
|
+
#define __FMA__
|
87
|
+
#endif
|
88
|
+
#ifndef __F16C__
|
89
|
+
#define __F16C__
|
90
|
+
#endif
|
91
|
+
#ifndef __SSE3__
|
92
|
+
#define __SSE3__
|
93
|
+
#endif
|
94
|
+
#endif
|
95
|
+
|
82
96
|
#ifdef __HAIKU__
|
83
97
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
84
98
|
#endif
|
@@ -172,8 +186,13 @@ typedef double ggml_float;
|
|
172
186
|
|
173
187
|
#ifdef __F16C__
|
174
188
|
|
189
|
+
#ifdef _MSC_VER
|
190
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
191
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
192
|
+
#else
|
175
193
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
176
194
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
195
|
+
#endif
|
177
196
|
|
178
197
|
#elif defined(__POWER9_VECTOR__)
|
179
198
|
|
@@ -443,6 +462,39 @@ static inline __m128i packNibbles( __m256i bytes )
|
|
443
462
|
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
|
444
463
|
return _mm_packus_epi16( r0, r1 );
|
445
464
|
}
|
465
|
+
#elif __AVX__
|
466
|
+
static inline __m128i bytesFromNibbles( const uint8_t* rsi )
|
467
|
+
{
|
468
|
+
// Load 8 bytes from memory
|
469
|
+
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
|
470
|
+
|
471
|
+
// Expand bytes into uint16_t values
|
472
|
+
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
473
|
+
|
474
|
+
// Unpack values into individual bytes
|
475
|
+
const __m128i lowMask = _mm_set1_epi8( 0xF );
|
476
|
+
__m128i high = _mm_andnot_si128( lowMask, bytes );
|
477
|
+
__m128i low = _mm_and_si128( lowMask, bytes );
|
478
|
+
high = _mm_slli_epi16( high, 4 );
|
479
|
+
bytes = _mm_or_si128( low, high );
|
480
|
+
return bytes;
|
481
|
+
}
|
482
|
+
|
483
|
+
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
484
|
+
{
|
485
|
+
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
486
|
+
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
487
|
+
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
488
|
+
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
489
|
+
high = _mm_srli_epi16( high, 4 );
|
490
|
+
bytes1 = _mm_or_si128( low, high );
|
491
|
+
high = _mm_andnot_si128( lowByte, bytes2 );
|
492
|
+
low = _mm_and_si128( lowByte, bytes2 );
|
493
|
+
high = _mm_srli_epi16( high, 4 );
|
494
|
+
bytes2 = _mm_or_si128( low, high );
|
495
|
+
|
496
|
+
return _mm_packus_epi16( bytes1, bytes2);
|
497
|
+
}
|
446
498
|
#endif
|
447
499
|
|
448
500
|
// method 5
|
@@ -491,8 +543,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
|
491
543
|
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
|
492
544
|
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
|
493
545
|
|
494
|
-
assert(vi0
|
495
|
-
assert(vi1
|
546
|
+
assert(vi0 < 16);
|
547
|
+
assert(vi1 < 16);
|
496
548
|
|
497
549
|
pp[l/2] = vi0 | (vi1 << 4);
|
498
550
|
}
|
@@ -546,10 +598,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
546
598
|
}
|
547
599
|
}
|
548
600
|
#elif __ARM_NEON
|
549
|
-
uint8_t pp[QK/2];
|
550
601
|
for (int i = 0; i < nb; i++) {
|
551
|
-
float amax = 0.0f; // absolute max
|
552
|
-
|
553
602
|
float32x4_t srcv [8];
|
554
603
|
float32x4_t asrcv[8];
|
555
604
|
float32x4_t amaxv[8];
|
@@ -561,7 +610,8 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
561
610
|
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
|
562
611
|
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
|
563
612
|
|
564
|
-
|
613
|
+
// absolute max
|
614
|
+
const float amax = MAX(
|
565
615
|
MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
|
566
616
|
MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
|
567
617
|
|
@@ -575,11 +625,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
575
625
|
const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
|
576
626
|
const int32x4_t vi = vcvtq_s32_f32(vf);
|
577
627
|
|
578
|
-
|
579
|
-
|
628
|
+
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
629
|
+
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
580
630
|
}
|
581
|
-
|
582
|
-
memcpy(y[i].qs, pp, sizeof(pp));
|
583
631
|
}
|
584
632
|
#elif defined(__AVX2__)
|
585
633
|
for (int i = 0; i < nb; i++) {
|
@@ -646,8 +694,81 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
646
694
|
__m128i res = packNibbles( i0 );
|
647
695
|
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
648
696
|
}
|
697
|
+
#elif defined(__AVX__)
|
698
|
+
for (int i = 0; i < nb; i++) {
|
699
|
+
// Load elements into 4 AVX vectors
|
700
|
+
__m256 v0 = _mm256_loadu_ps( x );
|
701
|
+
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
702
|
+
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
703
|
+
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
704
|
+
x += 32;
|
705
|
+
|
706
|
+
// Compute max(abs(e)) for the block
|
707
|
+
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
708
|
+
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
709
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
710
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
711
|
+
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
712
|
+
|
713
|
+
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
714
|
+
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
715
|
+
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
716
|
+
const float maxScalar = _mm_cvtss_f32( max4 );
|
717
|
+
|
718
|
+
// Quantize these floats
|
719
|
+
const float d = maxScalar / 7.0f;
|
720
|
+
y[i].d = d;
|
721
|
+
const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
|
722
|
+
const __m256 mul = _mm256_set1_ps( id );
|
723
|
+
|
724
|
+
// Apply the multiplier
|
725
|
+
v0 = _mm256_mul_ps( v0, mul );
|
726
|
+
v1 = _mm256_mul_ps( v1, mul );
|
727
|
+
v2 = _mm256_mul_ps( v2, mul );
|
728
|
+
v3 = _mm256_mul_ps( v3, mul );
|
729
|
+
|
730
|
+
// Round to nearest integer
|
731
|
+
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
732
|
+
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
733
|
+
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
734
|
+
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
735
|
+
|
736
|
+
// Convert floats to integers
|
737
|
+
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
738
|
+
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
739
|
+
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
740
|
+
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
741
|
+
|
742
|
+
// Since we don't have in AVX some necessary functions,
|
743
|
+
// we split the registers in half and call AVX2 analogs from SSE
|
744
|
+
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
745
|
+
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
746
|
+
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
747
|
+
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
748
|
+
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
749
|
+
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
750
|
+
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
751
|
+
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
752
|
+
|
753
|
+
// Convert int32 to int16
|
754
|
+
ni0 = _mm_packs_epi32( ni0, ni1 );
|
755
|
+
ni2 = _mm_packs_epi32( ni2, ni3 );
|
756
|
+
ni4 = _mm_packs_epi32( ni4, ni5 );
|
757
|
+
ni6 = _mm_packs_epi32( ni6, ni7 );
|
758
|
+
// Convert int16 to int8
|
759
|
+
ni0 = _mm_packs_epi16( ni0, ni2 );
|
760
|
+
ni4 = _mm_packs_epi16( ni4, ni6 );
|
761
|
+
|
762
|
+
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
|
763
|
+
const __m128i off = _mm_set1_epi8( 8);
|
764
|
+
ni0 = _mm_add_epi8( ni0, off );
|
765
|
+
ni4 = _mm_add_epi8( ni4, off );
|
766
|
+
|
767
|
+
// Compress the vector into 4 bit/value, and store
|
768
|
+
__m128i res = packNibbles( ni0, ni4 );
|
769
|
+
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
770
|
+
}
|
649
771
|
#elif defined(__wasm_simd128__)
|
650
|
-
uint8_t pp[QK/2];
|
651
772
|
for (int i = 0; i < nb; i++) {
|
652
773
|
float amax = 0.0f; // absolute max
|
653
774
|
|
@@ -676,11 +797,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|
676
797
|
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
|
677
798
|
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
|
678
799
|
|
679
|
-
|
680
|
-
|
800
|
+
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
|
801
|
+
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
|
681
802
|
}
|
682
|
-
|
683
|
-
memcpy(y[i].qs, pp, sizeof(pp));
|
684
803
|
}
|
685
804
|
#else
|
686
805
|
// scalar
|
@@ -719,8 +838,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
|
|
719
838
|
const uint8_t vi0 = roundf(v0);
|
720
839
|
const uint8_t vi1 = roundf(v1);
|
721
840
|
|
722
|
-
assert(vi0
|
723
|
-
assert(vi1
|
841
|
+
assert(vi0 < 16);
|
842
|
+
assert(vi1 < 16);
|
724
843
|
|
725
844
|
pp[l/2] = vi0 | (vi1 << 4);
|
726
845
|
}
|
@@ -732,11 +851,11 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
|
|
732
851
|
static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) {
|
733
852
|
assert(k % QK == 0);
|
734
853
|
|
735
|
-
#if defined(__AVX2__)
|
736
854
|
const int nb = k / QK;
|
737
855
|
|
738
856
|
block_q4_1 * restrict y = vy;
|
739
857
|
|
858
|
+
#if defined(__AVX2__)
|
740
859
|
for (int i = 0; i < nb; i++) {
|
741
860
|
// Load elements into 4 AVX vectors
|
742
861
|
__m256 v0 = _mm256_loadu_ps( x );
|
@@ -810,6 +929,41 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
|
|
810
929
|
__m128i res = packNibbles( i0 );
|
811
930
|
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
812
931
|
}
|
932
|
+
#elif __ARM_NEON
|
933
|
+
for (int i = 0; i < nb; i++) {
|
934
|
+
float32x4_t srcv[8];
|
935
|
+
float32x4_t minv[8];
|
936
|
+
float32x4_t maxv[8];
|
937
|
+
|
938
|
+
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
|
939
|
+
|
940
|
+
for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
|
941
|
+
for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
|
942
|
+
for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
|
943
|
+
|
944
|
+
for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
|
945
|
+
for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
|
946
|
+
for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
|
947
|
+
|
948
|
+
const float min = vminvq_f32(minv[0]);
|
949
|
+
const float max = vmaxvq_f32(maxv[0]);
|
950
|
+
|
951
|
+
const float d = (max - min) / ((1 << 4) - 1);
|
952
|
+
const float id = d ? 1.0f/d : 0.0f;
|
953
|
+
|
954
|
+
y[i].d = d;
|
955
|
+
y[i].m = min;
|
956
|
+
|
957
|
+
const float32x4_t minv0 = vdupq_n_f32(min);
|
958
|
+
|
959
|
+
for (int l = 0; l < 8; l++) {
|
960
|
+
const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
|
961
|
+
const int32x4_t vi = vcvtq_s32_f32(v);
|
962
|
+
|
963
|
+
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
964
|
+
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
965
|
+
}
|
966
|
+
}
|
813
967
|
#else
|
814
968
|
// scalar
|
815
969
|
quantize_row_q4_1_reference(x, vy, k);
|
@@ -970,6 +1124,50 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
|
|
970
1124
|
}
|
971
1125
|
}
|
972
1126
|
}
|
1127
|
+
#elif defined(__ARM_NEON)
|
1128
|
+
for (int i = 0; i < nb; i++) {
|
1129
|
+
const float32x4_t vd = vdupq_n_f32(x[i].d);
|
1130
|
+
const float32x4_t vm = vdupq_n_f32(x[i].m);
|
1131
|
+
|
1132
|
+
const uint8_t * restrict pp = x[i].qs;
|
1133
|
+
|
1134
|
+
for (int l = 0; l < QK; l += 16) {
|
1135
|
+
// Load 16x4-bit integers into 8x8-bit integers
|
1136
|
+
const uint8x8_t v8 = vld1_u8(pp + l/2);
|
1137
|
+
|
1138
|
+
// Expand 4-bit qs to 8-bit bytes
|
1139
|
+
const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
|
1140
|
+
const uint8x8_t v1 = vshr_n_u8(v8, 4);
|
1141
|
+
|
1142
|
+
// Interleave and combine
|
1143
|
+
const uint8x8_t vx_0 = vzip1_u8(v0, v1);
|
1144
|
+
const uint8x8_t vx_1 = vzip2_u8(v0, v1);
|
1145
|
+
|
1146
|
+
const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
|
1147
|
+
|
1148
|
+
// convert to 2x uint16x8_t
|
1149
|
+
const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
|
1150
|
+
const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
|
1151
|
+
|
1152
|
+
// convert to 4x float32x4_t
|
1153
|
+
const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
|
1154
|
+
const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
|
1155
|
+
const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
|
1156
|
+
const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
|
1157
|
+
|
1158
|
+
// multiply by d and add m
|
1159
|
+
const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
|
1160
|
+
const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
|
1161
|
+
const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
|
1162
|
+
const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
|
1163
|
+
|
1164
|
+
// Store
|
1165
|
+
vst1q_f32(y + i*QK + l + 0, r0);
|
1166
|
+
vst1q_f32(y + i*QK + l + 4, r1);
|
1167
|
+
vst1q_f32(y + i*QK + l + 8, r2);
|
1168
|
+
vst1q_f32(y + i*QK + l + 12, r3);
|
1169
|
+
}
|
1170
|
+
}
|
973
1171
|
#else
|
974
1172
|
for (int i = 0; i < nb; i++) {
|
975
1173
|
const float d = x[i].d;
|
@@ -1207,7 +1405,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1207
1405
|
_mm256_storeu_ps(arr, y);
|
1208
1406
|
|
1209
1407
|
for (int i = 0; i < 8; i++)
|
1210
|
-
x[i] =
|
1408
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
1211
1409
|
}
|
1212
1410
|
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
1213
1411
|
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
@@ -1636,7 +1834,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1636
1834
|
const block_q4_0 * restrict x = vx;
|
1637
1835
|
const block_q4_0 * restrict y = vy;
|
1638
1836
|
|
1639
|
-
|
1837
|
+
float sumf = 0.0;
|
1640
1838
|
|
1641
1839
|
#if defined(__ARM_NEON)
|
1642
1840
|
float sum0 = 0.0f;
|
@@ -1731,7 +1929,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1731
1929
|
#endif
|
1732
1930
|
}
|
1733
1931
|
|
1734
|
-
sumf =
|
1932
|
+
sumf = sum0 + sum1;
|
1735
1933
|
#elif defined(__AVX512F__)
|
1736
1934
|
// Initialize accumulator with zeros
|
1737
1935
|
__m512 acc0 = _mm512_setzero_ps();
|
@@ -1739,7 +1937,6 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1739
1937
|
|
1740
1938
|
const int superblock_size = 8;
|
1741
1939
|
const int superblock_count = nb / superblock_size;
|
1742
|
-
const int remainder = nb % superblock_size;
|
1743
1940
|
|
1744
1941
|
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
|
1745
1942
|
int i = superblock_ix * superblock_size;
|
@@ -1765,36 +1962,116 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|
1765
1962
|
// Initialize accumulator with zeros
|
1766
1963
|
__m256 acc = _mm256_setzero_ps();
|
1767
1964
|
|
1965
|
+
/* Prepare the constants we will need during execution */
|
1966
|
+
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
1967
|
+
const __m256i offset_8 = _mm256_set1_epi16( 8 );
|
1968
|
+
|
1969
|
+
#define UNROLL_COUNT 8
|
1970
|
+
// make sure we only unroll multiples of the block count
|
1971
|
+
assert(nb % UNROLL_COUNT == 0);
|
1972
|
+
|
1973
|
+
// Main loop
|
1974
|
+
for (int i = 0; i < nb; i+=UNROLL_COUNT) {
|
1975
|
+
|
1976
|
+
// This loop will be unrolled by the compiler
|
1977
|
+
for (int u=0;u<UNROLL_COUNT;u++) {
|
1978
|
+
/* Compute combined scale for the block */
|
1979
|
+
const __m256 scale = _mm256_mul_ps(
|
1980
|
+
_mm256_broadcast_ss( &x[i+u].d ),
|
1981
|
+
_mm256_broadcast_ss( &y[i+u].d ) );
|
1982
|
+
|
1983
|
+
/* get input from x
|
1984
|
+
Input: 32 Nibbles (16 bytes) at *x[i+u]
|
1985
|
+
Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
|
1986
|
+
|
1987
|
+
/* Load 16 bytes from memory */
|
1988
|
+
const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
|
1989
|
+
/* Expand bytes into uint16_t values */
|
1990
|
+
const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
|
1991
|
+
/* Unpack values into individual bytes */
|
1992
|
+
__m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
|
1993
|
+
const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
|
1994
|
+
__m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
|
1995
|
+
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
1996
|
+
x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
|
1997
|
+
x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
|
1998
|
+
|
1999
|
+
/* get input from y
|
2000
|
+
Input: 32 Nibbles (16 bytes) at *y[i+u]
|
2001
|
+
Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
|
2002
|
+
|
2003
|
+
/* Load 16 bytes from memory */
|
2004
|
+
const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
|
2005
|
+
/* Expand bytes into uint16_t values */
|
2006
|
+
const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
|
2007
|
+
/* Unpack values into individual bytes */
|
2008
|
+
const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
|
2009
|
+
__m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
|
2010
|
+
__m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
|
2011
|
+
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
2012
|
+
y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
|
2013
|
+
y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
|
2014
|
+
|
2015
|
+
/* Compute products of int16_t integers, add pairwise, store as int32_t */
|
2016
|
+
__m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
|
2017
|
+
__m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
|
2018
|
+
|
2019
|
+
/* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
|
2020
|
+
__m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
|
2021
|
+
|
2022
|
+
/* Convert to vectore of 8 int32_t to 8 floats */
|
2023
|
+
__m256 q = _mm256_cvtepi32_ps( xy_q );
|
2024
|
+
|
2025
|
+
/* Multiply q with scale and accumulate */
|
2026
|
+
acc = _mm256_fmadd_ps( scale, q, acc );
|
2027
|
+
}
|
2028
|
+
|
2029
|
+
}
|
2030
|
+
|
2031
|
+
// Return horizontal sum of the acc vector
|
2032
|
+
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
2033
|
+
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
2034
|
+
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
2035
|
+
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
2036
|
+
|
2037
|
+
sumf = _mm_cvtss_f32( res );
|
2038
|
+
#elif defined(__AVX__)
|
2039
|
+
// Initialize accumulator with zeros
|
2040
|
+
__m256 acc = _mm256_setzero_ps();
|
2041
|
+
|
1768
2042
|
// Main loop
|
1769
2043
|
for (int i = 0; i < nb; ++i) {
|
1770
2044
|
// Compute combined scale for the block
|
1771
2045
|
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
1772
2046
|
|
1773
|
-
|
1774
|
-
|
1775
|
-
|
2047
|
+
__m128i i32[2];
|
2048
|
+
for (int j = 0; j < 2; ++j) {
|
2049
|
+
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
|
2050
|
+
__m128i bx = bytesFromNibbles( x[i].qs + 8*j );
|
2051
|
+
__m128i by = bytesFromNibbles( y[i].qs + 8*j );
|
1776
2052
|
|
1777
|
-
|
1778
|
-
|
1779
|
-
|
1780
|
-
|
2053
|
+
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
2054
|
+
const __m128i off = _mm_set1_epi8( 8 );
|
2055
|
+
bx = _mm_sub_epi8( bx, off );
|
2056
|
+
by = _mm_sub_epi8( by, off );
|
1781
2057
|
|
1782
|
-
|
1783
|
-
|
1784
|
-
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
|
1785
|
-
// Compute products of int16_t integers, add pairwise
|
1786
|
-
__m256i i32 = _mm256_madd_epi16( x16, y16 );
|
2058
|
+
// Get absolute values of x vectors
|
2059
|
+
const __m128i ax = _mm_sign_epi8(bx, bx);
|
1787
2060
|
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1791
|
-
|
1792
|
-
|
2061
|
+
// Sign the values of the y vectors
|
2062
|
+
const __m128i sy = _mm_sign_epi8(by, bx);
|
2063
|
+
|
2064
|
+
// Perform multiplication and create 16-bit values
|
2065
|
+
const __m128i dot = _mm_maddubs_epi16(ax, sy);
|
2066
|
+
|
2067
|
+
const __m128i ones = _mm_set1_epi16(1);
|
2068
|
+
i32[j] = _mm_madd_epi16(ones, dot);
|
2069
|
+
}
|
1793
2070
|
|
1794
2071
|
// Convert int32_t to float
|
1795
|
-
__m256 p = _mm256_cvtepi32_ps( i32 );
|
2072
|
+
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
|
1796
2073
|
// Apply the scale, and accumulate
|
1797
|
-
acc =
|
2074
|
+
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
1798
2075
|
}
|
1799
2076
|
|
1800
2077
|
// Return horizontal sum of the acc vector
|
@@ -1944,7 +2221,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
|
1944
2221
|
// Compute cross scales for the block
|
1945
2222
|
const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
|
1946
2223
|
const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
|
1947
|
-
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
|
2224
|
+
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ );
|
1948
2225
|
|
1949
2226
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
1950
2227
|
__m256i bx = bytesFromNibbles( x[i].qs );
|
@@ -1990,6 +2267,45 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
|
1990
2267
|
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
1991
2268
|
|
1992
2269
|
sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
|
2270
|
+
#elif defined(__ARM_NEON)
|
2271
|
+
float sum00 = 0.0f;
|
2272
|
+
float sum01 = 0.0f;
|
2273
|
+
float sum10 = 0.0f;
|
2274
|
+
float sum11 = 0.0f;
|
2275
|
+
|
2276
|
+
for (int i = 0; i < nb; ++i) {
|
2277
|
+
const block_q4_1 * restrict x0 = &x[i + 0];
|
2278
|
+
const block_q4_1 * restrict y0 = &y[i + 0];
|
2279
|
+
|
2280
|
+
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
2281
|
+
|
2282
|
+
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2283
|
+
const uint8x16_t v1_0 = vld1q_u8(y0->qs);
|
2284
|
+
|
2285
|
+
// and with 0xf
|
2286
|
+
const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
|
2287
|
+
const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
|
2288
|
+
|
2289
|
+
const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
|
2290
|
+
const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
|
2291
|
+
|
2292
|
+
// dot product into uint16x8_t
|
2293
|
+
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
|
2294
|
+
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
|
2295
|
+
|
2296
|
+
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
|
2297
|
+
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
|
2298
|
+
|
2299
|
+
const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
|
2300
|
+
const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
|
2301
|
+
|
2302
|
+
sum00 += x0->m*y0->m;
|
2303
|
+
sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
|
2304
|
+
sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
|
2305
|
+
sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
|
2306
|
+
}
|
2307
|
+
|
2308
|
+
sumf = QK*sum00 + sum01 + sum10 + sum11;
|
1993
2309
|
#else
|
1994
2310
|
// scalar
|
1995
2311
|
for (int i = 0; i < nb; i++) {
|
@@ -2401,8 +2717,9 @@ struct ggml_context {
|
|
2401
2717
|
void * mem_buffer;
|
2402
2718
|
bool mem_buffer_owned;
|
2403
2719
|
bool mem_buffer_mlocked;
|
2720
|
+
bool no_alloc;
|
2404
2721
|
|
2405
|
-
int
|
2722
|
+
int n_objects;
|
2406
2723
|
|
2407
2724
|
struct ggml_object * objects_begin;
|
2408
2725
|
struct ggml_object * objects_end;
|
@@ -2487,7 +2804,7 @@ void ggml_print_objects(const struct ggml_context * ctx) {
|
|
2487
2804
|
GGML_PRINT("%s: --- end ---\n", __func__);
|
2488
2805
|
}
|
2489
2806
|
|
2490
|
-
|
2807
|
+
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
2491
2808
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2492
2809
|
|
2493
2810
|
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
@@ -2619,6 +2936,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2619
2936
|
static bool is_first_call = true;
|
2620
2937
|
|
2621
2938
|
if (is_first_call) {
|
2939
|
+
// initialize time system (required on Windows)
|
2940
|
+
ggml_time_init();
|
2941
|
+
|
2622
2942
|
// initialize GELU, SILU and EXP F32 tables
|
2623
2943
|
{
|
2624
2944
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
@@ -2684,6 +3004,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2684
3004
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
|
2685
3005
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
2686
3006
|
/*.mem_buffer_mlocked =*/ false,
|
3007
|
+
/*.no_alloc =*/ params.no_alloc,
|
2687
3008
|
/*.n_objects =*/ 0,
|
2688
3009
|
/*.objects_begin =*/ NULL,
|
2689
3010
|
/*.objects_end =*/ NULL,
|
@@ -2751,36 +3072,47 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|
2751
3072
|
return result;
|
2752
3073
|
}
|
2753
3074
|
|
3075
|
+
#ifdef __APPLE__
|
3076
|
+
#define MLOCK_SUGGESTION \
|
3077
|
+
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
3078
|
+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
3079
|
+
#else
|
3080
|
+
#define MLOCK_SUGGESTION \
|
3081
|
+
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
3082
|
+
#endif
|
3083
|
+
|
2754
3084
|
bool ggml_mlock_supported(void) {
|
2755
3085
|
return GGML_MLOCK_SUPPORT;
|
2756
3086
|
}
|
2757
3087
|
|
3088
|
+
bool ggml_mlock(
|
3089
|
+
struct ggml_context * ctx,
|
3090
|
+
const void *opt_extra_addr,
|
3091
|
+
size_t opt_extra_len,
|
3092
|
+
char **err_p) {
|
3093
|
+
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
|
2758
3094
|
#if GGML_MLOCK_SUPPORT
|
2759
|
-
#ifdef __APPLE__
|
2760
|
-
#define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
|
2761
|
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
|
2762
|
-
#else
|
2763
|
-
#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
|
2764
|
-
#endif
|
2765
|
-
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
2766
3095
|
if (ctx->mem_buffer_mlocked) {
|
2767
3096
|
return true;
|
2768
3097
|
}
|
2769
|
-
if (mlock(ctx->mem_buffer, ctx->mem_size)
|
2770
|
-
|
2771
|
-
|
2772
|
-
|
3098
|
+
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
|
3099
|
+
(opt_extra_len &&
|
3100
|
+
mlock(opt_extra_addr, opt_extra_len))) {
|
3101
|
+
if ((*err_p = malloc(1024))) {
|
3102
|
+
snprintf(*err_p, 1024,
|
3103
|
+
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
|
3104
|
+
ctx->mem_size + opt_extra_len,
|
3105
|
+
strerror(errno));
|
3106
|
+
}
|
2773
3107
|
return false;
|
2774
3108
|
}
|
2775
3109
|
ctx->mem_buffer_mlocked = true;
|
2776
3110
|
return true;
|
2777
|
-
}
|
2778
3111
|
#else // GGML_MLOCK_SUPPORT
|
2779
|
-
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
2780
3112
|
*err_p = strdup("can't mlock because it's not supported on this system");
|
2781
3113
|
return false;
|
2782
|
-
}
|
2783
3114
|
#endif // GGML_MLOCK_SUPPORT
|
3115
|
+
}
|
2784
3116
|
|
2785
3117
|
////////////////////////////////////////////////////////////////////////////////
|
2786
3118
|
|
@@ -2788,7 +3120,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
2788
3120
|
struct ggml_context * ctx,
|
2789
3121
|
enum ggml_type type,
|
2790
3122
|
int n_dims,
|
2791
|
-
const
|
3123
|
+
const int64_t* ne,
|
2792
3124
|
void* data) {
|
2793
3125
|
// always insert objects at the end of the context's memory pool
|
2794
3126
|
struct ggml_object * obj_cur = ctx->objects_end;
|
@@ -2799,7 +3131,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
2799
3131
|
|
2800
3132
|
size_t size_needed = 0;
|
2801
3133
|
|
2802
|
-
if (data == NULL) {
|
3134
|
+
if (data == NULL && !ctx->no_alloc) {
|
2803
3135
|
size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
|
2804
3136
|
for (int i = 1; i < n_dims; i++) {
|
2805
3137
|
size_needed *= ne[i];
|
@@ -2883,11 +3215,12 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
2883
3215
|
/*.perf_runs =*/ 0,
|
2884
3216
|
/*.perf_cycles =*/ 0,
|
2885
3217
|
/*.perf_time_us =*/ 0,
|
2886
|
-
/*.data =*/ data == NULL ? (void *)(result + 1) : data,
|
3218
|
+
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
2887
3219
|
/*.pad =*/ { 0 },
|
2888
3220
|
};
|
2889
3221
|
|
2890
|
-
|
3222
|
+
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
3223
|
+
//ggml_assert_aligned(result->data);
|
2891
3224
|
|
2892
3225
|
for (int i = 0; i < n_dims; i++) {
|
2893
3226
|
result->ne[i] = ne[i];
|
@@ -2908,44 +3241,44 @@ struct ggml_tensor * ggml_new_tensor(
|
|
2908
3241
|
struct ggml_context * ctx,
|
2909
3242
|
enum ggml_type type,
|
2910
3243
|
int n_dims,
|
2911
|
-
const
|
3244
|
+
const int64_t * ne) {
|
2912
3245
|
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
2913
3246
|
}
|
2914
3247
|
|
2915
3248
|
struct ggml_tensor * ggml_new_tensor_1d(
|
2916
3249
|
struct ggml_context * ctx,
|
2917
3250
|
enum ggml_type type,
|
2918
|
-
|
3251
|
+
int64_t ne0) {
|
2919
3252
|
return ggml_new_tensor(ctx, type, 1, &ne0);
|
2920
3253
|
}
|
2921
3254
|
|
2922
3255
|
struct ggml_tensor * ggml_new_tensor_2d(
|
2923
3256
|
struct ggml_context * ctx,
|
2924
3257
|
enum ggml_type type,
|
2925
|
-
|
2926
|
-
|
2927
|
-
const
|
3258
|
+
int64_t ne0,
|
3259
|
+
int64_t ne1) {
|
3260
|
+
const int64_t ne[2] = { ne0, ne1 };
|
2928
3261
|
return ggml_new_tensor(ctx, type, 2, ne);
|
2929
3262
|
}
|
2930
3263
|
|
2931
3264
|
struct ggml_tensor * ggml_new_tensor_3d(
|
2932
3265
|
struct ggml_context * ctx,
|
2933
3266
|
enum ggml_type type,
|
2934
|
-
|
2935
|
-
|
2936
|
-
|
2937
|
-
const
|
3267
|
+
int64_t ne0,
|
3268
|
+
int64_t ne1,
|
3269
|
+
int64_t ne2) {
|
3270
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
2938
3271
|
return ggml_new_tensor(ctx, type, 3, ne);
|
2939
3272
|
}
|
2940
3273
|
|
2941
3274
|
struct ggml_tensor * ggml_new_tensor_4d(
|
2942
3275
|
struct ggml_context * ctx,
|
2943
3276
|
enum ggml_type type,
|
2944
|
-
|
2945
|
-
|
2946
|
-
|
2947
|
-
|
2948
|
-
const
|
3277
|
+
int64_t ne0,
|
3278
|
+
int64_t ne1,
|
3279
|
+
int64_t ne2,
|
3280
|
+
int64_t ne3) {
|
3281
|
+
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
2949
3282
|
return ggml_new_tensor(ctx, type, 4, ne);
|
2950
3283
|
}
|
2951
3284
|
|
@@ -3288,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|
3288
3621
|
struct ggml_tensor * ggml_view_tensor(
|
3289
3622
|
struct ggml_context * ctx,
|
3290
3623
|
const struct ggml_tensor * src) {
|
3291
|
-
|
3624
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
3625
|
+
|
3626
|
+
result->nb[0] = src->nb[0];
|
3627
|
+
result->nb[1] = src->nb[1];
|
3628
|
+
result->nb[2] = src->nb[2];
|
3629
|
+
result->nb[3] = src->nb[3];
|
3630
|
+
|
3631
|
+
return result;
|
3292
3632
|
}
|
3293
3633
|
|
3294
3634
|
////////////////////////////////////////////////////////////////////////////////
|
@@ -3592,7 +3932,7 @@ struct ggml_tensor * ggml_mean(
|
|
3592
3932
|
is_node = true;
|
3593
3933
|
}
|
3594
3934
|
|
3595
|
-
|
3935
|
+
int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
|
3596
3936
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
|
3597
3937
|
|
3598
3938
|
result->op = GGML_OP_MEAN;
|
@@ -3953,7 +4293,7 @@ struct ggml_tensor * ggml_mul_mat(
|
|
3953
4293
|
is_node = true;
|
3954
4294
|
}
|
3955
4295
|
|
3956
|
-
const
|
4296
|
+
const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
|
3957
4297
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
|
3958
4298
|
|
3959
4299
|
result->op = GGML_OP_MUL_MAT;
|
@@ -4078,8 +4418,8 @@ struct ggml_tensor * ggml_reshape(
|
|
4078
4418
|
struct ggml_tensor * ggml_reshape_2d(
|
4079
4419
|
struct ggml_context * ctx,
|
4080
4420
|
struct ggml_tensor * a,
|
4081
|
-
|
4082
|
-
|
4421
|
+
int64_t ne0,
|
4422
|
+
int64_t ne1) {
|
4083
4423
|
GGML_ASSERT(ggml_is_contiguous(a));
|
4084
4424
|
GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
|
4085
4425
|
|
@@ -4090,7 +4430,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
4090
4430
|
is_node = true;
|
4091
4431
|
}
|
4092
4432
|
|
4093
|
-
const
|
4433
|
+
const int64_t ne[2] = { ne0, ne1 };
|
4094
4434
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
4095
4435
|
|
4096
4436
|
result->op = GGML_OP_RESHAPE;
|
@@ -4104,9 +4444,9 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
4104
4444
|
struct ggml_tensor * ggml_reshape_3d(
|
4105
4445
|
struct ggml_context * ctx,
|
4106
4446
|
struct ggml_tensor * a,
|
4107
|
-
|
4108
|
-
|
4109
|
-
|
4447
|
+
int64_t ne0,
|
4448
|
+
int64_t ne1,
|
4449
|
+
int64_t ne2) {
|
4110
4450
|
GGML_ASSERT(ggml_is_contiguous(a));
|
4111
4451
|
GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
|
4112
4452
|
|
@@ -4117,7 +4457,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
4117
4457
|
is_node = true;
|
4118
4458
|
}
|
4119
4459
|
|
4120
|
-
const
|
4460
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
4121
4461
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
4122
4462
|
|
4123
4463
|
result->op = GGML_OP_RESHAPE;
|
@@ -4133,7 +4473,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
4133
4473
|
struct ggml_tensor * ggml_view_1d(
|
4134
4474
|
struct ggml_context * ctx,
|
4135
4475
|
struct ggml_tensor * a,
|
4136
|
-
|
4476
|
+
int64_t ne0,
|
4137
4477
|
size_t offset) {
|
4138
4478
|
if (a->grad) {
|
4139
4479
|
GGML_ASSERT(false); // gradient propagation is not supported
|
@@ -4154,15 +4494,15 @@ struct ggml_tensor * ggml_view_1d(
|
|
4154
4494
|
struct ggml_tensor * ggml_view_2d(
|
4155
4495
|
struct ggml_context * ctx,
|
4156
4496
|
struct ggml_tensor * a,
|
4157
|
-
|
4158
|
-
|
4497
|
+
int64_t ne0,
|
4498
|
+
int64_t ne1,
|
4159
4499
|
size_t nb1,
|
4160
4500
|
size_t offset) {
|
4161
4501
|
if (a->grad) {
|
4162
4502
|
GGML_ASSERT(false); // gradient propagation is not supported
|
4163
4503
|
}
|
4164
4504
|
|
4165
|
-
const
|
4505
|
+
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
4166
4506
|
|
4167
4507
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
4168
4508
|
|
@@ -4178,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d(
|
|
4178
4518
|
return result;
|
4179
4519
|
}
|
4180
4520
|
|
4521
|
+
// ggml_view_3d
|
4522
|
+
|
4523
|
+
struct ggml_tensor * ggml_view_3d(
|
4524
|
+
struct ggml_context * ctx,
|
4525
|
+
struct ggml_tensor * a,
|
4526
|
+
int64_t ne0,
|
4527
|
+
int64_t ne1,
|
4528
|
+
int64_t ne2,
|
4529
|
+
size_t nb1,
|
4530
|
+
size_t nb2,
|
4531
|
+
size_t offset) {
|
4532
|
+
if (a->grad) {
|
4533
|
+
GGML_ASSERT(false); // gradient propagation is not supported
|
4534
|
+
}
|
4535
|
+
|
4536
|
+
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
4537
|
+
|
4538
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
4539
|
+
|
4540
|
+
result->nb[1] = nb1;
|
4541
|
+
result->nb[2] = nb2;
|
4542
|
+
result->nb[3] = result->nb[2]*ne2;
|
4543
|
+
|
4544
|
+
result->op = GGML_OP_VIEW;
|
4545
|
+
result->grad = NULL;
|
4546
|
+
result->src0 = a;
|
4547
|
+
result->src1 = NULL; // TODO: maybe store the offset here?
|
4548
|
+
|
4549
|
+
return result;
|
4550
|
+
}
|
4551
|
+
|
4181
4552
|
// ggml_permute
|
4182
4553
|
|
4183
4554
|
struct ggml_tensor * ggml_permute(
|
@@ -4393,7 +4764,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
4393
4764
|
is_node = true;
|
4394
4765
|
}
|
4395
4766
|
|
4396
|
-
const
|
4767
|
+
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
4397
4768
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
4398
4769
|
|
4399
4770
|
result->op = GGML_OP_CONV_1D_1S;
|
@@ -4420,7 +4791,7 @@ struct ggml_tensor * ggml_conv_1d_2s(
|
|
4420
4791
|
is_node = true;
|
4421
4792
|
}
|
4422
4793
|
|
4423
|
-
const
|
4794
|
+
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
4424
4795
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
4425
4796
|
|
4426
4797
|
result->op = GGML_OP_CONV_1D_2S;
|
@@ -4513,102 +4884,112 @@ static void ggml_compute_forward_dup_f16(
|
|
4513
4884
|
const struct ggml_tensor * src0,
|
4514
4885
|
struct ggml_tensor * dst) {
|
4515
4886
|
GGML_ASSERT(params->ith == 0);
|
4516
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
4517
4887
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
4518
4888
|
|
4519
4889
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
4520
4890
|
return;
|
4521
4891
|
}
|
4522
4892
|
|
4523
|
-
const
|
4524
|
-
const
|
4525
|
-
const
|
4526
|
-
const
|
4893
|
+
const int64_t ne00 = src0->ne[0];
|
4894
|
+
const int64_t ne01 = src0->ne[1];
|
4895
|
+
const int64_t ne02 = src0->ne[2];
|
4896
|
+
const int64_t ne03 = src0->ne[3];
|
4527
4897
|
|
4528
4898
|
const size_t nb00 = src0->nb[0];
|
4529
4899
|
const size_t nb01 = src0->nb[1];
|
4530
4900
|
const size_t nb02 = src0->nb[2];
|
4531
4901
|
const size_t nb03 = src0->nb[3];
|
4532
4902
|
|
4533
|
-
|
4903
|
+
const size_t nb0 = dst->nb[0];
|
4904
|
+
const size_t nb1 = dst->nb[1];
|
4905
|
+
const size_t nb2 = dst->nb[2];
|
4906
|
+
const size_t nb3 = dst->nb[3];
|
4907
|
+
|
4908
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
4534
4909
|
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
4535
4910
|
return;
|
4536
4911
|
}
|
4537
4912
|
|
4538
|
-
if (src0->
|
4539
|
-
|
4540
|
-
|
4541
|
-
|
4542
|
-
|
4543
|
-
|
4544
|
-
|
4545
|
-
|
4546
|
-
|
4547
|
-
|
4548
|
-
|
4549
|
-
|
4550
|
-
|
4551
|
-
id++;
|
4552
|
-
}
|
4913
|
+
if (src0->type == dst->type &&
|
4914
|
+
src0->ne[0] == dst->ne[0] &&
|
4915
|
+
src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
|
4916
|
+
// copy by rows
|
4917
|
+
const size_t rs = ne00*nb00;
|
4918
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
4919
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
4920
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
4921
|
+
memcpy(
|
4922
|
+
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
4923
|
+
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
|
4924
|
+
rs);
|
4553
4925
|
}
|
4554
4926
|
}
|
4555
|
-
} else if (dst->type == GGML_TYPE_F32) {
|
4556
|
-
size_t id = 0;
|
4557
|
-
float * dst_ptr = (float *) dst->data;
|
4558
|
-
|
4559
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
4560
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4561
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4562
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
4563
|
-
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4564
|
-
|
4565
|
-
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
4566
|
-
id++;
|
4567
|
-
}
|
4568
|
-
}
|
4569
|
-
}
|
4570
|
-
}
|
4571
|
-
} else {
|
4572
|
-
GGML_ASSERT(false); // TODO: implement
|
4573
4927
|
}
|
4574
|
-
|
4575
|
-
|
4576
|
-
|
4577
|
-
if (dst->type == GGML_TYPE_F32) {
|
4578
|
-
size_t id = 0;
|
4579
|
-
float * dst_ptr = (float *) dst->data;
|
4580
|
-
|
4581
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
4582
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4583
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4584
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
4585
|
-
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4928
|
+
return;
|
4929
|
+
}
|
4586
4930
|
|
4587
|
-
|
4588
|
-
|
4931
|
+
// TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
|
4932
|
+
|
4933
|
+
// dst counters
|
4934
|
+
int64_t i10 = 0;
|
4935
|
+
int64_t i11 = 0;
|
4936
|
+
int64_t i12 = 0;
|
4937
|
+
int64_t i13 = 0;
|
4938
|
+
|
4939
|
+
if (dst->type == GGML_TYPE_F16) {
|
4940
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
4941
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
4942
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
4943
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
4944
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4945
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
4946
|
+
|
4947
|
+
memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
|
4948
|
+
|
4949
|
+
if (++i10 == ne00) {
|
4950
|
+
i10 = 0;
|
4951
|
+
if (++i11 == ne01) {
|
4952
|
+
i11 = 0;
|
4953
|
+
if (++i12 == ne02) {
|
4954
|
+
i12 = 0;
|
4955
|
+
if (++i13 == ne03) {
|
4956
|
+
i13 = 0;
|
4957
|
+
}
|
4958
|
+
}
|
4959
|
+
}
|
4589
4960
|
}
|
4590
4961
|
}
|
4591
4962
|
}
|
4592
4963
|
}
|
4593
|
-
}
|
4594
|
-
|
4595
|
-
|
4596
|
-
|
4597
|
-
|
4598
|
-
|
4599
|
-
|
4600
|
-
|
4601
|
-
|
4602
|
-
|
4603
|
-
|
4604
|
-
|
4964
|
+
}
|
4965
|
+
} else if (dst->type == GGML_TYPE_F32) {
|
4966
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
4967
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
4968
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
4969
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
4970
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4971
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
4972
|
+
|
4973
|
+
*(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
|
4974
|
+
|
4975
|
+
if (++i10 == ne00) {
|
4976
|
+
i10 = 0;
|
4977
|
+
if (++i11 == ne01) {
|
4978
|
+
i11 = 0;
|
4979
|
+
if (++i12 == ne02) {
|
4980
|
+
i12 = 0;
|
4981
|
+
if (++i13 == ne03) {
|
4982
|
+
i13 = 0;
|
4983
|
+
}
|
4984
|
+
}
|
4985
|
+
}
|
4605
4986
|
}
|
4606
4987
|
}
|
4607
4988
|
}
|
4608
4989
|
}
|
4609
|
-
} else {
|
4610
|
-
GGML_ASSERT(false); // TODO: implement
|
4611
4990
|
}
|
4991
|
+
} else {
|
4992
|
+
GGML_ASSERT(false); // TODO: implement
|
4612
4993
|
}
|
4613
4994
|
}
|
4614
4995
|
|
@@ -4617,102 +4998,92 @@ static void ggml_compute_forward_dup_f32(
|
|
4617
4998
|
const struct ggml_tensor * src0,
|
4618
4999
|
struct ggml_tensor * dst) {
|
4619
5000
|
GGML_ASSERT(params->ith == 0);
|
4620
|
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
4621
5001
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
4622
5002
|
|
4623
5003
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
4624
5004
|
return;
|
4625
5005
|
}
|
4626
5006
|
|
4627
|
-
const
|
4628
|
-
const
|
4629
|
-
const
|
4630
|
-
const
|
5007
|
+
const int64_t ne00 = src0->ne[0];
|
5008
|
+
const int64_t ne01 = src0->ne[1];
|
5009
|
+
const int64_t ne02 = src0->ne[2];
|
5010
|
+
const int64_t ne03 = src0->ne[3];
|
4631
5011
|
|
4632
5012
|
const size_t nb00 = src0->nb[0];
|
4633
5013
|
const size_t nb01 = src0->nb[1];
|
4634
5014
|
const size_t nb02 = src0->nb[2];
|
4635
5015
|
const size_t nb03 = src0->nb[3];
|
4636
5016
|
|
4637
|
-
|
5017
|
+
const size_t nb0 = dst->nb[0];
|
5018
|
+
const size_t nb1 = dst->nb[1];
|
5019
|
+
const size_t nb2 = dst->nb[2];
|
5020
|
+
const size_t nb3 = dst->nb[3];
|
5021
|
+
|
5022
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
4638
5023
|
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
4639
5024
|
return;
|
4640
5025
|
}
|
4641
5026
|
|
4642
|
-
|
4643
|
-
|
4644
|
-
|
4645
|
-
|
4646
|
-
|
4647
|
-
|
4648
|
-
|
4649
|
-
|
4650
|
-
|
4651
|
-
|
4652
|
-
|
4653
|
-
|
4654
|
-
|
4655
|
-
|
4656
|
-
|
4657
|
-
|
4658
|
-
|
4659
|
-
|
4660
|
-
|
4661
|
-
|
4662
|
-
|
4663
|
-
|
4664
|
-
|
4665
|
-
|
4666
|
-
|
4667
|
-
|
4668
|
-
|
4669
|
-
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
4670
|
-
id++;
|
5027
|
+
// dst counters
|
5028
|
+
int64_t i10 = 0;
|
5029
|
+
int64_t i11 = 0;
|
5030
|
+
int64_t i12 = 0;
|
5031
|
+
int64_t i13 = 0;
|
5032
|
+
|
5033
|
+
if (dst->type == GGML_TYPE_F32) {
|
5034
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5035
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5036
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5037
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5038
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5039
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
5040
|
+
|
5041
|
+
memcpy(dst_ptr, src0_ptr, sizeof(float));
|
5042
|
+
|
5043
|
+
if (++i10 == dst->ne[0]) {
|
5044
|
+
i10 = 0;
|
5045
|
+
if (++i11 == dst->ne[1]) {
|
5046
|
+
i11 = 0;
|
5047
|
+
if (++i12 == dst->ne[2]) {
|
5048
|
+
i12 = 0;
|
5049
|
+
if (++i13 == dst->ne[3]) {
|
5050
|
+
i13 = 0;
|
5051
|
+
}
|
5052
|
+
}
|
5053
|
+
}
|
4671
5054
|
}
|
4672
5055
|
}
|
4673
5056
|
}
|
4674
5057
|
}
|
4675
|
-
} else {
|
4676
|
-
GGML_ASSERT(false); // TODO: implement
|
4677
5058
|
}
|
4678
|
-
} else {
|
4679
|
-
|
4680
|
-
|
4681
|
-
|
4682
|
-
|
4683
|
-
|
4684
|
-
|
4685
|
-
|
4686
|
-
|
4687
|
-
|
4688
|
-
|
4689
|
-
|
4690
|
-
|
4691
|
-
|
4692
|
-
|
4693
|
-
|
4694
|
-
|
4695
|
-
|
4696
|
-
|
4697
|
-
|
4698
|
-
|
4699
|
-
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
4700
|
-
|
4701
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
4702
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
4703
|
-
for (int i01 = 0; i01 < ne01; i01++) {
|
4704
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
4705
|
-
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
4706
|
-
|
4707
|
-
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
4708
|
-
id++;
|
5059
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
5060
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5061
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5062
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5063
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5064
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
5065
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
5066
|
+
|
5067
|
+
*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
|
5068
|
+
|
5069
|
+
if (++i10 == dst->ne[0]) {
|
5070
|
+
i10 = 0;
|
5071
|
+
if (++i11 == dst->ne[1]) {
|
5072
|
+
i11 = 0;
|
5073
|
+
if (++i12 == dst->ne[2]) {
|
5074
|
+
i12 = 0;
|
5075
|
+
if (++i13 == dst->ne[3]) {
|
5076
|
+
i13 = 0;
|
5077
|
+
}
|
5078
|
+
}
|
5079
|
+
}
|
4709
5080
|
}
|
4710
5081
|
}
|
4711
5082
|
}
|
4712
5083
|
}
|
4713
|
-
} else {
|
4714
|
-
GGML_ASSERT(false); // TODO: implement
|
4715
5084
|
}
|
5085
|
+
} else {
|
5086
|
+
GGML_ASSERT(false); // TODO: implement
|
4716
5087
|
}
|
4717
5088
|
}
|
4718
5089
|
|
@@ -5087,18 +5458,18 @@ static void ggml_compute_forward_sum_f32(
|
|
5087
5458
|
assert(ggml_is_scalar(dst));
|
5088
5459
|
assert(src0->nb[0] == sizeof(float));
|
5089
5460
|
|
5090
|
-
const
|
5091
|
-
const
|
5092
|
-
const
|
5093
|
-
const
|
5461
|
+
const int64_t ne00 = src0->ne[0];
|
5462
|
+
const int64_t ne01 = src0->ne[1];
|
5463
|
+
const int64_t ne02 = src0->ne[2];
|
5464
|
+
const int64_t ne03 = src0->ne[3];
|
5094
5465
|
|
5095
5466
|
const size_t nb01 = src0->nb[1];
|
5096
5467
|
const size_t nb02 = src0->nb[2];
|
5097
5468
|
const size_t nb03 = src0->nb[3];
|
5098
5469
|
|
5099
|
-
for (
|
5100
|
-
for (
|
5101
|
-
for (
|
5470
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5471
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5472
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5102
5473
|
ggml_vec_sum_f32(ne00,
|
5103
5474
|
(float *) (dst->data),
|
5104
5475
|
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
@@ -5143,19 +5514,19 @@ static void ggml_compute_forward_mean_f32(
|
|
5143
5514
|
|
5144
5515
|
assert(src0->nb[0] == sizeof(float));
|
5145
5516
|
|
5146
|
-
const
|
5147
|
-
const
|
5148
|
-
const
|
5149
|
-
const
|
5517
|
+
const int64_t ne00 = src0->ne[0];
|
5518
|
+
const int64_t ne01 = src0->ne[1];
|
5519
|
+
const int64_t ne02 = src0->ne[2];
|
5520
|
+
const int64_t ne03 = src0->ne[3];
|
5150
5521
|
|
5151
5522
|
const size_t nb01 = src0->nb[1];
|
5152
5523
|
const size_t nb02 = src0->nb[2];
|
5153
5524
|
const size_t nb03 = src0->nb[3];
|
5154
5525
|
|
5155
|
-
const
|
5156
|
-
const
|
5157
|
-
const
|
5158
|
-
const
|
5526
|
+
const int64_t ne0 = dst->ne[0];
|
5527
|
+
const int64_t ne1 = dst->ne[1];
|
5528
|
+
const int64_t ne2 = dst->ne[2];
|
5529
|
+
const int64_t ne3 = dst->ne[3];
|
5159
5530
|
|
5160
5531
|
assert(ne0 == 1);
|
5161
5532
|
assert(ne1 == ne01);
|
@@ -5171,9 +5542,9 @@ static void ggml_compute_forward_mean_f32(
|
|
5171
5542
|
const size_t nb2 = dst->nb[2];
|
5172
5543
|
const size_t nb3 = dst->nb[3];
|
5173
5544
|
|
5174
|
-
for (
|
5175
|
-
for (
|
5176
|
-
for (
|
5545
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
5546
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5547
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
5177
5548
|
ggml_vec_sum_f32(ne00,
|
5178
5549
|
(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
5179
5550
|
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
@@ -5660,10 +6031,10 @@ static void ggml_compute_forward_norm_f32(
|
|
5660
6031
|
const int ith = params->ith;
|
5661
6032
|
const int nth = params->nth;
|
5662
6033
|
|
5663
|
-
const
|
5664
|
-
const
|
5665
|
-
const
|
5666
|
-
const
|
6034
|
+
const int64_t ne00 = src0->ne[0];
|
6035
|
+
const int64_t ne01 = src0->ne[1];
|
6036
|
+
const int64_t ne02 = src0->ne[2];
|
6037
|
+
const int64_t ne03 = src0->ne[3];
|
5667
6038
|
|
5668
6039
|
const size_t nb01 = src0->nb[1];
|
5669
6040
|
const size_t nb02 = src0->nb[2];
|
@@ -5676,13 +6047,13 @@ static void ggml_compute_forward_norm_f32(
|
|
5676
6047
|
const float eps = 1e-5f; // TODO: make this a parameter
|
5677
6048
|
|
5678
6049
|
// TODO: optimize
|
5679
|
-
for (
|
5680
|
-
for (
|
5681
|
-
for (
|
6050
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6051
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6052
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
5682
6053
|
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
5683
6054
|
|
5684
6055
|
ggml_float sum = 0.0;
|
5685
|
-
for (
|
6056
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5686
6057
|
sum += (ggml_float)x[i00];
|
5687
6058
|
}
|
5688
6059
|
|
@@ -5691,7 +6062,7 @@ static void ggml_compute_forward_norm_f32(
|
|
5691
6062
|
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
5692
6063
|
|
5693
6064
|
ggml_float sum2 = 0.0;
|
5694
|
-
for (
|
6065
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5695
6066
|
float v = x[i00] - mean;
|
5696
6067
|
y[i00] = v;
|
5697
6068
|
sum2 += (ggml_float)(v*v);
|
@@ -5743,10 +6114,10 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
5743
6114
|
const int ith = params->ith;
|
5744
6115
|
const int nth = params->nth;
|
5745
6116
|
|
5746
|
-
const
|
5747
|
-
const
|
5748
|
-
const
|
5749
|
-
const
|
6117
|
+
const int64_t ne00 = src0->ne[0];
|
6118
|
+
const int64_t ne01 = src0->ne[1];
|
6119
|
+
const int64_t ne02 = src0->ne[2];
|
6120
|
+
const int64_t ne03 = src0->ne[3];
|
5750
6121
|
|
5751
6122
|
const size_t nb01 = src0->nb[1];
|
5752
6123
|
const size_t nb02 = src0->nb[2];
|
@@ -5759,13 +6130,13 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
5759
6130
|
const float eps = 1e-6f; // TODO: make this a parameter
|
5760
6131
|
|
5761
6132
|
// TODO: optimize
|
5762
|
-
for (
|
5763
|
-
for (
|
5764
|
-
for (
|
6133
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6134
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6135
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
5765
6136
|
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
5766
6137
|
|
5767
6138
|
ggml_float sum = 0.0;
|
5768
|
-
for (
|
6139
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
5769
6140
|
sum += (ggml_float)(x[i00] * x[i00]);
|
5770
6141
|
}
|
5771
6142
|
|
@@ -5818,13 +6189,13 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
5818
6189
|
const struct ggml_tensor * src0,
|
5819
6190
|
const struct ggml_tensor * src1,
|
5820
6191
|
struct ggml_tensor * dst) {
|
5821
|
-
//const
|
5822
|
-
//const
|
6192
|
+
//const int64_t ne00 = src0->ne[0];
|
6193
|
+
//const int64_t ne01 = src0->ne[1];
|
5823
6194
|
|
5824
|
-
const
|
6195
|
+
const int64_t ne10 = src1->ne[0];
|
5825
6196
|
|
5826
|
-
const
|
5827
|
-
const
|
6197
|
+
const int64_t ne0 = dst->ne[0];
|
6198
|
+
const int64_t ne1 = dst->ne[1];
|
5828
6199
|
|
5829
6200
|
// TODO: find the optimal values for these
|
5830
6201
|
if (ggml_is_contiguous(src0) &&
|
@@ -5846,23 +6217,23 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
5846
6217
|
int64_t t0 = ggml_perf_time_us();
|
5847
6218
|
UNUSED(t0);
|
5848
6219
|
|
5849
|
-
const
|
5850
|
-
const
|
5851
|
-
const
|
5852
|
-
const
|
6220
|
+
const int64_t ne00 = src0->ne[0];
|
6221
|
+
const int64_t ne01 = src0->ne[1];
|
6222
|
+
const int64_t ne02 = src0->ne[2];
|
6223
|
+
const int64_t ne03 = src0->ne[3];
|
5853
6224
|
|
5854
6225
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
5855
|
-
const
|
6226
|
+
const int64_t ne10 = src1->ne[0];
|
5856
6227
|
#endif
|
5857
|
-
const
|
6228
|
+
const int64_t ne11 = src1->ne[1];
|
5858
6229
|
#ifndef NDEBUG
|
5859
|
-
const
|
5860
|
-
const
|
6230
|
+
const int64_t ne12 = src1->ne[2];
|
6231
|
+
const int64_t ne13 = src1->ne[3];
|
5861
6232
|
|
5862
|
-
const
|
5863
|
-
const
|
5864
|
-
const
|
5865
|
-
const
|
6233
|
+
const int64_t ne0 = dst->ne[0];
|
6234
|
+
const int64_t ne1 = dst->ne[1];
|
6235
|
+
const int64_t ne2 = dst->ne[2];
|
6236
|
+
const int64_t ne3 = dst->ne[3];
|
5866
6237
|
|
5867
6238
|
const int nb00 = src0->nb[0];
|
5868
6239
|
#endif
|
@@ -5922,8 +6293,8 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
5922
6293
|
return;
|
5923
6294
|
}
|
5924
6295
|
|
5925
|
-
for (
|
5926
|
-
for (
|
6296
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6297
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
5927
6298
|
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
5928
6299
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
5929
6300
|
|
@@ -5970,7 +6341,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
5970
6341
|
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
5971
6342
|
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
5972
6343
|
|
5973
|
-
for (
|
6344
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
5974
6345
|
// src1 indices
|
5975
6346
|
const int i13 = i03;
|
5976
6347
|
const int i12 = i02;
|
@@ -6011,21 +6382,21 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6011
6382
|
int64_t t0 = ggml_perf_time_us();
|
6012
6383
|
UNUSED(t0);
|
6013
6384
|
|
6014
|
-
const
|
6015
|
-
const
|
6016
|
-
const
|
6017
|
-
const
|
6385
|
+
const int64_t ne00 = src0->ne[0];
|
6386
|
+
const int64_t ne01 = src0->ne[1];
|
6387
|
+
const int64_t ne02 = src0->ne[2];
|
6388
|
+
const int64_t ne03 = src0->ne[3];
|
6018
6389
|
|
6019
|
-
const
|
6020
|
-
const
|
6021
|
-
const
|
6022
|
-
const
|
6390
|
+
const int64_t ne10 = src1->ne[0];
|
6391
|
+
const int64_t ne11 = src1->ne[1];
|
6392
|
+
const int64_t ne12 = src1->ne[2];
|
6393
|
+
const int64_t ne13 = src1->ne[3];
|
6023
6394
|
|
6024
|
-
const
|
6025
|
-
const
|
6026
|
-
const
|
6027
|
-
const
|
6028
|
-
//const
|
6395
|
+
const int64_t ne0 = dst->ne[0];
|
6396
|
+
const int64_t ne1 = dst->ne[1];
|
6397
|
+
const int64_t ne2 = dst->ne[2];
|
6398
|
+
const int64_t ne3 = dst->ne[3];
|
6399
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
6029
6400
|
|
6030
6401
|
const int nb00 = src0->nb[0];
|
6031
6402
|
const int nb01 = src0->nb[1];
|
@@ -6085,12 +6456,12 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6085
6456
|
|
6086
6457
|
float * const wdata = params->wdata;
|
6087
6458
|
|
6088
|
-
for (
|
6089
|
-
for (
|
6459
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6460
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6090
6461
|
{
|
6091
6462
|
size_t id = 0;
|
6092
|
-
for (
|
6093
|
-
for (
|
6463
|
+
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
6464
|
+
for (int64_t i00 = 0; i00 < ne00; ++i00) {
|
6094
6465
|
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
6095
6466
|
}
|
6096
6467
|
}
|
@@ -6120,10 +6491,10 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6120
6491
|
ggml_fp16_t * const wdata = params->wdata;
|
6121
6492
|
|
6122
6493
|
size_t id = 0;
|
6123
|
-
for (
|
6124
|
-
for (
|
6125
|
-
for (
|
6126
|
-
for (
|
6494
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
6495
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
6496
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
6497
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
6127
6498
|
wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
|
6128
6499
|
}
|
6129
6500
|
}
|
@@ -6175,7 +6546,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
6175
6546
|
|
6176
6547
|
float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
|
6177
6548
|
|
6178
|
-
for (
|
6549
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
6179
6550
|
ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
|
6180
6551
|
}
|
6181
6552
|
}
|
@@ -6224,20 +6595,20 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6224
6595
|
int64_t t0 = ggml_perf_time_us();
|
6225
6596
|
UNUSED(t0);
|
6226
6597
|
|
6227
|
-
const
|
6228
|
-
const
|
6229
|
-
const
|
6230
|
-
const
|
6598
|
+
const int64_t ne00 = src0->ne[0];
|
6599
|
+
const int64_t ne01 = src0->ne[1];
|
6600
|
+
const int64_t ne02 = src0->ne[2];
|
6601
|
+
const int64_t ne03 = src0->ne[3];
|
6231
6602
|
|
6232
|
-
const
|
6233
|
-
const
|
6234
|
-
const
|
6235
|
-
const
|
6603
|
+
const int64_t ne10 = src1->ne[0];
|
6604
|
+
const int64_t ne11 = src1->ne[1];
|
6605
|
+
const int64_t ne12 = src1->ne[2];
|
6606
|
+
const int64_t ne13 = src1->ne[3];
|
6236
6607
|
|
6237
|
-
const
|
6238
|
-
const
|
6239
|
-
const
|
6240
|
-
const
|
6608
|
+
const int64_t ne0 = dst->ne[0];
|
6609
|
+
const int64_t ne1 = dst->ne[1];
|
6610
|
+
const int64_t ne2 = dst->ne[2];
|
6611
|
+
const int64_t ne3 = dst->ne[3];
|
6241
6612
|
|
6242
6613
|
const int nb00 = src0->nb[0];
|
6243
6614
|
const int nb01 = src0->nb[1];
|
@@ -6301,11 +6672,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6301
6672
|
float * const wdata = params->wdata;
|
6302
6673
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
6303
6674
|
|
6304
|
-
for (
|
6305
|
-
for (
|
6675
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
6676
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
6306
6677
|
{
|
6307
6678
|
size_t id = 0;
|
6308
|
-
for (
|
6679
|
+
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
6309
6680
|
dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
6310
6681
|
id += ne00;
|
6311
6682
|
}
|
@@ -6335,9 +6706,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6335
6706
|
char * wdata = params->wdata;
|
6336
6707
|
const size_t row_size = ne10*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type];
|
6337
6708
|
|
6338
|
-
for (
|
6339
|
-
for (
|
6340
|
-
for (
|
6709
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
6710
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
6711
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
6341
6712
|
quantize_row_q((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
6342
6713
|
wdata += row_size;
|
6343
6714
|
}
|
@@ -6386,7 +6757,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
6386
6757
|
|
6387
6758
|
assert(ne00 % 32 == 0);
|
6388
6759
|
|
6389
|
-
for (
|
6760
|
+
for (int64_t ic = 0; ic < ne11; ++ic) {
|
6390
6761
|
vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
6391
6762
|
}
|
6392
6763
|
}
|
@@ -6867,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
|
|
6867
7238
|
const struct ggml_tensor * src0,
|
6868
7239
|
const struct ggml_tensor * src1,
|
6869
7240
|
struct ggml_tensor * dst) {
|
6870
|
-
assert(params->ith == 0);
|
6871
7241
|
assert(src1->type == GGML_TYPE_I32);
|
6872
7242
|
assert(ggml_nelements(src1) == 3);
|
6873
7243
|
|
@@ -6879,10 +7249,10 @@ static void ggml_compute_forward_rope_f32(
|
|
6879
7249
|
const int n_dims = ((int32_t *) src1->data)[1];
|
6880
7250
|
const int mode = ((int32_t *) src1->data)[2];
|
6881
7251
|
|
6882
|
-
//const
|
6883
|
-
const
|
6884
|
-
const
|
6885
|
-
const
|
7252
|
+
//const int64_t ne0 = src0->ne[0];
|
7253
|
+
const int64_t ne1 = src0->ne[1];
|
7254
|
+
const int64_t ne2 = src0->ne[2];
|
7255
|
+
const int64_t ne3 = src0->ne[3];
|
6886
7256
|
|
6887
7257
|
const int nb0 = src0->nb[0];
|
6888
7258
|
const int nb1 = src0->nb[1];
|
@@ -6894,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(
|
|
6894
7264
|
|
6895
7265
|
assert(nb0 == sizeof(float));
|
6896
7266
|
|
6897
|
-
|
6898
|
-
|
6899
|
-
|
7267
|
+
const int ith = params->ith;
|
7268
|
+
const int nth = params->nth;
|
7269
|
+
|
7270
|
+
const int nr = ggml_nrows(src0);
|
7271
|
+
|
7272
|
+
// rows per thread
|
7273
|
+
const int dr = (nr + nth - 1)/nth;
|
7274
|
+
|
7275
|
+
// row range for this thread
|
7276
|
+
const int ir0 = dr*ith;
|
7277
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
7278
|
+
|
7279
|
+
// row index used to determine which thread to use
|
7280
|
+
int ir = 0;
|
7281
|
+
|
7282
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
7283
|
+
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
6900
7284
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
6901
|
-
for (
|
7285
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
7286
|
+
if (ir++ < ir0) continue;
|
7287
|
+
if (ir > ir1) break;
|
7288
|
+
|
6902
7289
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
6903
7290
|
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
6904
7291
|
|
@@ -6924,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
|
|
6924
7311
|
const struct ggml_tensor * src0,
|
6925
7312
|
const struct ggml_tensor * src1,
|
6926
7313
|
struct ggml_tensor * dst) {
|
6927
|
-
assert(params->ith == 0);
|
6928
7314
|
assert(src1->type == GGML_TYPE_I32);
|
6929
7315
|
assert(ggml_nelements(src1) == 3);
|
6930
7316
|
|
@@ -6936,10 +7322,10 @@ static void ggml_compute_forward_rope_f16(
|
|
6936
7322
|
const int n_dims = ((int32_t *) src1->data)[1];
|
6937
7323
|
const int mode = ((int32_t *) src1->data)[2];
|
6938
7324
|
|
6939
|
-
//const
|
6940
|
-
const
|
6941
|
-
const
|
6942
|
-
const
|
7325
|
+
//const int64_t ne0 = src0->ne[0];
|
7326
|
+
const int64_t ne1 = src0->ne[1];
|
7327
|
+
const int64_t ne2 = src0->ne[2];
|
7328
|
+
const int64_t ne3 = src0->ne[3];
|
6943
7329
|
|
6944
7330
|
const int nb0 = src0->nb[0];
|
6945
7331
|
const int nb1 = src0->nb[1];
|
@@ -6951,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(
|
|
6951
7337
|
|
6952
7338
|
assert(nb0 == sizeof(ggml_fp16_t));
|
6953
7339
|
|
6954
|
-
|
6955
|
-
|
7340
|
+
const int ith = params->ith;
|
7341
|
+
const int nth = params->nth;
|
7342
|
+
|
7343
|
+
const int nr = ggml_nrows(src0);
|
7344
|
+
|
7345
|
+
// rows per thread
|
7346
|
+
const int dr = (nr + nth - 1)/nth;
|
7347
|
+
|
7348
|
+
// row range for this thread
|
7349
|
+
const int ir0 = dr*ith;
|
7350
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
7351
|
+
|
7352
|
+
// row index used to determine which thread to use
|
7353
|
+
int ir = 0;
|
7354
|
+
|
7355
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
7356
|
+
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
6956
7357
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
6957
|
-
for (
|
7358
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
7359
|
+
if (ir++ < ir0) continue;
|
7360
|
+
if (ir > ir1) break;
|
7361
|
+
|
6958
7362
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
6959
7363
|
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
6960
7364
|
|
@@ -7015,21 +7419,21 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7015
7419
|
int64_t t0 = ggml_perf_time_us();
|
7016
7420
|
UNUSED(t0);
|
7017
7421
|
|
7018
|
-
const
|
7019
|
-
const
|
7020
|
-
const
|
7021
|
-
//const
|
7422
|
+
const int64_t ne00 = src0->ne[0];
|
7423
|
+
const int64_t ne01 = src0->ne[1];
|
7424
|
+
const int64_t ne02 = src0->ne[2];
|
7425
|
+
//const int64_t ne03 = src0->ne[3];
|
7022
7426
|
|
7023
|
-
const
|
7024
|
-
const
|
7025
|
-
//const
|
7026
|
-
//const
|
7427
|
+
const int64_t ne10 = src1->ne[0];
|
7428
|
+
const int64_t ne11 = src1->ne[1];
|
7429
|
+
//const int64_t ne12 = src1->ne[2];
|
7430
|
+
//const int64_t ne13 = src1->ne[3];
|
7027
7431
|
|
7028
|
-
//const
|
7029
|
-
//const
|
7030
|
-
//const
|
7031
|
-
//const
|
7032
|
-
//const
|
7432
|
+
//const int64_t ne0 = dst->ne[0];
|
7433
|
+
//const int64_t ne1 = dst->ne[1];
|
7434
|
+
//const int64_t ne2 = dst->ne[2];
|
7435
|
+
//const int64_t ne3 = dst->ne[3];
|
7436
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7033
7437
|
|
7034
7438
|
const int nb00 = src0->nb[0];
|
7035
7439
|
const int nb01 = src0->nb[1];
|
@@ -7066,11 +7470,11 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7066
7470
|
{
|
7067
7471
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
7068
7472
|
|
7069
|
-
for (
|
7070
|
-
for (
|
7473
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7474
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7071
7475
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7072
7476
|
ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
|
7073
|
-
for (
|
7477
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7074
7478
|
dst_data[i00*ew0 + i01] = src[i00];
|
7075
7479
|
}
|
7076
7480
|
}
|
@@ -7081,10 +7485,10 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7081
7485
|
{
|
7082
7486
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
|
7083
7487
|
|
7084
|
-
for (
|
7488
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7085
7489
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7086
7490
|
ggml_fp16_t * dst_data = wdata;
|
7087
|
-
for (
|
7491
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7088
7492
|
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
7089
7493
|
}
|
7090
7494
|
}
|
@@ -7109,7 +7513,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
7109
7513
|
|
7110
7514
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7111
7515
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7112
|
-
for (
|
7516
|
+
for (int64_t i0 = 0; i0 < ne10; ++i0) {
|
7113
7517
|
dst_data[i0] = 0;
|
7114
7518
|
for (int k = -nh; k <= nh; k++) {
|
7115
7519
|
float v = 0.0f;
|
@@ -7135,21 +7539,21 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7135
7539
|
int64_t t0 = ggml_perf_time_us();
|
7136
7540
|
UNUSED(t0);
|
7137
7541
|
|
7138
|
-
const
|
7139
|
-
const
|
7140
|
-
const
|
7141
|
-
//const
|
7542
|
+
const int64_t ne00 = src0->ne[0];
|
7543
|
+
const int64_t ne01 = src0->ne[1];
|
7544
|
+
const int64_t ne02 = src0->ne[2];
|
7545
|
+
//const int64_t ne03 = src0->ne[3];
|
7142
7546
|
|
7143
|
-
const
|
7144
|
-
const
|
7145
|
-
//const
|
7146
|
-
//const
|
7547
|
+
const int64_t ne10 = src1->ne[0];
|
7548
|
+
const int64_t ne11 = src1->ne[1];
|
7549
|
+
//const int64_t ne12 = src1->ne[2];
|
7550
|
+
//const int64_t ne13 = src1->ne[3];
|
7147
7551
|
|
7148
|
-
//const
|
7149
|
-
//const
|
7150
|
-
//const
|
7151
|
-
//const
|
7152
|
-
//const
|
7552
|
+
//const int64_t ne0 = dst->ne[0];
|
7553
|
+
//const int64_t ne1 = dst->ne[1];
|
7554
|
+
//const int64_t ne2 = dst->ne[2];
|
7555
|
+
//const int64_t ne3 = dst->ne[3];
|
7556
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7153
7557
|
|
7154
7558
|
const int nb00 = src0->nb[0];
|
7155
7559
|
const int nb01 = src0->nb[1];
|
@@ -7186,11 +7590,11 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7186
7590
|
{
|
7187
7591
|
float * const wdata = (float *) params->wdata + 0;
|
7188
7592
|
|
7189
|
-
for (
|
7190
|
-
for (
|
7593
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7594
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7191
7595
|
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7192
7596
|
float * dst_data = wdata + i02*ew0*ne00;
|
7193
|
-
for (
|
7597
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7194
7598
|
dst_data[i00*ew0 + i01] = src[i00];
|
7195
7599
|
}
|
7196
7600
|
}
|
@@ -7201,10 +7605,10 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7201
7605
|
{
|
7202
7606
|
float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
|
7203
7607
|
|
7204
|
-
for (
|
7608
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7205
7609
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7206
7610
|
float * dst_data = wdata;
|
7207
|
-
for (
|
7611
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7208
7612
|
dst_data[(i10 + nh)*ew0 + i11] = src[i10];
|
7209
7613
|
}
|
7210
7614
|
}
|
@@ -7229,7 +7633,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
7229
7633
|
|
7230
7634
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7231
7635
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7232
|
-
for (
|
7636
|
+
for (int64_t i0 = 0; i0 < ne10; ++i0) {
|
7233
7637
|
dst_data[i0] = 0;
|
7234
7638
|
for (int k = -nh; k <= nh; k++) {
|
7235
7639
|
float v = 0.0f;
|
@@ -7283,21 +7687,21 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7283
7687
|
int64_t t0 = ggml_perf_time_us();
|
7284
7688
|
UNUSED(t0);
|
7285
7689
|
|
7286
|
-
const
|
7287
|
-
const
|
7288
|
-
const
|
7289
|
-
//const
|
7690
|
+
const int64_t ne00 = src0->ne[0];
|
7691
|
+
const int64_t ne01 = src0->ne[1];
|
7692
|
+
const int64_t ne02 = src0->ne[2];
|
7693
|
+
//const int64_t ne03 = src0->ne[3];
|
7290
7694
|
|
7291
|
-
const
|
7292
|
-
const
|
7293
|
-
//const
|
7294
|
-
//const
|
7695
|
+
const int64_t ne10 = src1->ne[0];
|
7696
|
+
const int64_t ne11 = src1->ne[1];
|
7697
|
+
//const int64_t ne12 = src1->ne[2];
|
7698
|
+
//const int64_t ne13 = src1->ne[3];
|
7295
7699
|
|
7296
|
-
//const
|
7297
|
-
//const
|
7298
|
-
//const
|
7299
|
-
//const
|
7300
|
-
//const
|
7700
|
+
//const int64_t ne0 = dst->ne[0];
|
7701
|
+
//const int64_t ne1 = dst->ne[1];
|
7702
|
+
//const int64_t ne2 = dst->ne[2];
|
7703
|
+
//const int64_t ne3 = dst->ne[3];
|
7704
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7301
7705
|
|
7302
7706
|
const int nb00 = src0->nb[0];
|
7303
7707
|
const int nb01 = src0->nb[1];
|
@@ -7334,11 +7738,11 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7334
7738
|
{
|
7335
7739
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
7336
7740
|
|
7337
|
-
for (
|
7338
|
-
for (
|
7741
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7742
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7339
7743
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7340
7744
|
ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
|
7341
|
-
for (
|
7745
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7342
7746
|
dst_data[i00*ew0 + i01] = src[i00];
|
7343
7747
|
}
|
7344
7748
|
}
|
@@ -7349,10 +7753,10 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7349
7753
|
{
|
7350
7754
|
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
|
7351
7755
|
|
7352
|
-
for (
|
7756
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7353
7757
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7354
7758
|
ggml_fp16_t * dst_data = wdata;
|
7355
|
-
for (
|
7759
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7356
7760
|
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
7357
7761
|
}
|
7358
7762
|
}
|
@@ -7377,7 +7781,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
7377
7781
|
|
7378
7782
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7379
7783
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7380
|
-
for (
|
7784
|
+
for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
|
7381
7785
|
dst_data[i0/2] = 0;
|
7382
7786
|
for (int k = -nh; k <= nh; k++) {
|
7383
7787
|
float v = 0.0f;
|
@@ -7403,21 +7807,21 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7403
7807
|
int64_t t0 = ggml_perf_time_us();
|
7404
7808
|
UNUSED(t0);
|
7405
7809
|
|
7406
|
-
const
|
7407
|
-
const
|
7408
|
-
const
|
7409
|
-
//const
|
7810
|
+
const int64_t ne00 = src0->ne[0];
|
7811
|
+
const int64_t ne01 = src0->ne[1];
|
7812
|
+
const int64_t ne02 = src0->ne[2];
|
7813
|
+
//const int64_t ne03 = src0->ne[3];
|
7410
7814
|
|
7411
|
-
const
|
7412
|
-
const
|
7413
|
-
//const
|
7414
|
-
//const
|
7815
|
+
const int64_t ne10 = src1->ne[0];
|
7816
|
+
const int64_t ne11 = src1->ne[1];
|
7817
|
+
//const int64_t ne12 = src1->ne[2];
|
7818
|
+
//const int64_t ne13 = src1->ne[3];
|
7415
7819
|
|
7416
|
-
//const
|
7417
|
-
//const
|
7418
|
-
//const
|
7419
|
-
//const
|
7420
|
-
//const
|
7820
|
+
//const int64_t ne0 = dst->ne[0];
|
7821
|
+
//const int64_t ne1 = dst->ne[1];
|
7822
|
+
//const int64_t ne2 = dst->ne[2];
|
7823
|
+
//const int64_t ne3 = dst->ne[3];
|
7824
|
+
//const int64_t ne = ne0*ne1*ne2*ne3;
|
7421
7825
|
|
7422
7826
|
const int nb00 = src0->nb[0];
|
7423
7827
|
const int nb01 = src0->nb[1];
|
@@ -7454,11 +7858,11 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7454
7858
|
{
|
7455
7859
|
float * const wdata = (float *) params->wdata + 0;
|
7456
7860
|
|
7457
|
-
for (
|
7458
|
-
for (
|
7861
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
7862
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
7459
7863
|
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
|
7460
7864
|
float * dst_data = wdata + i02*ew0*ne00;
|
7461
|
-
for (
|
7865
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
7462
7866
|
dst_data[i00*ew0 + i01] = src[i00];
|
7463
7867
|
}
|
7464
7868
|
}
|
@@ -7469,10 +7873,10 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7469
7873
|
{
|
7470
7874
|
float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
|
7471
7875
|
|
7472
|
-
for (
|
7876
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
7473
7877
|
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
7474
7878
|
float * dst_data = wdata;
|
7475
|
-
for (
|
7879
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
7476
7880
|
dst_data[(i10 + nh)*ew0 + i11] = src[i10];
|
7477
7881
|
}
|
7478
7882
|
}
|
@@ -7497,7 +7901,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
7497
7901
|
|
7498
7902
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
7499
7903
|
float * dst_data = (float *)((char *) dst->data + i1*nb1);
|
7500
|
-
for (
|
7904
|
+
for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
|
7501
7905
|
dst_data[i0/2] = 0;
|
7502
7906
|
for (int k = -nh; k <= nh; k++) {
|
7503
7907
|
float v = 0.0f;
|
@@ -7549,25 +7953,25 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7549
7953
|
int64_t t0 = ggml_perf_time_us();
|
7550
7954
|
UNUSED(t0);
|
7551
7955
|
|
7552
|
-
const
|
7553
|
-
const
|
7554
|
-
const
|
7555
|
-
const
|
7956
|
+
const int64_t neq0 = q->ne[0];
|
7957
|
+
const int64_t neq1 = q->ne[1];
|
7958
|
+
const int64_t neq2 = q->ne[2];
|
7959
|
+
const int64_t neq3 = q->ne[3];
|
7556
7960
|
|
7557
|
-
const
|
7558
|
-
const
|
7559
|
-
//const
|
7560
|
-
//const
|
7961
|
+
const int64_t nek0 = k->ne[0];
|
7962
|
+
const int64_t nek1 = k->ne[1];
|
7963
|
+
//const int64_t nek2 = k->ne[2];
|
7964
|
+
//const int64_t nek3 = k->ne[3];
|
7561
7965
|
|
7562
|
-
//const
|
7563
|
-
const
|
7564
|
-
//const
|
7565
|
-
//const
|
7966
|
+
//const int64_t nev0 = v->ne[0];
|
7967
|
+
const int64_t nev1 = v->ne[1];
|
7968
|
+
//const int64_t nev2 = v->ne[2];
|
7969
|
+
//const int64_t nev3 = v->ne[3];
|
7566
7970
|
|
7567
|
-
const
|
7568
|
-
const
|
7569
|
-
//const
|
7570
|
-
//const
|
7971
|
+
const int64_t ne0 = dst->ne[0];
|
7972
|
+
const int64_t ne1 = dst->ne[1];
|
7973
|
+
//const int64_t ne2 = dst->ne[2];
|
7974
|
+
//const int64_t ne3 = dst->ne[3];
|
7571
7975
|
|
7572
7976
|
const int nbk0 = k->nb[0];
|
7573
7977
|
const int nbk1 = k->nb[1];
|
@@ -7592,10 +7996,10 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7592
7996
|
const int ith = params->ith;
|
7593
7997
|
const int nth = params->nth;
|
7594
7998
|
|
7595
|
-
const
|
7596
|
-
const
|
7597
|
-
const
|
7598
|
-
const
|
7999
|
+
const int64_t D = neq0;
|
8000
|
+
const int64_t N = neq1;
|
8001
|
+
const int64_t P = nek1 - N;
|
8002
|
+
const int64_t M = P + N;
|
7599
8003
|
|
7600
8004
|
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
7601
8005
|
|
@@ -7657,7 +8061,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7657
8061
|
S[i] = -INFINITY;
|
7658
8062
|
}
|
7659
8063
|
|
7660
|
-
for (
|
8064
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
7661
8065
|
// k indices
|
7662
8066
|
const int ik3 = iq3;
|
7663
8067
|
const int ik2 = iq2;
|
@@ -7676,7 +8080,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7676
8080
|
ggml_vec_scale_f32(nek1, S, scale);
|
7677
8081
|
|
7678
8082
|
if (masked) {
|
7679
|
-
for (
|
8083
|
+
for (int64_t i = P; i < M; i++) {
|
7680
8084
|
if (i > P + iq1) {
|
7681
8085
|
S[i] = -INFINITY;
|
7682
8086
|
}
|
@@ -7734,7 +8138,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
7734
8138
|
#endif
|
7735
8139
|
}
|
7736
8140
|
|
7737
|
-
for (
|
8141
|
+
for (int64_t ic = 0; ic < nev1; ++ic) {
|
7738
8142
|
// dst indices
|
7739
8143
|
const int i1 = iq1;
|
7740
8144
|
const int i2 = iq2;
|
@@ -7758,25 +8162,25 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
7758
8162
|
int64_t t0 = ggml_perf_time_us();
|
7759
8163
|
UNUSED(t0);
|
7760
8164
|
|
7761
|
-
const
|
7762
|
-
const
|
7763
|
-
const
|
7764
|
-
const
|
8165
|
+
const int64_t neq0 = q->ne[0];
|
8166
|
+
const int64_t neq1 = q->ne[1];
|
8167
|
+
const int64_t neq2 = q->ne[2];
|
8168
|
+
const int64_t neq3 = q->ne[3];
|
7765
8169
|
|
7766
|
-
const
|
7767
|
-
const
|
7768
|
-
//const
|
7769
|
-
//const
|
8170
|
+
const int64_t nek0 = k->ne[0];
|
8171
|
+
const int64_t nek1 = k->ne[1];
|
8172
|
+
//const int64_t nek2 = k->ne[2];
|
8173
|
+
//const int64_t nek3 = k->ne[3];
|
7770
8174
|
|
7771
|
-
//const
|
7772
|
-
const
|
7773
|
-
//const
|
7774
|
-
//const
|
8175
|
+
//const int64_t nev0 = v->ne[0];
|
8176
|
+
const int64_t nev1 = v->ne[1];
|
8177
|
+
//const int64_t nev2 = v->ne[2];
|
8178
|
+
//const int64_t nev3 = v->ne[3];
|
7775
8179
|
|
7776
|
-
const
|
7777
|
-
const
|
7778
|
-
//const
|
7779
|
-
//const
|
8180
|
+
const int64_t ne0 = dst->ne[0];
|
8181
|
+
const int64_t ne1 = dst->ne[1];
|
8182
|
+
//const int64_t ne2 = dst->ne[2];
|
8183
|
+
//const int64_t ne3 = dst->ne[3];
|
7780
8184
|
|
7781
8185
|
const int nbk0 = k->nb[0];
|
7782
8186
|
const int nbk1 = k->nb[1];
|
@@ -7801,10 +8205,10 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
7801
8205
|
const int ith = params->ith;
|
7802
8206
|
const int nth = params->nth;
|
7803
8207
|
|
7804
|
-
const
|
7805
|
-
const
|
7806
|
-
const
|
7807
|
-
const
|
8208
|
+
const int64_t D = neq0;
|
8209
|
+
const int64_t N = neq1;
|
8210
|
+
const int64_t P = nek1 - N;
|
8211
|
+
const int64_t M = P + N;
|
7808
8212
|
|
7809
8213
|
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
7810
8214
|
|
@@ -7867,7 +8271,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
7867
8271
|
}
|
7868
8272
|
|
7869
8273
|
if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
|
7870
|
-
for (
|
8274
|
+
for (int64_t ic = 0; ic < nek1; ++ic) {
|
7871
8275
|
// k indices
|
7872
8276
|
const int ik3 = iq3;
|
7873
8277
|
const int ik2 = iq2;
|
@@ -7882,7 +8286,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
7882
8286
|
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
7883
8287
|
}
|
7884
8288
|
} else {
|
7885
|
-
for (
|
8289
|
+
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
7886
8290
|
// k indices
|
7887
8291
|
const int ik3 = iq3;
|
7888
8292
|
const int ik2 = iq2;
|
@@ -7902,7 +8306,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
7902
8306
|
ggml_vec_scale_f32(nek1, S, scale);
|
7903
8307
|
|
7904
8308
|
if (masked) {
|
7905
|
-
for (
|
8309
|
+
for (int64_t i = P; i < M; i++) {
|
7906
8310
|
if (i > P + iq1) {
|
7907
8311
|
S[i] = -INFINITY;
|
7908
8312
|
}
|
@@ -7962,12 +8366,12 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
7962
8366
|
|
7963
8367
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
|
7964
8368
|
|
7965
|
-
for (
|
8369
|
+
for (int64_t i = 0; i < M; i++) {
|
7966
8370
|
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
7967
8371
|
}
|
7968
8372
|
|
7969
8373
|
if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
|
7970
|
-
for (
|
8374
|
+
for (int64_t ic = 0; ic < nev1; ++ic) {
|
7971
8375
|
// dst indices
|
7972
8376
|
const int i1 = iq1;
|
7973
8377
|
const int i2 = iq2;
|
@@ -7979,7 +8383,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
7979
8383
|
S16);
|
7980
8384
|
}
|
7981
8385
|
} else {
|
7982
|
-
for (
|
8386
|
+
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
7983
8387
|
// dst indices
|
7984
8388
|
const int i1 = iq1;
|
7985
8389
|
const int i2 = iq2;
|
@@ -8035,35 +8439,35 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8035
8439
|
int64_t t0 = ggml_perf_time_us();
|
8036
8440
|
UNUSED(t0);
|
8037
8441
|
|
8038
|
-
const
|
8039
|
-
const
|
8040
|
-
const
|
8041
|
-
const
|
8442
|
+
const int64_t nea0 = a->ne[0];
|
8443
|
+
const int64_t nea1 = a->ne[1];
|
8444
|
+
const int64_t nea2 = a->ne[2];
|
8445
|
+
const int64_t nea3 = a->ne[3];
|
8042
8446
|
|
8043
|
-
const
|
8044
|
-
const
|
8045
|
-
//const
|
8046
|
-
//const
|
8447
|
+
const int64_t neb00 = b0->ne[0];
|
8448
|
+
const int64_t neb01 = b0->ne[1];
|
8449
|
+
//const int64_t neb02 = b0->ne[2];
|
8450
|
+
//const int64_t neb03 = b0->ne[3];
|
8047
8451
|
|
8048
|
-
const
|
8049
|
-
const
|
8050
|
-
//const
|
8051
|
-
//const
|
8452
|
+
const int64_t neb10 = b1->ne[0];
|
8453
|
+
const int64_t neb11 = b1->ne[1];
|
8454
|
+
//const int64_t neb12 = b1->ne[2];
|
8455
|
+
//const int64_t neb13 = b1->ne[3];
|
8052
8456
|
|
8053
|
-
const
|
8054
|
-
const
|
8055
|
-
//const
|
8056
|
-
//const
|
8457
|
+
const int64_t nec00 = c0->ne[0];
|
8458
|
+
const int64_t nec01 = c0->ne[1];
|
8459
|
+
//const int64_t nec02 = c0->ne[2];
|
8460
|
+
//const int64_t nec03 = c0->ne[3];
|
8057
8461
|
|
8058
|
-
const
|
8059
|
-
const
|
8060
|
-
//const
|
8061
|
-
//const
|
8462
|
+
const int64_t nec10 = c1->ne[0];
|
8463
|
+
const int64_t nec11 = c1->ne[1];
|
8464
|
+
//const int64_t nec12 = c1->ne[2];
|
8465
|
+
//const int64_t nec13 = c1->ne[3];
|
8062
8466
|
|
8063
|
-
const
|
8064
|
-
const
|
8065
|
-
const
|
8066
|
-
//const
|
8467
|
+
const int64_t ne0 = dst->ne[0];
|
8468
|
+
const int64_t ne1 = dst->ne[1];
|
8469
|
+
const int64_t ne2 = dst->ne[2];
|
8470
|
+
//const int64_t ne3 = dst->ne[3];
|
8067
8471
|
|
8068
8472
|
const int nba0 = a->nb[0];
|
8069
8473
|
const int nba1 = a->nb[1];
|
@@ -8098,9 +8502,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8098
8502
|
const int ith = params->ith;
|
8099
8503
|
const int nth = params->nth;
|
8100
8504
|
|
8101
|
-
const
|
8102
|
-
//const
|
8103
|
-
const
|
8505
|
+
const int64_t D = nea0;
|
8506
|
+
//const int64_t N = nea1;
|
8507
|
+
const int64_t M = neb01;
|
8104
8508
|
|
8105
8509
|
GGML_ASSERT(ne0 == nea0);
|
8106
8510
|
GGML_ASSERT(ne1 == nea1);
|
@@ -8156,7 +8560,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8156
8560
|
|
8157
8561
|
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
|
8158
8562
|
|
8159
|
-
for (
|
8563
|
+
for (int64_t ic = 0; ic < neb01; ++ic) {
|
8160
8564
|
// b0 indices
|
8161
8565
|
const int ib03 = ia3;
|
8162
8566
|
const int ib02 = ia2;
|
@@ -8176,7 +8580,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8176
8580
|
|
8177
8581
|
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
8178
8582
|
|
8179
|
-
for (
|
8583
|
+
for (int64_t i = 0; i < M; i++) {
|
8180
8584
|
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
8181
8585
|
}
|
8182
8586
|
|
@@ -8188,7 +8592,7 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
8188
8592
|
const int i2 = ia2;
|
8189
8593
|
const int i3 = ia3;
|
8190
8594
|
|
8191
|
-
for (
|
8595
|
+
for (int64_t ic = 0; ic < nec01; ++ic) {
|
8192
8596
|
|
8193
8597
|
ggml_vec_dot_f16(neb01,
|
8194
8598
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
@@ -9053,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
9053
9457
|
} break;
|
9054
9458
|
case GGML_OP_ROPE:
|
9055
9459
|
{
|
9056
|
-
node->n_tasks =
|
9460
|
+
node->n_tasks = n_threads;
|
9057
9461
|
} break;
|
9058
9462
|
case GGML_OP_CONV_1D_1S:
|
9059
9463
|
case GGML_OP_CONV_1D_2S:
|
@@ -9091,7 +9495,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
9091
9495
|
|
9092
9496
|
size_t cur = 0;
|
9093
9497
|
|
9094
|
-
const
|
9498
|
+
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
9095
9499
|
|
9096
9500
|
if (node->src1->type == GGML_TYPE_F32) {
|
9097
9501
|
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
@@ -9350,7 +9754,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
9350
9754
|
|
9351
9755
|
perf_total_per_op_us[node->op] += node->perf_time_us;
|
9352
9756
|
|
9353
|
-
GGML_PRINT(" - %3d: [ %
|
9757
|
+
GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
9354
9758
|
i,
|
9355
9759
|
node->ne[0], node->ne[1], node->ne[2],
|
9356
9760
|
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
@@ -9364,7 +9768,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
9364
9768
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
9365
9769
|
struct ggml_tensor * node = cgraph->leafs[i];
|
9366
9770
|
|
9367
|
-
GGML_PRINT(" - %3d: [ %
|
9771
|
+
GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n",
|
9368
9772
|
i,
|
9369
9773
|
node->ne[0], node->ne[1],
|
9370
9774
|
GGML_OP_LABEL[node->op]);
|
@@ -9435,7 +9839,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
9435
9839
|
|
9436
9840
|
fprintf(fp, " \"%p\" [ \
|
9437
9841
|
style = filled; fillcolor = %s; shape = record; \
|
9438
|
-
label=\"%d [%
|
9842
|
+
label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
9439
9843
|
(void *) node, color,
|
9440
9844
|
i, node->ne[0], node->ne[1],
|
9441
9845
|
GGML_OP_SYMBOL[node->op]);
|
@@ -9460,7 +9864,7 @@ label=\"<x>%.1e\"; ]\n",
|
|
9460
9864
|
} else {
|
9461
9865
|
fprintf(fp, " \"%p\" [ \
|
9462
9866
|
style = filled; fillcolor = %s; shape = record; \
|
9463
|
-
label=\"<x>CONST %d [%
|
9867
|
+
label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
|
9464
9868
|
(void *) node, color,
|
9465
9869
|
i, node->ne[0], node->ne[1]);
|
9466
9870
|
}
|
@@ -9524,9 +9928,9 @@ label=\"<x>CONST %d [%d, %d]\"; ]\n",
|
|
9524
9928
|
static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
|
9525
9929
|
int i = 0;
|
9526
9930
|
for (int p = 0; p < np; ++p) {
|
9527
|
-
const
|
9931
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9528
9932
|
// TODO: add function to set tensor from array
|
9529
|
-
for (
|
9933
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9530
9934
|
ggml_set_f32_1d(ps[p], j, x[i++]);
|
9531
9935
|
}
|
9532
9936
|
}
|
@@ -9535,9 +9939,9 @@ static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const f
|
|
9535
9939
|
static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
|
9536
9940
|
int i = 0;
|
9537
9941
|
for (int p = 0; p < np; ++p) {
|
9538
|
-
const
|
9942
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9539
9943
|
// TODO: add function to get all elements at once
|
9540
|
-
for (
|
9944
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9541
9945
|
x[i++] = ggml_get_f32_1d(ps[p], j);
|
9542
9946
|
}
|
9543
9947
|
}
|
@@ -9546,9 +9950,9 @@ static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float *
|
|
9546
9950
|
static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
|
9547
9951
|
int i = 0;
|
9548
9952
|
for (int p = 0; p < np; ++p) {
|
9549
|
-
const
|
9953
|
+
const int64_t ne = ggml_nelements(ps[p]) ;
|
9550
9954
|
// TODO: add function to get all elements at once
|
9551
|
-
for (
|
9955
|
+
for (int64_t j = 0; j < ne; ++j) {
|
9552
9956
|
g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
|
9553
9957
|
}
|
9554
9958
|
}
|
@@ -10146,6 +10550,7 @@ enum ggml_opt_result ggml_opt(
|
|
10146
10550
|
struct ggml_init_params params_ctx = {
|
10147
10551
|
.mem_size = 16*1024*1024,
|
10148
10552
|
.mem_buffer = NULL,
|
10553
|
+
.no_alloc = false,
|
10149
10554
|
};
|
10150
10555
|
|
10151
10556
|
ctx = ggml_init(params_ctx);
|