numkong 7.4.5 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/binding.gyp +99 -5
- package/c/dispatch_e5m2.c +23 -3
- package/c/dispatch_f16.c +23 -0
- package/c/numkong.c +0 -13
- package/include/numkong/attention/sme.h +34 -31
- package/include/numkong/capabilities.h +2 -15
- package/include/numkong/cast/README.md +3 -0
- package/include/numkong/cast/haswell.h +28 -64
- package/include/numkong/cast/neon.h +15 -0
- package/include/numkong/cast/serial.h +17 -0
- package/include/numkong/cast/skylake.h +67 -52
- package/include/numkong/cast.h +1 -0
- package/include/numkong/curved/smef64.h +82 -62
- package/include/numkong/dot/README.md +1 -0
- package/include/numkong/dot/haswell.h +92 -13
- package/include/numkong/dot/rvvbf16.h +1 -1
- package/include/numkong/dot/rvvhalf.h +1 -1
- package/include/numkong/dot/serial.h +15 -0
- package/include/numkong/dot/skylake.h +61 -14
- package/include/numkong/dot/sve.h +6 -5
- package/include/numkong/dot/svebfdot.h +2 -1
- package/include/numkong/dot/svehalf.h +6 -5
- package/include/numkong/dot/svesdot.h +3 -2
- package/include/numkong/dots/README.md +2 -0
- package/include/numkong/dots/graniteamx.h +1167 -0
- package/include/numkong/dots/haswell.h +28 -28
- package/include/numkong/dots/sapphireamx.h +1 -1
- package/include/numkong/dots/serial.h +33 -11
- package/include/numkong/dots/skylake.h +28 -23
- package/include/numkong/dots/sme.h +172 -140
- package/include/numkong/dots/smebi32.h +14 -11
- package/include/numkong/dots/smef64.h +31 -26
- package/include/numkong/dots.h +41 -3
- package/include/numkong/each/serial.h +39 -0
- package/include/numkong/geospatial/haswell.h +1 -1
- package/include/numkong/geospatial/neon.h +1 -1
- package/include/numkong/geospatial/serial.h +15 -4
- package/include/numkong/geospatial/skylake.h +1 -1
- package/include/numkong/maxsim/serial.h +15 -0
- package/include/numkong/maxsim/sme.h +34 -33
- package/include/numkong/mesh/README.md +50 -44
- package/include/numkong/mesh/genoa.h +462 -0
- package/include/numkong/mesh/haswell.h +806 -933
- package/include/numkong/mesh/neon.h +871 -943
- package/include/numkong/mesh/neonbfdot.h +382 -522
- package/include/numkong/mesh/neonfhm.h +676 -0
- package/include/numkong/mesh/rvv.h +404 -319
- package/include/numkong/mesh/serial.h +225 -161
- package/include/numkong/mesh/skylake.h +1029 -1585
- package/include/numkong/mesh/v128relaxed.h +403 -377
- package/include/numkong/mesh.h +38 -0
- package/include/numkong/reduce/neon.h +29 -0
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +4 -4
- package/include/numkong/reduce/serial.h +15 -1
- package/include/numkong/reduce/sve.h +52 -0
- package/include/numkong/reduce.h +4 -0
- package/include/numkong/set/sve.h +6 -5
- package/include/numkong/sets/smebi32.h +35 -30
- package/include/numkong/sparse/serial.h +17 -2
- package/include/numkong/sparse/sve2.h +3 -2
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +98 -56
- package/include/numkong/spatial/serial.h +15 -0
- package/include/numkong/spatial/skylake.h +114 -54
- package/include/numkong/spatial/sve.h +7 -6
- package/include/numkong/spatial/svebfdot.h +7 -4
- package/include/numkong/spatial/svehalf.h +5 -4
- package/include/numkong/spatial/svesdot.h +9 -8
- package/include/numkong/spatial.h +0 -12
- package/include/numkong/spatials/graniteamx.h +301 -0
- package/include/numkong/spatials/serial.h +39 -0
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +391 -350
- package/include/numkong/spatials/smef64.h +79 -70
- package/include/numkong/spatials.h +54 -4
- package/include/numkong/tensor.hpp +107 -23
- package/include/numkong/types.h +59 -0
- package/javascript/dist/cjs/numkong.js +13 -0
- package/javascript/dist/esm/numkong.js +13 -0
- package/javascript/numkong.c +59 -14
- package/javascript/numkong.ts +13 -0
- package/package.json +7 -7
- package/probes/probe.js +2 -2
- package/wasm/numkong.wasm +0 -0
|
@@ -156,6 +156,36 @@ NK_INTERNAL void nk_dot_through_f32_update_skylake_(nk_dot_through_f32_state_sky
|
|
|
156
156
|
state->sum_f32x16 = _mm512_fmadd_ps(a.zmm_ps, b.zmm_ps, state->sum_f32x16);
|
|
157
157
|
}
|
|
158
158
|
|
|
159
|
+
/**
|
|
160
|
+
* @brief E5M2 byte-batched update: consumes 64 raw E5M2 bytes per call and widens inline.
|
|
161
|
+
* Two independent FMA chains (each 2-deep) merge into the single state accumulator at exit.
|
|
162
|
+
* Keeps register pressure at one __m512 across calls while breaking the FMA dep chain.
|
|
163
|
+
*/
|
|
164
|
+
NK_INTERNAL void nk_dot_e5m2x64_update_skylake_(nk_dot_through_f32_state_skylake_t_ *state, nk_b512_vec_t a_bytes,
|
|
165
|
+
nk_b512_vec_t b_bytes, nk_size_t depth_offset,
|
|
166
|
+
nk_size_t active_dimensions) {
|
|
167
|
+
nk_unused_(depth_offset);
|
|
168
|
+
nk_unused_(active_dimensions);
|
|
169
|
+
__m512i const zero_u8x64 = _mm512_setzero_si512();
|
|
170
|
+
__m512i a_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, a_bytes.zmm);
|
|
171
|
+
__m512i a_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, a_bytes.zmm);
|
|
172
|
+
__m512i b_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, b_bytes.zmm);
|
|
173
|
+
__m512i b_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, b_bytes.zmm);
|
|
174
|
+
__m512 a_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_even_f16x32));
|
|
175
|
+
__m512 a_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_even_f16x32, 1));
|
|
176
|
+
__m512 a_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_odd_f16x32));
|
|
177
|
+
__m512 a_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_odd_f16x32, 1));
|
|
178
|
+
__m512 b_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_even_f16x32));
|
|
179
|
+
__m512 b_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_even_f16x32, 1));
|
|
180
|
+
__m512 b_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_odd_f16x32));
|
|
181
|
+
__m512 b_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_odd_f16x32, 1));
|
|
182
|
+
__m512 first_chain_f32x16 = _mm512_mul_ps(a_first_f32x16, b_first_f32x16);
|
|
183
|
+
__m512 second_chain_f32x16 = _mm512_mul_ps(a_second_f32x16, b_second_f32x16);
|
|
184
|
+
first_chain_f32x16 = _mm512_fmadd_ps(a_third_f32x16, b_third_f32x16, first_chain_f32x16);
|
|
185
|
+
second_chain_f32x16 = _mm512_fmadd_ps(a_fourth_f32x16, b_fourth_f32x16, second_chain_f32x16);
|
|
186
|
+
state->sum_f32x16 = _mm512_add_ps(state->sum_f32x16, _mm512_add_ps(first_chain_f32x16, second_chain_f32x16));
|
|
187
|
+
}
|
|
188
|
+
|
|
159
189
|
/**
|
|
160
190
|
* @brief Finalizes 4x low-precision dot-products placing them into 4x consecutive 32-bit slots.
|
|
161
191
|
* @sa nk_dot_f16x16_udpate_skylake, nk_dot_bf16x16_udpate_skylake
|
|
@@ -543,7 +573,7 @@ NK_PUBLIC void nk_dot_e4m3_skylake(nk_e4m3_t const *a_scalars, nk_e4m3_t const *
|
|
|
543
573
|
|
|
544
574
|
nk_dot_e4m3_skylake_cycle:
|
|
545
575
|
if (count_scalars < 16) {
|
|
546
|
-
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, count_scalars);
|
|
576
|
+
__mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, (unsigned int)count_scalars);
|
|
547
577
|
a_e4m3_u8x16 = _mm_maskz_loadu_epi8(mask, a_scalars);
|
|
548
578
|
b_e4m3_u8x16 = _mm_maskz_loadu_epi8(mask, b_scalars);
|
|
549
579
|
count_scalars = 0;
|
|
@@ -563,27 +593,44 @@ nk_dot_e4m3_skylake_cycle:
|
|
|
563
593
|
|
|
564
594
|
NK_PUBLIC void nk_dot_e5m2_skylake(nk_e5m2_t const *a_scalars, nk_e5m2_t const *b_scalars, nk_size_t count_scalars,
|
|
565
595
|
nk_f32_t *result) {
|
|
566
|
-
|
|
567
|
-
|
|
596
|
+
// E5M2 shares F16 bias (15): vpunpck*bw against zero places the byte as F16 encoding,
|
|
597
|
+
// so we inline the widen rather than calling the helper 4× — same ops, cleaner code.
|
|
598
|
+
__m512 first_chain_f32x16 = _mm512_setzero_ps();
|
|
599
|
+
__m512 second_chain_f32x16 = _mm512_setzero_ps();
|
|
600
|
+
__m512i const zero_u8x64 = _mm512_setzero_si512();
|
|
601
|
+
__m512i a_u8x64, b_u8x64;
|
|
568
602
|
|
|
569
603
|
nk_dot_e5m2_skylake_cycle:
|
|
570
|
-
if (count_scalars <
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
604
|
+
if (count_scalars < 64) {
|
|
605
|
+
__mmask64 mask = _bzhi_u64(0xFFFFFFFFFFFFFFFFULL, (unsigned int)count_scalars);
|
|
606
|
+
a_u8x64 = _mm512_maskz_loadu_epi8(mask, a_scalars);
|
|
607
|
+
b_u8x64 = _mm512_maskz_loadu_epi8(mask, b_scalars);
|
|
574
608
|
count_scalars = 0;
|
|
575
609
|
}
|
|
576
610
|
else {
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
a_scalars +=
|
|
611
|
+
a_u8x64 = _mm512_loadu_si512((__m512i const *)a_scalars);
|
|
612
|
+
b_u8x64 = _mm512_loadu_si512((__m512i const *)b_scalars);
|
|
613
|
+
a_scalars += 64, b_scalars += 64, count_scalars -= 64;
|
|
580
614
|
}
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
615
|
+
__m512i a_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, a_u8x64);
|
|
616
|
+
__m512i a_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, a_u8x64);
|
|
617
|
+
__m512i b_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, b_u8x64);
|
|
618
|
+
__m512i b_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, b_u8x64);
|
|
619
|
+
__m512 a_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_even_f16x32));
|
|
620
|
+
__m512 a_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_even_f16x32, 1));
|
|
621
|
+
__m512 a_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_odd_f16x32));
|
|
622
|
+
__m512 a_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_odd_f16x32, 1));
|
|
623
|
+
__m512 b_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_even_f16x32));
|
|
624
|
+
__m512 b_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_even_f16x32, 1));
|
|
625
|
+
__m512 b_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_odd_f16x32));
|
|
626
|
+
__m512 b_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_odd_f16x32, 1));
|
|
627
|
+
first_chain_f32x16 = _mm512_fmadd_ps(a_first_f32x16, b_first_f32x16, first_chain_f32x16);
|
|
628
|
+
second_chain_f32x16 = _mm512_fmadd_ps(a_second_f32x16, b_second_f32x16, second_chain_f32x16);
|
|
629
|
+
first_chain_f32x16 = _mm512_fmadd_ps(a_third_f32x16, b_third_f32x16, first_chain_f32x16);
|
|
630
|
+
second_chain_f32x16 = _mm512_fmadd_ps(a_fourth_f32x16, b_fourth_f32x16, second_chain_f32x16);
|
|
584
631
|
if (count_scalars) goto nk_dot_e5m2_skylake_cycle;
|
|
585
632
|
|
|
586
|
-
*result = nk_reduce_add_f32x16_skylake_(
|
|
633
|
+
*result = nk_reduce_add_f32x16_skylake_(_mm512_add_ps(first_chain_f32x16, second_chain_f32x16));
|
|
587
634
|
}
|
|
588
635
|
|
|
589
636
|
NK_PUBLIC void nk_dot_e2m3_skylake(nk_e2m3_t const *a_scalars, nk_e2m3_t const *b_scalars, nk_size_t count_scalars,
|
|
@@ -39,6 +39,7 @@
|
|
|
39
39
|
#if NK_TARGET_SVE
|
|
40
40
|
|
|
41
41
|
#include "numkong/types.h" // `nk_f32_t`
|
|
42
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
42
43
|
#include "numkong/dot/serial.h" // `nk_u1x8_popcount_`
|
|
43
44
|
|
|
44
45
|
#if defined(__cplusplus)
|
|
@@ -110,7 +111,7 @@ NK_PUBLIC void nk_dot_f32_sve(nk_f32_t const *a_scalars, nk_f32_t const *b_scala
|
|
|
110
111
|
ab_f64x = svmla_f64_m(pred_odd_b64x, ab_f64x, svcvt_f64_f32_x(pred_odd_b64x, svext_f32(a_f32x, a_f32x, 1)),
|
|
111
112
|
svcvt_f64_f32_x(pred_odd_b64x, svext_f32(b_f32x, b_f32x, 1)));
|
|
112
113
|
}
|
|
113
|
-
*result =
|
|
114
|
+
*result = nk_svaddv_f64_(svptrue_b64(), ab_f64x);
|
|
114
115
|
}
|
|
115
116
|
|
|
116
117
|
NK_PUBLIC void nk_dot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
|
|
@@ -149,8 +150,8 @@ NK_PUBLIC void nk_dot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pair
|
|
|
149
150
|
ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_real_odd_f64x, b_imag_odd_f64x);
|
|
150
151
|
ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_imag_odd_f64x, b_real_odd_f64x);
|
|
151
152
|
}
|
|
152
|
-
results->real =
|
|
153
|
-
results->imag =
|
|
153
|
+
results->real = nk_svaddv_f64_(svptrue_b64(), ab_real_f64x);
|
|
154
|
+
results->imag = nk_svaddv_f64_(svptrue_b64(), ab_imag_f64x);
|
|
154
155
|
}
|
|
155
156
|
|
|
156
157
|
NK_PUBLIC void nk_vdot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
|
|
@@ -189,8 +190,8 @@ NK_PUBLIC void nk_vdot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pai
|
|
|
189
190
|
ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_real_odd_f64x, b_imag_odd_f64x);
|
|
190
191
|
ab_imag_f64x = svmls_f64_m(pred_odd_b64x, ab_imag_f64x, a_imag_odd_f64x, b_real_odd_f64x);
|
|
191
192
|
}
|
|
192
|
-
results->real =
|
|
193
|
-
results->imag =
|
|
193
|
+
results->real = nk_svaddv_f64_(svptrue_b64(), ab_real_f64x);
|
|
194
|
+
results->imag = nk_svaddv_f64_(svptrue_b64(), ab_imag_f64x);
|
|
194
195
|
}
|
|
195
196
|
|
|
196
197
|
NK_PUBLIC void nk_dot_f64_sve(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
|
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
#if NK_TARGET_SVEBFDOT
|
|
32
32
|
|
|
33
33
|
#include "numkong/types.h"
|
|
34
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
34
35
|
|
|
35
36
|
#if defined(__cplusplus)
|
|
36
37
|
extern "C" {
|
|
@@ -56,7 +57,7 @@ NK_PUBLIC void nk_dot_bf16_svebfdot(nk_bf16_t const *a_scalars, nk_bf16_t const
|
|
|
56
57
|
sum_f32x = svbfdot_f32(sum_f32x, a_bf16x, b_bf16x);
|
|
57
58
|
idx_scalars += svcnth();
|
|
58
59
|
} while (idx_scalars < count_scalars);
|
|
59
|
-
*result =
|
|
60
|
+
*result = nk_svaddv_f32_(svptrue_b32(), sum_f32x);
|
|
60
61
|
}
|
|
61
62
|
|
|
62
63
|
#if defined(__clang__)
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
#if NK_TARGET_SVEHALF
|
|
34
34
|
|
|
35
35
|
#include "numkong/types.h" // `nk_f16_t`
|
|
36
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
36
37
|
#include "numkong/dot/serial.h" // `nk_u1x8_popcount_`
|
|
37
38
|
|
|
38
39
|
#if defined(__cplusplus)
|
|
@@ -67,7 +68,7 @@ NK_PUBLIC void nk_dot_f16_svehalf(nk_f16_t const *a_scalars, nk_f16_t const *b_s
|
|
|
67
68
|
|
|
68
69
|
idx_scalars += svcnth();
|
|
69
70
|
} while (idx_scalars < count_scalars);
|
|
70
|
-
*result =
|
|
71
|
+
*result = nk_svaddv_f32_(svptrue_b32(), ab_f32x);
|
|
71
72
|
}
|
|
72
73
|
|
|
73
74
|
NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
|
|
@@ -107,8 +108,8 @@ NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_
|
|
|
107
108
|
|
|
108
109
|
idx_scalars += svcnth();
|
|
109
110
|
} while (idx_scalars < count_pairs);
|
|
110
|
-
results->real =
|
|
111
|
-
results->imag =
|
|
111
|
+
results->real = nk_svaddv_f32_(svptrue_b32(), ab_real_f32x);
|
|
112
|
+
results->imag = nk_svaddv_f32_(svptrue_b32(), ab_imag_f32x);
|
|
112
113
|
}
|
|
113
114
|
|
|
114
115
|
NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
|
|
@@ -148,8 +149,8 @@ NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b
|
|
|
148
149
|
|
|
149
150
|
idx_scalars += svcnth();
|
|
150
151
|
} while (idx_scalars < count_pairs);
|
|
151
|
-
results->real =
|
|
152
|
-
results->imag =
|
|
152
|
+
results->real = nk_svaddv_f32_(svptrue_b32(), ab_real_f32x);
|
|
153
|
+
results->imag = nk_svaddv_f32_(svptrue_b32(), ab_imag_f32x);
|
|
153
154
|
}
|
|
154
155
|
|
|
155
156
|
#if defined(__clang__)
|
|
@@ -34,6 +34,7 @@
|
|
|
34
34
|
#if NK_TARGET_SVESDOT
|
|
35
35
|
|
|
36
36
|
#include "numkong/types.h"
|
|
37
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
37
38
|
|
|
38
39
|
#if defined(__cplusplus)
|
|
39
40
|
extern "C" {
|
|
@@ -57,7 +58,7 @@ NK_PUBLIC void nk_dot_i8_svesdot(nk_i8_t const *a_scalars, nk_i8_t const *b_scal
|
|
|
57
58
|
sum_i32x = svdot_s32(sum_i32x, a_i8x, b_i8x);
|
|
58
59
|
idx_scalars += svcntb();
|
|
59
60
|
} while (idx_scalars < count_scalars);
|
|
60
|
-
*result = (nk_i32_t)
|
|
61
|
+
*result = (nk_i32_t)nk_svaddv_s32_(svptrue_b32(), sum_i32x);
|
|
61
62
|
}
|
|
62
63
|
|
|
63
64
|
NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scalars, nk_size_t count_scalars,
|
|
@@ -71,7 +72,7 @@ NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scal
|
|
|
71
72
|
sum_u32x = svdot_u32(sum_u32x, a_u8x, b_u8x);
|
|
72
73
|
idx_scalars += svcntb();
|
|
73
74
|
} while (idx_scalars < count_scalars);
|
|
74
|
-
*result = (nk_u32_t)
|
|
75
|
+
*result = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), sum_u32x);
|
|
75
76
|
}
|
|
76
77
|
|
|
77
78
|
#if defined(__clang__)
|
|
@@ -72,6 +72,8 @@ Int8 data is quad-interleaved: [a₀, a₁, a₂, a₃, a₀, a₁, a₂, a₃,
|
|
|
72
72
|
Tile configuration via `LDTILECFG` sets row counts and column byte-widths per tile — allows undersized tiles at matrix edges without masking.
|
|
73
73
|
Morton Z-curve ordering for tile traversal improves cache reuse when both A and B exceed L2.
|
|
74
74
|
This eliminates the explicit M×N×K loop nesting and register file pressure of vector ISAs — the entire dot-product reduction happens inside the tile instruction.
|
|
75
|
+
FP8 inputs on Sapphire AMX go through an on-the-fly E4M3/E5M2 → BF16 pack via the Ice Lake `VPERMI2W` LUT helpers — port-5-bound but the simplest correct route to feed `TDPBF16PS` tiles.
|
|
76
|
+
Granite Rapids adds `TDPFP16PS` (same tile shape, FP16 operands); the E5M2 variant widens inputs with a single `VPUNPCK*BW` against zero into FP16 tiles at pack time and then reuses the native FP16 compute loop — keeps the intermediate at FP16 precision instead of truncating to BF16 like the Sapphire path.
|
|
75
77
|
|
|
76
78
|
### SME Outer-Product Streaming
|
|
77
79
|
|