numkong 7.4.5 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +1 -0
  2. package/binding.gyp +99 -5
  3. package/c/dispatch_e5m2.c +23 -3
  4. package/c/dispatch_f16.c +23 -0
  5. package/c/numkong.c +0 -13
  6. package/include/numkong/attention/sme.h +34 -31
  7. package/include/numkong/capabilities.h +2 -15
  8. package/include/numkong/cast/README.md +3 -0
  9. package/include/numkong/cast/haswell.h +28 -64
  10. package/include/numkong/cast/neon.h +15 -0
  11. package/include/numkong/cast/serial.h +17 -0
  12. package/include/numkong/cast/skylake.h +67 -52
  13. package/include/numkong/cast.h +1 -0
  14. package/include/numkong/curved/smef64.h +82 -62
  15. package/include/numkong/dot/README.md +1 -0
  16. package/include/numkong/dot/haswell.h +92 -13
  17. package/include/numkong/dot/rvvbf16.h +1 -1
  18. package/include/numkong/dot/rvvhalf.h +1 -1
  19. package/include/numkong/dot/serial.h +15 -0
  20. package/include/numkong/dot/skylake.h +61 -14
  21. package/include/numkong/dot/sve.h +6 -5
  22. package/include/numkong/dot/svebfdot.h +2 -1
  23. package/include/numkong/dot/svehalf.h +6 -5
  24. package/include/numkong/dot/svesdot.h +3 -2
  25. package/include/numkong/dots/README.md +2 -0
  26. package/include/numkong/dots/graniteamx.h +1167 -0
  27. package/include/numkong/dots/haswell.h +28 -28
  28. package/include/numkong/dots/sapphireamx.h +1 -1
  29. package/include/numkong/dots/serial.h +33 -11
  30. package/include/numkong/dots/skylake.h +28 -23
  31. package/include/numkong/dots/sme.h +172 -140
  32. package/include/numkong/dots/smebi32.h +14 -11
  33. package/include/numkong/dots/smef64.h +31 -26
  34. package/include/numkong/dots.h +41 -3
  35. package/include/numkong/each/serial.h +39 -0
  36. package/include/numkong/geospatial/haswell.h +1 -1
  37. package/include/numkong/geospatial/neon.h +1 -1
  38. package/include/numkong/geospatial/serial.h +15 -4
  39. package/include/numkong/geospatial/skylake.h +1 -1
  40. package/include/numkong/maxsim/serial.h +15 -0
  41. package/include/numkong/maxsim/sme.h +34 -33
  42. package/include/numkong/mesh/README.md +50 -44
  43. package/include/numkong/mesh/genoa.h +462 -0
  44. package/include/numkong/mesh/haswell.h +806 -933
  45. package/include/numkong/mesh/neon.h +871 -943
  46. package/include/numkong/mesh/neonbfdot.h +382 -522
  47. package/include/numkong/mesh/neonfhm.h +676 -0
  48. package/include/numkong/mesh/rvv.h +404 -319
  49. package/include/numkong/mesh/serial.h +225 -161
  50. package/include/numkong/mesh/skylake.h +1029 -1585
  51. package/include/numkong/mesh/v128relaxed.h +403 -377
  52. package/include/numkong/mesh.h +38 -0
  53. package/include/numkong/reduce/neon.h +29 -0
  54. package/include/numkong/reduce/neonbfdot.h +2 -2
  55. package/include/numkong/reduce/neonfhm.h +4 -4
  56. package/include/numkong/reduce/serial.h +15 -1
  57. package/include/numkong/reduce/sve.h +52 -0
  58. package/include/numkong/reduce.h +4 -0
  59. package/include/numkong/set/sve.h +6 -5
  60. package/include/numkong/sets/smebi32.h +35 -30
  61. package/include/numkong/sparse/serial.h +17 -2
  62. package/include/numkong/sparse/sve2.h +3 -2
  63. package/include/numkong/spatial/genoa.h +0 -68
  64. package/include/numkong/spatial/haswell.h +98 -56
  65. package/include/numkong/spatial/serial.h +15 -0
  66. package/include/numkong/spatial/skylake.h +114 -54
  67. package/include/numkong/spatial/sve.h +7 -6
  68. package/include/numkong/spatial/svebfdot.h +7 -4
  69. package/include/numkong/spatial/svehalf.h +5 -4
  70. package/include/numkong/spatial/svesdot.h +9 -8
  71. package/include/numkong/spatial.h +0 -12
  72. package/include/numkong/spatials/graniteamx.h +301 -0
  73. package/include/numkong/spatials/serial.h +39 -0
  74. package/include/numkong/spatials/skylake.h +2 -2
  75. package/include/numkong/spatials/sme.h +391 -350
  76. package/include/numkong/spatials/smef64.h +79 -70
  77. package/include/numkong/spatials.h +54 -4
  78. package/include/numkong/tensor.hpp +107 -23
  79. package/include/numkong/types.h +59 -0
  80. package/javascript/dist/cjs/numkong.js +13 -0
  81. package/javascript/dist/esm/numkong.js +13 -0
  82. package/javascript/numkong.c +59 -14
  83. package/javascript/numkong.ts +13 -0
  84. package/package.json +7 -7
  85. package/probes/probe.js +2 -2
  86. package/wasm/numkong.wasm +0 -0
@@ -156,6 +156,36 @@ NK_INTERNAL void nk_dot_through_f32_update_skylake_(nk_dot_through_f32_state_sky
156
156
  state->sum_f32x16 = _mm512_fmadd_ps(a.zmm_ps, b.zmm_ps, state->sum_f32x16);
157
157
  }
158
158
 
159
+ /**
160
+ * @brief E5M2 byte-batched update: consumes 64 raw E5M2 bytes per call and widens inline.
161
+ * Two independent FMA chains (each 2-deep) merge into the single state accumulator at exit.
162
+ * Keeps register pressure at one __m512 across calls while breaking the FMA dep chain.
163
+ */
164
+ NK_INTERNAL void nk_dot_e5m2x64_update_skylake_(nk_dot_through_f32_state_skylake_t_ *state, nk_b512_vec_t a_bytes,
165
+ nk_b512_vec_t b_bytes, nk_size_t depth_offset,
166
+ nk_size_t active_dimensions) {
167
+ nk_unused_(depth_offset);
168
+ nk_unused_(active_dimensions);
169
+ __m512i const zero_u8x64 = _mm512_setzero_si512();
170
+ __m512i a_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, a_bytes.zmm);
171
+ __m512i a_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, a_bytes.zmm);
172
+ __m512i b_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, b_bytes.zmm);
173
+ __m512i b_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, b_bytes.zmm);
174
+ __m512 a_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_even_f16x32));
175
+ __m512 a_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_even_f16x32, 1));
176
+ __m512 a_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_odd_f16x32));
177
+ __m512 a_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_odd_f16x32, 1));
178
+ __m512 b_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_even_f16x32));
179
+ __m512 b_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_even_f16x32, 1));
180
+ __m512 b_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_odd_f16x32));
181
+ __m512 b_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_odd_f16x32, 1));
182
+ __m512 first_chain_f32x16 = _mm512_mul_ps(a_first_f32x16, b_first_f32x16);
183
+ __m512 second_chain_f32x16 = _mm512_mul_ps(a_second_f32x16, b_second_f32x16);
184
+ first_chain_f32x16 = _mm512_fmadd_ps(a_third_f32x16, b_third_f32x16, first_chain_f32x16);
185
+ second_chain_f32x16 = _mm512_fmadd_ps(a_fourth_f32x16, b_fourth_f32x16, second_chain_f32x16);
186
+ state->sum_f32x16 = _mm512_add_ps(state->sum_f32x16, _mm512_add_ps(first_chain_f32x16, second_chain_f32x16));
187
+ }
188
+
159
189
  /**
160
190
  * @brief Finalizes 4x low-precision dot-products placing them into 4x consecutive 32-bit slots.
161
191
  * @sa nk_dot_f16x16_udpate_skylake, nk_dot_bf16x16_udpate_skylake
@@ -543,7 +573,7 @@ NK_PUBLIC void nk_dot_e4m3_skylake(nk_e4m3_t const *a_scalars, nk_e4m3_t const *
543
573
 
544
574
  nk_dot_e4m3_skylake_cycle:
545
575
  if (count_scalars < 16) {
546
- __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, count_scalars);
576
+ __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, (unsigned int)count_scalars);
547
577
  a_e4m3_u8x16 = _mm_maskz_loadu_epi8(mask, a_scalars);
548
578
  b_e4m3_u8x16 = _mm_maskz_loadu_epi8(mask, b_scalars);
549
579
  count_scalars = 0;
@@ -563,27 +593,44 @@ nk_dot_e4m3_skylake_cycle:
563
593
 
564
594
  NK_PUBLIC void nk_dot_e5m2_skylake(nk_e5m2_t const *a_scalars, nk_e5m2_t const *b_scalars, nk_size_t count_scalars,
565
595
  nk_f32_t *result) {
566
- __m128i a_e5m2_u8x16, b_e5m2_u8x16;
567
- __m512 sum_f32x16 = _mm512_setzero_ps();
596
+ // E5M2 shares F16 bias (15): vpunpck*bw against zero places the byte as F16 encoding,
597
+ // so we inline the widen rather than calling the helper 4× — same ops, cleaner code.
598
+ __m512 first_chain_f32x16 = _mm512_setzero_ps();
599
+ __m512 second_chain_f32x16 = _mm512_setzero_ps();
600
+ __m512i const zero_u8x64 = _mm512_setzero_si512();
601
+ __m512i a_u8x64, b_u8x64;
568
602
 
569
603
  nk_dot_e5m2_skylake_cycle:
570
- if (count_scalars < 16) {
571
- __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFF, count_scalars);
572
- a_e5m2_u8x16 = _mm_maskz_loadu_epi8(mask, a_scalars);
573
- b_e5m2_u8x16 = _mm_maskz_loadu_epi8(mask, b_scalars);
604
+ if (count_scalars < 64) {
605
+ __mmask64 mask = _bzhi_u64(0xFFFFFFFFFFFFFFFFULL, (unsigned int)count_scalars);
606
+ a_u8x64 = _mm512_maskz_loadu_epi8(mask, a_scalars);
607
+ b_u8x64 = _mm512_maskz_loadu_epi8(mask, b_scalars);
574
608
  count_scalars = 0;
575
609
  }
576
610
  else {
577
- a_e5m2_u8x16 = _mm_loadu_si128((__m128i const *)a_scalars);
578
- b_e5m2_u8x16 = _mm_loadu_si128((__m128i const *)b_scalars);
579
- a_scalars += 16, b_scalars += 16, count_scalars -= 16;
611
+ a_u8x64 = _mm512_loadu_si512((__m512i const *)a_scalars);
612
+ b_u8x64 = _mm512_loadu_si512((__m512i const *)b_scalars);
613
+ a_scalars += 64, b_scalars += 64, count_scalars -= 64;
580
614
  }
581
- __m512 a_f32x16 = nk_e5m2x16_to_f32x16_skylake_(a_e5m2_u8x16);
582
- __m512 b_f32x16 = nk_e5m2x16_to_f32x16_skylake_(b_e5m2_u8x16);
583
- sum_f32x16 = _mm512_fmadd_ps(a_f32x16, b_f32x16, sum_f32x16);
615
+ __m512i a_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, a_u8x64);
616
+ __m512i a_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, a_u8x64);
617
+ __m512i b_even_f16x32 = _mm512_unpacklo_epi8(zero_u8x64, b_u8x64);
618
+ __m512i b_odd_f16x32 = _mm512_unpackhi_epi8(zero_u8x64, b_u8x64);
619
+ __m512 a_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_even_f16x32));
620
+ __m512 a_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_even_f16x32, 1));
621
+ __m512 a_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(a_odd_f16x32));
622
+ __m512 a_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a_odd_f16x32, 1));
623
+ __m512 b_first_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_even_f16x32));
624
+ __m512 b_second_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_even_f16x32, 1));
625
+ __m512 b_third_f32x16 = _mm512_cvtph_ps(_mm512_castsi512_si256(b_odd_f16x32));
626
+ __m512 b_fourth_f32x16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b_odd_f16x32, 1));
627
+ first_chain_f32x16 = _mm512_fmadd_ps(a_first_f32x16, b_first_f32x16, first_chain_f32x16);
628
+ second_chain_f32x16 = _mm512_fmadd_ps(a_second_f32x16, b_second_f32x16, second_chain_f32x16);
629
+ first_chain_f32x16 = _mm512_fmadd_ps(a_third_f32x16, b_third_f32x16, first_chain_f32x16);
630
+ second_chain_f32x16 = _mm512_fmadd_ps(a_fourth_f32x16, b_fourth_f32x16, second_chain_f32x16);
584
631
  if (count_scalars) goto nk_dot_e5m2_skylake_cycle;
585
632
 
586
- *result = nk_reduce_add_f32x16_skylake_(sum_f32x16);
633
+ *result = nk_reduce_add_f32x16_skylake_(_mm512_add_ps(first_chain_f32x16, second_chain_f32x16));
587
634
  }
588
635
 
589
636
  NK_PUBLIC void nk_dot_e2m3_skylake(nk_e2m3_t const *a_scalars, nk_e2m3_t const *b_scalars, nk_size_t count_scalars,
@@ -39,6 +39,7 @@
39
39
  #if NK_TARGET_SVE
40
40
 
41
41
  #include "numkong/types.h" // `nk_f32_t`
42
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
42
43
  #include "numkong/dot/serial.h" // `nk_u1x8_popcount_`
43
44
 
44
45
  #if defined(__cplusplus)
@@ -110,7 +111,7 @@ NK_PUBLIC void nk_dot_f32_sve(nk_f32_t const *a_scalars, nk_f32_t const *b_scala
110
111
  ab_f64x = svmla_f64_m(pred_odd_b64x, ab_f64x, svcvt_f64_f32_x(pred_odd_b64x, svext_f32(a_f32x, a_f32x, 1)),
111
112
  svcvt_f64_f32_x(pred_odd_b64x, svext_f32(b_f32x, b_f32x, 1)));
112
113
  }
113
- *result = svaddv_f64(svptrue_b64(), ab_f64x);
114
+ *result = nk_svaddv_f64_(svptrue_b64(), ab_f64x);
114
115
  }
115
116
 
116
117
  NK_PUBLIC void nk_dot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
@@ -149,8 +150,8 @@ NK_PUBLIC void nk_dot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pair
149
150
  ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_real_odd_f64x, b_imag_odd_f64x);
150
151
  ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_imag_odd_f64x, b_real_odd_f64x);
151
152
  }
152
- results->real = svaddv_f64(svptrue_b64(), ab_real_f64x);
153
- results->imag = svaddv_f64(svptrue_b64(), ab_imag_f64x);
153
+ results->real = nk_svaddv_f64_(svptrue_b64(), ab_real_f64x);
154
+ results->imag = nk_svaddv_f64_(svptrue_b64(), ab_imag_f64x);
154
155
  }
155
156
 
156
157
  NK_PUBLIC void nk_vdot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
@@ -189,8 +190,8 @@ NK_PUBLIC void nk_vdot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pai
189
190
  ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_real_odd_f64x, b_imag_odd_f64x);
190
191
  ab_imag_f64x = svmls_f64_m(pred_odd_b64x, ab_imag_f64x, a_imag_odd_f64x, b_real_odd_f64x);
191
192
  }
192
- results->real = svaddv_f64(svptrue_b64(), ab_real_f64x);
193
- results->imag = svaddv_f64(svptrue_b64(), ab_imag_f64x);
193
+ results->real = nk_svaddv_f64_(svptrue_b64(), ab_real_f64x);
194
+ results->imag = nk_svaddv_f64_(svptrue_b64(), ab_imag_f64x);
194
195
  }
195
196
 
196
197
  NK_PUBLIC void nk_dot_f64_sve(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
@@ -31,6 +31,7 @@
31
31
  #if NK_TARGET_SVEBFDOT
32
32
 
33
33
  #include "numkong/types.h"
34
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
34
35
 
35
36
  #if defined(__cplusplus)
36
37
  extern "C" {
@@ -56,7 +57,7 @@ NK_PUBLIC void nk_dot_bf16_svebfdot(nk_bf16_t const *a_scalars, nk_bf16_t const
56
57
  sum_f32x = svbfdot_f32(sum_f32x, a_bf16x, b_bf16x);
57
58
  idx_scalars += svcnth();
58
59
  } while (idx_scalars < count_scalars);
59
- *result = svaddv_f32(svptrue_b32(), sum_f32x);
60
+ *result = nk_svaddv_f32_(svptrue_b32(), sum_f32x);
60
61
  }
61
62
 
62
63
  #if defined(__clang__)
@@ -33,6 +33,7 @@
33
33
  #if NK_TARGET_SVEHALF
34
34
 
35
35
  #include "numkong/types.h" // `nk_f16_t`
36
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
36
37
  #include "numkong/dot/serial.h" // `nk_u1x8_popcount_`
37
38
 
38
39
  #if defined(__cplusplus)
@@ -67,7 +68,7 @@ NK_PUBLIC void nk_dot_f16_svehalf(nk_f16_t const *a_scalars, nk_f16_t const *b_s
67
68
 
68
69
  idx_scalars += svcnth();
69
70
  } while (idx_scalars < count_scalars);
70
- *result = svaddv_f32(svptrue_b32(), ab_f32x);
71
+ *result = nk_svaddv_f32_(svptrue_b32(), ab_f32x);
71
72
  }
72
73
 
73
74
  NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
@@ -107,8 +108,8 @@ NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_
107
108
 
108
109
  idx_scalars += svcnth();
109
110
  } while (idx_scalars < count_pairs);
110
- results->real = svaddv_f32(svptrue_b32(), ab_real_f32x);
111
- results->imag = svaddv_f32(svptrue_b32(), ab_imag_f32x);
111
+ results->real = nk_svaddv_f32_(svptrue_b32(), ab_real_f32x);
112
+ results->imag = nk_svaddv_f32_(svptrue_b32(), ab_imag_f32x);
112
113
  }
113
114
 
114
115
  NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
@@ -148,8 +149,8 @@ NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b
148
149
 
149
150
  idx_scalars += svcnth();
150
151
  } while (idx_scalars < count_pairs);
151
- results->real = svaddv_f32(svptrue_b32(), ab_real_f32x);
152
- results->imag = svaddv_f32(svptrue_b32(), ab_imag_f32x);
152
+ results->real = nk_svaddv_f32_(svptrue_b32(), ab_real_f32x);
153
+ results->imag = nk_svaddv_f32_(svptrue_b32(), ab_imag_f32x);
153
154
  }
154
155
 
155
156
  #if defined(__clang__)
@@ -34,6 +34,7 @@
34
34
  #if NK_TARGET_SVESDOT
35
35
 
36
36
  #include "numkong/types.h"
37
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
37
38
 
38
39
  #if defined(__cplusplus)
39
40
  extern "C" {
@@ -57,7 +58,7 @@ NK_PUBLIC void nk_dot_i8_svesdot(nk_i8_t const *a_scalars, nk_i8_t const *b_scal
57
58
  sum_i32x = svdot_s32(sum_i32x, a_i8x, b_i8x);
58
59
  idx_scalars += svcntb();
59
60
  } while (idx_scalars < count_scalars);
60
- *result = (nk_i32_t)svaddv_s32(svptrue_b32(), sum_i32x);
61
+ *result = (nk_i32_t)nk_svaddv_s32_(svptrue_b32(), sum_i32x);
61
62
  }
62
63
 
63
64
  NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scalars, nk_size_t count_scalars,
@@ -71,7 +72,7 @@ NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scal
71
72
  sum_u32x = svdot_u32(sum_u32x, a_u8x, b_u8x);
72
73
  idx_scalars += svcntb();
73
74
  } while (idx_scalars < count_scalars);
74
- *result = (nk_u32_t)svaddv_u32(svptrue_b32(), sum_u32x);
75
+ *result = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), sum_u32x);
75
76
  }
76
77
 
77
78
  #if defined(__clang__)
@@ -72,6 +72,8 @@ Int8 data is quad-interleaved: [a₀, a₁, a₂, a₃, a₀, a₁, a₂, a₃,
72
72
  Tile configuration via `LDTILECFG` sets row counts and column byte-widths per tile — allows undersized tiles at matrix edges without masking.
73
73
  Morton Z-curve ordering for tile traversal improves cache reuse when both A and B exceed L2.
74
74
  This eliminates the explicit M×N×K loop nesting and register file pressure of vector ISAs — the entire dot-product reduction happens inside the tile instruction.
75
+ FP8 inputs on Sapphire AMX go through an on-the-fly E4M3/E5M2 → BF16 pack via the Ice Lake `VPERMI2W` LUT helpers — port-5-bound but the simplest correct route to feed `TDPBF16PS` tiles.
76
+ Granite Rapids adds `TDPFP16PS` (same tile shape, FP16 operands); the E5M2 variant widens inputs with a single `VPUNPCK*BW` against zero into FP16 tiles at pack time and then reuses the native FP16 compute loop — keeps the intermediate at FP16 precision instead of truncating to BF16 like the Sapphire path.
75
77
 
76
78
  ### SME Outer-Product Streaming
77
79