npm - numkong - Versions diffs - 7.0.0 → 7.4.2 - Mend

numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (315) hide show

package/README.md +197 -124
package/binding.gyp +34 -484
package/c/dispatch_bf16.c +59 -1
package/c/dispatch_e2m3.c +41 -8
package/c/dispatch_e3m2.c +49 -8
package/c/dispatch_e4m3.c +51 -9
package/c/dispatch_e5m2.c +45 -1
package/c/dispatch_f16.c +79 -26
package/c/dispatch_f16c.c +5 -5
package/c/dispatch_f32.c +56 -0
package/c/dispatch_f64.c +52 -0
package/c/dispatch_i4.c +3 -0
package/c/dispatch_i8.c +62 -3
package/c/dispatch_other.c +18 -0
package/c/dispatch_u1.c +54 -9
package/c/dispatch_u4.c +3 -0
package/c/dispatch_u8.c +64 -3
package/c/numkong.c +3 -0
package/include/README.md +79 -9
package/include/numkong/attention/sapphireamx.h +278 -276
package/include/numkong/attention/sme.h +983 -977
package/include/numkong/attention.h +1 -1
package/include/numkong/capabilities.h +289 -94
package/include/numkong/cast/README.md +40 -40
package/include/numkong/cast/diamond.h +64 -0
package/include/numkong/cast/haswell.h +42 -194
package/include/numkong/cast/icelake.h +42 -37
package/include/numkong/cast/loongsonasx.h +252 -0
package/include/numkong/cast/neon.h +216 -249
package/include/numkong/cast/powervsx.h +449 -0
package/include/numkong/cast/rvv.h +223 -274
package/include/numkong/cast/sapphire.h +18 -18
package/include/numkong/cast/serial.h +1018 -944
package/include/numkong/cast/skylake.h +82 -23
package/include/numkong/cast/v128relaxed.h +462 -105
package/include/numkong/cast.h +24 -0
package/include/numkong/cast.hpp +44 -0
package/include/numkong/curved/README.md +17 -17
package/include/numkong/curved/neon.h +131 -7
package/include/numkong/curved/neonbfdot.h +6 -7
package/include/numkong/curved/rvv.h +26 -26
package/include/numkong/curved/smef64.h +186 -182
package/include/numkong/curved.h +14 -18
package/include/numkong/dot/README.md +154 -137
package/include/numkong/dot/alder.h +43 -43
package/include/numkong/dot/diamond.h +158 -0
package/include/numkong/dot/genoa.h +4 -30
package/include/numkong/dot/haswell.h +215 -180
package/include/numkong/dot/icelake.h +190 -76
package/include/numkong/dot/loongsonasx.h +671 -0
package/include/numkong/dot/neon.h +124 -73
package/include/numkong/dot/neonbfdot.h +11 -12
package/include/numkong/dot/neonfhm.h +44 -46
package/include/numkong/dot/neonfp8.h +323 -0
package/include/numkong/dot/neonsdot.h +190 -76
package/include/numkong/dot/powervsx.h +752 -0
package/include/numkong/dot/rvv.h +92 -84
package/include/numkong/dot/rvvbf16.h +12 -12
package/include/numkong/dot/rvvhalf.h +12 -12
package/include/numkong/dot/sapphire.h +4 -4
package/include/numkong/dot/serial.h +66 -30
package/include/numkong/dot/sierra.h +31 -31
package/include/numkong/dot/skylake.h +142 -110
package/include/numkong/dot/sve.h +217 -177
package/include/numkong/dot/svebfdot.h +10 -10
package/include/numkong/dot/svehalf.h +85 -41
package/include/numkong/dot/svesdot.h +89 -0
package/include/numkong/dot/v128relaxed.h +124 -89
package/include/numkong/dot.h +114 -48
package/include/numkong/dots/README.md +203 -203
package/include/numkong/dots/alder.h +12 -9
package/include/numkong/dots/diamond.h +86 -0
package/include/numkong/dots/genoa.h +10 -4
package/include/numkong/dots/haswell.h +63 -48
package/include/numkong/dots/icelake.h +27 -18
package/include/numkong/dots/loongsonasx.h +176 -0
package/include/numkong/dots/neon.h +14 -11
package/include/numkong/dots/neonbfdot.h +4 -3
package/include/numkong/dots/neonfhm.h +11 -9
package/include/numkong/dots/neonfp8.h +99 -0
package/include/numkong/dots/neonsdot.h +48 -12
package/include/numkong/dots/powervsx.h +194 -0
package/include/numkong/dots/rvv.h +451 -344
package/include/numkong/dots/sapphireamx.h +1028 -984
package/include/numkong/dots/serial.h +213 -197
package/include/numkong/dots/sierra.h +10 -7
package/include/numkong/dots/skylake.h +47 -36
package/include/numkong/dots/sme.h +2001 -2364
package/include/numkong/dots/smebi32.h +175 -162
package/include/numkong/dots/smef64.h +328 -323
package/include/numkong/dots/v128relaxed.h +64 -41
package/include/numkong/dots.h +573 -293
package/include/numkong/dots.hpp +45 -43
package/include/numkong/each/README.md +133 -137
package/include/numkong/each/haswell.h +6 -6
package/include/numkong/each/icelake.h +7 -7
package/include/numkong/each/neon.h +76 -42
package/include/numkong/each/neonbfdot.h +11 -12
package/include/numkong/each/neonhalf.h +24 -116
package/include/numkong/each/rvv.h +28 -28
package/include/numkong/each/sapphire.h +27 -161
package/include/numkong/each/serial.h +6 -6
package/include/numkong/each/skylake.h +7 -7
package/include/numkong/each/v128relaxed.h +562 -0
package/include/numkong/each.h +148 -62
package/include/numkong/each.hpp +2 -2
package/include/numkong/geospatial/README.md +18 -18
package/include/numkong/geospatial/haswell.h +365 -325
package/include/numkong/geospatial/neon.h +350 -306
package/include/numkong/geospatial/rvv.h +4 -4
package/include/numkong/geospatial/skylake.h +376 -340
package/include/numkong/geospatial/v128relaxed.h +366 -327
package/include/numkong/geospatial.h +17 -17
package/include/numkong/matrix.hpp +4 -4
package/include/numkong/maxsim/README.md +14 -14
package/include/numkong/maxsim/alder.h +6 -6
package/include/numkong/maxsim/genoa.h +4 -4
package/include/numkong/maxsim/haswell.h +6 -6
package/include/numkong/maxsim/icelake.h +18 -18
package/include/numkong/maxsim/neonsdot.h +21 -21
package/include/numkong/maxsim/sapphireamx.h +14 -14
package/include/numkong/maxsim/serial.h +6 -6
package/include/numkong/maxsim/sme.h +221 -196
package/include/numkong/maxsim/v128relaxed.h +6 -6
package/include/numkong/mesh/README.md +62 -56
package/include/numkong/mesh/haswell.h +339 -464
package/include/numkong/mesh/neon.h +1100 -519
package/include/numkong/mesh/neonbfdot.h +36 -68
package/include/numkong/mesh/rvv.h +530 -435
package/include/numkong/mesh/serial.h +75 -91
package/include/numkong/mesh/skylake.h +1627 -302
package/include/numkong/mesh/v128relaxed.h +443 -330
package/include/numkong/mesh.h +63 -49
package/include/numkong/mesh.hpp +4 -4
package/include/numkong/numkong.h +3 -3
package/include/numkong/numkong.hpp +1 -0
package/include/numkong/probability/README.md +23 -19
package/include/numkong/probability/neon.h +82 -52
package/include/numkong/probability/rvv.h +28 -23
package/include/numkong/probability/serial.h +51 -39
package/include/numkong/probability.h +20 -23
package/include/numkong/random.h +1 -1
package/include/numkong/reduce/README.md +143 -138
package/include/numkong/reduce/alder.h +81 -77
package/include/numkong/reduce/haswell.h +222 -220
package/include/numkong/reduce/neon.h +629 -519
package/include/numkong/reduce/neonbfdot.h +7 -218
package/include/numkong/reduce/neonfhm.h +9 -381
package/include/numkong/reduce/neonsdot.h +9 -9
package/include/numkong/reduce/rvv.h +928 -802
package/include/numkong/reduce/serial.h +23 -27
package/include/numkong/reduce/sierra.h +20 -20
package/include/numkong/reduce/skylake.h +326 -324
package/include/numkong/reduce/v128relaxed.h +52 -52
package/include/numkong/reduce.h +4 -23
package/include/numkong/reduce.hpp +156 -11
package/include/numkong/scalar/README.md +6 -6
package/include/numkong/scalar/haswell.h +26 -17
package/include/numkong/scalar/loongsonasx.h +74 -0
package/include/numkong/scalar/neon.h +9 -9
package/include/numkong/scalar/powervsx.h +96 -0
package/include/numkong/scalar/rvv.h +2 -2
package/include/numkong/scalar/sapphire.h +21 -10
package/include/numkong/scalar/serial.h +21 -21
package/include/numkong/scalar.h +13 -0
package/include/numkong/set/README.md +28 -28
package/include/numkong/set/haswell.h +12 -12
package/include/numkong/set/icelake.h +14 -14
package/include/numkong/set/loongsonasx.h +181 -0
package/include/numkong/set/neon.h +17 -18
package/include/numkong/set/powervsx.h +326 -0
package/include/numkong/set/rvv.h +4 -4
package/include/numkong/set/serial.h +6 -6
package/include/numkong/set/sve.h +60 -59
package/include/numkong/set/v128relaxed.h +6 -6
package/include/numkong/set.h +21 -7
package/include/numkong/sets/README.md +26 -26
package/include/numkong/sets/loongsonasx.h +52 -0
package/include/numkong/sets/powervsx.h +65 -0
package/include/numkong/sets/smebi32.h +395 -364
package/include/numkong/sets.h +83 -40
package/include/numkong/sparse/README.md +4 -4
package/include/numkong/sparse/icelake.h +101 -101
package/include/numkong/sparse/serial.h +1 -1
package/include/numkong/sparse/sve2.h +137 -141
package/include/numkong/sparse/turin.h +12 -12
package/include/numkong/sparse.h +10 -10
package/include/numkong/spatial/README.md +230 -226
package/include/numkong/spatial/alder.h +113 -116
package/include/numkong/spatial/diamond.h +240 -0
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +74 -55
package/include/numkong/spatial/icelake.h +539 -58
package/include/numkong/spatial/loongsonasx.h +483 -0
package/include/numkong/spatial/neon.h +125 -52
package/include/numkong/spatial/neonbfdot.h +8 -9
package/include/numkong/spatial/neonfp8.h +258 -0
package/include/numkong/spatial/neonsdot.h +180 -12
package/include/numkong/spatial/powervsx.h +738 -0
package/include/numkong/spatial/rvv.h +146 -139
package/include/numkong/spatial/rvvbf16.h +17 -12
package/include/numkong/spatial/rvvhalf.h +13 -10
package/include/numkong/spatial/serial.h +13 -12
package/include/numkong/spatial/sierra.h +232 -39
package/include/numkong/spatial/skylake.h +73 -74
package/include/numkong/spatial/sve.h +93 -72
package/include/numkong/spatial/svebfdot.h +29 -29
package/include/numkong/spatial/svehalf.h +52 -26
package/include/numkong/spatial/svesdot.h +142 -0
package/include/numkong/spatial/v128relaxed.h +293 -41
package/include/numkong/spatial.h +338 -82
package/include/numkong/spatials/README.md +194 -194
package/include/numkong/spatials/diamond.h +82 -0
package/include/numkong/spatials/haswell.h +2 -2
package/include/numkong/spatials/loongsonasx.h +153 -0
package/include/numkong/spatials/neonfp8.h +111 -0
package/include/numkong/spatials/neonsdot.h +34 -0
package/include/numkong/spatials/powervsx.h +153 -0
package/include/numkong/spatials/rvv.h +259 -243
package/include/numkong/spatials/sapphireamx.h +173 -173
package/include/numkong/spatials/serial.h +2 -2
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +590 -605
package/include/numkong/spatials/smef64.h +139 -130
package/include/numkong/spatials/v128relaxed.h +2 -2
package/include/numkong/spatials.h +820 -500
package/include/numkong/spatials.hpp +49 -48
package/include/numkong/tensor.hpp +406 -17
package/include/numkong/trigonometry/README.md +19 -19
package/include/numkong/trigonometry/haswell.h +402 -401
package/include/numkong/trigonometry/neon.h +386 -387
package/include/numkong/trigonometry/rvv.h +52 -51
package/include/numkong/trigonometry/serial.h +13 -13
package/include/numkong/trigonometry/skylake.h +373 -369
package/include/numkong/trigonometry/v128relaxed.h +375 -374
package/include/numkong/trigonometry.h +13 -13
package/include/numkong/trigonometry.hpp +2 -2
package/include/numkong/types.h +287 -49
package/include/numkong/types.hpp +436 -12
package/include/numkong/vector.hpp +82 -14
package/javascript/dist/cjs/numkong-wasm.js +6 -12
package/javascript/dist/cjs/numkong.d.ts +7 -1
package/javascript/dist/cjs/numkong.js +37 -11
package/javascript/dist/cjs/types.d.ts +9 -0
package/javascript/dist/cjs/types.js +96 -0
package/javascript/dist/esm/numkong-browser.d.ts +14 -0
package/javascript/dist/esm/numkong-browser.js +23 -0
package/javascript/dist/esm/numkong-wasm.js +6 -12
package/javascript/dist/esm/numkong.d.ts +7 -1
package/javascript/dist/esm/numkong.js +37 -11
package/javascript/dist/esm/types.d.ts +9 -0
package/javascript/dist/esm/types.js +96 -0
package/javascript/node-gyp-build.d.ts +4 -1
package/javascript/numkong-browser.ts +40 -0
package/javascript/numkong-wasm.ts +7 -13
package/javascript/numkong.c +5 -26
package/javascript/numkong.ts +36 -11
package/javascript/tsconfig-base.json +1 -0
package/javascript/tsconfig-cjs.json +6 -1
package/javascript/types.ts +110 -0
package/numkong.gypi +101 -0
package/package.json +34 -13
package/probes/arm_neon.c +8 -0
package/probes/arm_neon_bfdot.c +9 -0
package/probes/arm_neon_fhm.c +9 -0
package/probes/arm_neon_half.c +8 -0
package/probes/arm_neon_sdot.c +9 -0
package/probes/arm_neonfp8.c +9 -0
package/probes/arm_sme.c +16 -0
package/probes/arm_sme2.c +16 -0
package/probes/arm_sme2p1.c +16 -0
package/probes/arm_sme_bf16.c +16 -0
package/probes/arm_sme_bi32.c +16 -0
package/probes/arm_sme_f64.c +16 -0
package/probes/arm_sme_fa64.c +14 -0
package/probes/arm_sme_half.c +16 -0
package/probes/arm_sme_lut2.c +15 -0
package/probes/arm_sve.c +18 -0
package/probes/arm_sve2.c +20 -0
package/probes/arm_sve2p1.c +18 -0
package/probes/arm_sve_bfdot.c +20 -0
package/probes/arm_sve_half.c +18 -0
package/probes/arm_sve_sdot.c +21 -0
package/probes/loongarch_lasx.c +12 -0
package/probes/power_vsx.c +12 -0
package/probes/probe.js +127 -0
package/probes/riscv_rvv.c +14 -0
package/probes/riscv_rvv_bb.c +15 -0
package/probes/riscv_rvv_bf16.c +17 -0
package/probes/riscv_rvv_half.c +14 -0
package/probes/wasm_v128relaxed.c +11 -0
package/probes/x86_alder.c +17 -0
package/probes/x86_diamond.c +17 -0
package/probes/x86_genoa.c +17 -0
package/probes/x86_graniteamx.c +19 -0
package/probes/x86_haswell.c +11 -0
package/probes/x86_icelake.c +17 -0
package/probes/x86_sapphire.c +16 -0
package/probes/x86_sapphireamx.c +18 -0
package/probes/x86_sierra.c +17 -0
package/probes/x86_skylake.c +15 -0
package/probes/x86_turin.c +17 -0
package/wasm/numkong-emscripten.js +2 -0
package/wasm/numkong.d.ts +14 -0
package/wasm/numkong.js +1124 -0
package/wasm/numkong.wasm +0 -0
package/include/numkong/curved/neonhalf.h +0 -212
package/include/numkong/dot/neonhalf.h +0 -198
package/include/numkong/dots/neonhalf.h +0 -57
package/include/numkong/mesh/neonhalf.h +0 -616
package/include/numkong/reduce/neonhalf.h +0 -157
package/include/numkong/spatial/neonhalf.h +0 -118
package/include/numkong/spatial/sapphire.h +0 -343
package/include/numkong/spatials/neonhalf.h +0 -58
package/javascript/README.md +0 -246

package/include/numkong/sparse/sve2.h CHANGED Viewed

@@ -60,31 +60,31 @@ NK_PUBLIC void nk_sparse_intersect_u16_sve2( //
     while (a_idx < a_length && b_idx < b_length) {
         // Load `a_member` and broadcast it, load `b_members_vec` from memory
-        svbool_t a_progress_u16x = svwhilelt_b16_u64(a_idx, a_length);
-        svbool_t b_progress_u16x = svwhilelt_b16_u64(b_idx, b_length);
-        svuint16_t a_u16x = svld1_u16(a_progress_u16x, a + a_idx);
-        svuint16_t b_u16x = svld1_u16(b_progress_u16x, b + b_idx);
+        svbool_t a_progress_b16x = svwhilelt_b16_u64(a_idx, a_length);
+        svbool_t b_progress_b16x = svwhilelt_b16_u64(b_idx, b_length);
+        svuint16_t a_u16x = svld1_u16(a_progress_b16x, a + a_idx);
+        svuint16_t b_u16x = svld1_u16(b_progress_b16x, b + b_idx);
         // Intersecting registers with `svmatch_u16` involves a lot of shuffling
         // and comparisons, so we want to avoid it if the slices don't overlap at all..
         nk_u16_t a_min;
-        nk_u16_t a_max = svlastb(a_progress_u16x, a_u16x);
+        nk_u16_t a_max = svlastb(a_progress_b16x, a_u16x);
         nk_u16_t b_min = svlasta(svpfalse_b(), b_u16x);
-        nk_u16_t b_max = svlastb(b_progress_u16x, b_u16x);
+        nk_u16_t b_max = svlastb(b_progress_b16x, b_u16x);
         // If the slices don't overlap, advance the appropriate pointer
         while (a_max < b_min && (a_idx + register_size) <= a_length) {
             a_idx += register_size;
-            a_progress_u16x = svwhilelt_b16_u64(a_idx, a_length);
-            a_u16x = svld1_u16(a_progress_u16x, a + a_idx);
-            a_max = svlastb(a_progress_u16x, a_u16x);
+            a_progress_b16x = svwhilelt_b16_u64(a_idx, a_length);
+            a_u16x = svld1_u16(a_progress_b16x, a + a_idx);
+            a_max = svlastb(a_progress_b16x, a_u16x);
         }
         a_min = svlasta(svpfalse_b(), a_u16x);
         while (b_max < a_min && (b_idx + register_size) <= b_length) {
             b_idx += register_size;
-            b_progress_u16x = svwhilelt_b16_u64(b_idx, b_length);
-            b_u16x = svld1_u16(b_progress_u16x, b + b_idx);
-            b_max = svlastb(b_progress_u16x, b_u16x);
+            b_progress_b16x = svwhilelt_b16_u64(b_idx, b_length);
+            b_u16x = svld1_u16(b_progress_b16x, b + b_idx);
+            b_max = svlastb(b_progress_b16x, b_u16x);
         }
         b_min = svlasta(svpfalse_b(), b_u16x);
@@ -95,18 +95,18 @@ NK_PUBLIC void nk_sparse_intersect_u16_sve2( //
         //
         //      svuint16_t a_last_broadcasted =  svdup_n_u16(a_max);
         //      svuint16_t b_last_broadcasted =  svdup_n_u16(b_max);
-        svbool_t a_mask_u16x = svcmple_n_u16(a_progress_u16x, a_u16x, b_max);
-        svbool_t b_mask_u16x = svcmple_n_u16(b_progress_u16x, b_u16x, a_max);
-        nk_u64_t a_step = svcntp_b16(a_progress_u16x, a_mask_u16x);
-        nk_u64_t b_step = svcntp_b16(b_progress_u16x, b_mask_u16x);
+        svbool_t a_mask_b16x = svcmple_n_u16(a_progress_b16x, a_u16x, b_max);
+        svbool_t b_mask_b16x = svcmple_n_u16(b_progress_b16x, b_u16x, a_max);
+        nk_u64_t a_step = svcntp_b16(a_progress_b16x, a_mask_b16x);
+        nk_u64_t b_step = svcntp_b16(b_progress_b16x, b_mask_b16x);
         // Compare `a_u16x` with each lane of `b_u16x`
-        svbool_t equal_mask = svmatch_u16(a_progress_u16x, a_u16x, b_u16x);
+        svbool_t equal_mask_b16x = svmatch_u16(a_progress_b16x, a_u16x, b_u16x);
         for (nk_size_t i = 1; i < lanes_count; i++) {
             b_u16x = svext_u16(b_u16x, b_u16x, 8);
-            equal_mask = svorr_z(svptrue_b16(), equal_mask, svmatch_u16(a_progress_u16x, a_u16x, b_u16x));
+            equal_mask_b16x = svorr_z(svptrue_b16(), equal_mask_b16x, svmatch_u16(a_progress_b16x, a_u16x, b_u16x));
         }
-        nk_size_t equal_count = svcntp_b16(svptrue_b16(), equal_mask);
+        nk_size_t equal_count = svcntp_b16(svptrue_b16(), equal_mask_b16x);
         // Manually compact and store matching elements (svcompact_u16 is not defined)
         if (result) {
@@ -114,7 +114,7 @@ NK_PUBLIC void nk_sparse_intersect_u16_sve2( //
             nk_u16_t mask_data[16];
             svst1_u16(svptrue_b16(), a_data, a_u16x);
-            svst1_u16(svptrue_b16(), mask_data, svdup_n_u16_z(equal_mask, 1));
+            svst1_u16(svptrue_b16(), mask_data, svdup_n_u16_z(equal_mask_b16x, 1));
             for (nk_size_t i = 0; i < svcnth(); i++)
                 if (mask_data[i]) result[c++] = a_data[i];
@@ -142,31 +142,31 @@ NK_PUBLIC void nk_sparse_intersect_u32_sve2( //
     while (a_idx < a_length && b_idx < b_length) {
         // Load `a_member` and broadcast it, load `b_members_vec` from memory
-        svbool_t a_progress_u32x = svwhilelt_b32_u64(a_idx, a_length);
-        svbool_t b_progress_u32x = svwhilelt_b32_u64(b_idx, b_length);
-        svuint32_t a_u32x = svld1_u32(a_progress_u32x, a + a_idx);
-        svuint32_t b_u32x = svld1_u32(b_progress_u32x, b + b_idx);
+        svbool_t a_progress_b32x = svwhilelt_b32_u64(a_idx, a_length);
+        svbool_t b_progress_b32x = svwhilelt_b32_u64(b_idx, b_length);
+        svuint32_t a_u32x = svld1_u32(a_progress_b32x, a + a_idx);
+        svuint32_t b_u32x = svld1_u32(b_progress_b32x, b + b_idx);
         // Intersecting registers with `svmatch_u16` involves a lot of shuffling
         // and comparisons, so we want to avoid it if the slices don't overlap at all..
         nk_u32_t a_min;
-        nk_u32_t a_max = svlastb(a_progress_u32x, a_u32x);
+        nk_u32_t a_max = svlastb(a_progress_b32x, a_u32x);
         nk_u32_t b_min = svlasta(svpfalse_b(), b_u32x);
-        nk_u32_t b_max = svlastb(b_progress_u32x, b_u32x);
+        nk_u32_t b_max = svlastb(b_progress_b32x, b_u32x);
         // If the slices don't overlap, advance the appropriate pointer
         while (a_max < b_min && (a_idx + register_size) <= a_length) {
             a_idx += register_size;
-            a_progress_u32x = svwhilelt_b32_u64(a_idx, a_length);
-            a_u32x = svld1_u32(a_progress_u32x, a + a_idx);
-            a_max = svlastb(a_progress_u32x, a_u32x);
+            a_progress_b32x = svwhilelt_b32_u64(a_idx, a_length);
+            a_u32x = svld1_u32(a_progress_b32x, a + a_idx);
+            a_max = svlastb(a_progress_b32x, a_u32x);
         }
         a_min = svlasta(svpfalse_b(), a_u32x);
         while (b_max < a_min && (b_idx + register_size) <= b_length) {
             b_idx += register_size;
-            b_progress_u32x = svwhilelt_b32_u64(b_idx, b_length);
-            b_u32x = svld1_u32(b_progress_u32x, b + b_idx);
-            b_max = svlastb(b_progress_u32x, b_u32x);
+            b_progress_b32x = svwhilelt_b32_u64(b_idx, b_length);
+            b_u32x = svld1_u32(b_progress_b32x, b + b_idx);
+            b_max = svlastb(b_progress_b32x, b_u32x);
         }
         b_min = svlasta(svpfalse_b(), b_u32x);
@@ -177,21 +177,21 @@ NK_PUBLIC void nk_sparse_intersect_u32_sve2( //
         //
         //      svuint32_t a_last_broadcasted =  svdup_n_u32(a_max);
         //      svuint32_t b_last_broadcasted =  svdup_n_u32(b_max);
-        svbool_t a_mask_u32x = svcmple_n_u32(a_progress_u32x, a_u32x, b_max);
-        svbool_t b_mask_u32x = svcmple_n_u32(b_progress_u32x, b_u32x, a_max);
-        nk_u64_t a_step = svcntp_b32(a_progress_u32x, a_mask_u32x);
-        nk_u64_t b_step = svcntp_b32(b_progress_u32x, b_mask_u32x);
+        svbool_t a_mask_b32x = svcmple_n_u32(a_progress_b32x, a_u32x, b_max);
+        svbool_t b_mask_b32x = svcmple_n_u32(b_progress_b32x, b_u32x, a_max);
+        nk_u64_t a_step = svcntp_b32(a_progress_b32x, a_mask_b32x);
+        nk_u64_t b_step = svcntp_b32(b_progress_b32x, b_mask_b32x);
         // Comparing `a_u32x` with each lane of `b_u32x` can't be done with `svmatch`,
         // the same way as in `nk_sparse_intersect_u16_sve2`, as that instruction is only
         // available for 8-bit and 16-bit integers.
         //
-        //      svbool_t equal_mask = svpfalse_b();
+        //      svbool_t equal_mask_b32x = svpfalse_b();
         //      for (nk_size_t i = 0; i < register_size; i++) {
-        //          equal_mask = svorr_z(svptrue_b32(), equal_mask, svcmpeq_u32(a_progress, a_u32x, b_u32x));
+        //          equal_mask_b32x = svorr_z(svptrue_b32(), equal_mask_b32x, svcmpeq_u32(a_progress, a_u32x, b_u32x));
         //          b_u32x = svext_u32(b_u32x, b_u32x, 1);
         //      }
-        //      nk_size_t equal_count = svcntp_b32(a_progress, equal_mask);
+        //      nk_size_t equal_count = svcntp_b32(a_progress, equal_mask_b32x);
         //
         // Alternatively, one can use histogram instructions, like `svhistcnt_u32_z`.
         // They practically compute the prefix-matching count, which is equivalent to
@@ -210,19 +210,19 @@ NK_PUBLIC void nk_sparse_intersect_u32_sve2( //
         //      C 1 1 1 0             B 1 1 1 0
         //      D 1 1 1 1             A 1 1 1 1
         //
-        svuint32_t hist_lower = svhistcnt_u32_z(a_progress_u32x, a_u32x, b_u32x);
+        svuint32_t hist_low_u32x = svhistcnt_u32_z(a_progress_b32x, a_u32x, b_u32x);
         svuint32_t a_rev_u32x = svrev_u32(a_u32x);
         svuint32_t b_rev_u32x = svrev_u32(b_u32x);
-        svuint32_t hist_upper = svrev_u32(svhistcnt_u32_z(svptrue_b32(), a_rev_u32x, b_rev_u32x));
-        svuint32_t hist = svorr_u32_x(a_progress_u32x, hist_lower, hist_upper);
-        svbool_t equal_mask = svcmpne_n_u32(a_progress_u32x, hist, 0);
-        nk_size_t equal_count = svcntp_b32(a_progress_u32x, equal_mask);
+        svuint32_t hist_high_u32x = svrev_u32(svhistcnt_u32_z(svptrue_b32(), a_rev_u32x, b_rev_u32x));
+        svuint32_t hist_u32x = svorr_u32_x(a_progress_b32x, hist_low_u32x, hist_high_u32x);
+        svbool_t equal_mask_b32x = svcmpne_n_u32(a_progress_b32x, hist_u32x, 0);
+        nk_size_t equal_count = svcntp_b32(a_progress_b32x, equal_mask_b32x);
         // Use SVE2 svcompact to compress matching elements and store to result buffer
         if (result) {
-            svuint32_t compacted = svcompact_u32(equal_mask, a_u32x);
-            svbool_t store_predicate = svwhilelt_b32_u64(0, equal_count);
-            svst1_u32(store_predicate, result + c, compacted);
+            svuint32_t compacted_u32x = svcompact_u32(equal_mask_b32x, a_u32x);
+            svbool_t store_predicate_b32x = svwhilelt_b32_u64(0u, equal_count);
+            svst1_u32(store_predicate_b32x, result + c, compacted_u32x);
         }
         // Advance
@@ -246,56 +246,56 @@ NK_PUBLIC void nk_sparse_intersect_u64_sve2( //
     while (a_idx < a_length && b_idx < b_length) {
         // Load `a_member` and broadcast it, load `b_members_vec` from memory
-        svbool_t a_progress_u64x = svwhilelt_b64_u64(a_idx, a_length);
-        svbool_t b_progress_u64x = svwhilelt_b64_u64(b_idx, b_length);
-        svuint64_t a_u64x = svld1_u64(a_progress_u64x, a + a_idx);
-        svuint64_t b_u64x = svld1_u64(b_progress_u64x, b + b_idx);
+        svbool_t a_progress_b64x = svwhilelt_b64_u64(a_idx, a_length);
+        svbool_t b_progress_b64x = svwhilelt_b64_u64(b_idx, b_length);
+        svuint64_t a_u64x = svld1_u64(a_progress_b64x, a + a_idx);
+        svuint64_t b_u64x = svld1_u64(b_progress_b64x, b + b_idx);
         // Intersecting registers involves comparisons,
         // so we want to avoid it if the slices don't overlap at all.
         nk_u64_t a_min;
-        nk_u64_t a_max = svlastb(a_progress_u64x, a_u64x);
+        nk_u64_t a_max = svlastb(a_progress_b64x, a_u64x);
         nk_u64_t b_min = svlasta(svpfalse_b(), b_u64x);
-        nk_u64_t b_max = svlastb(b_progress_u64x, b_u64x);
+        nk_u64_t b_max = svlastb(b_progress_b64x, b_u64x);
         // If the slices don't overlap, advance the appropriate pointer
         while (a_max < b_min && (a_idx + register_size) <= a_length) {
             a_idx += register_size;
-            a_progress_u64x = svwhilelt_b64_u64(a_idx, a_length);
-            a_u64x = svld1_u64(a_progress_u64x, a + a_idx);
-            a_max = svlastb(a_progress_u64x, a_u64x);
+            a_progress_b64x = svwhilelt_b64_u64(a_idx, a_length);
+            a_u64x = svld1_u64(a_progress_b64x, a + a_idx);
+            a_max = svlastb(a_progress_b64x, a_u64x);
         }
         a_min = svlasta(svpfalse_b(), a_u64x);
         while (b_max < a_min && (b_idx + register_size) <= b_length) {
             b_idx += register_size;
-            b_progress_u64x = svwhilelt_b64_u64(b_idx, b_length);
-            b_u64x = svld1_u64(b_progress_u64x, b + b_idx);
-            b_max = svlastb(b_progress_u64x, b_u64x);
+            b_progress_b64x = svwhilelt_b64_u64(b_idx, b_length);
+            b_u64x = svld1_u64(b_progress_b64x, b + b_idx);
+            b_max = svlastb(b_progress_b64x, b_u64x);
         }
         b_min = svlasta(svpfalse_b(), b_u64x);
         // Estimate how much we will need to advance the pointers afterwards.
-        svbool_t a_mask_u64x = svcmple_n_u64(a_progress_u64x, a_u64x, b_max);
-        svbool_t b_mask_u64x = svcmple_n_u64(b_progress_u64x, b_u64x, a_max);
-        nk_u64_t a_step = svcntp_b64(a_progress_u64x, a_mask_u64x);
-        nk_u64_t b_step = svcntp_b64(b_progress_u64x, b_mask_u64x);
+        svbool_t a_mask_b64x = svcmple_n_u64(a_progress_b64x, a_u64x, b_max);
+        svbool_t b_mask_b64x = svcmple_n_u64(b_progress_b64x, b_u64x, a_max);
+        nk_u64_t a_step = svcntp_b64(a_progress_b64x, a_mask_b64x);
+        nk_u64_t b_step = svcntp_b64(b_progress_b64x, b_mask_b64x);
         // Use histogram instructions like `svhistcnt_u64_z` to compute intersection.
         // They compute the prefix-matching count, equivalent to the lower triangle
         // of the row-major intersection matrix.
-        svuint64_t hist_lower = svhistcnt_u64_z(a_progress_u64x, a_u64x, b_u64x);
+        svuint64_t hist_low_u64x = svhistcnt_u64_z(a_progress_b64x, a_u64x, b_u64x);
         svuint64_t a_rev_u64x = svrev_u64(a_u64x);
         svuint64_t b_rev_u64x = svrev_u64(b_u64x);
-        svuint64_t hist_upper = svrev_u64(svhistcnt_u64_z(svptrue_b64(), a_rev_u64x, b_rev_u64x));
-        svuint64_t hist = svorr_u64_x(a_progress_u64x, hist_lower, hist_upper);
-        svbool_t equal_mask = svcmpne_n_u64(a_progress_u64x, hist, 0);
-        nk_size_t equal_count = svcntp_b64(a_progress_u64x, equal_mask);
+        svuint64_t hist_high_u64x = svrev_u64(svhistcnt_u64_z(svptrue_b64(), a_rev_u64x, b_rev_u64x));
+        svuint64_t hist_u64x = svorr_u64_x(a_progress_b64x, hist_low_u64x, hist_high_u64x);
+        svbool_t equal_mask_b64x = svcmpne_n_u64(a_progress_b64x, hist_u64x, 0);
+        nk_size_t equal_count = svcntp_b64(a_progress_b64x, equal_mask_b64x);
         // Use SVE2 svcompact to compress matching elements and store to result buffer
         if (result) {
-            svuint64_t compacted = svcompact_u64(equal_mask, a_u64x);
-            svbool_t store_predicate = svwhilelt_b64_u64(0, equal_count);
-            svst1_u64(store_predicate, result + c, compacted);
+            svuint64_t compacted_u64x = svcompact_u64(equal_mask_b64x, a_u64x);
+            svbool_t store_predicate_b64x = svwhilelt_b64_u64(0u, equal_count);
+            svst1_u64(store_predicate_b64x, result + c, compacted_u64x);
         }
         // Advance
@@ -312,94 +312,90 @@ NK_PUBLIC void nk_sparse_dot_u32f32_sve2(                 //
     nk_size_t a_length, nk_size_t b_length,               //
     nk_f64_t *product) {
-    // A single SVE lane is 128 bits wide, so one lane fits 4 values.
     nk_size_t const register_size = svcntw();
     nk_size_t const vector_length_f64 = svcntd();
-    nk_size_t const lanes_count = register_size / 4;
     nk_size_t a_idx = 0, b_idx = 0;
-    svbool_t const predicate_all_f32x = svptrue_b32();
-    svbool_t const predicate_all_f64x = svptrue_b64();
+    svbool_t const predicate_all_b32x = svptrue_b32();
+    svbool_t const predicate_all_b64x = svptrue_b64();
     svfloat64_t product_f64x = svdup_f64(0.0);
     while (a_idx < a_length && b_idx < b_length) {
         // Load indices with progress predicates
-        svbool_t a_progress_u32x = svwhilelt_b32_u64(a_idx, a_length);
-        svbool_t b_progress_u32x = svwhilelt_b32_u64(b_idx, b_length);
-        svuint32_t a_u32x = svld1_u32(a_progress_u32x, a + a_idx);
-        svuint32_t b_u32x = svld1_u32(b_progress_u32x, b + b_idx);
+        svbool_t a_progress_b32x = svwhilelt_b32_u64(a_idx, a_length);
+        svbool_t b_progress_b32x = svwhilelt_b32_u64(b_idx, b_length);
+        svuint32_t a_u32x = svld1_u32(a_progress_b32x, a + a_idx);
+        svuint32_t b_u32x = svld1_u32(b_progress_b32x, b + b_idx);
         // Avoid expensive intersection if slices don't overlap at all
         nk_u32_t a_min;
-        nk_u32_t a_max = svlastb(a_progress_u32x, a_u32x);
+        nk_u32_t a_max = svlastb(a_progress_b32x, a_u32x);
         nk_u32_t b_min = svlasta(svpfalse_b(), b_u32x);
-        nk_u32_t b_max = svlastb(b_progress_u32x, b_u32x);
+        nk_u32_t b_max = svlastb(b_progress_b32x, b_u32x);
         // If the slices don't overlap, advance the appropriate pointer
         while (a_max < b_min && (a_idx + register_size) <= a_length) {
             a_idx += register_size;
-            a_progress_u32x = svwhilelt_b32_u64(a_idx, a_length);
-            a_u32x = svld1_u32(a_progress_u32x, a + a_idx);
-            a_max = svlastb(a_progress_u32x, a_u32x);
+            a_progress_b32x = svwhilelt_b32_u64(a_idx, a_length);
+            a_u32x = svld1_u32(a_progress_b32x, a + a_idx);
+            a_max = svlastb(a_progress_b32x, a_u32x);
         }
         a_min = svlasta(svpfalse_b(), a_u32x);
         while (b_max < a_min && (b_idx + register_size) <= b_length) {
             b_idx += register_size;
-            b_progress_u32x = svwhilelt_b32_u64(b_idx, b_length);
-            b_u32x = svld1_u32(b_progress_u32x, b + b_idx);
-            b_max = svlastb(b_progress_u32x, b_u32x);
+            b_progress_b32x = svwhilelt_b32_u64(b_idx, b_length);
+            b_u32x = svld1_u32(b_progress_b32x, b + b_idx);
+            b_max = svlastb(b_progress_b32x, b_u32x);
         }
         b_min = svlasta(svpfalse_b(), b_u32x);
         // Calculate step sizes before modifying vectors
-        svbool_t a_mask_u32x = svcmple_n_u32(a_progress_u32x, a_u32x, b_max);
-        svbool_t b_mask_u32x = svcmple_n_u32(b_progress_u32x, b_u32x, a_max);
-        nk_u64_t a_step = svcntp_b32(a_progress_u32x, a_mask_u32x);
-        nk_u64_t b_step = svcntp_b32(b_progress_u32x, b_mask_u32x);
+        svbool_t a_mask_b32x = svcmple_n_u32(a_progress_b32x, a_u32x, b_max);
+        svbool_t b_mask_b32x = svcmple_n_u32(b_progress_b32x, b_u32x, a_max);
+        nk_u64_t a_step = svcntp_b32(a_progress_b32x, a_mask_b32x);
+        nk_u64_t b_step = svcntp_b32(b_progress_b32x, b_mask_b32x);
         // Use histogram-based intersection (svmatch_u32 doesn't exist)
-        svuint32_t hist_lower_u32x = svhistcnt_u32_z(a_progress_u32x, a_u32x, b_u32x);
+        svuint32_t hist_low_u32x = svhistcnt_u32_z(a_progress_b32x, a_u32x, b_u32x);
         svuint32_t a_rev_u32x = svrev_u32(a_u32x);
         svuint32_t b_rev_u32x = svrev_u32(b_u32x);
-        svuint32_t hist_upper_u32x = svrev_u32(svhistcnt_u32_z(predicate_all_f32x, a_rev_u32x, b_rev_u32x));
-        svuint32_t hist_u32x = svorr_u32_x(a_progress_u32x, hist_lower_u32x, hist_upper_u32x);
-        svbool_t a_equal_mask_u32x = svcmpne_n_u32(a_progress_u32x, hist_u32x, 0);
-        svbool_t a_overlap_mask_u32x = svand_b_z(predicate_all_f32x, a_progress_u32x, a_equal_mask_u32x);
+        svuint32_t hist_high_u32x = svrev_u32(svhistcnt_u32_z(predicate_all_b32x, a_rev_u32x, b_rev_u32x));
+        svuint32_t hist_u32x = svorr_u32_x(a_progress_b32x, hist_low_u32x, hist_high_u32x);
+        svbool_t a_equal_mask_b32x = svcmpne_n_u32(a_progress_b32x, hist_u32x, 0);
+        svbool_t a_overlap_mask_b32x = svand_b_z(predicate_all_b32x, a_progress_b32x, a_equal_mask_b32x);
-        if (!svptest_any(a_progress_u32x, a_overlap_mask_u32x)) {
+        if (!svptest_any(a_progress_b32x, a_overlap_mask_b32x)) {
             a_idx += a_step;
             b_idx += b_step;
             continue;
         }
-        // Load weights and mask by intersection
-        svfloat32_t a_weights_f32x = svsel_f32(a_overlap_mask_u32x, svld1_f32(a_progress_u32x, a_weights + a_idx),
-                                               svdup_f32(0.f));
-        svfloat32_t b_weights_f32x = svld1_f32(b_progress_u32x, b_weights + b_idx);
-        svbool_t predicate_low_f64x = svwhilelt_b64_u64(a_idx, a_length);
-        svbool_t predicate_high_f64x = svwhilelt_b64_u64(a_idx + vector_length_f64, a_length);
-        svfloat64_t a_low_f64x = svcvt_f64_f32_x(predicate_low_f64x, a_weights_f32x);
-        svfloat64_t a_high_f64x = svcvtlt_f64_f32_x(predicate_high_f64x, a_weights_f32x);
-        // For each position in a that matches something in b, we need the corresponding b weight.
-        // Use lane-by-lane matching for dot product.
-        for (nk_size_t i = 0; i < lanes_count; i++) {
-            // Check which elements of a match the current rotation of b
-            svbool_t equal_lane_u32x = svcmpeq_u32(a_progress_u32x, a_u32x, b_u32x);
-            svfloat32_t b_equal_weights_f32x = svsel_f32(equal_lane_u32x, b_weights_f32x, svdup_f32(0.f));
-            svfloat64_t b_low_f64x = svcvt_f64_f32_x(predicate_low_f64x, b_equal_weights_f32x);
-            svfloat64_t b_high_f64x = svcvtlt_f64_f32_x(predicate_high_f64x, b_equal_weights_f32x);
-            product_f64x = svmla_f64_x(predicate_low_f64x, product_f64x, a_low_f64x, b_low_f64x);
-            product_f64x = svmla_f64_x(predicate_high_f64x, product_f64x, a_high_f64x, b_high_f64x);
-            // Rotate b vectors
-            b_u32x = svext_u32(b_u32x, b_u32x, 4);
-            b_weights_f32x = svext_f32(b_weights_f32x, b_weights_f32x, 4);
-        }
+        // Compute b overlap mask (symmetric histogram: which b elements match something in a)
+        svuint32_t b_hist_low_u32x = svhistcnt_u32_z(b_progress_b32x, b_u32x, a_u32x);
+        svuint32_t b_hist_high_u32x = svrev_u32(svhistcnt_u32_z(predicate_all_b32x, b_rev_u32x, a_rev_u32x));
+        svuint32_t b_hist_u32x = svorr_u32_x(b_progress_b32x, b_hist_low_u32x, b_hist_high_u32x);
+        svbool_t b_overlap_mask_b32x = svand_b_z(predicate_all_b32x, b_progress_b32x,
+                                                 svcmpne_n_u32(b_progress_b32x, b_hist_u32x, 0));
+        // Compact matching weights — both arrays are sorted, so svcompact
+        // preserves relative order and aligns corresponding intersection pairs.
+        svfloat32_t a_matched_f32x = svcompact_f32(a_overlap_mask_b32x, svld1_f32(a_progress_b32x, a_weights + a_idx));
+        svfloat32_t b_matched_f32x = svcompact_f32(b_overlap_mask_b32x, svld1_f32(b_progress_b32x, b_weights + b_idx));
+        // Widen to f64 and accumulate. svcvt_f64_f32 converts even-indexed f32
+        // elements; svcvtlt_f64_f32 converts odd-indexed f32 elements.
+        nk_size_t match_count = svcntp_b32(a_progress_b32x, a_overlap_mask_b32x);
+        svbool_t pred_even_b64x = svwhilelt_b64_u64(0u, (match_count + 1) / 2);
+        svbool_t pred_odd_b64x = svwhilelt_b64_u64(0u, match_count / 2);
+        product_f64x = svmla_f64_x(pred_even_b64x, product_f64x, svcvt_f64_f32_x(pred_even_b64x, a_matched_f32x),
+                                   svcvt_f64_f32_x(pred_even_b64x, b_matched_f32x));
+        product_f64x = svmla_f64_x(pred_odd_b64x, product_f64x, svcvtlt_f64_f32_x(pred_odd_b64x, a_matched_f32x),
+                                   svcvtlt_f64_f32_x(pred_odd_b64x, b_matched_f32x));
         // Advance
         a_idx += a_step;
         b_idx += b_step;
     }
-    *product = svaddv_f64(predicate_all_f64x, product_f64x);
+    *product = svaddv_f64(predicate_all_b64x, product_f64x);
 }
 #if defined(__clang__)
@@ -431,31 +427,31 @@ NK_PUBLIC void nk_sparse_dot_u16bf16_sve2(                  //
     while (a_idx < a_length && b_idx < b_length) {
         // Load `a_member` and broadcast it, load `b_members_vec` from memory
-        svbool_t a_progress_u16x = svwhilelt_b16_u64(a_idx, a_length);
-        svbool_t b_progress_u16x = svwhilelt_b16_u64(b_idx, b_length);
-        svuint16_t a_u16x = svld1_u16(a_progress_u16x, a + a_idx);
-        svuint16_t b_u16x = svld1_u16(b_progress_u16x, b + b_idx);
+        svbool_t a_progress_b16x = svwhilelt_b16_u64(a_idx, a_length);
+        svbool_t b_progress_b16x = svwhilelt_b16_u64(b_idx, b_length);
+        svuint16_t a_u16x = svld1_u16(a_progress_b16x, a + a_idx);
+        svuint16_t b_u16x = svld1_u16(b_progress_b16x, b + b_idx);
         // Intersecting registers with `svmatch_u16` involves a lot of shuffling
         // and comparisons, so we want to avoid it if the slices don't overlap at all..
         nk_u16_t a_min;
-        nk_u16_t a_max = svlastb(a_progress_u16x, a_u16x);
+        nk_u16_t a_max = svlastb(a_progress_b16x, a_u16x);
         nk_u16_t b_min = svlasta(svpfalse_b(), b_u16x);
-        nk_u16_t b_max = svlastb(b_progress_u16x, b_u16x);
+        nk_u16_t b_max = svlastb(b_progress_b16x, b_u16x);
         // If the slices don't overlap, advance the appropriate pointer
         while (a_max < b_min && (a_idx + register_size) <= a_length) {
             a_idx += register_size;
-            a_progress_u16x = svwhilelt_b16_u64(a_idx, a_length);
-            a_u16x = svld1_u16(a_progress_u16x, a + a_idx);
-            a_max = svlastb(a_progress_u16x, a_u16x);
+            a_progress_b16x = svwhilelt_b16_u64(a_idx, a_length);
+            a_u16x = svld1_u16(a_progress_b16x, a + a_idx);
+            a_max = svlastb(a_progress_b16x, a_u16x);
         }
         a_min = svlasta(svpfalse_b(), a_u16x);
         while (b_max < a_min && (b_idx + register_size) <= b_length) {
             b_idx += register_size;
-            b_progress_u16x = svwhilelt_b16_u64(b_idx, b_length);
-            b_u16x = svld1_u16(b_progress_u16x, b + b_idx);
-            b_max = svlastb(b_progress_u16x, b_u16x);
+            b_progress_b16x = svwhilelt_b16_u64(b_idx, b_length);
+            b_u16x = svld1_u16(b_progress_b16x, b + b_idx);
+            b_max = svlastb(b_progress_b16x, b_u16x);
         }
         b_min = svlasta(svpfalse_b(), b_u16x);
@@ -466,20 +462,20 @@ NK_PUBLIC void nk_sparse_dot_u16bf16_sve2(                  //
         //
         //      svuint16_t a_last_broadcasted =  svdup_n_u16(a_max);
         //      svuint16_t b_last_broadcasted =  svdup_n_u16(b_max);
-        svbool_t a_mask_u16x = svcmple_n_u16(a_progress_u16x, a_u16x, b_max);
-        svbool_t b_mask_u16x = svcmple_n_u16(b_progress_u16x, b_u16x, a_max);
-        nk_u64_t a_step = svcntp_b16(a_progress_u16x, a_mask_u16x);
-        nk_u64_t b_step = svcntp_b16(b_progress_u16x, b_mask_u16x);
+        svbool_t a_mask_b16x = svcmple_n_u16(a_progress_b16x, a_u16x, b_max);
+        svbool_t b_mask_b16x = svcmple_n_u16(b_progress_b16x, b_u16x, a_max);
+        nk_u64_t a_step = svcntp_b16(a_progress_b16x, a_mask_b16x);
+        nk_u64_t b_step = svcntp_b16(b_progress_b16x, b_mask_b16x);
         // Compare `a_u16x` with each lane of `b_u16x`
-        svbfloat16_t a_weights_bf16x = svld1_bf16(a_progress_u16x, (__bf16 const *)a_weights + a_idx);
-        svbfloat16_t b_weights_bf16x = svld1_bf16(b_progress_u16x, (__bf16 const *)b_weights + b_idx);
+        svbfloat16_t a_weights_bf16x = svld1_bf16(a_progress_b16x, (__bf16 const *)a_weights + a_idx);
+        svbfloat16_t b_weights_bf16x = svld1_bf16(b_progress_b16x, (__bf16 const *)b_weights + b_idx);
         for (nk_size_t i = 0; i < lanes_count; i++) {
-            svbool_t equal_mask_u16x = svmatch_u16(a_progress_u16x, a_u16x, b_u16x);
+            svbool_t equal_mask_b16x = svmatch_u16(a_progress_b16x, a_u16x, b_u16x);
             //! The `svsel_bf16` intrinsic is broken in many compilers, not returning the correct type.
             //! So we reinterprete floats as integers and apply `svsel_s16`, but the `svreinterpret_s16_bs16`
             //! and `svreinterpret_bf16_s16` are not always properly defined!
-            svint16_t b_equal_weights_s16x = svsel_s16(equal_mask_u16x, svreinterpret_s16_bf16(b_weights_bf16x),
+            svint16_t b_equal_weights_s16x = svsel_s16(equal_mask_b16x, svreinterpret_s16_bf16(b_weights_bf16x),
                                                        svdup_n_s16(0));
             product_f32x = svbfdot_f32(product_f32x, a_weights_bf16x, svreinterpret_bf16_s16(b_equal_weights_s16x));
             b_u16x = svext_u16(b_u16x, b_u16x, 8);

package/include/numkong/sparse/turin.h CHANGED Viewed

@@ -243,8 +243,8 @@ NK_PUBLIC void nk_sparse_dot_u32f32_turin(                //
     // Native VP2INTERSECTD works directly on u32 - no conversion needed!
     nk_u32_t const *const a_end = a + a_length;
     nk_u32_t const *const b_end = b + b_length;
-    __m512d product_lower_f64x8 = _mm512_setzero_pd();
-    __m512d product_upper_f64x8 = _mm512_setzero_pd();
+    __m512d product_low_f64x8 = _mm512_setzero_pd();
+    __m512d product_high_f64x8 = _mm512_setzero_pd();
     nk_b512_vec_t a_vec, b_vec;
     while (a + 16 <= a_end && b + 16 <= b_end) {
@@ -281,15 +281,15 @@ NK_PUBLIC void nk_sparse_dot_u32f32_turin(                //
             __m512 b_weights_f32x16 = _mm512_loadu_ps(b_weights);
             __m512 a_matched_f32x16 = _mm512_maskz_compress_ps(a_matches, a_weights_f32x16);
             __m512 b_matched_f32x16 = _mm512_maskz_compress_ps(b_matches, b_weights_f32x16);
-            __m256 a_matched_lower_f32x8 = _mm512_castps512_ps256(a_matched_f32x16);
-            __m256 a_matched_upper_f32x8 = _mm512_extractf32x8_ps(a_matched_f32x16, 1);
-            __m256 b_matched_lower_f32x8 = _mm512_castps512_ps256(b_matched_f32x16);
-            __m256 b_matched_upper_f32x8 = _mm512_extractf32x8_ps(b_matched_f32x16, 1);
-            product_lower_f64x8 = _mm512_fmadd_pd(_mm512_cvtps_pd(a_matched_lower_f32x8),
-                                                  _mm512_cvtps_pd(b_matched_lower_f32x8), product_lower_f64x8);
-            product_upper_f64x8 = _mm512_fmadd_pd(_mm512_cvtps_pd(a_matched_upper_f32x8),
-                                                  _mm512_cvtps_pd(b_matched_upper_f32x8), product_upper_f64x8);
+            __m256 a_matched_low_f32x8 = _mm512_castps512_ps256(a_matched_f32x16);
+            __m256 a_matched_high_f32x8 = _mm512_extractf32x8_ps(a_matched_f32x16, 1);
+            __m256 b_matched_low_f32x8 = _mm512_castps512_ps256(b_matched_f32x16);
+            __m256 b_matched_high_f32x8 = _mm512_extractf32x8_ps(b_matched_f32x16, 1);
+            product_low_f64x8 = _mm512_fmadd_pd(_mm512_cvtps_pd(a_matched_low_f32x8),
+                                                _mm512_cvtps_pd(b_matched_low_f32x8), product_low_f64x8);
+            product_high_f64x8 = _mm512_fmadd_pd(_mm512_cvtps_pd(a_matched_high_f32x8),
+                                                 _mm512_cvtps_pd(b_matched_high_f32x8), product_high_f64x8);
         }
         __m512i a_max_u32x16 = _mm512_set1_epi32(*(int const *)&a_max);
@@ -304,7 +304,7 @@ NK_PUBLIC void nk_sparse_dot_u32f32_turin(                //
     nk_f64_t tail_product = 0;
     nk_sparse_dot_u32f32_serial(a, b, a_weights, b_weights, a_end - a, b_end - b, &tail_product);
-    *product = _mm512_reduce_add_pd(product_lower_f64x8) + _mm512_reduce_add_pd(product_upper_f64x8) + tail_product;
+    *product = _mm512_reduce_add_pd(product_low_f64x8) + _mm512_reduce_add_pd(product_high_f64x8) + tail_product;
 }
 #if defined(__clang__)

package/include/numkong/sparse.h CHANGED Viewed

@@ -57,22 +57,22 @@
  *  The Ice Lake kernels are shuffle/compare heavy; their throughput is often gated by port 5.
  *  On Genoa, many integer ops dual-issue on FP ports, often improving throughput despite higher latency.
  *
- *      Intrinsic                       Instruction                      Ice           Genoa
- *      _mm512_shuffle_epi32            VPSHUFD (ZMM, ZMM, I8)           1c @ p5       1c @ p123
- *      _mm512_mask_cmpneq_epi32_mask   VPCMPD (K, ZMM, ZMM, I8)         3c @ p5       5c @ p01
- *      _mm512_alignr_epi32             VALIGND (ZMM, ZMM, ZMM, I8)      3c @ p5       6c @ p12
- *      _mm512_conflict_epi32           VPCONFLICTD (ZMM, ZMM)           26c @ p0/5    7c @ p01/12
- *      _mm256_maskz_compress_epi16     VPCOMPRESSW (YMM, K, YMM)        3-6c @ p5     4-8c @ p01/12
- *      _mm256_dpwssds_epi32            VPDPWSSDS (YMM, K, YMM, YMM)     4-5c @ p01    4c @ p01
- *      _mm256_dpbf16_ps                VDPBF16PS (YMM, YMM, YMM)        n/a           6c @ p01
+ *      Intrinsic                      Instruction                   Icelake           Genoa
+ *      _mm512_shuffle_epi32           VPSHUFD (ZMM, ZMM, I8)        1cy @ p5          1cy @ p123
+ *      _mm512_mask_cmpneq_epi32_mask  VPCMPD (K, ZMM, ZMM, I8)      3cy @ p5          5cy @ p01
+ *      _mm512_alignr_epi32            VALIGND (ZMM, ZMM, ZMM, I8)   3cy @ p5          6cy @ p12
+ *      _mm512_conflict_epi32          VPCONFLICTD (ZMM, ZMM)        26cy @ p0+p05+p5  7cy @ p01+p12
+ *      _mm256_maskz_compress_epi16    VPCOMPRESSW (YMM, K, YMM)     3-6cy @ p5+p5     4-8cy @ p01+p12
+ *      _mm256_dpwssds_epi32           VPDPWSSDS (YMM, K, YMM, YMM)  4-5cy @ p01       4cy @ p01
+ *      _mm256_dpbf16_ps               VDPBF16PS (YMM, YMM, YMM)     n/a               6cy @ p01
  *
  *  VP2INTERSECTD is unsupported on Ice Lake and not yet covered by uops.info for Zen5/Turin.
- *  Tiger Lake measures ~36-41c @ p5 for ZMM variants, which is why we always avoid it on Intel.
+ *  Tiger Lake measures ~36-41cy @ p5 for ZMM variants, which is why we always avoid it on Intel.
  *
  *  @section references References
  *
  *  - uops.info: https://uops.info/
- *  - Intel Intrinsics Guide: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
+ *  - Intel Intrinsics Guide: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
  *  - Arm Intrinsics Reference: https://developer.arm.com/architectures/instruction-sets/intrinsics/
  *  - vp2intersect experiments: https://github.com/mozonaut/vp2intersect
  *  - Diez-Canas "Faster-Than-Native Alternatives for x86 VP2INTERSECT Instructions":