npm - numkong - Versions diffs - 7.4.4 → 7.5.0 - Mend

numkong 7.4.4 → 7.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/README.md +1 -0
package/binding.gyp +81 -5
package/c/dispatch_f16.c +23 -0
package/c/numkong.c +0 -13
package/include/numkong/attention/sme.h +34 -31
package/include/numkong/capabilities.h +2 -15
package/include/numkong/cast/neon.h +15 -0
package/include/numkong/curved/smef64.h +82 -62
package/include/numkong/dot/rvvbf16.h +1 -1
package/include/numkong/dot/rvvhalf.h +1 -1
package/include/numkong/dot/sve.h +6 -5
package/include/numkong/dot/svebfdot.h +2 -1
package/include/numkong/dot/svehalf.h +6 -5
package/include/numkong/dot/svesdot.h +3 -2
package/include/numkong/dots/graniteamx.h +733 -0
package/include/numkong/dots/serial.h +11 -4
package/include/numkong/dots/sme.h +172 -140
package/include/numkong/dots/smebi32.h +14 -11
package/include/numkong/dots/smef64.h +31 -26
package/include/numkong/dots.h +29 -3
package/include/numkong/each/serial.h +22 -0
package/include/numkong/geospatial/haswell.h +1 -1
package/include/numkong/geospatial/neon.h +1 -1
package/include/numkong/geospatial/serial.h +1 -1
package/include/numkong/geospatial/skylake.h +1 -1
package/include/numkong/maxsim/sme.h +94 -55
package/include/numkong/mesh/README.md +13 -27
package/include/numkong/mesh/haswell.h +25 -122
package/include/numkong/mesh/neon.h +21 -110
package/include/numkong/mesh/neonbfdot.h +4 -43
package/include/numkong/mesh/rvv.h +7 -82
package/include/numkong/mesh/serial.h +48 -53
package/include/numkong/mesh/skylake.h +7 -123
package/include/numkong/mesh/v128relaxed.h +9 -93
package/include/numkong/mesh.h +2 -2
package/include/numkong/mesh.hpp +35 -96
package/include/numkong/reduce/neon.h +29 -0
package/include/numkong/reduce/neonbfdot.h +2 -2
package/include/numkong/reduce/neonfhm.h +4 -4
package/include/numkong/reduce/sve.h +52 -0
package/include/numkong/reduce.h +4 -0
package/include/numkong/set/sve.h +6 -5
package/include/numkong/sets/smebi32.h +35 -30
package/include/numkong/sparse/sve2.h +3 -2
package/include/numkong/spatial/sve.h +7 -6
package/include/numkong/spatial/svebfdot.h +7 -4
package/include/numkong/spatial/svehalf.h +5 -4
package/include/numkong/spatial/svesdot.h +9 -8
package/include/numkong/spatials/graniteamx.h +173 -0
package/include/numkong/spatials/serial.h +22 -0
package/include/numkong/spatials/sme.h +391 -350
package/include/numkong/spatials/smef64.h +79 -70
package/include/numkong/spatials.h +37 -4
package/include/numkong/types.h +59 -0
package/javascript/dist/cjs/numkong.js +13 -0
package/javascript/dist/esm/numkong.js +13 -0
package/javascript/numkong.c +56 -12
package/javascript/numkong.ts +13 -0
package/package.json +7 -7
package/probes/probe.js +2 -2
package/wasm/numkong.wasm +0 -0

package/include/numkong/mesh/v128relaxed.h CHANGED Viewed

@@ -570,16 +570,10 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
         rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
         rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
     if (scale) *scale = 1.0f;
+    if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
+    if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
-    // Fused single-pass: accumulate centroids and squared differences simultaneously.
-    // RMSD = √(E[(a−b)²] − (ā − b̄)²)
     v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
-    v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
-    v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
-    v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
-    v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
-    v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
-    v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
     v128_t sum_sq_x_low_f64x2 = zero_f64x2, sum_sq_x_high_f64x2 = zero_f64x2;
     v128_t sum_sq_y_low_f64x2 = zero_f64x2, sum_sq_y_high_f64x2 = zero_f64x2;
     v128_t sum_sq_z_low_f64x2 = zero_f64x2, sum_sq_z_high_f64x2 = zero_f64x2;
@@ -590,8 +584,7 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
         nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
         nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
-        // Promote lower and upper halves to f64. Deltas computed in f64 to avoid
-        // f32 cancellation in the single-pass formula RMSD = √(E[(a−b)²] − (ā − b̄)²).
+        // Promote lower and upper halves to f64 for precision.
         v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
         v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
         v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
@@ -605,21 +598,7 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
         v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
         v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
-        // Accumulate centroids.
-        sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2);
-        sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
-        sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2);
-        sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
-        sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2);
-        sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
-        sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2);
-        sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
-        sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2);
-        sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
-        sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2);
-        sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
-        // Accumulate squared differences in f64 — deltas computed in f64 for precision.
+        // Accumulate squared differences in f64.
         v128_t dx_low_f64x2 = wasm_f64x2_sub(a_x_low_f64x2, b_x_low_f64x2);
         v128_t dx_high_f64x2 = wasm_f64x2_sub(a_x_high_f64x2, b_x_high_f64x2);
         v128_t dy_low_f64x2 = wasm_f64x2_sub(a_y_low_f64x2, b_y_low_f64x2);
@@ -635,12 +614,6 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
         sum_sq_z_high_f64x2 = wasm_f64x2_relaxed_madd(dz_high_f64x2, dz_high_f64x2, sum_sq_z_high_f64x2);
     }
-    nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
-    nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
-    nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
-    nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
-    nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
-    nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
     nk_f64_t sum_sq_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_x_low_f64x2, sum_sq_x_high_f64x2));
     nk_f64_t sum_sq_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_y_low_f64x2, sum_sq_y_high_f64x2));
     nk_f64_t sum_sq_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_z_low_f64x2, sum_sq_z_high_f64x2));
@@ -649,45 +622,25 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
     for (; index < n; ++index) {
         nk_f64_t ax = a[index * 3 + 0], ay = a[index * 3 + 1], az = a[index * 3 + 2];
         nk_f64_t bx = b[index * 3 + 0], by = b[index * 3 + 1], bz = b[index * 3 + 2];
-        sum_a_x += ax, sum_a_y += ay, sum_a_z += az;
-        sum_b_x += bx, sum_b_y += by, sum_b_z += bz;
         nk_f64_t dx = ax - bx, dy = ay - by, dz = az - bz;
         sum_sq_x += dx * dx, sum_sq_y += dy * dy, sum_sq_z += dz * dz;
     }
-    nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
-    nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
-             centroid_a_z = sum_a_z * inv_points_count;
-    nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
-             centroid_b_z = sum_b_z * inv_points_count;
-    if (a_centroid)
-        a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
-        a_centroid[2] = (nk_f32_t)centroid_a_z;
-    if (b_centroid)
-        b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
-        b_centroid[2] = (nk_f32_t)centroid_b_z;
-    nk_f64_t sum_squared = sum_sq_x + sum_sq_y + sum_sq_z;
-    nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
-    nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
-    nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
-    nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
-    *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
+    *result = nk_f64_sqrt_v128relaxed((sum_sq_x + sum_sq_y + sum_sq_z) / (nk_f64_t)n);
 }
 NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
                                        nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
-    // RMSD uses identity rotation and scale=1.0
     if (rotation)
         rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
         rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
     if (scale) *scale = 1.0;
+    if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
+    if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
     v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
-    // Accumulators for centroids and squared differences
-    v128_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
-    v128_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
+    // Accumulators for squared differences
     v128_t sum_squared_x_f64x2 = zeros_f64x2, sum_squared_y_f64x2 = zeros_f64x2, sum_squared_z_f64x2 = zeros_f64x2;
     v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
@@ -698,13 +651,6 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
         nk_deinterleave_f64x2_v128relaxed_(a + i * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
         nk_deinterleave_f64x2_v128relaxed_(b + i * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
-        sum_a_x_f64x2 = wasm_f64x2_add(sum_a_x_f64x2, a_x_f64x2);
-        sum_a_y_f64x2 = wasm_f64x2_add(sum_a_y_f64x2, a_y_f64x2);
-        sum_a_z_f64x2 = wasm_f64x2_add(sum_a_z_f64x2, a_z_f64x2);
-        sum_b_x_f64x2 = wasm_f64x2_add(sum_b_x_f64x2, b_x_f64x2);
-        sum_b_y_f64x2 = wasm_f64x2_add(sum_b_y_f64x2, b_y_f64x2);
-        sum_b_z_f64x2 = wasm_f64x2_add(sum_b_z_f64x2, b_z_f64x2);
         v128_t delta_x_f64x2 = wasm_f64x2_sub(a_x_f64x2, b_x_f64x2);
         v128_t delta_y_f64x2 = wasm_f64x2_sub(a_y_f64x2, b_y_f64x2);
         v128_t delta_z_f64x2 = wasm_f64x2_sub(a_z_f64x2, b_z_f64x2);
@@ -715,12 +661,6 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
     }
     // Reduce vectors to scalars.
-    nk_f64_t total_ax = nk_reduce_stable_f64x2_v128relaxed_(sum_a_x_f64x2), total_ax_compensation = 0.0;
-    nk_f64_t total_ay = nk_reduce_stable_f64x2_v128relaxed_(sum_a_y_f64x2), total_ay_compensation = 0.0;
-    nk_f64_t total_az = nk_reduce_stable_f64x2_v128relaxed_(sum_a_z_f64x2), total_az_compensation = 0.0;
-    nk_f64_t total_bx = nk_reduce_stable_f64x2_v128relaxed_(sum_b_x_f64x2), total_bx_compensation = 0.0;
-    nk_f64_t total_by = nk_reduce_stable_f64x2_v128relaxed_(sum_b_y_f64x2), total_by_compensation = 0.0;
-    nk_f64_t total_bz = nk_reduce_stable_f64x2_v128relaxed_(sum_b_z_f64x2), total_bz_compensation = 0.0;
     nk_f64_t total_squared_x = nk_reduce_stable_f64x2_v128relaxed_(sum_squared_x_f64x2),
              total_squared_x_compensation = 0.0;
     nk_f64_t total_squared_y = nk_reduce_stable_f64x2_v128relaxed_(sum_squared_y_f64x2),
@@ -732,40 +672,16 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
     for (; i < n; ++i) {
         nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
         nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
-        nk_accumulate_sum_f64_(&total_ax, &total_ax_compensation, ax);
-        nk_accumulate_sum_f64_(&total_ay, &total_ay_compensation, ay);
-        nk_accumulate_sum_f64_(&total_az, &total_az_compensation, az);
-        nk_accumulate_sum_f64_(&total_bx, &total_bx_compensation, bx);
-        nk_accumulate_sum_f64_(&total_by, &total_by_compensation, by);
-        nk_accumulate_sum_f64_(&total_bz, &total_bz_compensation, bz);
         nk_f64_t delta_x = ax - bx, delta_y = ay - by, delta_z = az - bz;
         nk_accumulate_square_f64_(&total_squared_x, &total_squared_x_compensation, delta_x);
         nk_accumulate_square_f64_(&total_squared_y, &total_squared_y_compensation, delta_y);
         nk_accumulate_square_f64_(&total_squared_z, &total_squared_z_compensation, delta_z);
     }
-    total_ax += total_ax_compensation, total_ay += total_ay_compensation, total_az += total_az_compensation;
-    total_bx += total_bx_compensation, total_by += total_by_compensation, total_bz += total_bz_compensation;
     total_squared_x += total_squared_x_compensation, total_squared_y += total_squared_y_compensation,
         total_squared_z += total_squared_z_compensation;
-    // Compute centroids
-    nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
-    nk_f64_t centroid_a_x = total_ax * inv_points_count, centroid_a_y = total_ay * inv_points_count,
-             centroid_a_z = total_az * inv_points_count;
-    nk_f64_t centroid_b_x = total_bx * inv_points_count, centroid_b_y = total_by * inv_points_count,
-             centroid_b_z = total_bz * inv_points_count;
-    if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
-    if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
-    // Compute RMSD
-    nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
-    nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
-    nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
-    nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
-    nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
-    *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
+    *result = nk_f64_sqrt_v128relaxed((total_squared_x + total_squared_y + total_squared_z) / (nk_f64_t)n);
 }
 NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,

package/include/numkong/mesh.h CHANGED Viewed

@@ -6,7 +6,7 @@
  *
  *  Contains:
  *
- *  - Root Mean Square Deviation (RMSD) for rigid body superposition
+ *  - Root Mean Square Deviation (RMSD) of raw point differences
  *  - Kabsch algorithm for optimal rigid body alignment (rotation only)
  *  - Umeyama algorithm for similarity transform (rotation + uniform scaling)
  *
@@ -48,7 +48,7 @@
  *
  *  @section algorithm_overview Algorithm Overview
  *
- *  - RMSD: Simple root mean square deviation without alignment. R = identity, scale = 1.0
+ *  - RMSD: Raw √(Σ‖aᵢ − bᵢ‖² / n) without centering or alignment. R = identity, scale = 1.0, centroids zeroed
  *  - Kabsch: Finds optimal rotation R minimizing ‖R × (a - ā) - (b - b̄)‖. scale = 1.0
  *  - Umeyama: Finds optimal rotation R and scale c minimizing ‖c × R × (a - ā) - (b - b̄)‖
  *

package/include/numkong/mesh.hpp CHANGED Viewed

@@ -354,74 +354,30 @@ void rmsd(                                               //
     else if constexpr (std::is_same_v<in_type_, bf16_t> && simd)
         nk_rmsd_bf16(&a->raw_, &b->raw_, n, &a_centroid->raw_, &b_centroid->raw_, &rotation->raw_,
                      scale ? &scale->raw_ : nullptr, &metric->raw_);
-    // Scalar fallback
+    // Scalar fallback: raw √(Σ‖aᵢ − bᵢ‖² / n), no centering
     else {
-        // Step 1: Compute centroids
-        metric_type_ sum_a_x {}, sum_a_y {}, sum_a_z {};
-        metric_type_ sum_b_x {}, sum_b_y {}, sum_b_z {};
-        metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
-        for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]);
-            val_a_y = metric_type_(a[i * 3 + 1]);
-            val_a_z = metric_type_(a[i * 3 + 2]);
-            val_b_x = metric_type_(b[i * 3 + 0]);
-            val_b_y = metric_type_(b[i * 3 + 1]);
-            val_b_z = metric_type_(b[i * 3 + 2]);
-            sum_a_x = sum_a_x + val_a_x;
-            sum_a_y = sum_a_y + val_a_y;
-            sum_a_z = sum_a_z + val_a_z;
-            sum_b_x = sum_b_x + val_b_x;
-            sum_b_y = sum_b_y + val_b_y;
-            sum_b_z = sum_b_z + val_b_z;
-        }
-        metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
-        metric_type_ centroid_a_x = sum_a_x * inv_n;
-        metric_type_ centroid_a_y = sum_a_y * inv_n;
-        metric_type_ centroid_a_z = sum_a_z * inv_n;
-        metric_type_ centroid_b_x = sum_b_x * inv_n;
-        metric_type_ centroid_b_y = sum_b_y * inv_n;
-        metric_type_ centroid_b_z = sum_b_z * inv_n;
-        // Step 2: Store centroids if requested
         if (a_centroid)
-            a_centroid[0] = transform_type_(centroid_a_x), a_centroid[1] = transform_type_(centroid_a_y),
-            a_centroid[2] = transform_type_(centroid_a_z);
+            a_centroid[0] = transform_type_(0.0), a_centroid[1] = transform_type_(0.0),
+            a_centroid[2] = transform_type_(0.0);
         if (b_centroid)
-            b_centroid[0] = transform_type_(centroid_b_x), b_centroid[1] = transform_type_(centroid_b_y),
-            b_centroid[2] = transform_type_(centroid_b_z);
-        // Step 3: RMSD uses identity rotation and scale=1.0
+            b_centroid[0] = transform_type_(0.0), b_centroid[1] = transform_type_(0.0),
+            b_centroid[2] = transform_type_(0.0);
         if (rotation) {
-            rotation[0] = transform_type_(1.0);
-            rotation[1] = transform_type_(0.0);
-            rotation[2] = transform_type_(0.0);
-            rotation[3] = transform_type_(0.0);
-            rotation[4] = transform_type_(1.0);
-            rotation[5] = transform_type_(0.0);
-            rotation[6] = transform_type_(0.0);
-            rotation[7] = transform_type_(0.0);
-            rotation[8] = transform_type_(1.0);
+            rotation[0] = transform_type_(1.0), rotation[1] = transform_type_(0.0), rotation[2] = transform_type_(0.0);
+            rotation[3] = transform_type_(0.0), rotation[4] = transform_type_(1.0), rotation[5] = transform_type_(0.0);
+            rotation[6] = transform_type_(0.0), rotation[7] = transform_type_(0.0), rotation[8] = transform_type_(1.0);
         }
         if (scale) *scale = transform_type_(1.0);
-        // Step 4: Compute RMSD between centered point clouds
         metric_type_ sum_squared {};
         for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]);
-            val_a_y = metric_type_(a[i * 3 + 1]);
-            val_a_z = metric_type_(a[i * 3 + 2]);
-            val_b_x = metric_type_(b[i * 3 + 0]);
-            val_b_y = metric_type_(b[i * 3 + 1]);
-            val_b_z = metric_type_(b[i * 3 + 2]);
-            metric_type_ dx = (val_a_x - centroid_a_x) - (val_b_x - centroid_b_x);
-            metric_type_ dy = (val_a_y - centroid_a_y) - (val_b_y - centroid_b_y);
-            metric_type_ dz = (val_a_z - centroid_a_z) - (val_b_z - centroid_b_z);
+            metric_type_ dx = metric_type_(a[i * 3 + 0]) - metric_type_(b[i * 3 + 0]);
+            metric_type_ dy = metric_type_(a[i * 3 + 1]) - metric_type_(b[i * 3 + 1]);
+            metric_type_ dz = metric_type_(a[i * 3 + 2]) - metric_type_(b[i * 3 + 2]);
             sum_squared = sum_squared + dx * dx + dy * dy + dz * dz;
         }
-        *metric = (sum_squared * inv_n).sqrt();
+        *metric = (sum_squared / metric_type_(static_cast<double>(n))).sqrt();
     }
 }
@@ -470,18 +426,12 @@ void kabsch(                                             //
         metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
         for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]);
-            val_a_y = metric_type_(a[i * 3 + 1]);
+            val_a_x = metric_type_(a[i * 3 + 0]), val_a_y = metric_type_(a[i * 3 + 1]),
             val_a_z = metric_type_(a[i * 3 + 2]);
-            val_b_x = metric_type_(b[i * 3 + 0]);
-            val_b_y = metric_type_(b[i * 3 + 1]);
+            val_b_x = metric_type_(b[i * 3 + 0]), val_b_y = metric_type_(b[i * 3 + 1]),
             val_b_z = metric_type_(b[i * 3 + 2]);
-            sum_a_x = sum_a_x + val_a_x;
-            sum_a_y = sum_a_y + val_a_y;
-            sum_a_z = sum_a_z + val_a_z;
-            sum_b_x = sum_b_x + val_b_x;
-            sum_b_y = sum_b_y + val_b_y;
-            sum_b_z = sum_b_z + val_b_z;
+            sum_a_x = sum_a_x + val_a_x, sum_a_y = sum_a_y + val_a_y, sum_a_z = sum_a_z + val_a_z;
+            sum_b_x = sum_b_x + val_b_x, sum_b_y = sum_b_y + val_b_y, sum_b_z = sum_b_z + val_b_z;
         }
         metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
@@ -503,11 +453,9 @@ void kabsch(                                             //
         // Step 2: Build 3x3 covariance matrix H = (A - A_bar)^T x (B - B_bar)
         metric_type_ cross_covariance[9] = {};
         for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]) - centroid_a_x;
-            val_a_y = metric_type_(a[i * 3 + 1]) - centroid_a_y;
+            val_a_x = metric_type_(a[i * 3 + 0]) - centroid_a_x, val_a_y = metric_type_(a[i * 3 + 1]) - centroid_a_y,
             val_a_z = metric_type_(a[i * 3 + 2]) - centroid_a_z;
-            val_b_x = metric_type_(b[i * 3 + 0]) - centroid_b_x;
-            val_b_y = metric_type_(b[i * 3 + 1]) - centroid_b_y;
+            val_b_x = metric_type_(b[i * 3 + 0]) - centroid_b_x, val_b_y = metric_type_(b[i * 3 + 1]) - centroid_b_y,
             val_b_z = metric_type_(b[i * 3 + 2]) - centroid_b_z;
             cross_covariance[0] = cross_covariance[0] + val_a_x * val_b_x;
             cross_covariance[1] = cross_covariance[1] + val_a_x * val_b_y;
@@ -563,11 +511,11 @@ void kabsch(                                             //
         metric_type_ sum_squared {};
         for (std::size_t i = 0; i < n; i++) {
             metric_type_ point_a[3], point_b[3], rotated_point_a[3];
-            point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x;
-            point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y;
+            point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x,
+            point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y,
             point_a[2] = metric_type_(a[i * 3 + 2]) - centroid_a_z;
-            point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x;
-            point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y;
+            point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x,
+            point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y,
             point_b[2] = metric_type_(b[i * 3 + 2]) - centroid_b_z;
             rotated_point_a[0] = rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] +
                                  rotation_matrix[2] * point_a[2];
@@ -628,18 +576,12 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
         metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
         for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]);
-            val_a_y = metric_type_(a[i * 3 + 1]);
+            val_a_x = metric_type_(a[i * 3 + 0]), val_a_y = metric_type_(a[i * 3 + 1]),
             val_a_z = metric_type_(a[i * 3 + 2]);
-            val_b_x = metric_type_(b[i * 3 + 0]);
-            val_b_y = metric_type_(b[i * 3 + 1]);
+            val_b_x = metric_type_(b[i * 3 + 0]), val_b_y = metric_type_(b[i * 3 + 1]),
             val_b_z = metric_type_(b[i * 3 + 2]);
-            sum_a_x = sum_a_x + val_a_x;
-            sum_a_y = sum_a_y + val_a_y;
-            sum_a_z = sum_a_z + val_a_z;
-            sum_b_x = sum_b_x + val_b_x;
-            sum_b_y = sum_b_y + val_b_y;
-            sum_b_z = sum_b_z + val_b_z;
+            sum_a_x = sum_a_x + val_a_x, sum_a_y = sum_a_y + val_a_y, sum_a_z = sum_a_z + val_a_z;
+            sum_b_x = sum_b_x + val_b_x, sum_b_y = sum_b_y + val_b_y, sum_b_z = sum_b_z + val_b_z;
         }
         metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
@@ -650,16 +592,13 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
         metric_type_ centroid_b_y = sum_b_y * inv_n;
         metric_type_ centroid_b_z = sum_b_z * inv_n;
-        if (a_centroid) {
-            a_centroid[0] = transform_type_(centroid_a_x);
-            a_centroid[1] = transform_type_(centroid_a_y);
+        if (a_centroid)
+            a_centroid[0] = transform_type_(centroid_a_x), a_centroid[1] = transform_type_(centroid_a_y),
             a_centroid[2] = transform_type_(centroid_a_z);
-        }
-        if (b_centroid) {
-            b_centroid[0] = transform_type_(centroid_b_x);
-            b_centroid[1] = transform_type_(centroid_b_y);
+        if (b_centroid)
+            b_centroid[0] = transform_type_(centroid_b_x), b_centroid[1] = transform_type_(centroid_b_y),
             b_centroid[2] = transform_type_(centroid_b_z);
-        }
         // Step 2: Build covariance matrix H and compute variance of A
         metric_type_ cross_covariance[9] = {};
@@ -733,11 +672,11 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
         metric_type_ sum_squared {};
         for (std::size_t i = 0; i < n; i++) {
             metric_type_ point_a[3], point_b[3], rotated_point_a[3];
-            point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x;
-            point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y;
+            point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x,
+            point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y,
             point_a[2] = metric_type_(a[i * 3 + 2]) - centroid_a_z;
-            point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x;
-            point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y;
+            point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x,
+            point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y,
             point_b[2] = metric_type_(b[i * 3 + 2]) - centroid_b_z;
             rotated_point_a[0] = scale_factor * (rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] +
                                                  rotation_matrix[2] * point_a[2]);

package/include/numkong/reduce/neon.h CHANGED Viewed

@@ -3936,6 +3936,35 @@ NK_PUBLIC void nk_reduce_moments_f16_neon(                             //
     else nk_reduce_moments_f16_serial(data_ptr, count, stride_bytes, sum_ptr, sumsq_ptr);
 }
+NK_INTERNAL void nk_reduce_moments_u1_neon_contiguous_( //
+    nk_u1x8_t const *data_ptr, nk_size_t count,         //
+    nk_u64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
+    nk_size_t byte_count = nk_size_divide_round_up_(count, NK_BITS_PER_BYTE);
+    nk_u64_t sum = 0;
+    nk_size_t idx = 0;
+    // Each vcntq_u8 produces values 0-8 per lane; accumulate at u8 level
+    // for up to 31 iterations (31 × 8 = 248, fits in u8) before widening.
+    while (idx + 16 <= byte_count) {
+        uint8x16_t popcount_u8x16 = vdupq_n_u8(0);
+        for (nk_size_t cycle = 0; cycle < 31 && idx + 16 <= byte_count; ++cycle, idx += 16) {
+            uint8x16_t data_u8x16 = vld1q_u8((nk_u8_t const *)data_ptr + idx);
+            popcount_u8x16 = vaddq_u8(popcount_u8x16, vcntq_u8(data_u8x16));
+        }
+        sum += (nk_u64_t)vaddlvq_u8(popcount_u8x16);
+    }
+    for (; idx < byte_count; ++idx) sum += nk_u1x8_popcount_(((nk_u8_t const *)data_ptr)[idx]);
+    *sum_ptr = sum, *sumsq_ptr = sum;
+}
+NK_PUBLIC void nk_reduce_moments_u1_neon(                               //
+    nk_u1x8_t const *data_ptr, nk_size_t count, nk_size_t stride_bytes, //
+    nk_u64_t *sum_ptr, nk_u64_t *sumsq_ptr) {
+    count = nk_size_round_up_to_multiple_(count, 8);
+    if (count == 0) *sum_ptr = 0, *sumsq_ptr = 0;
+    else if (stride_bytes == 1) nk_reduce_moments_u1_neon_contiguous_(data_ptr, count, sum_ptr, sumsq_ptr);
+    else nk_reduce_moments_u1_serial(data_ptr, count, stride_bytes, sum_ptr, sumsq_ptr);
+}
 #if defined(__clang__)
 #pragma clang attribute pop
 #elif defined(__GNUC__)

package/include/numkong/reduce/neonbfdot.h CHANGED Viewed

@@ -33,7 +33,7 @@ NK_INTERNAL void nk_reduce_moments_bf16_neonbfdot_contiguous_( //
     nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
     // bf16 representation of 1.0 is 0x3F80 (same as upper 16 bits of f32 1.0)
-    bfloat16x8_t ones_bf16x8 = vreinterpretq_bf16_u16(vdupq_n_u16(0x3F80));
+    bfloat16x8_t ones_bf16x8 = vreinterpretq_bf16_u16(nk_u16x8_splat_(0x3F80));
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
     float32x4_t sumsq_f32x4 = vdupq_n_f32(0);
     nk_size_t idx = 0;
@@ -61,7 +61,7 @@ NK_INTERNAL void nk_reduce_moments_bf16_neonbfdot_strided_(                //
     nk_bf16_t const *data_ptr, nk_size_t count, nk_size_t stride_elements, //
     nk_f32_t *sum_ptr, nk_f32_t *sumsq_ptr) {
-    bfloat16x8_t ones_bf16x8 = vreinterpretq_bf16_u16(vdupq_n_u16(0x3F80));
+    bfloat16x8_t ones_bf16x8 = vreinterpretq_bf16_u16(nk_u16x8_splat_(0x3F80));
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
     float32x4_t sumsq_f32x4 = vdupq_n_f32(0);
     nk_size_t idx = 0;

package/include/numkong/reduce/neonfhm.h CHANGED Viewed

@@ -34,7 +34,7 @@ NK_INTERNAL void nk_reduce_moments_e4m3_neonfhm_contiguous_( //
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
     float32x4_t sumsq_f32x4 = vdupq_n_f32(0);
-    float16x8_t ones_f16x8 = vreinterpretq_f16_u16(vdupq_n_u16(0x3C00));
+    float16x8_t ones_f16x8 = vreinterpretq_f16_u16(nk_u16x8_splat_(0x3C00));
     nk_size_t idx = 0;
     for (; idx + 8 <= count; idx += 8) {
@@ -67,7 +67,7 @@ NK_INTERNAL void nk_reduce_moments_e4m3_neonfhm_strided_(                  //
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
     float32x4_t sumsq_f32x4 = vdupq_n_f32(0);
-    float16x8_t ones_f16x8 = vreinterpretq_f16_u16(vdupq_n_u16(0x3C00));
+    float16x8_t ones_f16x8 = vreinterpretq_f16_u16(nk_u16x8_splat_(0x3C00));
     nk_size_t idx = 0;
     if (stride_elements == 2) {
@@ -159,7 +159,7 @@ NK_INTERNAL void nk_reduce_moments_e5m2_neonfhm_contiguous_( //
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
     float32x4_t sumsq_f32x4 = vdupq_n_f32(0);
-    float16x8_t ones_f16x8 = vreinterpretq_f16_u16(vdupq_n_u16(0x3C00));
+    float16x8_t ones_f16x8 = vreinterpretq_f16_u16(nk_u16x8_splat_(0x3C00));
     nk_size_t idx = 0;
     for (; idx + 8 <= count; idx += 8) {
@@ -192,7 +192,7 @@ NK_INTERNAL void nk_reduce_moments_e5m2_neonfhm_strided_(                  //
     float32x4_t sum_f32x4 = vdupq_n_f32(0);
     float32x4_t sumsq_f32x4 = vdupq_n_f32(0);
-    float16x8_t ones_f16x8 = vreinterpretq_f16_u16(vdupq_n_u16(0x3C00));
+    float16x8_t ones_f16x8 = vreinterpretq_f16_u16(nk_u16x8_splat_(0x3C00));
     nk_size_t idx = 0;
     if (stride_elements == 2) {

package/include/numkong/reduce/sve.h ADDED Viewed

@@ -0,0 +1,52 @@
+/**
+ *  @brief SVE horizontal reduction helpers with MSan unpoisoning.
+ *  @file include/numkong/reduce/sve.h
+ *  @author Ash Vardanian
+ *  @date April 12, 2026
+ *
+ *  LLVM's MSan does not instrument ARM SVE intrinsics — `svaddv` moves data
+ *  from vector to scalar registers via architecture-specific paths invisible
+ *  to the compiler, causing false-positive uninitialized-value reports.
+ *  These macros wrap the reduction and unpoison the scalar result.
+ *
+ *  The `svaddv` intrinsic stays inside a macro so it expands in the caller's
+ *  target context — SVE and SME streaming translation units carry incompatible
+ *  target attributes. The unpoisoning runs on the already-reduced scalar, so it
+ *  lives in a target-agnostic `NK_INTERNAL` helper called from the macro.
+ *
+ *  @sa include/numkong/reduce.h
+ */
+#ifndef NK_REDUCE_SVE_H
+#define NK_REDUCE_SVE_H
+#if NK_TARGET_ARM64_
+#if NK_TARGET_SVE || NK_TARGET_SVE2 || NK_TARGET_SME
+#include "numkong/types.h"
+NK_INTERNAL nk_f64_t nk_unpoison_f64_(nk_f64_t v) NK_STREAMING_COMPATIBLE_ {
+    nk_unpoison_(&v, sizeof(v));
+    return v;
+}
+NK_INTERNAL nk_f32_t nk_unpoison_f32_(nk_f32_t v) NK_STREAMING_COMPATIBLE_ {
+    nk_unpoison_(&v, sizeof(v));
+    return v;
+}
+NK_INTERNAL nk_u64_t nk_unpoison_u64_(nk_u64_t v) NK_STREAMING_COMPATIBLE_ {
+    nk_unpoison_(&v, sizeof(v));
+    return v;
+}
+NK_INTERNAL nk_i64_t nk_unpoison_i64_(nk_i64_t v) NK_STREAMING_COMPATIBLE_ {
+    nk_unpoison_(&v, sizeof(v));
+    return v;
+}
+#define nk_svaddv_f64_(predicate, vector) nk_unpoison_f64_(svaddv_f64((predicate), (vector)))
+#define nk_svaddv_f32_(predicate, vector) nk_unpoison_f32_(svaddv_f32((predicate), (vector)))
+#define nk_svaddv_u32_(predicate, vector) nk_unpoison_u64_(svaddv_u32((predicate), (vector)))
+#define nk_svaddv_s32_(predicate, vector) nk_unpoison_i64_(svaddv_s32((predicate), (vector)))
+#define nk_svaddv_u8_(predicate, vector)  nk_unpoison_u64_(svaddv_u8((predicate), (vector)))
+#endif // NK_TARGET_SVE || NK_TARGET_SVE2 || NK_TARGET_SME
+#endif // NK_TARGET_ARM64_
+#endif // NK_REDUCE_SVE_H

package/include/numkong/reduce.h CHANGED Viewed

@@ -389,6 +389,8 @@ NK_PUBLIC void nk_reduce_moments_i16_neon(nk_i16_t const *, nk_size_t, nk_size_t
 /** @copydoc nk_reduce_moments_f64 */
 NK_PUBLIC void nk_reduce_moments_u16_neon(nk_u16_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
 /** @copydoc nk_reduce_moments_f64 */
+NK_PUBLIC void nk_reduce_moments_u1_neon(nk_u1x8_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
+/** @copydoc nk_reduce_moments_f64 */
 NK_PUBLIC void nk_reduce_moments_i32_neon(nk_i32_t const *, nk_size_t, nk_size_t, nk_i64_t *, nk_u64_t *);
 /** @copydoc nk_reduce_moments_f64 */
 NK_PUBLIC void nk_reduce_moments_u32_neon(nk_u32_t const *, nk_size_t, nk_size_t, nk_u64_t *, nk_u64_t *);
@@ -1559,6 +1561,8 @@ NK_PUBLIC void nk_reduce_moments_u1(nk_u1x8_t const *d, nk_size_t n, nk_size_t s
     nk_reduce_moments_u1_skylake(d, n, s, sum, sumsq);
 #elif NK_TARGET_HASWELL
     nk_reduce_moments_u1_haswell(d, n, s, sum, sumsq);
+#elif NK_TARGET_NEON
+    nk_reduce_moments_u1_neon(d, n, s, sum, sumsq);
 #else
     nk_reduce_moments_u1_serial(d, n, s, sum, sumsq);
 #endif

package/include/numkong/set/sve.h CHANGED Viewed

@@ -32,8 +32,9 @@
 #if NK_TARGET_ARM64_
 #if NK_TARGET_SVE
-#include "numkong/types.h"    // `nk_u1x8_t`
-#include "numkong/set/neon.h" // `nk_hamming_u1_neon`
+#include "numkong/types.h"      // `nk_u1x8_t`
+#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
+#include "numkong/set/neon.h"   // `nk_hamming_u1_neon`
 #if defined(__cplusplus)
 extern "C" {
@@ -73,7 +74,7 @@ NK_PUBLIC void nk_hamming_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size
             i += words_per_register;
             ++cycle;
         } while (i < n_bytes && cycle < 31);
-        differences += svaddv_u8(all_predicate_b8x, popcount_u8x);
+        differences += nk_svaddv_u8_(all_predicate_b8x, popcount_u8x);
         popcount_u8x = svdup_n_u8(0);
         cycle = 0; // Reset the cycle counter.
     }
@@ -110,9 +111,9 @@ NK_PUBLIC void nk_jaccard_u1_sve(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size
             i += words_per_register;
             ++cycle;
         } while (i < n_bytes && cycle < 31);
-        intersection_count += svaddv_u8(all_predicate_b8x, intersection_popcount_u8x);
+        intersection_count += nk_svaddv_u8_(all_predicate_b8x, intersection_popcount_u8x);
         intersection_popcount_u8x = svdup_n_u8(0);
-        union_count += svaddv_u8(all_predicate_b8x, union_popcount_u8x);
+        union_count += nk_svaddv_u8_(all_predicate_b8x, union_popcount_u8x);
         union_popcount_u8x = svdup_n_u8(0);
         cycle = 0; // Reset the cycle counter.
     }