npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/sets/smebi32.h ADDED Viewed

@@ -0,0 +1,1099 @@
+/**
+ *  @brief SIMD-accelerated Batched Set Distances for SME.
+ *  @file include/numkong/sets/smebi32.h
+ *  @author Ash Vardanian
+ *  @date February 6, 2026
+ *  @sa include/numkong/sets.h
+ *
+ *  Uses ARM Scalable Matrix Extension (SME) for efficient binary set operations.
+ *  Leverages streaming mode's wider vectors (512-bit on Apple M4) for fast
+ *  XOR+POPCNT operations on binary vectors.
+ *
+ *  @section smebi32_math Mathematical Foundation
+ *
+ *  Hamming distance: popcount(a XOR b) = number of differing bits
+ *
+ *  Jaccard distance using intersection:
+ *    intersection = popcount(a AND b)
+ *    union = popcount(a) + popcount(b) - intersection
+ *    jaccard = 1 - intersection / union
+ *
+ *  @section smebi32_tiles SME Dimensions (512-bit SVL)
+ *
+ *  - svcntw(): 16 (number of 32-bit elements per vector)
+ *  - svcntb(): 64 (number of bytes per SVE vector)
+ *  - Tile blocking: 16x16 output tiles for cache efficiency
+ *  - Depth processing: 64 bytes (512 bits) per iteration
+ *
+ *  @section smebi32_perf Performance Characteristics (Apple M4)
+ *
+ *  - SVL: 512 bits (64 bytes)
+ *  - Streaming mode provides dedicated register file
+ *  - Streaming mode overhead: ~50-100 cycles for SMSTART/SMSTOP
+ */
+#ifndef NK_SETS_SMEBI32_H
+#define NK_SETS_SMEBI32_H
+#if NK_TARGET_ARM_
+#if NK_TARGET_SMEBI32
+#include "numkong/types.h"
+#include "numkong/set/serial.h"
+#include "numkong/sets/serial.h"
+#include "numkong/dots/sme.h" // `nk_sme_zero_za32_*` constants
+#include "numkong/reduce.h"   // `nk_reduce_moments_u1`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/*
+ *  Binary set operations using SME BMOPA instruction.
+ *
+ *  BMOPA computes: ZA[i,j] += popcount(~(Zn[i] ^ Zm[j])) = popcount(XNOR)
+ *  This counts matching bits. Hamming = depth_bits - matching.
+ *
+ *  Tile layout (SVL=512, Apple M4):
+ *  - ZA32 output tile: 16 × 16 u32 elements (1 KB)
+ *  - Input vectors: 16 u32 elements (SVL/32)
+ *  - Each BMOPA processes 32 bits (one u32) across 16×16 pairs
+ *  - BMOPA predicates: b32 (u32 input granularity)
+ *  - Packed kernel: 4-tile path (ZA0-ZA3) for 4 B-column tiles simultaneously
+ *  - Unpacked kernel: ZA transpose (ZA0.S=staging, ZA1-3.S=accumulation, 3-tile fast path)
+ *  - Packed format: column-major u32 within each tile
+ */
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("sme2,sve2"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("+sme2")
+#endif
+/*  Read SVL in bytes from non-streaming context using RDSVL instruction. */
+NK_INTERNAL nk_size_t nk_smebi32_svl_bytes_(void) {
+    nk_size_t svl_bytes;
+    __asm__ volatile("rdsvl %0, #1" : "=r"(svl_bytes));
+    return svl_bytes;
+}
+/*  Get ZA32 tile dimension (number of f32/u32 elements per row). */
+NK_INTERNAL nk_size_t nk_smebi32_tile_dim_(void) { return nk_smebi32_svl_bytes_() / sizeof(nk_u32_t); }
+typedef struct {
+    nk_u32_t row_tile_count;   // ceiling(rows / tile_dim)
+    nk_u32_t depth_tile_count; // ceiling(depth_bits / depth_tile_bits)
+    nk_u32_t rows;             // actual row count
+    nk_u32_t depth_bits;       // actual depth in bits
+    nk_u32_t svl_bytes;        // SVL at pack time for validation
+    nk_u32_t norms_offset;     // byte offset to norms (0 if none)
+    nk_u32_t reserved[10];     // padding to 64 bytes
+} nk_sets_smebi32_packed_header_t;
+/** Count total set bits across a byte vector using streaming SVE.
+ *  Accumulates per-byte popcounts into u32 lanes via svdot; single horizontal reduction at end. */
+NK_PUBLIC nk_u32_t nk_sets_reduce_sumsq_u1_streaming_(nk_u1x8_t const *data,
+                                                      nk_size_t n_bytes) NK_STREAMING_COMPATIBLE_ {
+    svuint32_t acc_u32x = svdup_u32(0);
+    svuint8_t const ones_u8x = svdup_u8(1);
+    for (nk_size_t offset = 0; offset < n_bytes; offset += svcntb()) {
+        svbool_t predicate_u8x = svwhilelt_b8_u64(offset, n_bytes);
+        acc_u32x = svdot_u32(acc_u32x, svcnt_u8_z(predicate_u8x, svld1_u8(predicate_u8x, data + offset)), ones_u8x);
+    }
+    return (nk_u32_t)svaddv_u32(svptrue_b32(), acc_u32x);
+}
+#pragma region Hamming Distance
+NK_PUBLIC nk_size_t nk_dots_packed_size_u1_smebi32(nk_size_t row_count, nk_size_t depth_bits) {
+    nk_size_t const tile_dim = nk_smebi32_tile_dim_();        // 16 rows per tile
+    nk_size_t const depth_tile_size = nk_smebi32_tile_dim_(); // 16 u32 per depth tile = 512 bits
+    nk_size_t const depth_u32 = nk_size_divide_round_up_(depth_bits, 32);
+    nk_size_t const row_tile_count = nk_size_divide_round_up_(row_count, tile_dim);
+    nk_size_t const depth_tile_count = nk_size_divide_round_up_(depth_u32, depth_tile_size);
+    nk_size_t const tile_elements = tile_dim * depth_tile_size; // 256 u32 per tile
+    nk_size_t size = sizeof(nk_sets_smebi32_packed_header_t);
+    size += row_tile_count * depth_tile_count * tile_elements * sizeof(nk_u32_t);
+    size += row_count * sizeof(nk_u32_t); // per-row population counts
+    return size;
+}
+NK_PUBLIC void nk_dots_pack_u1_smebi32(nk_u1x8_t const *b, nk_size_t row_count, nk_size_t depth_bits,
+                                       nk_size_t b_stride_in_bytes, void *b_packed) {
+    nk_size_t const svl_bytes = nk_smebi32_svl_bytes_();
+    nk_size_t const tile_dim = nk_smebi32_tile_dim_();        // 16 rows per tile
+    nk_size_t const depth_tile_size = nk_smebi32_tile_dim_(); // 16 u32 per depth tile
+    nk_size_t const tile_elements = tile_dim * depth_tile_size;
+    nk_size_t const depth_in_bytes = nk_size_divide_round_up_(depth_bits, NK_BITS_PER_BYTE);
+    nk_size_t const depth_u32_total = nk_size_divide_round_up_(depth_bits, 32);
+    nk_size_t const row_tile_count = nk_size_divide_round_up_(row_count, tile_dim);
+    nk_size_t const depth_tile_count = nk_size_divide_round_up_(depth_u32_total, depth_tile_size);
+    nk_size_t const total_tiles = row_tile_count * depth_tile_count;
+    nk_size_t const data_size = total_tiles * tile_elements * sizeof(nk_u32_t);
+    nk_sets_smebi32_packed_header_t *header = (nk_sets_smebi32_packed_header_t *)b_packed;
+    header->row_tile_count = (nk_u32_t)row_tile_count;
+    header->depth_tile_count = (nk_u32_t)depth_tile_count;
+    header->rows = (nk_u32_t)row_count;
+    header->depth_bits = (nk_u32_t)depth_bits;
+    header->svl_bytes = (nk_u32_t)svl_bytes;
+    header->norms_offset = (nk_u32_t)(sizeof(nk_sets_smebi32_packed_header_t) + data_size);
+    nk_u32_t *tiles_ptr = (nk_u32_t *)((char *)b_packed + sizeof(nk_sets_smebi32_packed_header_t));
+    nk_u32_t *norms_ptr = (nk_u32_t *)((char *)b_packed + header->norms_offset);
+    // Zero-initialize all tiles (partial tiles stay zero-padded for predicated loads)
+    for (nk_size_t i = 0; i < total_tiles * tile_elements; i++) tiles_ptr[i] = 0;
+    // Pack tiles: column-major u32 within each tile for efficient SVE loads
+    for (nk_size_t row_tile = 0; row_tile < row_tile_count; row_tile++) {
+        for (nk_size_t depth_tile = 0; depth_tile < depth_tile_count; depth_tile++) {
+            nk_size_t const tile_index = row_tile * depth_tile_count + depth_tile;
+            nk_u32_t *tile_output = tiles_ptr + tile_index * tile_elements;
+            nk_size_t const src_row_start = row_tile * tile_dim;
+            nk_size_t const src_u32_start = depth_tile * depth_tile_size;
+            nk_size_t const rows_to_pack = (src_row_start + tile_dim <= row_count) ? tile_dim
+                                                                                   : (row_count - src_row_start);
+            nk_size_t const u32s_to_pack = (src_u32_start + depth_tile_size <= depth_u32_total)
+                                               ? depth_tile_size
+                                               : (depth_u32_total > src_u32_start ? depth_u32_total - src_u32_start
+                                                                                  : 0);
+            // Column-major packing: tile_output[col * tile_dim + row]
+            for (nk_size_t row = 0; row < rows_to_pack; row++) {
+                nk_u32_t const *src_row = (nk_u32_t const *)((char const *)b +
+                                                             (src_row_start + row) * b_stride_in_bytes);
+                for (nk_size_t col = 0; col < u32s_to_pack; col++) {
+                    nk_size_t const dst_idx = col * tile_dim + row; // Column-major!
+                    tile_output[dst_idx] = src_row[src_u32_start + col];
+                }
+            }
+        }
+    }
+    // Compute per-row population counts
+    for (nk_size_t row = 0; row < row_count; row++) {
+        nk_u1x8_t const *src_row = (nk_u1x8_t const *)((char const *)b + row * b_stride_in_bytes);
+        {
+            nk_u64_t nk_local_sum_, nk_local_sumsq_;
+            nk_reduce_moments_u1(src_row, depth_in_bytes * 8, sizeof(nk_u1x8_t), &nk_local_sum_, &nk_local_sumsq_);
+            norms_ptr[row] = (nk_u32_t)nk_local_sum_;
+        }
+    }
+}
+/**
+ *  SME Hamming kernel using ZA transpose for unpacked A.
+ *  ZA0.S = staging (A rows loaded horizontally, read vertically for BMOPA).
+ *  ZA1-3.S = BMOPA accumulation (3 B column tiles in fast path).
+ *
+ *  Each ZA0.S batch covers 16 depth u32 steps (one full depth tile).
+ *  BMOPA expansion=1 for u32: each u32 contributes 32 bits via XNOR+POPCNT.
+ */
+__arm_locally_streaming __arm_new("za") static void nk_hammings_packed_u1_smebi32_streaming_(
+    nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t row_count_a, nk_size_t row_count_b,
+    nk_size_t depth_bits, nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
+    nk_sets_smebi32_packed_header_t const *header = (nk_sets_smebi32_packed_header_t const *)b_packed;
+    nk_size_t const row_tile_count_b = header->row_tile_count;
+    nk_size_t const depth_tile_count = header->depth_tile_count;
+    nk_size_t const tile_dim = svcntw();        // 16 for 512-bit SVL
+    nk_size_t const depth_tile_size = svcntw(); // 16 u32 per depth tile
+    nk_size_t const tile_elements = tile_dim * depth_tile_size;
+    nk_size_t const depth_u32_total = nk_size_divide_round_up_(depth_bits, 32);
+    nk_u32_t const *b_tiles = (nk_u32_t const *)((char const *)b_packed + sizeof(nk_sets_smebi32_packed_header_t));
+    svbool_t const predicate_all_u32x = svptrue_b32();
+    svuint32_t const depth_u32x = svdup_u32((nk_u32_t)depth_bits);
+    nk_size_t const row_tile_count_a = nk_size_divide_round_up_(row_count_a, tile_dim);
+    for (nk_size_t row_tile_a = 0; row_tile_a < row_tile_count_a; row_tile_a++) {
+        nk_size_t const row_start_a = row_tile_a * tile_dim;
+        nk_size_t const rows_a_remaining = (row_start_a + tile_dim <= row_count_a) ? tile_dim
+                                                                                   : (row_count_a - row_start_a);
+        svbool_t const row_predicate_u32x = svwhilelt_b32_u64(0u, rows_a_remaining);
+        // Fast path: 3 B column tiles using ZA1-ZA3 (ZA0.S = staging)
+        nk_size_t row_tile_b = 0;
+        for (; row_tile_b + 3 <= row_tile_count_b; row_tile_b += 3) {
+            svzero_mask_za(nk_sme_zero_za32_tiles_123_);
+            for (nk_size_t d_tile = 0; d_tile < depth_tile_count; d_tile++) {
+                nk_size_t const d_start_u32 = d_tile * depth_tile_size;
+                nk_size_t const u32s_this_tile = (d_start_u32 + depth_tile_size <= depth_u32_total)
+                                                     ? depth_tile_size
+                                                     : (depth_u32_total > d_start_u32 ? depth_u32_total - d_start_u32
+                                                                                      : 0);
+                if (u32s_this_tile == 0) break;
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                svbool_t const batch_predicate_u32x = svwhilelt_b32_u64(0u, u32s_this_tile);
+                // Load A rows into ZA0.S horizontally as u32 words
+                for (nk_size_t row_in_tile = 0; row_in_tile < rows_a_remaining; row_in_tile++) {
+                    nk_u32_t const *a_row_u32 = (nk_u32_t const *)((char const *)a +
+                                                                   (row_start_a + row_in_tile) * a_stride_in_bytes) +
+                                                d_start_u32;
+                    svld1_hor_za32(0, row_in_tile, batch_predicate_u32x, a_row_u32);
+                }
+                // B tile pointers for 3 column tiles
+                nk_u32_t const *b_tile0 = b_tiles + ((row_tile_b + 0) * depth_tile_count + d_tile) * tile_elements;
+                nk_u32_t const *b_tile1 = b_tiles + ((row_tile_b + 1) * depth_tile_count + d_tile) * tile_elements;
+                nk_u32_t const *b_tile2 = b_tiles + ((row_tile_b + 2) * depth_tile_count + d_tile) * tile_elements;
+                // Vertical read + BMOPA for each depth step
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_column_u32x = svread_ver_za32_u32_m(svdup_u32(0), row_predicate_u32x, 0, step);
+                    svbmopa_za32_u32_m(1, row_predicate_u32x, predicate_all_u32x, a_column_u32x,
+                                       svld1_u32(predicate_all_u32x, b_tile0 + step * tile_dim));
+                    svbmopa_za32_u32_m(2, row_predicate_u32x, predicate_all_u32x, a_column_u32x,
+                                       svld1_u32(predicate_all_u32x, b_tile1 + step * tile_dim));
+                    svbmopa_za32_u32_m(3, row_predicate_u32x, predicate_all_u32x, a_column_u32x,
+                                       svld1_u32(predicate_all_u32x, b_tile2 + step * tile_dim));
+                }
+            }
+            // Extract from ZA1-3: Hamming = depth_bits - matching_bits
+            for (nk_size_t row = 0; row < rows_a_remaining; row++) {
+                nk_u32_t *c_row = (nk_u32_t *)((char *)c + (row_start_a + row) * c_stride_in_bytes);
+                svuint32_t za1_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_u32x, 1, row);
+                svuint32_t za2_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_u32x, 2, row);
+                svuint32_t za3_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_u32x, 3, row);
+                svst1_u32(predicate_all_u32x, c_row + (row_tile_b + 0) * tile_dim,
+                          svsub_u32_x(predicate_all_u32x, depth_u32x, za1_u32x));
+                svst1_u32(predicate_all_u32x, c_row + (row_tile_b + 1) * tile_dim,
+                          svsub_u32_x(predicate_all_u32x, depth_u32x, za2_u32x));
+                svst1_u32(predicate_all_u32x, c_row + (row_tile_b + 2) * tile_dim,
+                          svsub_u32_x(predicate_all_u32x, depth_u32x, za3_u32x));
+            }
+        }
+        // Remainder: 1 B column tile at a time using ZA1
+        for (; row_tile_b < row_tile_count_b; row_tile_b++) {
+            nk_size_t const row_start_b = row_tile_b * tile_dim;
+            nk_size_t const rows_b_remaining = (row_start_b + tile_dim <= row_count_b) ? tile_dim
+                                                                                       : (row_count_b - row_start_b);
+            svbool_t const column_predicate_u32x = svwhilelt_b32_u64(0u, rows_b_remaining);
+            svzero_mask_za(nk_sme_zero_za32_tile_1_);
+            for (nk_size_t d_tile = 0; d_tile < depth_tile_count; d_tile++) {
+                nk_size_t const d_start_u32 = d_tile * depth_tile_size;
+                nk_size_t const u32s_this_tile = (d_start_u32 + depth_tile_size <= depth_u32_total)
+                                                     ? depth_tile_size
+                                                     : (depth_u32_total > d_start_u32 ? depth_u32_total - d_start_u32
+                                                                                      : 0);
+                if (u32s_this_tile == 0) break;
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                svbool_t const batch_predicate_u32x = svwhilelt_b32_u64(0u, u32s_this_tile);
+                // Load A rows into ZA0.S horizontally
+                for (nk_size_t row_in_tile = 0; row_in_tile < rows_a_remaining; row_in_tile++) {
+                    nk_u32_t const *a_row_u32 = (nk_u32_t const *)((char const *)a +
+                                                                   (row_start_a + row_in_tile) * a_stride_in_bytes) +
+                                                d_start_u32;
+                    svld1_hor_za32(0, row_in_tile, batch_predicate_u32x, a_row_u32);
+                }
+                nk_u32_t const *b_tile = b_tiles + (row_tile_b * depth_tile_count + d_tile) * tile_elements;
+                // Vertical read + BMOPA
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_column_u32x = svread_ver_za32_u32_m(svdup_u32(0), row_predicate_u32x, 0, step);
+                    svuint32_t b_u32x = svld1_u32(predicate_all_u32x, b_tile + step * tile_dim);
+                    svbmopa_za32_u32_m(1, row_predicate_u32x, column_predicate_u32x, a_column_u32x, b_u32x);
+                }
+            }
+            // Extract from ZA1: Hamming = depth_bits - matching_bits
+            for (nk_size_t row = 0; row < rows_a_remaining; row++) {
+                svuint32_t za1_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_u32x, 1, row);
+                svuint32_t hamming_u32x = svsub_u32_x(predicate_all_u32x, depth_u32x, za1_u32x);
+                nk_u32_t *c_row = (nk_u32_t *)((char *)c + (row_start_a + row) * c_stride_in_bytes);
+                svst1_u32(column_predicate_u32x, c_row + row_start_b, hamming_u32x);
+            }
+        }
+    }
+}
+NK_PUBLIC void nk_hammings_packed_u1_smebi32(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c,
+                                             nk_size_t row_count_a, nk_size_t row_count_b, nk_size_t depth_bits,
+                                             nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
+    nk_hammings_packed_u1_smebi32_streaming_(a, b_packed, c, row_count_a, row_count_b, depth_bits, a_stride_in_bytes,
+                                             c_stride_in_bytes);
+}
+/**
+ *  Symmetric Hamming using ZA0 time-sharing + 3-tile fast path.
+ *  ZA0.S = staging (A rows loaded horizontally, read vertically for BMOPA).
+ *  ZA1-3.S = BMOPA accumulators (3 B column tiles in fast path).
+ *  Mirrors the unpacked kernel nk_hammings_packed_u1_smebi32_streaming_ pattern.
+ */
+__arm_locally_streaming __arm_new("za") static void nk_hammings_symmetric_u1_smebi32_streaming_(
+    nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth_bits, nk_size_t stride, nk_u32_t *result,
+    nk_size_t result_stride, nk_size_t row_start, nk_size_t row_count) {
+    nk_size_t const tile_dim = svcntw();        // 16 for 512-bit SVL
+    nk_size_t const depth_tile_size = svcntw(); // 16 u32 per depth tile
+    nk_size_t const depth_u32_total = nk_size_divide_round_up_(depth_bits, 32);
+    nk_size_t const depth_tile_count = nk_size_divide_round_up_(depth_u32_total, depth_tile_size);
+    svbool_t const predicate_all_u32x = svptrue_b32();
+    svuint32_t const depth_u32x = svdup_u32((nk_u32_t)depth_bits);
+    NK_ALIGN64 nk_u32_t a_buffer[16][16]; // Stack buffer for A column save
+    nk_size_t const row_end = row_start + row_count;
+    nk_size_t const column_tile_count = nk_size_divide_round_up_(n_vectors, tile_dim);
+    for (nk_size_t row_tile_start = row_start; row_tile_start < row_end && row_tile_start < n_vectors;
+         row_tile_start += tile_dim) {
+        nk_size_t const rows_remaining = (row_tile_start + tile_dim <= row_end) ? tile_dim : (row_end - row_tile_start);
+        nk_size_t const rows_clamped = (row_tile_start + rows_remaining <= n_vectors) ? rows_remaining
+                                                                                      : (n_vectors - row_tile_start);
+        svbool_t const row_predicate_u32x = svwhilelt_b32_u64(0u, rows_clamped);
+        nk_size_t column_tile_index = 0;
+        // Fast path: 3 column tiles using ZA1-ZA3 (ZA0 = staging)
+        for (; column_tile_index + 3 <= column_tile_count; column_tile_index += 3) {
+            svzero_mask_za(nk_sme_zero_za32_tiles_123_);
+            for (nk_size_t d_tile = 0; d_tile < depth_tile_count; d_tile++) {
+                nk_size_t const d_start_u32 = d_tile * depth_tile_size;
+                nk_size_t const u32s_this_tile = (d_start_u32 + depth_tile_size <= depth_u32_total)
+                                                     ? depth_tile_size
+                                                     : (depth_u32_total > d_start_u32 ? depth_u32_total - d_start_u32
+                                                                                      : 0);
+                if (u32s_this_tile == 0) break;
+                // Load A rows into ZA0 horizontally
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                svbool_t const batch_predicate_u32x = svwhilelt_b32_u64(0u, u32s_this_tile);
+                for (nk_size_t row_in_tile = 0; row_in_tile < rows_clamped; row_in_tile++) {
+                    nk_u32_t const *a_row_u32 = (nk_u32_t const *)((char const *)vectors +
+                                                                   (row_tile_start + row_in_tile) * stride) +
+                                                d_start_u32;
+                    svld1_hor_za32(0, row_in_tile, batch_predicate_u32x, a_row_u32);
+                }
+                // Save A columns from ZA0 to stack buffer
+                for (nk_size_t s = 0; s < u32s_this_tile; s++)
+                    svst1_u32(predicate_all_u32x, a_buffer[s],
+                              svread_ver_za32_u32_m(svdup_u32(0), row_predicate_u32x, 0, s));
+                // B column tile 0
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                for (nk_size_t col = 0; col < tile_dim; col++) {
+                    nk_size_t const col_abs = (column_tile_index + 0) * tile_dim + col;
+                    if (col_abs < n_vectors) {
+                        nk_u32_t const *b_row = (nk_u32_t const *)((char const *)vectors + col_abs * stride) +
+                                                d_start_u32;
+                        svld1_hor_za32(0, col, batch_predicate_u32x, b_row);
+                    }
+                }
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_u32x = svld1_u32(predicate_all_u32x, a_buffer[step]);
+                    svuint32_t b_u32x = svread_ver_za32_u32_m(svdup_u32(0), predicate_all_u32x, 0, step);
+                    svbmopa_za32_u32_m(1, row_predicate_u32x, predicate_all_u32x, a_u32x, b_u32x);
+                }
+                // B column tile 1
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                for (nk_size_t col = 0; col < tile_dim; col++) {
+                    nk_size_t const col_abs = (column_tile_index + 1) * tile_dim + col;
+                    if (col_abs < n_vectors) {
+                        nk_u32_t const *b_row = (nk_u32_t const *)((char const *)vectors + col_abs * stride) +
+                                                d_start_u32;
+                        svld1_hor_za32(0, col, batch_predicate_u32x, b_row);
+                    }
+                }
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_u32x = svld1_u32(predicate_all_u32x, a_buffer[step]);
+                    svuint32_t b_u32x = svread_ver_za32_u32_m(svdup_u32(0), predicate_all_u32x, 0, step);
+                    svbmopa_za32_u32_m(2, row_predicate_u32x, predicate_all_u32x, a_u32x, b_u32x);
+                }
+                // B column tile 2
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                for (nk_size_t col = 0; col < tile_dim; col++) {
+                    nk_size_t const col_abs = (column_tile_index + 2) * tile_dim + col;
+                    if (col_abs < n_vectors) {
+                        nk_u32_t const *b_row = (nk_u32_t const *)((char const *)vectors + col_abs * stride) +
+                                                d_start_u32;
+                        svld1_hor_za32(0, col, batch_predicate_u32x, b_row);
+                    }
+                }
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_u32x = svld1_u32(predicate_all_u32x, a_buffer[step]);
+                    svuint32_t b_u32x = svread_ver_za32_u32_m(svdup_u32(0), predicate_all_u32x, 0, step);
+                    svbmopa_za32_u32_m(3, row_predicate_u32x, predicate_all_u32x, a_u32x, b_u32x);
+                }
+            }
+            // Extract ZA1-3: hamming = depth_bits - ZA[i][j]
+            for (nk_size_t row = 0; row < rows_clamped; row++) {
+                nk_u32_t *c_row = (nk_u32_t *)((char *)result + (row_tile_start + row) * result_stride);
+                svuint32_t za1_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_u32x, 1, row);
+                svuint32_t za2_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_u32x, 2, row);
+                svuint32_t za3_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_u32x, 3, row);
+                svst1_u32(predicate_all_u32x, c_row + (column_tile_index + 0) * tile_dim,
+                          svsub_u32_x(predicate_all_u32x, depth_u32x, za1_u32x));
+                svst1_u32(predicate_all_u32x, c_row + (column_tile_index + 1) * tile_dim,
+                          svsub_u32_x(predicate_all_u32x, depth_u32x, za2_u32x));
+                svst1_u32(predicate_all_u32x, c_row + (column_tile_index + 2) * tile_dim,
+                          svsub_u32_x(predicate_all_u32x, depth_u32x, za3_u32x));
+            }
+        }
+        // Remainder: 1 column tile at a time using ZA1
+        for (; column_tile_index < column_tile_count; column_tile_index++) {
+            nk_size_t const col_tile_start = column_tile_index * tile_dim;
+            nk_size_t const cols_remaining = (col_tile_start + tile_dim <= n_vectors) ? tile_dim
+                                                                                      : (n_vectors - col_tile_start);
+            svbool_t const column_predicate_u32x = svwhilelt_b32_u64(0u, cols_remaining);
+            svzero_mask_za(nk_sme_zero_za32_tile_1_);
+            for (nk_size_t d_tile = 0; d_tile < depth_tile_count; d_tile++) {
+                nk_size_t const d_start_u32 = d_tile * depth_tile_size;
+                nk_size_t const u32s_this_tile = (d_start_u32 + depth_tile_size <= depth_u32_total)
+                                                     ? depth_tile_size
+                                                     : (depth_u32_total > d_start_u32 ? depth_u32_total - d_start_u32
+                                                                                      : 0);
+                if (u32s_this_tile == 0) break;
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                svbool_t const batch_predicate_u32x = svwhilelt_b32_u64(0u, u32s_this_tile);
+                // Load A rows into ZA0 horizontally
+                for (nk_size_t row_in_tile = 0; row_in_tile < rows_clamped; row_in_tile++) {
+                    nk_u32_t const *a_row_u32 = (nk_u32_t const *)((char const *)vectors +
+                                                                   (row_tile_start + row_in_tile) * stride) +
+                                                d_start_u32;
+                    svld1_hor_za32(0, row_in_tile, batch_predicate_u32x, a_row_u32);
+                }
+                // Save A columns from ZA0 to stack buffer
+                for (nk_size_t s = 0; s < u32s_this_tile; s++)
+                    svst1_u32(predicate_all_u32x, a_buffer[s],
+                              svread_ver_za32_u32_m(svdup_u32(0), row_predicate_u32x, 0, s));
+                // Load B column tile into ZA0
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                for (nk_size_t col = 0; col < tile_dim; col++) {
+                    nk_size_t const col_abs = col_tile_start + col;
+                    if (col_abs < n_vectors) {
+                        nk_u32_t const *b_row = (nk_u32_t const *)((char const *)vectors + col_abs * stride) +
+                                                d_start_u32;
+                        svld1_hor_za32(0, col, batch_predicate_u32x, b_row);
+                    }
+                }
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_u32x = svld1_u32(predicate_all_u32x, a_buffer[step]);
+                    svuint32_t b_u32x = svread_ver_za32_u32_m(svdup_u32(0), column_predicate_u32x, 0, step);
+                    svbmopa_za32_u32_m(1, row_predicate_u32x, column_predicate_u32x, a_u32x, b_u32x);
+                }
+            }
+            for (nk_size_t row = 0; row < rows_clamped; row++) {
+                svuint32_t za1_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_u32x, 1, row);
+                svuint32_t hamming_u32x = svsub_u32_x(predicate_all_u32x, depth_u32x, za1_u32x);
+                nk_u32_t *c_row = (nk_u32_t *)((char *)result + (row_tile_start + row) * result_stride);
+                svst1_u32(column_predicate_u32x, c_row + col_tile_start, hamming_u32x);
+            }
+        }
+    }
+}
+NK_PUBLIC void nk_hammings_symmetric_u1_smebi32(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth_bits,
+                                                nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
+                                                nk_size_t row_start, nk_size_t row_count) {
+    nk_hammings_symmetric_u1_smebi32_streaming_(vectors, n_vectors, depth_bits, stride, result, result_stride,
+                                                row_start, row_count);
+}
+#pragma endregion // Hamming Distance
+/*
+ *  Jaccard distance via BMOPA matching counts + algebraic normalization.
+ *
+ *  BMOPA gives: matching = popcount(XNOR(a,b))
+ *  Then:
+ *    hamming      = depth_bits - matching
+ *    intersection = (norm_a + norm_b - hamming) / 2  =  (norm_a + norm_b - depth_bits + matching) / 2
+ *    union        = (norm_a + norm_b + hamming) / 2  =  sum_norms - intersection
+ *    jaccard      = 1 - intersection / union          (1.0 when union == 0)
+ *
+ *  Inner BMOPA loop is identical to Hamming; only the extraction phase differs.
+ *  Packed format shares the Hamming tile layout for B operand, plus per-row norms.
+ */
+#pragma region Jaccard Distance
+/**
+ *  SME Jaccard kernel using BMOPA for matching-bit counts.
+ *  Mirrors nk_hammings_packed_u1_smebi32_streaming_ exactly in structure,
+ *  but derives intersection/union algebraically from the matching counts:
+ *    matching      = popcount(XNOR(a,b))          (from BMOPA)
+ *    hamming       = depth_bits - matching
+ *    intersection  = (norm_a + norm_b - hamming) / 2
+ *    union         = (norm_a + norm_b + hamming) / 2
+ *    jaccard       = 1 - intersection / union      (1.0 when union == 0)
+ */
+__arm_locally_streaming __arm_new("za") static void nk_jaccards_packed_u1_smebi32_streaming_(
+    nk_u1x8_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t row_count_a, nk_size_t row_count_b,
+    nk_size_t depth_bits, nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
+    nk_sets_smebi32_packed_header_t const *header = (nk_sets_smebi32_packed_header_t const *)b_packed;
+    nk_size_t const row_tile_count_b = header->row_tile_count;
+    nk_size_t const depth_tile_count = header->depth_tile_count;
+    nk_size_t const tile_dim = svcntw();        // 16 for 512-bit SVL
+    nk_size_t const depth_tile_size = svcntw(); // 16 u32 per depth tile
+    nk_size_t const tile_elements = tile_dim * depth_tile_size;
+    nk_size_t const depth_u32_total = nk_size_divide_round_up_(depth_bits, 32);
+    nk_u32_t const *b_tiles = (nk_u32_t const *)((char const *)b_packed + sizeof(nk_sets_smebi32_packed_header_t));
+    nk_u32_t const *b_norms = header->norms_offset ? (nk_u32_t const *)((char const *)b_packed + header->norms_offset)
+                                                   : (nk_u32_t const *)0;
+    svbool_t const predicate_all_f32x = svptrue_b32();
+    svfloat32_t const depth_f32x = svdup_f32((nk_f32_t)depth_bits);
+    svfloat32_t const half_f32x = svdup_f32(0.5f);
+    svfloat32_t const one_f32x = svdup_f32(1.0f);
+    svfloat32_t const zero_f32x = svdup_f32(0.0f);
+    nk_size_t const depth_in_bytes = nk_size_divide_round_up_(depth_bits, 8);
+    nk_size_t const row_tile_count_a = nk_size_divide_round_up_(row_count_a, tile_dim);
+    for (nk_size_t row_tile_a = 0; row_tile_a < row_tile_count_a; row_tile_a++) {
+        nk_size_t const row_start_a = row_tile_a * tile_dim;
+        nk_size_t const rows_a_remaining = (row_start_a + tile_dim <= row_count_a) ? tile_dim
+                                                                                   : (row_count_a - row_start_a);
+        svbool_t const row_predicate_f32x = svwhilelt_b32_u64(0u, rows_a_remaining);
+        // Compute A tile norms using streaming SVE popcount
+        NK_ALIGN64 nk_f32_t a_tile_norms[16];
+        for (nk_size_t r = 0; r < rows_a_remaining; r++) {
+            nk_u1x8_t const *a_row = (nk_u1x8_t const *)((char const *)a + (row_start_a + r) * a_stride_in_bytes);
+            a_tile_norms[r] = (nk_f32_t)nk_sets_reduce_sumsq_u1_streaming_(a_row, depth_in_bytes);
+        }
+        // Fast path: 3 B column tiles using ZA1-ZA3 (ZA0.S = staging)
+        nk_size_t row_tile_b = 0;
+        for (; row_tile_b + 3 <= row_tile_count_b; row_tile_b += 3) {
+            svzero_mask_za(nk_sme_zero_za32_tiles_123_);
+            for (nk_size_t d_tile = 0; d_tile < depth_tile_count; d_tile++) {
+                nk_size_t const d_start_u32 = d_tile * depth_tile_size;
+                nk_size_t const u32s_this_tile = (d_start_u32 + depth_tile_size <= depth_u32_total)
+                                                     ? depth_tile_size
+                                                     : (depth_u32_total > d_start_u32 ? depth_u32_total - d_start_u32
+                                                                                      : 0);
+                if (u32s_this_tile == 0) break;
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                svbool_t const batch_predicate_f32x = svwhilelt_b32_u64(0u, u32s_this_tile);
+                // Load A rows into ZA0.S horizontally as u32 words
+                for (nk_size_t row_in_tile = 0; row_in_tile < rows_a_remaining; row_in_tile++) {
+                    nk_u32_t const *a_row_u32 = (nk_u32_t const *)((char const *)a +
+                                                                   (row_start_a + row_in_tile) * a_stride_in_bytes) +
+                                                d_start_u32;
+                    svld1_hor_za32(0, row_in_tile, batch_predicate_f32x, a_row_u32);
+                }
+                // B tile pointers for 3 column tiles
+                nk_u32_t const *b_tile0 = b_tiles + ((row_tile_b + 0) * depth_tile_count + d_tile) * tile_elements;
+                nk_u32_t const *b_tile1 = b_tiles + ((row_tile_b + 1) * depth_tile_count + d_tile) * tile_elements;
+                nk_u32_t const *b_tile2 = b_tiles + ((row_tile_b + 2) * depth_tile_count + d_tile) * tile_elements;
+                // Vertical read + BMOPA for each depth step
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_column_u32x = svread_ver_za32_u32_m(svdup_u32(0), row_predicate_f32x, 0, step);
+                    svbmopa_za32_u32_m(1, row_predicate_f32x, predicate_all_f32x, a_column_u32x,
+                                       svld1_u32(predicate_all_f32x, b_tile0 + step * tile_dim));
+                    svbmopa_za32_u32_m(2, row_predicate_f32x, predicate_all_f32x, a_column_u32x,
+                                       svld1_u32(predicate_all_f32x, b_tile1 + step * tile_dim));
+                    svbmopa_za32_u32_m(3, row_predicate_f32x, predicate_all_f32x, a_column_u32x,
+                                       svld1_u32(predicate_all_f32x, b_tile2 + step * tile_dim));
+                }
+            }
+            // Extract from ZA1-3: Jaccard normalization via streaming SVE
+            // Hoist B norms outside row loop (same for all A rows in this tile-pair)
+            svfloat32_t b_norms_0_f32x = svcvt_f32_u32_x(
+                predicate_all_f32x, svld1_u32(predicate_all_f32x, b_norms + (row_tile_b + 0) * tile_dim));
+            svfloat32_t b_norms_1_f32x = svcvt_f32_u32_x(
+                predicate_all_f32x, svld1_u32(predicate_all_f32x, b_norms + (row_tile_b + 1) * tile_dim));
+            svfloat32_t b_norms_2_f32x = svcvt_f32_u32_x(
+                predicate_all_f32x, svld1_u32(predicate_all_f32x, b_norms + (row_tile_b + 2) * tile_dim));
+            for (nk_size_t row = 0; row < rows_a_remaining; row++) {
+                nk_f32_t *c_row = (nk_f32_t *)((char *)c + (row_start_a + row) * c_stride_in_bytes);
+                svfloat32_t norm_a_f32x = svdup_f32(a_tile_norms[row]);
+                // ZA1
+                {
+                    svuint32_t za1_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_f32x, 1, row);
+                    svfloat32_t matching_f32x = svcvt_f32_u32_x(predicate_all_f32x, za1_u32x);
+                    svfloat32_t sum_norms_f32x = svadd_f32_x(predicate_all_f32x, norm_a_f32x, b_norms_0_f32x);
+                    svfloat32_t intersection_f32x = svmul_f32_x(
+                        predicate_all_f32x,
+                        svadd_f32_x(predicate_all_f32x, svsub_f32_x(predicate_all_f32x, sum_norms_f32x, depth_f32x),
+                                    matching_f32x),
+                        half_f32x);
+                    svfloat32_t union_val_f32x = svsub_f32_x(predicate_all_f32x, sum_norms_f32x, intersection_f32x);
+                    svbool_t nonzero_f32x = svcmpne_f32(predicate_all_f32x, union_val_f32x, zero_f32x);
+                    svfloat32_t ratio_f32x = svdiv_f32_x(predicate_all_f32x, intersection_f32x, union_val_f32x);
+                    svfloat32_t jaccard_f32x = svsel_f32(
+                        nonzero_f32x, svsub_f32_x(predicate_all_f32x, one_f32x, ratio_f32x), one_f32x);
+                    svst1_f32(predicate_all_f32x, c_row + (row_tile_b + 0) * tile_dim, jaccard_f32x);
+                }
+                // ZA2
+                {
+                    svuint32_t za2_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_f32x, 2, row);
+                    svfloat32_t matching_f32x = svcvt_f32_u32_x(predicate_all_f32x, za2_u32x);
+                    svfloat32_t sum_norms_f32x = svadd_f32_x(predicate_all_f32x, norm_a_f32x, b_norms_1_f32x);
+                    svfloat32_t intersection_f32x = svmul_f32_x(
+                        predicate_all_f32x,
+                        svadd_f32_x(predicate_all_f32x, svsub_f32_x(predicate_all_f32x, sum_norms_f32x, depth_f32x),
+                                    matching_f32x),
+                        half_f32x);
+                    svfloat32_t union_val_f32x = svsub_f32_x(predicate_all_f32x, sum_norms_f32x, intersection_f32x);
+                    svbool_t nonzero_f32x = svcmpne_f32(predicate_all_f32x, union_val_f32x, zero_f32x);
+                    svfloat32_t ratio_f32x = svdiv_f32_x(predicate_all_f32x, intersection_f32x, union_val_f32x);
+                    svfloat32_t jaccard_f32x = svsel_f32(
+                        nonzero_f32x, svsub_f32_x(predicate_all_f32x, one_f32x, ratio_f32x), one_f32x);
+                    svst1_f32(predicate_all_f32x, c_row + (row_tile_b + 1) * tile_dim, jaccard_f32x);
+                }
+                // ZA3
+                {
+                    svuint32_t za3_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_f32x, 3, row);
+                    svfloat32_t matching_f32x = svcvt_f32_u32_x(predicate_all_f32x, za3_u32x);
+                    svfloat32_t sum_norms_f32x = svadd_f32_x(predicate_all_f32x, norm_a_f32x, b_norms_2_f32x);
+                    svfloat32_t intersection_f32x = svmul_f32_x(
+                        predicate_all_f32x,
+                        svadd_f32_x(predicate_all_f32x, svsub_f32_x(predicate_all_f32x, sum_norms_f32x, depth_f32x),
+                                    matching_f32x),
+                        half_f32x);
+                    svfloat32_t union_val_f32x = svsub_f32_x(predicate_all_f32x, sum_norms_f32x, intersection_f32x);
+                    svbool_t nonzero_f32x = svcmpne_f32(predicate_all_f32x, union_val_f32x, zero_f32x);
+                    svfloat32_t ratio_f32x = svdiv_f32_x(predicate_all_f32x, intersection_f32x, union_val_f32x);
+                    svfloat32_t jaccard_f32x = svsel_f32(
+                        nonzero_f32x, svsub_f32_x(predicate_all_f32x, one_f32x, ratio_f32x), one_f32x);
+                    svst1_f32(predicate_all_f32x, c_row + (row_tile_b + 2) * tile_dim, jaccard_f32x);
+                }
+            }
+        }
+        // Remainder: 1 B column tile at a time using ZA1
+        for (; row_tile_b < row_tile_count_b; row_tile_b++) {
+            nk_size_t const row_start_b = row_tile_b * tile_dim;
+            nk_size_t const rows_b_remaining = (row_start_b + tile_dim <= row_count_b) ? tile_dim
+                                                                                       : (row_count_b - row_start_b);
+            svbool_t const column_predicate_f32x = svwhilelt_b32_u64(0u, rows_b_remaining);
+            svzero_mask_za(nk_sme_zero_za32_tile_1_);
+            for (nk_size_t d_tile = 0; d_tile < depth_tile_count; d_tile++) {
+                nk_size_t const d_start_u32 = d_tile * depth_tile_size;
+                nk_size_t const u32s_this_tile = (d_start_u32 + depth_tile_size <= depth_u32_total)
+                                                     ? depth_tile_size
+                                                     : (depth_u32_total > d_start_u32 ? depth_u32_total - d_start_u32
+                                                                                      : 0);
+                if (u32s_this_tile == 0) break;
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                svbool_t const batch_predicate_f32x = svwhilelt_b32_u64(0u, u32s_this_tile);
+                // Load A rows into ZA0.S horizontally
+                for (nk_size_t row_in_tile = 0; row_in_tile < rows_a_remaining; row_in_tile++) {
+                    nk_u32_t const *a_row_u32 = (nk_u32_t const *)((char const *)a +
+                                                                   (row_start_a + row_in_tile) * a_stride_in_bytes) +
+                                                d_start_u32;
+                    svld1_hor_za32(0, row_in_tile, batch_predicate_f32x, a_row_u32);
+                }
+                nk_u32_t const *b_tile = b_tiles + (row_tile_b * depth_tile_count + d_tile) * tile_elements;
+                // Vertical read + BMOPA
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_column_u32x = svread_ver_za32_u32_m(svdup_u32(0), row_predicate_f32x, 0, step);
+                    svuint32_t b_u32x = svld1_u32(predicate_all_f32x, b_tile + step * tile_dim);
+                    svbmopa_za32_u32_m(1, row_predicate_f32x, column_predicate_f32x, a_column_u32x, b_u32x);
+                }
+            }
+            // Extract from ZA1: Jaccard normalization
+            svfloat32_t b_norms_f32x = svcvt_f32_u32_x(predicate_all_f32x,
+                                                       svld1_u32(predicate_all_f32x, b_norms + row_start_b));
+            for (nk_size_t row = 0; row < rows_a_remaining; row++) {
+                svuint32_t za1_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_f32x, 1, row);
+                svfloat32_t matching_f32x = svcvt_f32_u32_x(predicate_all_f32x, za1_u32x);
+                svfloat32_t norm_a_f32x = svdup_f32(a_tile_norms[row]);
+                svfloat32_t sum_norms_f32x = svadd_f32_x(predicate_all_f32x, norm_a_f32x, b_norms_f32x);
+                svfloat32_t intersection_f32x = svmul_f32_x(
+                    predicate_all_f32x,
+                    svadd_f32_x(predicate_all_f32x, svsub_f32_x(predicate_all_f32x, sum_norms_f32x, depth_f32x),
+                                matching_f32x),
+                    half_f32x);
+                svfloat32_t union_val_f32x = svsub_f32_x(predicate_all_f32x, sum_norms_f32x, intersection_f32x);
+                svbool_t nonzero_f32x = svcmpne_f32(predicate_all_f32x, union_val_f32x, zero_f32x);
+                svfloat32_t ratio_f32x = svdiv_f32_x(predicate_all_f32x, intersection_f32x, union_val_f32x);
+                svfloat32_t jaccard_f32x = svsel_f32(nonzero_f32x,
+                                                     svsub_f32_x(predicate_all_f32x, one_f32x, ratio_f32x), one_f32x);
+                nk_f32_t *c_row = (nk_f32_t *)((char *)c + (row_start_a + row) * c_stride_in_bytes);
+                svst1_f32(column_predicate_f32x, c_row + row_start_b, jaccard_f32x);
+            }
+        }
+    }
+}
+NK_PUBLIC void nk_jaccards_packed_u1_smebi32(nk_u1x8_t const *a, void const *b_packed, nk_f32_t *c,
+                                             nk_size_t row_count_a, nk_size_t row_count_b, nk_size_t depth_bits,
+                                             nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
+    nk_jaccards_packed_u1_smebi32_streaming_(a, b_packed, c, row_count_a, row_count_b, depth_bits, a_stride_in_bytes,
+                                             c_stride_in_bytes);
+}
+/**
+ *  Symmetric Jaccard kernel using ZA0 time-sharing + 3-tile fast path.
+ *  Fills upper triangle only (column_tile >= row_tile); caller sees result[i][j] for j >= i.
+ *  Norms computed on-the-fly using streaming SVE popcount.
+ */
+__arm_locally_streaming __arm_new("za") static void nk_jaccards_symmetric_u1_smebi32_streaming_(
+    nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth_bits, nk_size_t stride, nk_f32_t *result,
+    nk_size_t result_stride, nk_size_t row_start, nk_size_t row_count) {
+    nk_size_t const tile_dim = svcntw();        // 16 for 512-bit SVL
+    nk_size_t const depth_tile_size = svcntw(); // 16 u32 per depth tile
+    nk_size_t const depth_u32_total = nk_size_divide_round_up_(depth_bits, 32);
+    nk_size_t const depth_tile_count = nk_size_divide_round_up_(depth_u32_total, depth_tile_size);
+    nk_size_t const depth_in_bytes = nk_size_divide_round_up_(depth_bits, NK_BITS_PER_BYTE);
+    svbool_t const predicate_all_f32x = svptrue_b32();
+    svfloat32_t const depth_f32x = svdup_f32((nk_f32_t)depth_bits);
+    svfloat32_t const half_f32x = svdup_f32(0.5f);
+    svfloat32_t const one_f32x = svdup_f32(1.0f);
+    svfloat32_t const zero_f32x = svdup_f32(0.0f);
+    NK_ALIGN64 nk_u32_t a_buffer[16][16]; // Stack buffer for A column save
+    nk_size_t const row_end = row_start + row_count;
+    nk_size_t const column_tile_count = nk_size_divide_round_up_(n_vectors, tile_dim);
+    for (nk_size_t row_tile_start = row_start; row_tile_start < row_end && row_tile_start < n_vectors;
+         row_tile_start += tile_dim) {
+        nk_size_t const rows_remaining = (row_tile_start + tile_dim <= row_end) ? tile_dim : (row_end - row_tile_start);
+        nk_size_t const rows_clamped = (row_tile_start + rows_remaining <= n_vectors) ? rows_remaining
+                                                                                      : (n_vectors - row_tile_start);
+        svbool_t const row_predicate_f32x = svwhilelt_b32_u64(0u, rows_clamped);
+        // Compute A tile norms
+        NK_ALIGN64 nk_f32_t a_tile_norms[16];
+        for (nk_size_t r = 0; r < rows_clamped; r++) {
+            nk_u1x8_t const *a_row = (nk_u1x8_t const *)((char const *)vectors + (row_tile_start + r) * stride);
+            a_tile_norms[r] = (nk_f32_t)nk_sets_reduce_sumsq_u1_streaming_(a_row, depth_in_bytes);
+        }
+        for (nk_size_t r = rows_clamped; r < tile_dim; r++) a_tile_norms[r] = 0.0f;
+        // Upper triangle: start from this row tile's column
+        nk_size_t column_tile_index = row_tile_start / tile_dim;
+        // Fast path: 3 column tiles using ZA1-ZA3 (ZA0 = staging)
+        for (; column_tile_index + 3 <= column_tile_count; column_tile_index += 3) {
+            svzero_mask_za(nk_sme_zero_za32_tiles_123_);
+            for (nk_size_t d_tile = 0; d_tile < depth_tile_count; d_tile++) {
+                nk_size_t const d_start_u32 = d_tile * depth_tile_size;
+                nk_size_t const u32s_this_tile = (d_start_u32 + depth_tile_size <= depth_u32_total)
+                                                     ? depth_tile_size
+                                                     : (depth_u32_total > d_start_u32 ? depth_u32_total - d_start_u32
+                                                                                      : 0);
+                if (u32s_this_tile == 0) break;
+                // Load A rows into ZA0 horizontally
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                svbool_t const batch_predicate_f32x = svwhilelt_b32_u64(0u, u32s_this_tile);
+                for (nk_size_t row_in_tile = 0; row_in_tile < rows_clamped; row_in_tile++) {
+                    nk_u32_t const *a_row_u32 = (nk_u32_t const *)((char const *)vectors +
+                                                                   (row_tile_start + row_in_tile) * stride) +
+                                                d_start_u32;
+                    svld1_hor_za32(0, row_in_tile, batch_predicate_f32x, a_row_u32);
+                }
+                // Save A columns from ZA0 to stack buffer
+                for (nk_size_t s = 0; s < u32s_this_tile; s++)
+                    svst1_u32(predicate_all_f32x, a_buffer[s],
+                              svread_ver_za32_u32_m(svdup_u32(0), row_predicate_f32x, 0, s));
+                // B column tile 0
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                for (nk_size_t col = 0; col < tile_dim; col++) {
+                    nk_size_t const col_abs = (column_tile_index + 0) * tile_dim + col;
+                    if (col_abs < n_vectors) {
+                        nk_u32_t const *b_row = (nk_u32_t const *)((char const *)vectors + col_abs * stride) +
+                                                d_start_u32;
+                        svld1_hor_za32(0, col, batch_predicate_f32x, b_row);
+                    }
+                }
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_u32x = svld1_u32(predicate_all_f32x, a_buffer[step]);
+                    svuint32_t b_u32x = svread_ver_za32_u32_m(svdup_u32(0), predicate_all_f32x, 0, step);
+                    svbmopa_za32_u32_m(1, row_predicate_f32x, predicate_all_f32x, a_u32x, b_u32x);
+                }
+                // B column tile 1
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                for (nk_size_t col = 0; col < tile_dim; col++) {
+                    nk_size_t const col_abs = (column_tile_index + 1) * tile_dim + col;
+                    if (col_abs < n_vectors) {
+                        nk_u32_t const *b_row = (nk_u32_t const *)((char const *)vectors + col_abs * stride) +
+                                                d_start_u32;
+                        svld1_hor_za32(0, col, batch_predicate_f32x, b_row);
+                    }
+                }
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_u32x = svld1_u32(predicate_all_f32x, a_buffer[step]);
+                    svuint32_t b_u32x = svread_ver_za32_u32_m(svdup_u32(0), predicate_all_f32x, 0, step);
+                    svbmopa_za32_u32_m(2, row_predicate_f32x, predicate_all_f32x, a_u32x, b_u32x);
+                }
+                // B column tile 2
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                for (nk_size_t col = 0; col < tile_dim; col++) {
+                    nk_size_t const col_abs = (column_tile_index + 2) * tile_dim + col;
+                    if (col_abs < n_vectors) {
+                        nk_u32_t const *b_row = (nk_u32_t const *)((char const *)vectors + col_abs * stride) +
+                                                d_start_u32;
+                        svld1_hor_za32(0, col, batch_predicate_f32x, b_row);
+                    }
+                }
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_u32x = svld1_u32(predicate_all_f32x, a_buffer[step]);
+                    svuint32_t b_u32x = svread_ver_za32_u32_m(svdup_u32(0), predicate_all_f32x, 0, step);
+                    svbmopa_za32_u32_m(3, row_predicate_f32x, predicate_all_f32x, a_u32x, b_u32x);
+                }
+            }
+            // Compute B tile norms for 3 column tiles
+            NK_ALIGN64 nk_u32_t b_tile_norms_0[16];
+            NK_ALIGN64 nk_u32_t b_tile_norms_1[16];
+            NK_ALIGN64 nk_u32_t b_tile_norms_2[16];
+            for (nk_size_t col = 0; col < tile_dim; col++) {
+                nk_size_t const col_abs_0 = (column_tile_index + 0) * tile_dim + col;
+                nk_size_t const col_abs_1 = (column_tile_index + 1) * tile_dim + col;
+                nk_size_t const col_abs_2 = (column_tile_index + 2) * tile_dim + col;
+                b_tile_norms_0[col] = (col_abs_0 < n_vectors)
+                                          ? nk_sets_reduce_sumsq_u1_streaming_(
+                                                (nk_u1x8_t const *)((char const *)vectors + col_abs_0 * stride),
+                                                depth_in_bytes)
+                                          : 0;
+                b_tile_norms_1[col] = (col_abs_1 < n_vectors)
+                                          ? nk_sets_reduce_sumsq_u1_streaming_(
+                                                (nk_u1x8_t const *)((char const *)vectors + col_abs_1 * stride),
+                                                depth_in_bytes)
+                                          : 0;
+                b_tile_norms_2[col] = (col_abs_2 < n_vectors)
+                                          ? nk_sets_reduce_sumsq_u1_streaming_(
+                                                (nk_u1x8_t const *)((char const *)vectors + col_abs_2 * stride),
+                                                depth_in_bytes)
+                                          : 0;
+            }
+            // Extract ZA1-3: Jaccard normalization
+            svfloat32_t b_norms_0_f32x = svcvt_f32_u32_x(predicate_all_f32x,
+                                                         svld1_u32(predicate_all_f32x, b_tile_norms_0));
+            svfloat32_t b_norms_1_f32x = svcvt_f32_u32_x(predicate_all_f32x,
+                                                         svld1_u32(predicate_all_f32x, b_tile_norms_1));
+            svfloat32_t b_norms_2_f32x = svcvt_f32_u32_x(predicate_all_f32x,
+                                                         svld1_u32(predicate_all_f32x, b_tile_norms_2));
+            for (nk_size_t row = 0; row < rows_clamped; row++) {
+                nk_f32_t *c_row = (nk_f32_t *)((char *)result + (row_tile_start + row) * result_stride);
+                svfloat32_t norm_a_f32x = svdup_f32(a_tile_norms[row]);
+                // ZA1
+                {
+                    svuint32_t za1_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_f32x, 1, row);
+                    svfloat32_t matching_f32x = svcvt_f32_u32_x(predicate_all_f32x, za1_u32x);
+                    svfloat32_t sum_norms_f32x = svadd_f32_x(predicate_all_f32x, norm_a_f32x, b_norms_0_f32x);
+                    svfloat32_t intersection_f32x = svmul_f32_x(
+                        predicate_all_f32x,
+                        svadd_f32_x(predicate_all_f32x, svsub_f32_x(predicate_all_f32x, sum_norms_f32x, depth_f32x),
+                                    matching_f32x),
+                        half_f32x);
+                    svfloat32_t union_val_f32x = svsub_f32_x(predicate_all_f32x, sum_norms_f32x, intersection_f32x);
+                    svbool_t nonzero_f32x = svcmpne_f32(predicate_all_f32x, union_val_f32x, zero_f32x);
+                    svfloat32_t ratio_f32x = svdiv_f32_x(predicate_all_f32x, intersection_f32x, union_val_f32x);
+                    svfloat32_t jaccard_f32x = svsel_f32(
+                        nonzero_f32x, svsub_f32_x(predicate_all_f32x, one_f32x, ratio_f32x), one_f32x);
+                    svst1_f32(predicate_all_f32x, c_row + (column_tile_index + 0) * tile_dim, jaccard_f32x);
+                }
+                // ZA2
+                {
+                    svuint32_t za2_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_f32x, 2, row);
+                    svfloat32_t matching_f32x = svcvt_f32_u32_x(predicate_all_f32x, za2_u32x);
+                    svfloat32_t sum_norms_f32x = svadd_f32_x(predicate_all_f32x, norm_a_f32x, b_norms_1_f32x);
+                    svfloat32_t intersection_f32x = svmul_f32_x(
+                        predicate_all_f32x,
+                        svadd_f32_x(predicate_all_f32x, svsub_f32_x(predicate_all_f32x, sum_norms_f32x, depth_f32x),
+                                    matching_f32x),
+                        half_f32x);
+                    svfloat32_t union_val_f32x = svsub_f32_x(predicate_all_f32x, sum_norms_f32x, intersection_f32x);
+                    svbool_t nonzero_f32x = svcmpne_f32(predicate_all_f32x, union_val_f32x, zero_f32x);
+                    svfloat32_t ratio_f32x = svdiv_f32_x(predicate_all_f32x, intersection_f32x, union_val_f32x);
+                    svfloat32_t jaccard_f32x = svsel_f32(
+                        nonzero_f32x, svsub_f32_x(predicate_all_f32x, one_f32x, ratio_f32x), one_f32x);
+                    svst1_f32(predicate_all_f32x, c_row + (column_tile_index + 1) * tile_dim, jaccard_f32x);
+                }
+                // ZA3
+                {
+                    svuint32_t za3_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_f32x, 3, row);
+                    svfloat32_t matching_f32x = svcvt_f32_u32_x(predicate_all_f32x, za3_u32x);
+                    svfloat32_t sum_norms_f32x = svadd_f32_x(predicate_all_f32x, norm_a_f32x, b_norms_2_f32x);
+                    svfloat32_t intersection_f32x = svmul_f32_x(
+                        predicate_all_f32x,
+                        svadd_f32_x(predicate_all_f32x, svsub_f32_x(predicate_all_f32x, sum_norms_f32x, depth_f32x),
+                                    matching_f32x),
+                        half_f32x);
+                    svfloat32_t union_val_f32x = svsub_f32_x(predicate_all_f32x, sum_norms_f32x, intersection_f32x);
+                    svbool_t nonzero_f32x = svcmpne_f32(predicate_all_f32x, union_val_f32x, zero_f32x);
+                    svfloat32_t ratio_f32x = svdiv_f32_x(predicate_all_f32x, intersection_f32x, union_val_f32x);
+                    svfloat32_t jaccard_f32x = svsel_f32(
+                        nonzero_f32x, svsub_f32_x(predicate_all_f32x, one_f32x, ratio_f32x), one_f32x);
+                    svst1_f32(predicate_all_f32x, c_row + (column_tile_index + 2) * tile_dim, jaccard_f32x);
+                }
+            }
+        }
+        // Remainder: 1 column tile at a time using ZA1
+        for (; column_tile_index < column_tile_count; column_tile_index++) {
+            nk_size_t const col_tile_start = column_tile_index * tile_dim;
+            nk_size_t const cols_remaining = (col_tile_start + tile_dim <= n_vectors) ? tile_dim
+                                                                                      : (n_vectors - col_tile_start);
+            svbool_t const column_predicate_f32x = svwhilelt_b32_u64(0u, cols_remaining);
+            svzero_mask_za(nk_sme_zero_za32_tile_1_);
+            for (nk_size_t d_tile = 0; d_tile < depth_tile_count; d_tile++) {
+                nk_size_t const d_start_u32 = d_tile * depth_tile_size;
+                nk_size_t const u32s_this_tile = (d_start_u32 + depth_tile_size <= depth_u32_total)
+                                                     ? depth_tile_size
+                                                     : (depth_u32_total > d_start_u32 ? depth_u32_total - d_start_u32
+                                                                                      : 0);
+                if (u32s_this_tile == 0) break;
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                svbool_t const batch_predicate_f32x = svwhilelt_b32_u64(0u, u32s_this_tile);
+                // Load A rows into ZA0 horizontally
+                for (nk_size_t row_in_tile = 0; row_in_tile < rows_clamped; row_in_tile++) {
+                    nk_u32_t const *a_row_u32 = (nk_u32_t const *)((char const *)vectors +
+                                                                   (row_tile_start + row_in_tile) * stride) +
+                                                d_start_u32;
+                    svld1_hor_za32(0, row_in_tile, batch_predicate_f32x, a_row_u32);
+                }
+                // Save A columns from ZA0 to stack buffer
+                for (nk_size_t s = 0; s < u32s_this_tile; s++)
+                    svst1_u32(predicate_all_f32x, a_buffer[s],
+                              svread_ver_za32_u32_m(svdup_u32(0), row_predicate_f32x, 0, s));
+                // Load B column tile into ZA0
+                svzero_mask_za(nk_sme_zero_za32_tile_0_);
+                for (nk_size_t col = 0; col < tile_dim; col++) {
+                    nk_size_t const col_abs = col_tile_start + col;
+                    if (col_abs < n_vectors) {
+                        nk_u32_t const *b_row = (nk_u32_t const *)((char const *)vectors + col_abs * stride) +
+                                                d_start_u32;
+                        svld1_hor_za32(0, col, batch_predicate_f32x, b_row);
+                    }
+                }
+                for (nk_size_t step = 0; step < u32s_this_tile; step++) {
+                    svuint32_t a_u32x = svld1_u32(predicate_all_f32x, a_buffer[step]);
+                    svuint32_t b_u32x = svread_ver_za32_u32_m(svdup_u32(0), column_predicate_f32x, 0, step);
+                    svbmopa_za32_u32_m(1, row_predicate_f32x, column_predicate_f32x, a_u32x, b_u32x);
+                }
+            }
+            // Compute B tile norms for remainder tile
+            NK_ALIGN64 nk_u32_t b_tile_norms[16];
+            for (nk_size_t col = 0; col < tile_dim; col++) {
+                nk_size_t const col_abs = col_tile_start + col;
+                b_tile_norms[col] = (col_abs < n_vectors)
+                                        ? nk_sets_reduce_sumsq_u1_streaming_(
+                                              (nk_u1x8_t const *)((char const *)vectors + col_abs * stride),
+                                              depth_in_bytes)
+                                        : 0;
+            }
+            svfloat32_t b_norms_f32x = svcvt_f32_u32_x(predicate_all_f32x, svld1_u32(predicate_all_f32x, b_tile_norms));
+            for (nk_size_t row = 0; row < rows_clamped; row++) {
+                svuint32_t za1_u32x = svread_hor_za32_u32_m(svdup_u32(0), predicate_all_f32x, 1, row);
+                svfloat32_t matching_f32x = svcvt_f32_u32_x(predicate_all_f32x, za1_u32x);
+                svfloat32_t norm_a_f32x = svdup_f32(a_tile_norms[row]);
+                svfloat32_t sum_norms_f32x = svadd_f32_x(predicate_all_f32x, norm_a_f32x, b_norms_f32x);
+                svfloat32_t intersection_f32x = svmul_f32_x(
+                    predicate_all_f32x,
+                    svadd_f32_x(predicate_all_f32x, svsub_f32_x(predicate_all_f32x, sum_norms_f32x, depth_f32x),
+                                matching_f32x),
+                    half_f32x);
+                svfloat32_t union_val_f32x = svsub_f32_x(predicate_all_f32x, sum_norms_f32x, intersection_f32x);
+                svbool_t nonzero_f32x = svcmpne_f32(predicate_all_f32x, union_val_f32x, zero_f32x);
+                svfloat32_t ratio_f32x = svdiv_f32_x(predicate_all_f32x, intersection_f32x, union_val_f32x);
+                svfloat32_t jaccard_f32x = svsel_f32(nonzero_f32x,
+                                                     svsub_f32_x(predicate_all_f32x, one_f32x, ratio_f32x), one_f32x);
+                nk_f32_t *c_row = (nk_f32_t *)((char *)result + (row_tile_start + row) * result_stride);
+                svst1_f32(column_predicate_f32x, c_row + col_tile_start, jaccard_f32x);
+            }
+        }
+    }
+}
+NK_PUBLIC void nk_jaccards_symmetric_u1_smebi32(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth_bits,
+                                                nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
+                                                nk_size_t row_start, nk_size_t row_count) {
+    nk_jaccards_symmetric_u1_smebi32_streaming_(vectors, n_vectors, depth_bits, stride, result, result_stride,
+                                                row_start, row_count);
+}
+#pragma endregion // Jaccard Distance
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_SMEBI32
+#endif // NK_TARGET_ARM_
+#endif // NK_SETS_SMEBI32_H