npm - numkong - Versions diffs - 7.4.4 → 7.5.0 - Mend

numkong 7.4.4 → 7.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/README.md +1 -0
package/binding.gyp +81 -5
package/c/dispatch_f16.c +23 -0
package/c/numkong.c +0 -13
package/include/numkong/attention/sme.h +34 -31
package/include/numkong/capabilities.h +2 -15
package/include/numkong/cast/neon.h +15 -0
package/include/numkong/curved/smef64.h +82 -62
package/include/numkong/dot/rvvbf16.h +1 -1
package/include/numkong/dot/rvvhalf.h +1 -1
package/include/numkong/dot/sve.h +6 -5
package/include/numkong/dot/svebfdot.h +2 -1
package/include/numkong/dot/svehalf.h +6 -5
package/include/numkong/dot/svesdot.h +3 -2
package/include/numkong/dots/graniteamx.h +733 -0
package/include/numkong/dots/serial.h +11 -4
package/include/numkong/dots/sme.h +172 -140
package/include/numkong/dots/smebi32.h +14 -11
package/include/numkong/dots/smef64.h +31 -26
package/include/numkong/dots.h +29 -3
package/include/numkong/each/serial.h +22 -0
package/include/numkong/geospatial/haswell.h +1 -1
package/include/numkong/geospatial/neon.h +1 -1
package/include/numkong/geospatial/serial.h +1 -1
package/include/numkong/geospatial/skylake.h +1 -1
package/include/numkong/maxsim/sme.h +94 -55
package/include/numkong/mesh/README.md +13 -27
package/include/numkong/mesh/haswell.h +25 -122
package/include/numkong/mesh/neon.h +21 -110
package/include/numkong/mesh/neonbfdot.h +4 -43
package/include/numkong/mesh/rvv.h +7 -82
package/include/numkong/mesh/serial.h +48 -53
package/include/numkong/mesh/skylake.h +7 -123
package/include/numkong/mesh/v128relaxed.h +9 -93
package/include/numkong/mesh.h +2 -2
package/include/numkong/mesh.hpp +35 -96
package/include/numkong/reduce/neon.h +29 -0
package/include/numkong/reduce/neonbfdot.h +2 -2
package/include/numkong/reduce/neonfhm.h +4 -4
package/include/numkong/reduce/sve.h +52 -0
package/include/numkong/reduce.h +4 -0
package/include/numkong/set/sve.h +6 -5
package/include/numkong/sets/smebi32.h +35 -30
package/include/numkong/sparse/sve2.h +3 -2
package/include/numkong/spatial/sve.h +7 -6
package/include/numkong/spatial/svebfdot.h +7 -4
package/include/numkong/spatial/svehalf.h +5 -4
package/include/numkong/spatial/svesdot.h +9 -8
package/include/numkong/spatials/graniteamx.h +173 -0
package/include/numkong/spatials/serial.h +22 -0
package/include/numkong/spatials/sme.h +391 -350
package/include/numkong/spatials/smef64.h +79 -70
package/include/numkong/spatials.h +37 -4
package/include/numkong/types.h +59 -0
package/javascript/dist/cjs/numkong.js +13 -0
package/javascript/dist/esm/numkong.js +13 -0
package/javascript/numkong.c +56 -12
package/javascript/numkong.ts +13 -0
package/package.json +7 -7
package/probes/probe.js +2 -2
package/wasm/numkong.wasm +0 -0

package/include/numkong/dots/smebi32.h CHANGED Viewed

@@ -39,9 +39,9 @@ extern "C" {
  *  BMOPA gives matching = popcount(XNOR(a,b)).
  *  dot(a,b) = popcount(a AND b) = (pop_a + pop_b - depth_bits + matching) / 2
  */
-__arm_locally_streaming __arm_new("za") static void nk_dots_packed_u1_smebi32_streaming_(
+__arm_new("za") static void nk_dots_packed_u1_smebi32_streaming_( //
     nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t row_count_a, nk_size_t row_count_b,
-    nk_size_t depth_bits, nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
+    nk_size_t depth_bits, nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) NK_STREAMING_ {
     nk_sets_smebi32_packed_header_t const *header = (nk_sets_smebi32_packed_header_t const *)b_packed;
     nk_size_t const row_tile_count_b = header->row_tile_count;
@@ -204,20 +204,22 @@ __arm_locally_streaming __arm_new("za") static void nk_dots_packed_u1_smebi32_st
     }
 }
-NK_PUBLIC void nk_dots_packed_u1_smebi32(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t row_count_a,
-                                         nk_size_t row_count_b, nk_size_t depth_bits, nk_size_t a_stride_in_bytes,
-                                         nk_size_t c_stride_in_bytes) {
+NK_PUBLIC void nk_dots_packed_u1_smebi32( //
+    nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t row_count_a, nk_size_t row_count_b,
+    nk_size_t depth_bits, nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
+    nk_sme_start_streaming_();
     nk_dots_packed_u1_smebi32_streaming_(a, b_packed, c, row_count_a, row_count_b, depth_bits, a_stride_in_bytes,
                                          c_stride_in_bytes);
+    nk_sme_stop_streaming_();
 }
 /**
  *  Symmetric u1 dot-product using ZA0 time-sharing + 3-tile fast path.
  *  Same ZA transpose pattern as hammings_symmetric, but with dot extraction.
  */
-__arm_locally_streaming __arm_new("za") static void nk_dots_symmetric_u1_smebi32_streaming_(
+__arm_new("za") static void nk_dots_symmetric_u1_smebi32_streaming_( //
     nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth_bits, nk_size_t stride_in_bytes,
-    nk_u32_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
+    nk_u32_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
     nk_size_t const tile_dim = svcntw();        // 16 for 512-bit SVL
     nk_size_t const depth_tile_size = svcntw(); // 16 u32 per depth tile
@@ -451,12 +453,13 @@ __arm_locally_streaming __arm_new("za") static void nk_dots_symmetric_u1_smebi32
     }
 }
-NK_PUBLIC void nk_dots_symmetric_u1_smebi32(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth_bits,
-                                            nk_size_t stride_in_bytes, nk_u32_t *result,
-                                            nk_size_t result_stride_in_bytes, nk_size_t row_start,
-                                            nk_size_t row_count) {
+NK_PUBLIC void nk_dots_symmetric_u1_smebi32( //
+    nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth_bits, nk_size_t stride_in_bytes,
+    nk_u32_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
+    nk_sme_start_streaming_();
     nk_dots_symmetric_u1_smebi32_streaming_(vectors, vectors_count, depth_bits, stride_in_bytes, result,
                                             result_stride_in_bytes, row_start, row_count);
+    nk_sme_stop_streaming_();
 }
 #if defined(__clang__)

package/include/numkong/dots/smef64.h CHANGED Viewed

@@ -153,9 +153,9 @@ NK_PUBLIC void nk_dots_pack_f32_smef64(nk_f32_t const *b, nk_size_t columns, nk_
     }
 }
-__arm_locally_streaming __arm_new("za") static void nk_dots_packed_f32_smef64_streaming_(
+__arm_new("za") static void nk_dots_packed_f32_smef64_streaming_( //
     nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
-    nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
+    nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
     nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
     nk_size_t const column_tile_count = header->column_tile_count;
@@ -390,14 +390,16 @@ __arm_locally_streaming __arm_new("za") static void nk_dots_packed_f32_smef64_st
     }
 }
-NK_PUBLIC void nk_dots_packed_f32_smef64(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows,
-                                         nk_size_t columns, nk_size_t depth, nk_size_t a_stride_in_bytes,
-                                         nk_size_t c_stride_in_bytes) {
+NK_PUBLIC void nk_dots_packed_f32_smef64( //
+    nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
+    nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
     nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
     nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
+    nk_sme_stop_streaming_();
 }
 /**
@@ -406,9 +408,9 @@ NK_PUBLIC void nk_dots_packed_f32_smef64(nk_f32_t const *a, void const *b_packed
  *  pre-reads A columns into Z registers, then reloads ZA0 with widened B data
  *  per column tile. Eliminates all scalar B-packing loops.
  */
-__arm_locally_streaming __arm_new("za") static void nk_dots_symmetric_f32_smef64_streaming_(
+__arm_new("za") static void nk_dots_symmetric_f32_smef64_streaming_( //
     nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
-    nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
+    nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
     nk_size_t const tile_dimension = svcntd();              // 8 for SVL=512
     nk_size_t const depth_tile_size = svcntw();             // 16 for SVL=512
@@ -721,15 +723,16 @@ __arm_locally_streaming __arm_new("za") static void nk_dots_symmetric_f32_smef64
     }
 }
-NK_PUBLIC void nk_dots_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
-                                            nk_size_t stride_in_bytes, nk_f64_t *result,
-                                            nk_size_t result_stride_in_bytes, nk_size_t row_start,
-                                            nk_size_t row_count) {
+NK_PUBLIC void nk_dots_symmetric_f32_smef64( //
+    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
+    nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
     nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f32_t);
     nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_symmetric_f32_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
                                             result_stride_elements, row_start, row_count);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F32 Floats
@@ -783,17 +786,16 @@ NK_PUBLIC void nk_dots_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t v
  *
  *  All slices fit in f32 (24-bit significand). Products: max 19+19 = 38 ≤ 53, exact in f64.
  */
-NK_PUBLIC nk_u64_t nk_f64_smef64_ozaki_mask_19_bits_(void) NK_STREAMING_ {
+NK_PUBLIC nk_u64_t nk_f64_smef64_ozaki_mask_19_bits_(void) {
     return 0xFFFFFFFC00000000ULL; // keep top 19 sig bits
 }
-NK_PUBLIC nk_u64_t nk_f64_smef64_ozaki_mask_17_bits_(void) NK_STREAMING_ {
+NK_PUBLIC nk_u64_t nk_f64_smef64_ozaki_mask_17_bits_(void) {
     return 0xFFFFFFF000000000ULL; // keep top 17 sig bits
 }
 /*  Split a scalar f64 into 3 non-overlapping Ozaki slices (19+17+17 mantissa bits).
  *  Each slice fits in f32. Outputs stored via pointers. */
-NK_PUBLIC void nk_f64_smef64_ozaki_split_f64_(nk_f64_t val, nk_f64_t *slice_0, nk_f64_t *slice_1,
-                                              nk_f64_t *slice_2) NK_STREAMING_ {
+NK_PUBLIC void nk_f64_smef64_ozaki_split_f64_(nk_f64_t val, nk_f64_t *slice_0, nk_f64_t *slice_1, nk_f64_t *slice_2) {
     nk_fui64_t pun;
     pun.f = val;
     pun.u &= nk_f64_smef64_ozaki_mask_19_bits_();
@@ -805,9 +807,9 @@ NK_PUBLIC void nk_f64_smef64_ozaki_split_f64_(nk_f64_t val, nk_f64_t *slice_0, n
     *slice_2 = residual - *slice_1;
 }
-__arm_locally_streaming __arm_new("za") static void nk_dots_symmetric_f64_smef64_streaming_(
+__arm_new("za") static void nk_dots_symmetric_f64_smef64_streaming_( //
     nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
-    nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
+    nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
     nk_size_t const tile_dimension = svcntd();
     nk_size_t const depth_steps_per_batch = tile_dimension;
@@ -929,15 +931,16 @@ __arm_locally_streaming __arm_new("za") static void nk_dots_symmetric_f64_smef64
     }
 }
-NK_PUBLIC void nk_dots_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
-                                            nk_size_t stride_in_bytes, nk_f64_t *result,
-                                            nk_size_t result_stride_in_bytes, nk_size_t row_start,
-                                            nk_size_t row_count) {
+NK_PUBLIC void nk_dots_symmetric_f64_smef64( //
+    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
+    nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
     nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f64_t);
     nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_symmetric_f64_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
                                             result_stride_elements, row_start, row_count);
+    nk_sme_stop_streaming_();
 }
 NK_PUBLIC nk_size_t nk_dots_packed_size_f64_smef64(nk_size_t columns, nk_size_t depth) {
@@ -1018,9 +1021,9 @@ NK_PUBLIC void nk_dots_pack_f64_smef64(nk_f64_t const *b, nk_size_t columns, nk_
     }
 }
-__arm_locally_streaming __arm_new("za") static void nk_dots_packed_f64_smef64_streaming_(
+__arm_new("za") static void nk_dots_packed_f64_smef64_streaming_( //
     nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
-    nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
+    nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
     // Read header
     nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
@@ -1296,14 +1299,16 @@ __arm_locally_streaming __arm_new("za") static void nk_dots_packed_f64_smef64_st
     }
 }
-NK_PUBLIC void nk_dots_packed_f64_smef64(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows,
-                                         nk_size_t columns, nk_size_t depth, nk_size_t a_stride_in_bytes,
-                                         nk_size_t c_stride_in_bytes) {
+NK_PUBLIC void nk_dots_packed_f64_smef64( //
+    nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
+    nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
     nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
     nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F64 Floats

package/include/numkong/dots.h CHANGED Viewed

@@ -681,6 +681,25 @@ NK_PUBLIC void nk_dots_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_
                                                 nk_size_t row_start, nk_size_t row_count);
 #endif // NK_TARGET_SAPPHIREAMX
+/*  Granite Rapids backends using Intel AMX-FP16 (Advanced Matrix Extensions with FP16 support).
+ *  AMX-FP16 adds TDPFP16PS (FP16×FP16→FP32 tile multiply-accumulate), same tile geometry as BF16.
+ *  The F32 Ozaki kernel splits F32 inputs into 2 FP16 halves for ~35-40 bit effective precision.
+ */
+#if NK_TARGET_GRANITEAMX
+/** @copydoc nk_dots_packed_size_f16 */
+NK_PUBLIC nk_size_t nk_dots_packed_size_f16_graniteamx(nk_size_t width, nk_size_t depth);
+/** @copydoc nk_dots_pack_f16 */
+NK_PUBLIC void nk_dots_pack_f16_graniteamx(nk_f16_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
+                                           void *b_packed);
+/** @copydoc nk_dots_packed_f16 */
+NK_PUBLIC void nk_dots_packed_f16_graniteamx(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
+                                             nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
+/** @copydoc nk_dots_symmetric_f16 */
+NK_PUBLIC void nk_dots_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
+                                                nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
+                                                nk_size_t row_start, nk_size_t row_count);
+#endif // NK_TARGET_GRANITEAMX
 /*  ARM SME backends using Scalable Matrix Extension.
  *  SME provides ZA tile registers for outer product operations.
  *  F16/BF16/I8/U8/E4M3 use ZA32 tiles, F32/F64 use ZA64 tiles (FEAT_SME_F64F64).
@@ -1858,6 +1877,7 @@ NK_PUBLIC void nk_dots_symmetric_u1_loongsonasx(nk_u1x8_t const *vectors, nk_siz
 #include "numkong/dots/genoa.h"
 #include "numkong/dots/diamond.h"
 #include "numkong/dots/sapphireamx.h"
+#include "numkong/dots/graniteamx.h"
 #include "numkong/dots/neon.h"
 #include "numkong/dots/neonsdot.h"
 #include "numkong/dots/neonfhm.h"
@@ -2002,7 +2022,9 @@ NK_PUBLIC void nk_dots_packed_f64(nk_f64_t const *a, void const *b_packed, nk_f6
 }
 NK_PUBLIC nk_size_t nk_dots_packed_size_f16(nk_size_t width, nk_size_t depth) {
-#if NK_TARGET_SME
+#if NK_TARGET_GRANITEAMX
+    return nk_dots_packed_size_f16_graniteamx(width, depth);
+#elif NK_TARGET_SME
     return nk_dots_packed_size_f16_sme(width, depth);
 #elif NK_TARGET_NEONFHM
     return nk_dots_packed_size_f16_neonfhm(width, depth);
@@ -2023,7 +2045,9 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_f16(nk_size_t width, nk_size_t depth) {
 NK_PUBLIC void nk_dots_pack_f16(nk_f16_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
                                 void *b_packed) {
-#if NK_TARGET_SME
+#if NK_TARGET_GRANITEAMX
+    nk_dots_pack_f16_graniteamx(b, width, depth, b_stride, b_packed);
+#elif NK_TARGET_SME
     nk_dots_pack_f16_sme(b, width, depth, b_stride, b_packed);
 #elif NK_TARGET_NEONFHM
     nk_dots_pack_f16_neonfhm(b, width, depth, b_stride, b_packed);
@@ -2044,7 +2068,9 @@ NK_PUBLIC void nk_dots_pack_f16(nk_f16_t const *b, nk_size_t width, nk_size_t de
 NK_PUBLIC void nk_dots_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
                                   nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride) {
-#if NK_TARGET_SME
+#if NK_TARGET_GRANITEAMX
+    nk_dots_packed_f16_graniteamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
+#elif NK_TARGET_SME
     nk_dots_packed_f16_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
 #elif NK_TARGET_NEONFHM
     nk_dots_packed_f16_neonfhm(a, b_packed, c, height, width, depth, a_stride, c_stride);

package/include/numkong/each/serial.h CHANGED Viewed

@@ -76,6 +76,18 @@ extern "C" {
         }                                                                                                             \
     }
+/* Optimize serial fallbacks for size — see dots/serial.h for rationale. */
+#if defined(NDEBUG)
+#if defined(_MSC_VER)
+#pragma optimize("s", on)
+#elif defined(__clang__)
+#pragma clang attribute push(__attribute__((minsize)), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC optimize("Os")
+#endif
+#endif
 nk_define_each_sum_(f64, f64, nk_assign_from_to_, nk_assign_from_to_)        // nk_each_sum_f64_serial
 nk_define_each_sum_(f32, f32, nk_assign_from_to_, nk_assign_from_to_)        // nk_each_sum_f32_serial
 nk_define_each_sum_(f16, f32, nk_f16_to_f32_serial, nk_f32_to_f16_serial)    // nk_each_sum_f16_serial
@@ -253,6 +265,16 @@ NK_PUBLIC void nk_each_fma_f64c_serial(nk_f64c_t const *a, nk_f64c_t const *b, n
     }
 }
+#if defined(NDEBUG)
+#if defined(_MSC_VER)
+#pragma optimize("", on)
+#elif defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif
 #if defined(__cplusplus)
 } // extern "C"
 #endif

package/include/numkong/geospatial/haswell.h CHANGED Viewed

@@ -24,7 +24,7 @@
 #if NK_TARGET_HASWELL
 #include "numkong/types.h"
-#include "numkong/trigonometry/haswell.h" // `nk_sin_f64x4_haswell_`, `nk_cos_f64x4_haswell_`, `nk_atan2_f64x4_haswell_`, etc.
+#include "numkong/trigonometry/haswell.h" // `nk_sin_f64x4_haswell_`, `nk_cos_f64x4_haswell_`, `nk_atan2_f64x4_haswell_`
 #if defined(__cplusplus)
 extern "C" {

package/include/numkong/geospatial/neon.h CHANGED Viewed

@@ -21,7 +21,7 @@
 #if NK_TARGET_NEON
 #include "numkong/types.h"
-#include "numkong/trigonometry/neon.h" // `nk_sin_f64x2_neon_`, `nk_cos_f64x2_neon_`, `nk_atan2_f64x2_neon_`, etc.
+#include "numkong/trigonometry/neon.h" // `nk_sin_f64x2_neon_`, `nk_cos_f64x2_neon_`, `nk_atan2_f64x2_neon_`
 #if defined(__cplusplus)
 extern "C" {

package/include/numkong/geospatial/serial.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #include "numkong/types.h"
 #include "numkong/spatial/serial.h"      // `nk_f64_sqrt_serial`, `nk_f32_sqrt_serial`
-#include "numkong/trigonometry/serial.h" // `nk_f64_sin`, `nk_f64_cos`, `nk_f64_atan2`, etc.
+#include "numkong/trigonometry/serial.h" // `nk_f64_sin`, `nk_f64_cos`, `nk_f64_atan2`
 #if defined(__cplusplus)
 extern "C" {

package/include/numkong/geospatial/skylake.h CHANGED Viewed

@@ -24,7 +24,7 @@
 #if NK_TARGET_SKYLAKE
 #include "numkong/types.h"
-#include "numkong/trigonometry/skylake.h" // `nk_sin_f64x8_skylake_`, `nk_cos_f64x8_skylake_`, `nk_atan2_f64x8_skylake_`, etc.
+#include "numkong/trigonometry/skylake.h" // `nk_sin_f64x8_skylake_`, `nk_cos_f64x8_skylake_`, `nk_atan2_f64x8_skylake_`
 #if defined(__cplusplus)
 extern "C" {

package/include/numkong/maxsim/sme.h CHANGED Viewed

@@ -46,7 +46,8 @@
 #if NK_TARGET_ARM64_
 #if NK_TARGET_SME
-#include "numkong/dots/sme.h" // nk_dots_sme_packed_header_t, nk_dots_pack_{f16,bf16}_sme, nk_dots_packed_size_{f16,bf16}_sme
+#include "numkong/dots/sme.h"   // `nk_dots_sme_packed_header_t`
+#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
 #if defined(__cplusplus)
 extern "C" {
@@ -90,10 +91,9 @@ NK_STATIC_ASSERT(sizeof(nk_maxsim_sme_packed_header_t) == 64, nk_maxsim_sme_pack
  *
  *  1-tile remainder: uses ZA0 only, with predicated loads for partial tiles.
  */
-__arm_locally_streaming __arm_new("za") static void nk_maxsim_packed_f16_streaming_( //
-    void const *query_packed, void const *document_packed,                           //
-    nk_size_t query_count, nk_size_t document_count,                                 //
-    nk_size_t depth, nk_f32_t *result) {
+__arm_new("za") static void nk_maxsim_packed_f16_streaming_( //
+    void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
+    nk_size_t depth, nk_f32_t *result) NK_STREAMING_ {
     nk_maxsim_sme_packed_header_t const *query_header = (nk_maxsim_sme_packed_header_t const *)query_packed;
     nk_maxsim_sme_packed_header_t const *document_header = (nk_maxsim_sme_packed_header_t const *)document_packed;
@@ -258,18 +258,19 @@ __arm_locally_streaming __arm_new("za") static void nk_maxsim_packed_f16_streami
             document_inverse_norms_f32x);
         svfloat32_t angular_distance_f32x = svmax_f32_x(
             row_predicate_b32x, svsub_f32_x(row_predicate_b32x, svdup_f32(1.0f), cosine_f32x), svdup_f32(0.0f));
-        total_angular_distance += svaddv_f32(row_predicate_b32x, angular_distance_f32x);
+        total_angular_distance += nk_svaddv_f32_(row_predicate_b32x, angular_distance_f32x);
     }
     *result = total_angular_distance;
 }
-NK_PUBLIC void nk_maxsim_packed_f16_sme(                              //
-    void const *query_packed, void const *document_packed,            //
-    nk_size_t query_count, nk_size_t document_count, nk_size_t depth, //
-    nk_f32_t *result) {                                               //
+NK_PUBLIC void nk_maxsim_packed_f16_sme( //
+    void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
+    nk_size_t depth, nk_f32_t *result) {
+    nk_sme_start_streaming_();
     nk_maxsim_packed_f16_streaming_(query_packed, document_packed, query_count, document_count, depth, result);
+    nk_sme_stop_streaming_();
 }
 /**
@@ -281,10 +282,9 @@ NK_PUBLIC void nk_maxsim_packed_f16_sme(                              //
  *
  *  1-tile remainder: uses ZA0 only, with predicated loads for partial tiles.
  */
-__arm_locally_streaming __arm_new("za") static void nk_maxsim_packed_bf16_streaming_( //
-    void const *query_packed, void const *document_packed,                            //
-    nk_size_t query_count, nk_size_t document_count,                                  //
-    nk_size_t depth, nk_f32_t *result) {
+__arm_new("za") static void nk_maxsim_packed_bf16_streaming_( //
+    void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
+    nk_size_t depth, nk_f32_t *result) NK_STREAMING_ {
     nk_maxsim_sme_packed_header_t const *query_header = (nk_maxsim_sme_packed_header_t const *)query_packed;
     nk_maxsim_sme_packed_header_t const *document_header = (nk_maxsim_sme_packed_header_t const *)document_packed;
@@ -454,18 +454,19 @@ __arm_locally_streaming __arm_new("za") static void nk_maxsim_packed_bf16_stream
             document_inverse_norms_f32x);
         svfloat32_t angular_distance_f32x = svmax_f32_x(
             row_predicate_b32x, svsub_f32_x(row_predicate_b32x, svdup_f32(1.0f), cosine_f32x), svdup_f32(0.0f));
-        total_angular_distance += svaddv_f32(row_predicate_b32x, angular_distance_f32x);
+        total_angular_distance += nk_svaddv_f32_(row_predicate_b32x, angular_distance_f32x);
     }
     *result = total_angular_distance;
 }
-NK_PUBLIC void nk_maxsim_packed_bf16_sme(                             //
-    void const *query_packed, void const *document_packed,            //
-    nk_size_t query_count, nk_size_t document_count, nk_size_t depth, //
-    nk_f32_t *result) {                                               //
+NK_PUBLIC void nk_maxsim_packed_bf16_sme( //
+    void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
+    nk_size_t depth, nk_f32_t *result) {
+    nk_sme_start_streaming_();
     nk_maxsim_packed_bf16_streaming_(query_packed, document_packed, query_count, document_count, depth, result);
+    nk_sme_stop_streaming_();
 }
 NK_PUBLIC nk_size_t nk_maxsim_packed_size_bf16_sme(nk_size_t columns, nk_size_t depth) { //
@@ -649,7 +650,47 @@ NK_PUBLIC nk_f64_t nk_maxsim_reduce_dot_f32_ssve_(                         //
         svfloat64_t b_odd_f64x = svcvtlt_f64_f32_x(predicate_odd_b64x, b_f32x);
         accumulator_odd_f64x = svmla_f64_m(predicate_odd_b64x, accumulator_odd_f64x, a_odd_f64x, b_odd_f64x);
     }
-    return svaddv_f64(svptrue_b64(), accumulator_even_f64x) + svaddv_f64(svptrue_b64(), accumulator_odd_f64x);
+    return nk_svaddv_f64_(svptrue_b64(), accumulator_even_f64x) + nk_svaddv_f64_(svptrue_b64(), accumulator_odd_f64x);
+}
+/**
+ *  Streaming-compatible angular distance accumulation from pre-reduced dot products
+ *  and contiguous f64 norm arrays.
+ *  Computes rsqrt via Newton-Raphson and accumulates `1 - dot / sqrt(||q||^2 * ||d||^2)`.
+ */
+NK_PUBLIC nk_f64_t nk_maxsim_angular_from_dots_ssve_(                                    //
+    nk_f64_t const *dot_products, nk_size_t count,                                       //
+    nk_f64_t const *query_norms_f64, nk_f64_t const *document_norms_f64) NK_STREAMING_ { //
+    nk_f64_t total_angular_distance_f64 = 0.0;
+    nk_size_t const vector_length = svcntd();
+    for (nk_size_t i = 0; i < count; i += vector_length) {
+        svbool_t predicate_b64x = svwhilelt_b64_u64(i, count);
+        svfloat64_t dot_products_f64x = svld1_f64(predicate_b64x, dot_products + i);
+        svfloat64_t query_norms_f64x = svld1_f64(predicate_b64x, query_norms_f64 + i);
+        svfloat64_t document_norms_f64x = svld1_f64(predicate_b64x, document_norms_f64 + i);
+        // norm_product = query_norm * document_norm
+        svfloat64_t norm_products_f64x = svmul_f64_x(predicate_b64x, query_norms_f64x, document_norms_f64x);
+        // Newton-Raphson rsqrt: estimate then two refinement steps
+        svfloat64_t rsqrt_f64x = svrsqrte_f64(norm_products_f64x);
+        rsqrt_f64x = svmul_f64_x(predicate_b64x, rsqrt_f64x,
+                                 svrsqrts_f64(svmul_f64_x(predicate_b64x, norm_products_f64x, rsqrt_f64x), rsqrt_f64x));
+        rsqrt_f64x = svmul_f64_x(predicate_b64x, rsqrt_f64x,
+                                 svrsqrts_f64(svmul_f64_x(predicate_b64x, norm_products_f64x, rsqrt_f64x), rsqrt_f64x));
+        // cosine = dot_product * rsqrt(norm_product), zeroed where norm <= 0
+        svbool_t positive_b64x = svcmpgt_f64(predicate_b64x, norm_products_f64x, svdup_n_f64(0.0));
+        svfloat64_t cosine_f64x = svmul_f64_z(positive_b64x, dot_products_f64x, rsqrt_f64x);
+        // angular_distance = max(0, 1 - cosine)
+        svfloat64_t angular_distance_f64x = svsub_f64_x(predicate_b64x, svdup_f64(1.0), cosine_f64x);
+        angular_distance_f64x = svmax_f64_x(predicate_b64x, angular_distance_f64x, svdup_f64(0.0));
+        total_angular_distance_f64 += nk_svaddv_f64_(predicate_b64x, angular_distance_f64x);
+    }
+    return total_angular_distance_f64;
 }
 /**
@@ -661,10 +702,9 @@ NK_PUBLIC nk_f64_t nk_maxsim_reduce_dot_f32_ssve_(                         //
  *  Refinement: tile-wide interleaved f64 dot products for the winning (query, document) pairs.
  *  Angular distance: 1 - dot / sqrt(||q||^2 * ||d||^2), accumulated with f64.
  */
-__arm_locally_streaming __arm_new("za") static void nk_maxsim_packed_f32_streaming_( //
-    void const *query_packed, void const *document_packed,                           //
-    nk_size_t query_count, nk_size_t document_count, nk_size_t depth,                //
-    nk_f64_t *result) {
+__arm_new("za") static void nk_maxsim_packed_f32_streaming_( //
+    void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
+    nk_size_t depth, nk_f64_t *result) NK_STREAMING_ {
     nk_maxsim_sme_packed_header_t const *query_header = (nk_maxsim_sme_packed_header_t const *)query_packed;
     nk_maxsim_sme_packed_header_t const *document_header = (nk_maxsim_sme_packed_header_t const *)document_packed;
@@ -895,48 +935,47 @@ __arm_locally_streaming __arm_new("za") static void nk_maxsim_packed_f32_streami
                                                  svcvtlt_f64_f32_x(predicate_odd_b64x, document_values_3_f32x));
             }
-            // Reduce accumulators and compute angular distance per row
-            svfloat64_t *batch_accumulators[] = {&accumulator_0_f64x, &accumulator_1_f64x, &accumulator_2_f64x,
-                                                 &accumulator_3_f64x};
-            for (nk_size_t batch_index = 0; batch_index < 4; batch_index++) {
-                nk_size_t query_index = row_start + row_batch_start + batch_index;
-                nk_u32_t best_document_index = best_document_indices[row_batch_start + batch_index];
-                nk_f64_t dot_product_f64 = svaddv_f64(svptrue_b64(), *batch_accumulators[batch_index]);
-                nk_f64_t norm_product_f64 = (nk_f64_t)query_norms[query_index] *
-                                            (nk_f64_t)document_norms[best_document_index];
-                nk_f64_t cosine_f64 = (norm_product_f64 > 0.0) ? dot_product_f64 * nk_f64_rsqrt_serial(norm_product_f64)
-                                                               : 0.0;
-                nk_f64_t angular_distance_f64 = 1.0 - cosine_f64;
-                if (angular_distance_f64 < 0.0) angular_distance_f64 = 0.0;
-                total_angular_distance_f64 += angular_distance_f64;
+            // Reduce SVE accumulators to scalars and compute angular distances
+            nk_f64_t dot_products_f64[4];
+            dot_products_f64[0] = nk_svaddv_f64_(svptrue_b64(), accumulator_0_f64x);
+            dot_products_f64[1] = nk_svaddv_f64_(svptrue_b64(), accumulator_1_f64x);
+            dot_products_f64[2] = nk_svaddv_f64_(svptrue_b64(), accumulator_2_f64x);
+            dot_products_f64[3] = nk_svaddv_f64_(svptrue_b64(), accumulator_3_f64x);
+            nk_f64_t batch_query_norms_f64[4], batch_document_norms_f64[4];
+            for (nk_size_t i = 0; i < 4; i++) {
+                batch_query_norms_f64[i] = (nk_f64_t)query_norms[row_start + row_batch_start + i];
+                batch_document_norms_f64[i] = (nk_f64_t)document_norms[best_document_indices[row_batch_start + i]];
             }
+            total_angular_distance_f64 += nk_maxsim_angular_from_dots_ssve_(dot_products_f64, 4, batch_query_norms_f64,
+                                                                            batch_document_norms_f64);
         }
-        // Remainder: 1 row at a time
-        for (; row_batch_start < rows_remaining; row_batch_start++) {
-            nk_size_t query_index = row_start + row_batch_start;
-            nk_u32_t best_document_index = best_document_indices[row_batch_start];
-            nk_f64_t dot_product_f64 = nk_maxsim_reduce_dot_f32_ssve_(query_original_ptrs[row_batch_start],
-                                                                      document_original_ptrs[row_batch_start], depth);
-            nk_f64_t norm_product_f64 = (nk_f64_t)query_norms[query_index] *
-                                        (nk_f64_t)document_norms[best_document_index];
-            nk_f64_t cosine_f64 = (norm_product_f64 > 0.0) ? dot_product_f64 * nk_f64_rsqrt_serial(norm_product_f64)
-                                                           : 0.0;
-            nk_f64_t angular_distance_f64 = 1.0 - cosine_f64;
-            if (angular_distance_f64 < 0.0) angular_distance_f64 = 0.0;
-            total_angular_distance_f64 += angular_distance_f64;
+        // Remainder: compute dot products then batch the angular distance
+        nk_size_t remainder_count = rows_remaining - row_batch_start;
+        if (remainder_count > 0) {
+            nk_f64_t remainder_dot_products_f64[3];
+            nk_f64_t remainder_query_norms_f64[3], remainder_document_norms_f64[3];
+            for (nk_size_t i = 0; i < remainder_count; i++) {
+                remainder_dot_products_f64[i] = nk_maxsim_reduce_dot_f32_ssve_(
+                    query_original_ptrs[row_batch_start + i], document_original_ptrs[row_batch_start + i], depth);
+                remainder_query_norms_f64[i] = (nk_f64_t)query_norms[row_start + row_batch_start + i];
+                remainder_document_norms_f64[i] = (nk_f64_t)document_norms[best_document_indices[row_batch_start + i]];
+            }
+            total_angular_distance_f64 += nk_maxsim_angular_from_dots_ssve_(
+                remainder_dot_products_f64, remainder_count, remainder_query_norms_f64, remainder_document_norms_f64);
         }
     }
     *result = total_angular_distance_f64;
 }
-NK_PUBLIC void nk_maxsim_packed_f32_sme(                              //
-    void const *query_packed, void const *document_packed,            //
-    nk_size_t query_count, nk_size_t document_count, nk_size_t depth, //
-    nk_f64_t *result) {                                               //
+NK_PUBLIC void nk_maxsim_packed_f32_sme( //
+    void const *query_packed, void const *document_packed, nk_size_t query_count, nk_size_t document_count,
+    nk_size_t depth, nk_f64_t *result) {
+    nk_sme_start_streaming_();
     nk_maxsim_packed_f32_streaming_(query_packed, document_packed, query_count, document_count, depth, result);
+    nk_sme_stop_streaming_();
 }
 #if defined(__clang__)