npm - numkong - Versions diffs - 7.4.5 → 7.5.0 - Mend

numkong 7.4.5 → 7.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/README.md +1 -0
package/binding.gyp +81 -5
package/c/dispatch_f16.c +23 -0
package/c/numkong.c +0 -13
package/include/numkong/attention/sme.h +34 -31
package/include/numkong/capabilities.h +2 -15
package/include/numkong/cast/neon.h +15 -0
package/include/numkong/curved/smef64.h +82 -62
package/include/numkong/dot/rvvbf16.h +1 -1
package/include/numkong/dot/rvvhalf.h +1 -1
package/include/numkong/dot/sve.h +6 -5
package/include/numkong/dot/svebfdot.h +2 -1
package/include/numkong/dot/svehalf.h +6 -5
package/include/numkong/dot/svesdot.h +3 -2
package/include/numkong/dots/graniteamx.h +733 -0
package/include/numkong/dots/serial.h +11 -4
package/include/numkong/dots/sme.h +172 -140
package/include/numkong/dots/smebi32.h +14 -11
package/include/numkong/dots/smef64.h +31 -26
package/include/numkong/dots.h +29 -3
package/include/numkong/each/serial.h +22 -0
package/include/numkong/geospatial/haswell.h +1 -1
package/include/numkong/geospatial/neon.h +1 -1
package/include/numkong/geospatial/serial.h +1 -1
package/include/numkong/geospatial/skylake.h +1 -1
package/include/numkong/maxsim/sme.h +34 -33
package/include/numkong/mesh/serial.h +22 -0
package/include/numkong/reduce/neon.h +29 -0
package/include/numkong/reduce/neonbfdot.h +2 -2
package/include/numkong/reduce/neonfhm.h +4 -4
package/include/numkong/reduce/sve.h +52 -0
package/include/numkong/reduce.h +4 -0
package/include/numkong/set/sve.h +6 -5
package/include/numkong/sets/smebi32.h +35 -30
package/include/numkong/sparse/sve2.h +3 -2
package/include/numkong/spatial/sve.h +7 -6
package/include/numkong/spatial/svebfdot.h +7 -4
package/include/numkong/spatial/svehalf.h +5 -4
package/include/numkong/spatial/svesdot.h +9 -8
package/include/numkong/spatials/graniteamx.h +173 -0
package/include/numkong/spatials/serial.h +22 -0
package/include/numkong/spatials/sme.h +391 -350
package/include/numkong/spatials/smef64.h +79 -70
package/include/numkong/spatials.h +37 -4
package/include/numkong/types.h +59 -0
package/javascript/dist/cjs/numkong.js +13 -0
package/javascript/dist/esm/numkong.js +13 -0
package/javascript/numkong.c +56 -12
package/javascript/numkong.ts +13 -0
package/package.json +7 -7
package/probes/probe.js +2 -2
package/wasm/numkong.wasm +0 -0

package/include/numkong/spatials/smef64.h CHANGED Viewed

@@ -13,6 +13,7 @@
 #if NK_TARGET_SME
 #include "numkong/dots/serial.h"
+#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
 #include "numkong/dots/smef64.h"
 #if defined(__cplusplus)
@@ -44,7 +45,7 @@ NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f32_ssve_(nk_f32_t const *data, nk_size_
         svfloat64_t values_odd_f64x = svcvtlt_f64_f32_x(predicate_odd_b64x, values_f32x);
         accumulator_odd_f64x = svmla_f64_m(predicate_odd_b64x, accumulator_odd_f64x, values_odd_f64x, values_odd_f64x);
     }
-    return svaddv_f64(svptrue_b64(), accumulator_even_f64x) + svaddv_f64(svptrue_b64(), accumulator_odd_f64x);
+    return nk_svaddv_f64_(svptrue_b64(), accumulator_even_f64x) + nk_svaddv_f64_(svptrue_b64(), accumulator_odd_f64x);
 }
 NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f64_ssve_(nk_f64_t const *data, nk_size_t count) NK_STREAMING_ {
@@ -55,7 +56,7 @@ NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f64_ssve_(nk_f64_t const *data, nk_size_
         svfloat64_t values_f64x = svld1_f64(predicate_b64x, data + i);
         accumulator_f64x = svmla_f64_m(predicate_b64x, accumulator_f64x, values_f64x, values_f64x);
     }
-    return svaddv_f64(svptrue_b64(), accumulator_f64x);
+    return nk_svaddv_f64_(svptrue_b64(), accumulator_f64x);
 }
 NK_PUBLIC svfloat64_t nk_angulars_from_dot_f64x_ssvef64_(svbool_t predicate_b64x, svfloat64_t dots_f64x,
@@ -85,10 +86,9 @@ NK_PUBLIC svfloat64_t nk_euclideans_from_dot_f64x_ssvef64_(svbool_t predicate_b6
 #pragma region F32 Packed Angular
-__arm_locally_streaming static void nk_angulars_packed_f32_smef64_finalize_streaming_( //
-    nk_f32_t const *a, void const *b_packed, nk_f64_t *c,                              //
-    nk_size_t rows, nk_size_t columns, nk_size_t depth,                                //
-    nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
+static void nk_angulars_packed_f32_smef64_finalize_ssve_( //
+    nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
+    nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
     nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
     nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
@@ -110,26 +110,26 @@ __arm_locally_streaming static void nk_angulars_packed_f32_smef64_finalize_strea
     }
 }
-NK_PUBLIC void nk_angulars_packed_f32_smef64(             //
-    nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
-    nk_size_t rows, nk_size_t columns, nk_size_t depth,   //
+NK_PUBLIC void nk_angulars_packed_f32_smef64( //
+    nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
     nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
     nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
     nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
-    nk_angulars_packed_f32_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
-                                                      c_stride_elements);
+    nk_angulars_packed_f32_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
+                                                 c_stride_elements);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F32 Packed Angular
 #pragma region F32 Packed Euclidean
-__arm_locally_streaming static void nk_euclideans_packed_f32_smef64_finalize_streaming_( //
-    nk_f32_t const *a, void const *b_packed, nk_f64_t *c,                                //
-    nk_size_t rows, nk_size_t columns, nk_size_t depth,                                  //
-    nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
+static void nk_euclideans_packed_f32_smef64_finalize_ssve_( //
+    nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
+    nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
     nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
     nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
@@ -151,25 +151,26 @@ __arm_locally_streaming static void nk_euclideans_packed_f32_smef64_finalize_str
     }
 }
-NK_PUBLIC void nk_euclideans_packed_f32_smef64(           //
-    nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
-    nk_size_t rows, nk_size_t columns, nk_size_t depth,   //
+NK_PUBLIC void nk_euclideans_packed_f32_smef64( //
+    nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
     nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
     nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
     nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
-    nk_euclideans_packed_f32_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
-                                                        c_stride_elements);
+    nk_euclideans_packed_f32_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
+                                                   c_stride_elements);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F32 Packed Euclidean
 #pragma region F32 Symmetric Angular
-__arm_locally_streaming static void nk_angulars_symmetric_f32_smef64_finalize_streaming_(         //
-    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, //
-    nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
+static void nk_angulars_symmetric_f32_smef64_finalize_ssve_( //
+    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
+    nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
     // Phase 1: cache row norms on diagonal
     for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
         nk_f32_t const *row_vector = vectors + row_index * stride_elements;
@@ -204,25 +205,27 @@ __arm_locally_streaming static void nk_angulars_symmetric_f32_smef64_finalize_st
         result[row_index * result_stride_elements + row_index] = 0;
 }
-NK_PUBLIC void nk_angulars_symmetric_f32_smef64(                                                  //
-    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, //
-    nk_f64_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
+NK_PUBLIC void nk_angulars_symmetric_f32_smef64( //
+    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
+    nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
     nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f32_t);
     nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_symmetric_f32_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
                                             result_stride_elements, row_start, row_count);
-    nk_angulars_symmetric_f32_smef64_finalize_streaming_(vectors, vectors_count, depth, stride_elements, result,
-                                                         result_stride_elements, row_start, row_count);
+    nk_angulars_symmetric_f32_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
+                                                    result_stride_elements, row_start, row_count);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F32 Symmetric Angular
 #pragma region F32 Symmetric Euclidean
-__arm_locally_streaming static void nk_euclideans_symmetric_f32_smef64_finalize_streaming_(       //
-    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, //
-    nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
+static void nk_euclideans_symmetric_f32_smef64_finalize_ssve_( //
+    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
+    nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
     // Phase 1: cache row norms on diagonal
     for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
         nk_f32_t const *row_vector = vectors + row_index * stride_elements;
@@ -257,26 +260,27 @@ __arm_locally_streaming static void nk_euclideans_symmetric_f32_smef64_finalize_
         result[row_index * result_stride_elements + row_index] = 0;
 }
-NK_PUBLIC void nk_euclideans_symmetric_f32_smef64(                                                //
-    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, //
-    nk_f64_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
+NK_PUBLIC void nk_euclideans_symmetric_f32_smef64( //
+    nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
+    nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
     nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f32_t);
     nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_symmetric_f32_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
                                             result_stride_elements, row_start, row_count);
-    nk_euclideans_symmetric_f32_smef64_finalize_streaming_(vectors, vectors_count, depth, stride_elements, result,
-                                                           result_stride_elements, row_start, row_count);
+    nk_euclideans_symmetric_f32_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
+                                                      result_stride_elements, row_start, row_count);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F32 Symmetric Euclidean
 #pragma region F64 Packed Angular
-__arm_locally_streaming static void nk_angulars_packed_f64_smef64_finalize_streaming_( //
-    nk_f64_t const *a, void const *b_packed, nk_f64_t *c,                              //
-    nk_size_t rows, nk_size_t columns, nk_size_t depth,                                //
-    nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
+static void nk_angulars_packed_f64_smef64_finalize_ssve_( //
+    nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
+    nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
     nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
     nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
@@ -298,26 +302,26 @@ __arm_locally_streaming static void nk_angulars_packed_f64_smef64_finalize_strea
     }
 }
-NK_PUBLIC void nk_angulars_packed_f64_smef64(             //
-    nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
-    nk_size_t rows, nk_size_t columns, nk_size_t depth,   //
+NK_PUBLIC void nk_angulars_packed_f64_smef64( //
+    nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
     nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
     nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
     nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
-    nk_angulars_packed_f64_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
-                                                      c_stride_elements);
+    nk_angulars_packed_f64_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
+                                                 c_stride_elements);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F64 Packed Angular
 #pragma region F64 Packed Euclidean
-__arm_locally_streaming static void nk_euclideans_packed_f64_smef64_finalize_streaming_( //
-    nk_f64_t const *a, void const *b_packed, nk_f64_t *c,                                //
-    nk_size_t rows, nk_size_t columns, nk_size_t depth,                                  //
-    nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
+static void nk_euclideans_packed_f64_smef64_finalize_ssve_( //
+    nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
+    nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
     nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
     nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
@@ -339,25 +343,26 @@ __arm_locally_streaming static void nk_euclideans_packed_f64_smef64_finalize_str
     }
 }
-NK_PUBLIC void nk_euclideans_packed_f64_smef64(           //
-    nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
-    nk_size_t rows, nk_size_t columns, nk_size_t depth,   //
+NK_PUBLIC void nk_euclideans_packed_f64_smef64( //
+    nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
     nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
     nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
     nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
-    nk_euclideans_packed_f64_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
-                                                        c_stride_elements);
+    nk_euclideans_packed_f64_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
+                                                   c_stride_elements);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F64 Packed Euclidean
 #pragma region F64 Symmetric Angular
-__arm_locally_streaming static void nk_angulars_symmetric_f64_smef64_finalize_streaming_(         //
-    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, //
-    nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
+static void nk_angulars_symmetric_f64_smef64_finalize_ssve_( //
+    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
+    nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
     // Phase 1: cache row norms on diagonal
     for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
         nk_f64_t const *row_vector = vectors + row_index * stride_elements;
@@ -392,25 +397,27 @@ __arm_locally_streaming static void nk_angulars_symmetric_f64_smef64_finalize_st
         result[row_index * result_stride_elements + row_index] = 0;
 }
-NK_PUBLIC void nk_angulars_symmetric_f64_smef64(                                                  //
-    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, //
-    nk_f64_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
+NK_PUBLIC void nk_angulars_symmetric_f64_smef64( //
+    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
+    nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
     nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f64_t);
     nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_symmetric_f64_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
                                             result_stride_elements, row_start, row_count);
-    nk_angulars_symmetric_f64_smef64_finalize_streaming_(vectors, vectors_count, depth, stride_elements, result,
-                                                         result_stride_elements, row_start, row_count);
+    nk_angulars_symmetric_f64_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
+                                                    result_stride_elements, row_start, row_count);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F64 Symmetric Angular
 #pragma region F64 Symmetric Euclidean
-__arm_locally_streaming static void nk_euclideans_symmetric_f64_smef64_finalize_streaming_(       //
-    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, //
-    nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
+static void nk_euclideans_symmetric_f64_smef64_finalize_ssve_( //
+    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
+    nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
     // Phase 1: cache row norms on diagonal
     for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
         nk_f64_t const *row_vector = vectors + row_index * stride_elements;
@@ -445,17 +452,19 @@ __arm_locally_streaming static void nk_euclideans_symmetric_f64_smef64_finalize_
         result[row_index * result_stride_elements + row_index] = 0;
 }
-NK_PUBLIC void nk_euclideans_symmetric_f64_smef64(                                                //
-    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, //
-    nk_f64_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
+NK_PUBLIC void nk_euclideans_symmetric_f64_smef64( //
+    nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
+    nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
     nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f64_t);
     nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
+    nk_sme_start_streaming_();
     nk_dots_symmetric_f64_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
                                             result_stride_elements, row_start, row_count);
-    nk_euclideans_symmetric_f64_smef64_finalize_streaming_(vectors, vectors_count, depth, stride_elements, result,
-                                                           result_stride_elements, row_start, row_count);
+    nk_euclideans_symmetric_f64_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
+                                                      result_stride_elements, row_start, row_count);
+    nk_sme_stop_streaming_();
 }
 #pragma endregion F64 Symmetric Euclidean

package/include/numkong/spatials.h CHANGED Viewed

@@ -739,6 +739,28 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk
                                                       nk_size_t row_start, nk_size_t row_count);
 #endif // NK_TARGET_SAPPHIREAMX
+/*  Granite Rapids backends using Intel AMX-FP16.
+ *  Native FP16 spatial kernels.
+ */
+#if NK_TARGET_GRANITEAMX
+/** @copydoc nk_angulars_packed_f16 */
+NK_PUBLIC void nk_angulars_packed_f16_graniteamx(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
+                                                 nk_size_t rows, nk_size_t cols, nk_size_t depth,
+                                                 nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
+/** @copydoc nk_angulars_symmetric_f16 */
+NK_PUBLIC void nk_angulars_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
+                                                    nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
+                                                    nk_size_t row_start, nk_size_t row_count);
+/** @copydoc nk_euclideans_packed_f16 */
+NK_PUBLIC void nk_euclideans_packed_f16_graniteamx(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
+                                                   nk_size_t rows, nk_size_t cols, nk_size_t depth,
+                                                   nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
+/** @copydoc nk_euclideans_symmetric_f16 */
+NK_PUBLIC void nk_euclideans_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
+                                                      nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
+                                                      nk_size_t row_start, nk_size_t row_count);
+#endif // NK_TARGET_GRANITEAMX
 /*  ARM SME backends using Scalable Matrix Extension.
  *  SME provides ZA tile registers for outer product operations.
  *  F16/BF16/I8/U8/E4M3 use ZA32 tiles, F32/F64 use ZA64 tiles (FEAT_SME_F64F64).
@@ -2078,6 +2100,7 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
 #include "numkong/spatials/alder.h"
 #include "numkong/spatials/sierra.h"
 #include "numkong/spatials/sapphireamx.h"
+#include "numkong/spatials/graniteamx.h"
 #include "numkong/spatials/rvv.h"
 #include "numkong/spatials/v128relaxed.h"
 #include "numkong/spatials/sme.h"
@@ -2290,7 +2313,9 @@ NK_PUBLIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t ve
 NK_PUBLIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
                                       nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
                                       nk_size_t r_stride_in_bytes) {
-#if NK_TARGET_SME
+#if NK_TARGET_GRANITEAMX
+    nk_angulars_packed_f16_graniteamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
+#elif NK_TARGET_SME
     nk_angulars_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
 #elif NK_TARGET_NEONFHM
     nk_angulars_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
@@ -2311,7 +2336,10 @@ NK_PUBLIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed, n
 NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
                                          nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
                                          nk_size_t row_start, nk_size_t row_count) {
-#if NK_TARGET_SME
+#if NK_TARGET_GRANITEAMX
+    nk_angulars_symmetric_f16_graniteamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
+                                         row_count);
+#elif NK_TARGET_SME
     nk_angulars_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
 #elif NK_TARGET_NEONFHM
     nk_angulars_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
@@ -2337,7 +2365,9 @@ NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vect
 NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
                                         nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
                                         nk_size_t r_stride_in_bytes) {
-#if NK_TARGET_SME
+#if NK_TARGET_GRANITEAMX
+    nk_euclideans_packed_f16_graniteamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
+#elif NK_TARGET_SME
     nk_euclideans_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
 #elif NK_TARGET_NEONFHM
     nk_euclideans_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
@@ -2358,7 +2388,10 @@ NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed,
 NK_PUBLIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
                                            nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
                                            nk_size_t row_start, nk_size_t row_count) {
-#if NK_TARGET_SME
+#if NK_TARGET_GRANITEAMX
+    nk_euclideans_symmetric_f16_graniteamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
+                                           row_count);
+#elif NK_TARGET_SME
     nk_euclideans_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
 #elif NK_TARGET_NEONFHM
     nk_euclideans_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,

package/include/numkong/types.h CHANGED Viewed

@@ -69,6 +69,20 @@
 #define _GNU_SOURCE
 #endif
+// MSan (MemorySanitizer) cannot track data flow through SVE horizontal reductions
+// like `svaddv`, which move data from vector registers to scalar registers via
+// architecture-specific paths invisible to the compiler. `nk_unpoison_` marks the
+// resulting scalar as initialized so MSan does not report false positives.
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#define nk_unpoison_(ptr, size) __msan_unpoison((ptr), (size))
+#endif
+#endif
+#ifndef nk_unpoison_
+#define nk_unpoison_(ptr, size) (void)(ptr), (void)(size)
+#endif
 // Inferring target OS: Windows, macOS, Linux, or FreeBSD
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NK_DEFINED_WINDOWS_ 1
@@ -1627,6 +1641,51 @@ NK_INTERNAL nk_size_t nk_sme_cntd_(void) {
     __asm__ __volatile__("smstart sm\n\t" "cntd %0\n\t" "smstop sm" : "=r"(r));
     return (nk_size_t)r;
 }
+/** @brief Enter streaming SVE mode (PSTATE.SM = 1). Caller is responsible for smstop. */
+NK_INTERNAL void nk_sme_start_streaming_(void) { __asm__ __volatile__("smstart sm" ::: "memory"); }
+/** @brief Exit streaming SVE mode (PSTATE.SM = 0). Must pair with nk_sme_start_streaming_. */
+NK_INTERNAL void nk_sme_stop_streaming_(void) { __asm__ __volatile__("smstop sm" ::: "memory"); }
+/**
+ *  SME runtime stubs — weak definitions for symbols the compiler may reference
+ *  from __arm_streaming or __arm_new("za") functions. Every TU that includes
+ *  this header emits a weak copy; the linker deduplicates to one.
+ *
+ *  - __arm_tpidr2_save / __arm_tpidr2_restore: lazy ZA save/restore protocol
+ *    used in __arm_new("za") prologues. Always no-ops in NumKong because no
+ *    NK_PUBLIC function carries ZA state (TPIDR2_EL0 is always null at entry).
+ *
+ *  - __arm_sc_memset / __arm_sc_memcpy / __arm_sc_memmove: streaming-compatible
+ *    memory routines the compiler may emit inside __arm_streaming functions.
+ *    Apple Clang provides these in its runtime; upstream LLVM does not.
+ */
+__attribute__((weak)) void __arm_tpidr2_save(void) {}
+__attribute__((weak)) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
+__attribute__((weak, target("+sme"))) void *__arm_sc_memset(void *d, int c, __SIZE_TYPE__ n) __arm_streaming_compatible {
+    unsigned char *p = (unsigned char *)d;
+    for (__SIZE_TYPE__ i = 0; i < n; i++) p[i] = (unsigned char)c;
+    return d;
+}
+__attribute__((weak, target("+sme"))) void *__arm_sc_memcpy(void *d, void const *s,
+                                                           __SIZE_TYPE__ n) __arm_streaming_compatible {
+    unsigned char *dp = (unsigned char *)d;
+    unsigned char const *sp = (unsigned char const *)s;
+    for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
+    return d;
+}
+__attribute__((weak, target("+sme"))) void *__arm_sc_memmove(void *d, void const *s,
+                                                            __SIZE_TYPE__ n) __arm_streaming_compatible {
+    unsigned char *dp = (unsigned char *)d;
+    unsigned char const *sp = (unsigned char const *)s;
+    if (dp < sp) {
+        for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
+    }
+    else {
+        for (__SIZE_TYPE__ i = n; i > 0; i--) dp[i - 1] = sp[i - 1];
+    }
+    return d;
+}
 #endif
 #ifdef __cplusplus

package/javascript/dist/cjs/numkong.js CHANGED Viewed

@@ -99,6 +99,19 @@ Object.defineProperty(exports, "PackedMatrix", { enumerable: true, get: function
 Object.defineProperty(exports, "DType", { enumerable: true, get: function () { return types_js_1.DType; } });
 Object.defineProperty(exports, "outputDtype", { enumerable: true, get: function () { return types_js_1.outputDtype; } });
 function loadNativeAddon() {
+    var _a;
+    // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
+    // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
+    // runtime (e.g. one loaded by another native addon) may already be
+    // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
+    // libiomp5 to coexist; it must be in `process.env` before the `require()`
+    // below triggers the addon's `dlopen`, since libomp's constructor reads
+    // the env during dependency resolution and is too late to influence
+    // afterwards. Left unguarded because the variable is harmless on
+    // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
+    // who set it to something else is respected by `??=`. See
+    // `python/numkong/__init__.py` for the Python analog.
+    (_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
     // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
     try {
         const req = (0, node_module_1.createRequire)(path.join(getDirName(), "noop.js"));

package/javascript/dist/esm/numkong.js CHANGED Viewed

@@ -31,6 +31,19 @@ import { existsSync } from "node:fs";
 import { getFileName, getRoot } from "bindings";
 import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype } from "./types.js";
 function loadNativeAddon() {
+    var _a;
+    // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
+    // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
+    // runtime (e.g. one loaded by another native addon) may already be
+    // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
+    // libiomp5 to coexist; it must be in `process.env` before the `require()`
+    // below triggers the addon's `dlopen`, since libomp's constructor reads
+    // the env during dependency resolution and is too late to influence
+    // afterwards. Left unguarded because the variable is harmless on
+    // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
+    // who set it to something else is respected by `??=`. See
+    // `python/numkong/__init__.py` for the Python analog.
+    (_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
     // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
     try {
         const req = createRequire(path.join(getDirName(), "noop.js"));

package/javascript/numkong.c CHANGED Viewed

@@ -9,10 +9,17 @@
 #include <string.h> // `strcmp` function
+#if defined(NK_USE_OPENMP)
+#include <omp.h>
+#endif
 #include <node_api.h> // `napi_*` functions — N-API v6+ for BigInt (Node ≥ 10.20)
 #include <numkong/numkong.h> // `nk_*` functions — must be first to bring `_GNU_SOURCE`
+#define NK_PARALLEL_PACKED_TILE    64
+#define NK_PARALLEL_SYMMETRIC_TILE 32
 /** @brief Global variable that caches the CPU capabilities, and is computed just once, when the module is loaded. */
 nk_capability_t static_capabilities = nk_cap_serial_k;
@@ -482,11 +489,11 @@ static napi_value api_dots_pack(napi_env env, napi_callback_info info) {
  * dtype
  */
 static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
-    size_t argc = 9;
-    napi_value args[9];
+    size_t argc = 10;
+    napi_value args[10];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
-    if (argc != 9) {
-        napi_throw_error(env, NULL, "Packed operation requires 9 arguments");
+    if (argc < 9 || argc > 10) {
+        napi_throw_error(env, NULL, "Packed operation requires 9-10 arguments (last is optional threads)");
         return NULL;
     }
@@ -533,8 +540,26 @@ static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_ke
         return NULL;
     }
-    kernel(a_data, packed_data, result_data, (nk_size_t)height, (nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride,
-           (nk_size_t)result_stride);
+    uint32_t threads = 1;
+    if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
+#if defined(NK_USE_OPENMP)
+    if (threads == 0) threads = (uint32_t)omp_get_max_threads();
+    omp_set_num_threads((int)threads);
+#endif
+    // `int` loop counter pre-declared: MSVC's OpenMP stays at 2.0 canonical
+    // form, which forbids in-init declarations and rejects 64-bit iterators
+    // — either would trip C3015.
+    int const tile_count = (int)nk_size_divide_round_up_(height, NK_PARALLEL_PACKED_TILE);
+    int tile_idx;
+#pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
+    for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+        nk_size_t row = (nk_size_t)tile_idx * NK_PARALLEL_PACKED_TILE;
+        nk_size_t chunk = (row + NK_PARALLEL_PACKED_TILE <= height) ? NK_PARALLEL_PACKED_TILE : (height - row);
+        kernel((char const *)a_data + row * a_stride, packed_data, (char *)result_data + row * result_stride, chunk,
+               (nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride, (nk_size_t)result_stride);
+    }
     return NULL;
 }
@@ -554,11 +579,11 @@ static napi_value api_euclideans_packed(napi_env env, napi_callback_info info) {
  * string dtype
  */
 static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
-    size_t argc = 9;
-    napi_value args[9];
+    size_t argc = 10;
+    napi_value args[10];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
-    if (argc != 9) {
-        napi_throw_error(env, NULL, "Symmetric operation requires 9 arguments");
+    if (argc < 9 || argc > 10) {
+        napi_throw_error(env, NULL, "Symmetric operation requires 9-10 arguments (last is optional threads)");
         return NULL;
     }
@@ -601,8 +626,27 @@ static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk
         return NULL;
     }
-    kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
-           (nk_size_t)result_stride, (nk_size_t)row_start, (nk_size_t)row_count);
+    uint32_t threads = 1;
+    if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
+#if defined(NK_USE_OPENMP)
+    if (threads == 0) threads = (uint32_t)omp_get_max_threads();
+    omp_set_num_threads((int)threads);
+#endif
+    // `int` loop counter pre-declared: see note at `api_packed_common`.
+    int const tile_count = (int)nk_size_divide_round_up_(row_count, NK_PARALLEL_SYMMETRIC_TILE);
+    int tile_idx;
+#pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
+    for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+        nk_size_t tile_start = (nk_size_t)row_start + (nk_size_t)tile_idx * NK_PARALLEL_SYMMETRIC_TILE;
+        nk_size_t tile_rows = (tile_start + NK_PARALLEL_SYMMETRIC_TILE <= (nk_size_t)row_start + row_count)
+                                  ? NK_PARALLEL_SYMMETRIC_TILE
+                                  : ((nk_size_t)row_start + row_count - tile_start);
+        kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
+               (nk_size_t)result_stride, tile_start, tile_rows);
+    }
     return NULL;
 }

package/javascript/numkong.ts CHANGED Viewed

@@ -33,6 +33,19 @@ import { getFileName, getRoot } from "bindings";
 import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype, KernelFamily } from "./types.js";
 function loadNativeAddon(): any {
+  // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
+  // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
+  // runtime (e.g. one loaded by another native addon) may already be
+  // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
+  // libiomp5 to coexist; it must be in `process.env` before the `require()`
+  // below triggers the addon's `dlopen`, since libomp's constructor reads
+  // the env during dependency resolution and is too late to influence
+  // afterwards. Left unguarded because the variable is harmless on
+  // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
+  // who set it to something else is respected by `??=`. See
+  // `python/numkong/__init__.py` for the Python analog.
+  process.env.KMP_DUPLICATE_LIB_OK ??= "TRUE";
   // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
   try {
     const req = createRequire(path.join(getDirName(), "noop.js"));