npm - numkong - Versions diffs - 7.4.5 → 7.6.0 - Mend

numkong 7.4.5 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/README.md +1 -0
package/binding.gyp +99 -5
package/c/dispatch_e5m2.c +23 -3
package/c/dispatch_f16.c +23 -0
package/c/numkong.c +0 -13
package/include/numkong/attention/sme.h +34 -31
package/include/numkong/capabilities.h +2 -15
package/include/numkong/cast/README.md +3 -0
package/include/numkong/cast/haswell.h +28 -64
package/include/numkong/cast/neon.h +15 -0
package/include/numkong/cast/serial.h +17 -0
package/include/numkong/cast/skylake.h +67 -52
package/include/numkong/cast.h +1 -0
package/include/numkong/curved/smef64.h +82 -62
package/include/numkong/dot/README.md +1 -0
package/include/numkong/dot/haswell.h +92 -13
package/include/numkong/dot/rvvbf16.h +1 -1
package/include/numkong/dot/rvvhalf.h +1 -1
package/include/numkong/dot/serial.h +15 -0
package/include/numkong/dot/skylake.h +61 -14
package/include/numkong/dot/sve.h +6 -5
package/include/numkong/dot/svebfdot.h +2 -1
package/include/numkong/dot/svehalf.h +6 -5
package/include/numkong/dot/svesdot.h +3 -2
package/include/numkong/dots/README.md +2 -0
package/include/numkong/dots/graniteamx.h +1167 -0
package/include/numkong/dots/haswell.h +28 -28
package/include/numkong/dots/sapphireamx.h +1 -1
package/include/numkong/dots/serial.h +33 -11
package/include/numkong/dots/skylake.h +28 -23
package/include/numkong/dots/sme.h +172 -140
package/include/numkong/dots/smebi32.h +14 -11
package/include/numkong/dots/smef64.h +31 -26
package/include/numkong/dots.h +41 -3
package/include/numkong/each/serial.h +39 -0
package/include/numkong/geospatial/haswell.h +1 -1
package/include/numkong/geospatial/neon.h +1 -1
package/include/numkong/geospatial/serial.h +15 -4
package/include/numkong/geospatial/skylake.h +1 -1
package/include/numkong/maxsim/serial.h +15 -0
package/include/numkong/maxsim/sme.h +34 -33
package/include/numkong/mesh/README.md +50 -44
package/include/numkong/mesh/genoa.h +462 -0
package/include/numkong/mesh/haswell.h +806 -933
package/include/numkong/mesh/neon.h +871 -943
package/include/numkong/mesh/neonbfdot.h +382 -522
package/include/numkong/mesh/neonfhm.h +676 -0
package/include/numkong/mesh/rvv.h +404 -319
package/include/numkong/mesh/serial.h +225 -161
package/include/numkong/mesh/skylake.h +1029 -1585
package/include/numkong/mesh/v128relaxed.h +403 -377
package/include/numkong/mesh.h +38 -0
package/include/numkong/reduce/neon.h +29 -0
package/include/numkong/reduce/neonbfdot.h +2 -2
package/include/numkong/reduce/neonfhm.h +4 -4
package/include/numkong/reduce/serial.h +15 -1
package/include/numkong/reduce/sve.h +52 -0
package/include/numkong/reduce.h +4 -0
package/include/numkong/set/sve.h +6 -5
package/include/numkong/sets/smebi32.h +35 -30
package/include/numkong/sparse/serial.h +17 -2
package/include/numkong/sparse/sve2.h +3 -2
package/include/numkong/spatial/genoa.h +0 -68
package/include/numkong/spatial/haswell.h +98 -56
package/include/numkong/spatial/serial.h +15 -0
package/include/numkong/spatial/skylake.h +114 -54
package/include/numkong/spatial/sve.h +7 -6
package/include/numkong/spatial/svebfdot.h +7 -4
package/include/numkong/spatial/svehalf.h +5 -4
package/include/numkong/spatial/svesdot.h +9 -8
package/include/numkong/spatial.h +0 -12
package/include/numkong/spatials/graniteamx.h +301 -0
package/include/numkong/spatials/serial.h +39 -0
package/include/numkong/spatials/skylake.h +2 -2
package/include/numkong/spatials/sme.h +391 -350
package/include/numkong/spatials/smef64.h +79 -70
package/include/numkong/spatials.h +54 -4
package/include/numkong/tensor.hpp +107 -23
package/include/numkong/types.h +59 -0
package/javascript/dist/cjs/numkong.js +13 -0
package/javascript/dist/esm/numkong.js +13 -0
package/javascript/numkong.c +59 -14
package/javascript/numkong.ts +13 -0
package/package.json +7 -7
package/probes/probe.js +2 -2
package/wasm/numkong.wasm +0 -0

package/include/numkong/tensor.hpp CHANGED Viewed

@@ -1,5 +1,5 @@
 /**
- *  @brief NumKong Tensor types and tensor-level operations for C++23 and newer.
+ *  @brief NumKong Tensor types and tensor-level operations for C++20 and newer.
  *  @file include/numkong/tensor.hpp
  *  @author Ash Vardanian
  *  @date March 2026
@@ -19,7 +19,8 @@
  *  Features:
  *  - Signed strides (ptrdiff_t) for reversed/transposed views
  *  - Signed indexing (negative = from end)
- *  - C++23 variadic `operator[]` for flat access, exact access, and trailing `slice`
+ *  - Variadic `operator()` for flat/exact access and trailing `slice` (C++20-portable);
+ *    `operator[]` multi-arg sugar provided when the compiler supports P2128 (C++23).
  *  - Axis iteration (rows_views(), rows_spans(), axis_iterator)
  *  - Conversion to vector_view/vector_span for rank-1 tensors
  */
@@ -37,6 +38,14 @@
 #include "vector.hpp" // `aligned_allocator`
+// True when the compiler supports C++23 P2128 multi-arg `operator[]`. Under
+// this gate we expose `t[a, b, c]` as sugar that delegates to `operator()`.
+#if defined(__cpp_multidimensional_subscript) && __cpp_multidimensional_subscript >= 202110L
+#define NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_ 1
+#else
+#define NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_ 0
+#endif
 namespace ashvardanian::numkong {
 template <typename value_type_, std::size_t max_rank_>
@@ -300,26 +309,44 @@ struct tensor_view {
         return tensor_flat_lookup_(*this, idx);
     }
-    /** @brief Exact multi-dimensional scalar lookup. */
+    /** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
     template <std::integral... index_types_>
         requires(sizeof...(index_types_) >= 2)
-    decltype(auto) operator[](index_types_... idxs) const noexcept {
+    decltype(auto) operator()(index_types_... idxs) const noexcept {
         nk_assert_(shape_.rank == sizeof...(index_types_));
         auto coords = resolve_tensor_indices_<value_type_>(shape_, std::index_sequence_for<index_types_...> {},
                                                            idxs...);
         return tensor_lookup_resolved_(*this, std::span<std::size_t const, sizeof...(index_types_)>(coords));
     }
+#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
+    /** @brief C++23 sugar: `t[i, j, k]` scalar lookup, delegates to `operator()`. */
+    template <std::integral... index_types_>
+        requires(sizeof...(index_types_) >= 2)
+    decltype(auto) operator[](index_types_... idxs) const noexcept {
+        return (*this)(idxs...);
+    }
+#endif
     /** @brief Trailing `slice` returns the same view. */
     constexpr tensor_view operator[](tensor_slice_t) const noexcept { return *this; }
-    /** @brief Prefix leading-axis slicing with a trailing `slice` marker. */
+    /** @brief Prefix leading-axis slicing with a trailing `slice` marker (call syntax, C++20-portable). */
     template <typename first_type_, typename second_type_, typename... rest_types_>
         requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
-    tensor_view operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
+    tensor_view operator()(first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
         return tensor_slice_suffix_(*this, first, second, rest...);
     }
+#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
+    /** @brief C++23 sugar: `t[i, nk::slice]` slicing, delegates to `operator()`. */
+    template <typename first_type_, typename second_type_, typename... rest_types_>
+        requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
+    tensor_view operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
+        return (*this)(first, second, rest...);
+    }
+#endif
     /** @brief Rank-0 scalar access. */
     decltype(auto) scalar() const noexcept {
         nk_assert_(shape_.rank == 0);
@@ -512,22 +539,36 @@ struct tensor_span {
         return tensor_flat_lookup_(static_cast<tensor_view<value_type_, max_rank_>>(*this), idx);
     }
-    /** @brief Exact multi-dimensional scalar lookup. */
+    /** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
     template <std::integral... index_types_>
         requires(sizeof...(index_types_) >= 2)
-    decltype(auto) operator[](index_types_... idxs) noexcept {
+    decltype(auto) operator()(index_types_... idxs) noexcept {
         nk_assert_(shape_.rank == sizeof...(index_types_));
         auto coords = resolve_tensor_indices_<value_type_>(shape_, std::index_sequence_for<index_types_...> {},
                                                            idxs...);
         return tensor_lookup_resolved_(*this, std::span<std::size_t const, sizeof...(index_types_)>(coords));
     }
-    /** @brief Const full-coordinate lookup. */
+    /** @brief Const full-coordinate lookup via call syntax. */
+    template <std::integral... index_types_>
+        requires(sizeof...(index_types_) >= 2)
+    decltype(auto) operator()(index_types_... idxs) const noexcept {
+        return static_cast<tensor_view<value_type_, max_rank_>>(*this)(idxs...);
+    }
+#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
+    /** @brief C++23 sugar: multi-arg `[]` scalar lookup, delegates to `operator()`. */
+    template <std::integral... index_types_>
+        requires(sizeof...(index_types_) >= 2)
+    decltype(auto) operator[](index_types_... idxs) noexcept {
+        return (*this)(idxs...);
+    }
     template <std::integral... index_types_>
         requires(sizeof...(index_types_) >= 2)
     decltype(auto) operator[](index_types_... idxs) const noexcept {
-        return static_cast<tensor_view<value_type_, max_rank_>>(*this)[idxs...];
+        return (*this)(idxs...);
     }
+#endif
     /** @brief Trailing `slice` returns the same span. */
     constexpr tensor_span operator[](tensor_slice_t) noexcept { return *this; }
@@ -535,21 +576,36 @@ struct tensor_span {
         return static_cast<tensor_view<value_type_, max_rank_>>(*this);
     }
-    /** @brief Prefix leading-axis slicing with a trailing `slice` marker. */
+    /** @brief Prefix leading-axis slicing via call syntax (C++20-portable). */
     template <typename first_type_, typename second_type_, typename... rest_types_>
         requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
-    tensor_span operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
+    tensor_span operator()(first_type_ first, second_type_ second, rest_types_... rest) noexcept {
         return tensor_slice_suffix_(*this, first, second, rest...);
     }
-    /** @brief Const prefix leading-axis slicing with a trailing `slice` marker. */
+    /** @brief Const prefix leading-axis slicing via call syntax. */
     template <typename first_type_, typename second_type_, typename... rest_types_>
         requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
-    tensor_view<value_type_, max_rank_> operator[](first_type_ first, second_type_ second,
+    tensor_view<value_type_, max_rank_> operator()(first_type_ first, second_type_ second,
                                                    rest_types_... rest) const noexcept {
         return tensor_slice_suffix_(static_cast<tensor_view<value_type_, max_rank_>>(*this), first, second, rest...);
     }
+#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
+    /** @brief C++23 sugar: multi-arg `[]` slicing, delegates to `operator()`. */
+    template <typename first_type_, typename second_type_, typename... rest_types_>
+        requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
+    tensor_span operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
+        return (*this)(first, second, rest...);
+    }
+    template <typename first_type_, typename second_type_, typename... rest_types_>
+        requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
+    tensor_view<value_type_, max_rank_> operator[](first_type_ first, second_type_ second,
+                                                   rest_types_... rest) const noexcept {
+        return (*this)(first, second, rest...);
+    }
+#endif
     /** @brief Rank-0 mutable scalar access. */
     decltype(auto) scalar_ref() noexcept {
         nk_assert_(shape_.rank == 0);
@@ -1546,38 +1602,66 @@ struct tensor {
         return view()[idx];
     }
-    /** @brief Exact multi-dimensional scalar lookup. */
+    /** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
     template <std::integral... index_types_>
         requires(sizeof...(index_types_) >= 2)
-    decltype(auto) operator[](index_types_... idxs) noexcept {
-        return span()[idxs...];
+    decltype(auto) operator()(index_types_... idxs) noexcept {
+        return span()(idxs...);
+    }
+    /** @brief Const multidimensional lookup via call syntax. */
+    template <std::integral... index_types_>
+        requires(sizeof...(index_types_) >= 2)
+    decltype(auto) operator()(index_types_... idxs) const noexcept {
+        return view()(idxs...);
     }
-    /** @brief Const multidimensional lookup. */
+#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
+    /** @brief C++23 sugar: multi-arg `[]` scalar lookup, delegates to `operator()`. */
+    template <std::integral... index_types_>
+        requires(sizeof...(index_types_) >= 2)
+    decltype(auto) operator[](index_types_... idxs) noexcept {
+        return (*this)(idxs...);
+    }
     template <std::integral... index_types_>
         requires(sizeof...(index_types_) >= 2)
     decltype(auto) operator[](index_types_... idxs) const noexcept {
-        return view()[idxs...];
+        return (*this)(idxs...);
     }
+#endif
     /** @brief Trailing `slice` returns the same tensor view/span category. */
     span_type operator[](tensor_slice_t) noexcept { return span(); }
     view_type operator[](tensor_slice_t) const noexcept { return view(); }
-    /** @brief Prefix leading-axis slicing with a trailing `slice` marker. */
+    /** @brief Prefix leading-axis slicing via call syntax (C++20-portable). */
     template <typename first_type_, typename second_type_, typename... rest_types_>
         requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
-    span_type operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
+    span_type operator()(first_type_ first, second_type_ second, rest_types_... rest) noexcept {
         return tensor_slice_suffix_(span(), first, second, rest...);
     }
-    /** @brief Const prefix leading-axis slicing with a trailing `slice` marker. */
+    /** @brief Const prefix leading-axis slicing via call syntax. */
     template <typename first_type_, typename second_type_, typename... rest_types_>
         requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
-    view_type operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
+    view_type operator()(first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
         return tensor_slice_suffix_(view(), first, second, rest...);
     }
+#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
+    /** @brief C++23 sugar: multi-arg `[]` slicing, delegates to `operator()`. */
+    template <typename first_type_, typename second_type_, typename... rest_types_>
+        requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
+    span_type operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
+        return (*this)(first, second, rest...);
+    }
+    template <typename first_type_, typename second_type_, typename... rest_types_>
+        requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
+    view_type operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
+        return (*this)(first, second, rest...);
+    }
+#endif
     /** @brief Rank-0 mutable scalar access. */
     decltype(auto) scalar_ref() noexcept { return span().scalar_ref(); }

package/include/numkong/types.h CHANGED Viewed

@@ -69,6 +69,20 @@
 #define _GNU_SOURCE
 #endif
+// MSan (MemorySanitizer) cannot track data flow through SVE horizontal reductions
+// like `svaddv`, which move data from vector registers to scalar registers via
+// architecture-specific paths invisible to the compiler. `nk_unpoison_` marks the
+// resulting scalar as initialized so MSan does not report false positives.
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#define nk_unpoison_(ptr, size) __msan_unpoison((ptr), (size))
+#endif
+#endif
+#ifndef nk_unpoison_
+#define nk_unpoison_(ptr, size) (void)(ptr), (void)(size)
+#endif
 // Inferring target OS: Windows, macOS, Linux, or FreeBSD
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NK_DEFINED_WINDOWS_ 1
@@ -1627,6 +1641,51 @@ NK_INTERNAL nk_size_t nk_sme_cntd_(void) {
     __asm__ __volatile__("smstart sm\n\t" "cntd %0\n\t" "smstop sm" : "=r"(r));
     return (nk_size_t)r;
 }
+/** @brief Enter streaming SVE mode (PSTATE.SM = 1). Caller is responsible for smstop. */
+NK_INTERNAL void nk_sme_start_streaming_(void) { __asm__ __volatile__("smstart sm" ::: "memory"); }
+/** @brief Exit streaming SVE mode (PSTATE.SM = 0). Must pair with nk_sme_start_streaming_. */
+NK_INTERNAL void nk_sme_stop_streaming_(void) { __asm__ __volatile__("smstop sm" ::: "memory"); }
+/**
+ *  SME runtime stubs — weak definitions for symbols the compiler may reference
+ *  from __arm_streaming or __arm_new("za") functions. Every TU that includes
+ *  this header emits a weak copy; the linker deduplicates to one.
+ *
+ *  - __arm_tpidr2_save / __arm_tpidr2_restore: lazy ZA save/restore protocol
+ *    used in __arm_new("za") prologues. Always no-ops in NumKong because no
+ *    NK_PUBLIC function carries ZA state (TPIDR2_EL0 is always null at entry).
+ *
+ *  - __arm_sc_memset / __arm_sc_memcpy / __arm_sc_memmove: streaming-compatible
+ *    memory routines the compiler may emit inside __arm_streaming functions.
+ *    Apple Clang provides these in its runtime; upstream LLVM does not.
+ */
+__attribute__((weak)) void __arm_tpidr2_save(void) {}
+__attribute__((weak)) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
+__attribute__((weak, target("+sme"))) void *__arm_sc_memset(void *d, int c, __SIZE_TYPE__ n) __arm_streaming_compatible {
+    unsigned char *p = (unsigned char *)d;
+    for (__SIZE_TYPE__ i = 0; i < n; i++) p[i] = (unsigned char)c;
+    return d;
+}
+__attribute__((weak, target("+sme"))) void *__arm_sc_memcpy(void *d, void const *s,
+                                                           __SIZE_TYPE__ n) __arm_streaming_compatible {
+    unsigned char *dp = (unsigned char *)d;
+    unsigned char const *sp = (unsigned char const *)s;
+    for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
+    return d;
+}
+__attribute__((weak, target("+sme"))) void *__arm_sc_memmove(void *d, void const *s,
+                                                            __SIZE_TYPE__ n) __arm_streaming_compatible {
+    unsigned char *dp = (unsigned char *)d;
+    unsigned char const *sp = (unsigned char const *)s;
+    if (dp < sp) {
+        for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
+    }
+    else {
+        for (__SIZE_TYPE__ i = n; i > 0; i--) dp[i - 1] = sp[i - 1];
+    }
+    return d;
+}
 #endif
 #ifdef __cplusplus

package/javascript/dist/cjs/numkong.js CHANGED Viewed

@@ -99,6 +99,19 @@ Object.defineProperty(exports, "PackedMatrix", { enumerable: true, get: function
 Object.defineProperty(exports, "DType", { enumerable: true, get: function () { return types_js_1.DType; } });
 Object.defineProperty(exports, "outputDtype", { enumerable: true, get: function () { return types_js_1.outputDtype; } });
 function loadNativeAddon() {
+    var _a;
+    // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
+    // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
+    // runtime (e.g. one loaded by another native addon) may already be
+    // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
+    // libiomp5 to coexist; it must be in `process.env` before the `require()`
+    // below triggers the addon's `dlopen`, since libomp's constructor reads
+    // the env during dependency resolution and is too late to influence
+    // afterwards. Left unguarded because the variable is harmless on
+    // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
+    // who set it to something else is respected by `??=`. See
+    // `python/numkong/__init__.py` for the Python analog.
+    (_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
     // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
     try {
         const req = (0, node_module_1.createRequire)(path.join(getDirName(), "noop.js"));

package/javascript/dist/esm/numkong.js CHANGED Viewed

@@ -31,6 +31,19 @@ import { existsSync } from "node:fs";
 import { getFileName, getRoot } from "bindings";
 import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype } from "./types.js";
 function loadNativeAddon() {
+    var _a;
+    // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
+    // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
+    // runtime (e.g. one loaded by another native addon) may already be
+    // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
+    // libiomp5 to coexist; it must be in `process.env` before the `require()`
+    // below triggers the addon's `dlopen`, since libomp's constructor reads
+    // the env during dependency resolution and is too late to influence
+    // afterwards. Left unguarded because the variable is harmless on
+    // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
+    // who set it to something else is respected by `??=`. See
+    // `python/numkong/__init__.py` for the Python analog.
+    (_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
     // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
     try {
         const req = createRequire(path.join(getDirName(), "noop.js"));

package/javascript/numkong.c CHANGED Viewed

@@ -9,10 +9,17 @@
 #include <string.h> // `strcmp` function
+#if defined(NK_USE_OPENMP)
+#include <omp.h>
+#endif
 #include <node_api.h> // `napi_*` functions — N-API v6+ for BigInt (Node ≥ 10.20)
 #include <numkong/numkong.h> // `nk_*` functions — must be first to bring `_GNU_SOURCE`
+#define NK_PARALLEL_PACKED_TILE    64
+#define NK_PARALLEL_SYMMETRIC_TILE 32
 /** @brief Global variable that caches the CPU capabilities, and is computed just once, when the module is loaded. */
 nk_capability_t static_capabilities = nk_cap_serial_k;
@@ -152,9 +159,10 @@ static napi_value dense(napi_env env, napi_callback_info info, nk_kernel_kind_t
             // Auto-detect from N-API TypedArray type (backward-compatible 4-type whitelist)
             if (type_a != napi_float64_array && type_a != napi_float32_array && type_a != napi_int8_array &&
                 type_a != napi_uint8_array) {
-                napi_throw_error(
+                napi_throw_error( //
                     env, NULL,
-                    "Only f64, f32, i8, u8 arrays are auto-detected; pass dtype string as 3rd argument " "for other " "types");
+                    "Only f64, f32, i8, u8 arrays are auto-detected; " //
+                    "pass dtype string as 3rd argument for other types");
                 return NULL;
             }
             switch (type_a) {
@@ -482,11 +490,11 @@ static napi_value api_dots_pack(napi_env env, napi_callback_info info) {
  * dtype
  */
 static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
-    size_t argc = 9;
-    napi_value args[9];
+    size_t argc = 10;
+    napi_value args[10];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
-    if (argc != 9) {
-        napi_throw_error(env, NULL, "Packed operation requires 9 arguments");
+    if (argc < 9 || argc > 10) {
+        napi_throw_error(env, NULL, "Packed operation requires 9-10 arguments (last is optional threads)");
         return NULL;
     }
@@ -533,8 +541,26 @@ static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_ke
         return NULL;
     }
-    kernel(a_data, packed_data, result_data, (nk_size_t)height, (nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride,
-           (nk_size_t)result_stride);
+    uint32_t threads = 1;
+    if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
+#if defined(NK_USE_OPENMP)
+    if (threads == 0) threads = (uint32_t)omp_get_max_threads();
+    omp_set_num_threads((int)threads);
+#endif
+    // `int` loop counter pre-declared: MSVC's OpenMP stays at 2.0 canonical
+    // form, which forbids in-init declarations and rejects 64-bit iterators
+    // — either would trip C3015.
+    int const tile_count = (int)nk_size_divide_round_up_(height, NK_PARALLEL_PACKED_TILE);
+    int tile_idx;
+#pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
+    for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+        nk_size_t row = (nk_size_t)tile_idx * NK_PARALLEL_PACKED_TILE;
+        nk_size_t chunk = (row + NK_PARALLEL_PACKED_TILE <= height) ? NK_PARALLEL_PACKED_TILE : (height - row);
+        kernel((char const *)a_data + row * a_stride, packed_data, (char *)result_data + row * result_stride, chunk,
+               (nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride, (nk_size_t)result_stride);
+    }
     return NULL;
 }
@@ -554,11 +580,11 @@ static napi_value api_euclideans_packed(napi_env env, napi_callback_info info) {
  * string dtype
  */
 static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
-    size_t argc = 9;
-    napi_value args[9];
+    size_t argc = 10;
+    napi_value args[10];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
-    if (argc != 9) {
-        napi_throw_error(env, NULL, "Symmetric operation requires 9 arguments");
+    if (argc < 9 || argc > 10) {
+        napi_throw_error(env, NULL, "Symmetric operation requires 9-10 arguments (last is optional threads)");
         return NULL;
     }
@@ -601,8 +627,27 @@ static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk
         return NULL;
     }
-    kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
-           (nk_size_t)result_stride, (nk_size_t)row_start, (nk_size_t)row_count);
+    uint32_t threads = 1;
+    if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
+#if defined(NK_USE_OPENMP)
+    if (threads == 0) threads = (uint32_t)omp_get_max_threads();
+    omp_set_num_threads((int)threads);
+#endif
+    // `int` loop counter pre-declared: see note at `api_packed_common`.
+    int const tile_count = (int)nk_size_divide_round_up_(row_count, NK_PARALLEL_SYMMETRIC_TILE);
+    int tile_idx;
+#pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
+    for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+        nk_size_t tile_start = (nk_size_t)row_start + (nk_size_t)tile_idx * NK_PARALLEL_SYMMETRIC_TILE;
+        nk_size_t tile_rows = (tile_start + NK_PARALLEL_SYMMETRIC_TILE <= (nk_size_t)row_start + row_count)
+                                  ? NK_PARALLEL_SYMMETRIC_TILE
+                                  : ((nk_size_t)row_start + row_count - tile_start);
+        kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
+               (nk_size_t)result_stride, tile_start, tile_rows);
+    }
     return NULL;
 }

package/javascript/numkong.ts CHANGED Viewed

@@ -33,6 +33,19 @@ import { getFileName, getRoot } from "bindings";
 import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype, KernelFamily } from "./types.js";
 function loadNativeAddon(): any {
+  // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
+  // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
+  // runtime (e.g. one loaded by another native addon) may already be
+  // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
+  // libiomp5 to coexist; it must be in `process.env` before the `require()`
+  // below triggers the addon's `dlopen`, since libomp's constructor reads
+  // the env during dependency resolution and is too late to influence
+  // afterwards. Left unguarded because the variable is harmless on
+  // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
+  // who set it to something else is respected by `??=`. See
+  // `python/numkong/__init__.py` for the Python analog.
+  process.env.KMP_DUPLICATE_LIB_OK ??= "TRUE";
   // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
   try {
     const req = createRequire(path.join(getDirName(), "noop.js"));

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "numkong",
-  "version": "7.4.5",
+  "version": "7.6.0",
   "description": "Portable mixed-precision math, linear-algebra, & retrieval library with 2000+ SIMD kernels for x86, Arm, RISC-V, LoongArch, Power, & WebAssembly",
   "homepage": "https://github.com/ashvardanian/NumKong",
   "author": "Ash Vardanian",
@@ -98,11 +98,11 @@
     "printWidth": 120
   },
   "optionalDependencies": {
-    "@numkong/darwin-arm64": "7.4.5",
-    "@numkong/darwin-x64": "7.4.5",
-    "@numkong/linux-arm64": "7.4.5",
-    "@numkong/linux-x64": "7.4.5",
-    "@numkong/win32-arm64": "7.4.5",
-    "@numkong/win32-x64": "7.4.5"
+    "@numkong/darwin-arm64": "7.6.0",
+    "@numkong/darwin-x64": "7.6.0",
+    "@numkong/linux-arm64": "7.6.0",
+    "@numkong/linux-x64": "7.6.0",
+    "@numkong/win32-arm64": "7.6.0",
+    "@numkong/win32-x64": "7.6.0"
   }
 }

package/probes/probe.js CHANGED Viewed

@@ -76,8 +76,8 @@ const PROBES = [
     ["NK_TARGET_SME2P1", "probes/arm_sme2p1.c", ["-march=armv8-a+sme2p1"], []],
     ["NK_TARGET_SMEF64", "probes/arm_sme_f64.c", ["-march=armv8-a+sme+sme-f64f64"], []],
     ["NK_TARGET_SMEHALF", "probes/arm_sme_half.c", ["-march=armv8-a+sme+sme-f16f16"], []],
-    ["NK_TARGET_SMEBF16", "probes/arm_sme_bf16.c", ["-march=armv8-a+sme2+b16b16"], []],
-    ["NK_TARGET_SMEBI32", "probes/arm_sme_bi32.c", ["-march=armv8-a+sme2+sme-i16i32"], []],
+    ["NK_TARGET_SMEBF16", "probes/arm_sme_bf16.c", ["-march=armv8-a+sme2+sme-b16b16"], []],
+    ["NK_TARGET_SMEBI32", "probes/arm_sme_bi32.c", ["-march=armv8-a+sme2"], []],
     ["NK_TARGET_SMELUT2", "probes/arm_sme_lut2.c", ["-march=armv8-a+sme2+lut"], []],
     ["NK_TARGET_SMEFA64", "probes/arm_sme_fa64.c", ["-march=armv8-a+sme+sme-fa64"], []],
     // RISC-V

package/wasm/numkong.wasm CHANGED Viewed

Binary file