numkong 7.4.5 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +1 -0
  2. package/binding.gyp +99 -5
  3. package/c/dispatch_e5m2.c +23 -3
  4. package/c/dispatch_f16.c +23 -0
  5. package/c/numkong.c +0 -13
  6. package/include/numkong/attention/sme.h +34 -31
  7. package/include/numkong/capabilities.h +2 -15
  8. package/include/numkong/cast/README.md +3 -0
  9. package/include/numkong/cast/haswell.h +28 -64
  10. package/include/numkong/cast/neon.h +15 -0
  11. package/include/numkong/cast/serial.h +17 -0
  12. package/include/numkong/cast/skylake.h +67 -52
  13. package/include/numkong/cast.h +1 -0
  14. package/include/numkong/curved/smef64.h +82 -62
  15. package/include/numkong/dot/README.md +1 -0
  16. package/include/numkong/dot/haswell.h +92 -13
  17. package/include/numkong/dot/rvvbf16.h +1 -1
  18. package/include/numkong/dot/rvvhalf.h +1 -1
  19. package/include/numkong/dot/serial.h +15 -0
  20. package/include/numkong/dot/skylake.h +61 -14
  21. package/include/numkong/dot/sve.h +6 -5
  22. package/include/numkong/dot/svebfdot.h +2 -1
  23. package/include/numkong/dot/svehalf.h +6 -5
  24. package/include/numkong/dot/svesdot.h +3 -2
  25. package/include/numkong/dots/README.md +2 -0
  26. package/include/numkong/dots/graniteamx.h +1167 -0
  27. package/include/numkong/dots/haswell.h +28 -28
  28. package/include/numkong/dots/sapphireamx.h +1 -1
  29. package/include/numkong/dots/serial.h +33 -11
  30. package/include/numkong/dots/skylake.h +28 -23
  31. package/include/numkong/dots/sme.h +172 -140
  32. package/include/numkong/dots/smebi32.h +14 -11
  33. package/include/numkong/dots/smef64.h +31 -26
  34. package/include/numkong/dots.h +41 -3
  35. package/include/numkong/each/serial.h +39 -0
  36. package/include/numkong/geospatial/haswell.h +1 -1
  37. package/include/numkong/geospatial/neon.h +1 -1
  38. package/include/numkong/geospatial/serial.h +15 -4
  39. package/include/numkong/geospatial/skylake.h +1 -1
  40. package/include/numkong/maxsim/serial.h +15 -0
  41. package/include/numkong/maxsim/sme.h +34 -33
  42. package/include/numkong/mesh/README.md +50 -44
  43. package/include/numkong/mesh/genoa.h +462 -0
  44. package/include/numkong/mesh/haswell.h +806 -933
  45. package/include/numkong/mesh/neon.h +871 -943
  46. package/include/numkong/mesh/neonbfdot.h +382 -522
  47. package/include/numkong/mesh/neonfhm.h +676 -0
  48. package/include/numkong/mesh/rvv.h +404 -319
  49. package/include/numkong/mesh/serial.h +225 -161
  50. package/include/numkong/mesh/skylake.h +1029 -1585
  51. package/include/numkong/mesh/v128relaxed.h +403 -377
  52. package/include/numkong/mesh.h +38 -0
  53. package/include/numkong/reduce/neon.h +29 -0
  54. package/include/numkong/reduce/neonbfdot.h +2 -2
  55. package/include/numkong/reduce/neonfhm.h +4 -4
  56. package/include/numkong/reduce/serial.h +15 -1
  57. package/include/numkong/reduce/sve.h +52 -0
  58. package/include/numkong/reduce.h +4 -0
  59. package/include/numkong/set/sve.h +6 -5
  60. package/include/numkong/sets/smebi32.h +35 -30
  61. package/include/numkong/sparse/serial.h +17 -2
  62. package/include/numkong/sparse/sve2.h +3 -2
  63. package/include/numkong/spatial/genoa.h +0 -68
  64. package/include/numkong/spatial/haswell.h +98 -56
  65. package/include/numkong/spatial/serial.h +15 -0
  66. package/include/numkong/spatial/skylake.h +114 -54
  67. package/include/numkong/spatial/sve.h +7 -6
  68. package/include/numkong/spatial/svebfdot.h +7 -4
  69. package/include/numkong/spatial/svehalf.h +5 -4
  70. package/include/numkong/spatial/svesdot.h +9 -8
  71. package/include/numkong/spatial.h +0 -12
  72. package/include/numkong/spatials/graniteamx.h +301 -0
  73. package/include/numkong/spatials/serial.h +39 -0
  74. package/include/numkong/spatials/skylake.h +2 -2
  75. package/include/numkong/spatials/sme.h +391 -350
  76. package/include/numkong/spatials/smef64.h +79 -70
  77. package/include/numkong/spatials.h +54 -4
  78. package/include/numkong/tensor.hpp +107 -23
  79. package/include/numkong/types.h +59 -0
  80. package/javascript/dist/cjs/numkong.js +13 -0
  81. package/javascript/dist/esm/numkong.js +13 -0
  82. package/javascript/numkong.c +59 -14
  83. package/javascript/numkong.ts +13 -0
  84. package/package.json +7 -7
  85. package/probes/probe.js +2 -2
  86. package/wasm/numkong.wasm +0 -0
@@ -1,5 +1,5 @@
1
1
  /**
2
- * @brief NumKong Tensor types and tensor-level operations for C++23 and newer.
2
+ * @brief NumKong Tensor types and tensor-level operations for C++20 and newer.
3
3
  * @file include/numkong/tensor.hpp
4
4
  * @author Ash Vardanian
5
5
  * @date March 2026
@@ -19,7 +19,8 @@
19
19
  * Features:
20
20
  * - Signed strides (ptrdiff_t) for reversed/transposed views
21
21
  * - Signed indexing (negative = from end)
22
- * - C++23 variadic `operator[]` for flat access, exact access, and trailing `slice`
22
+ * - Variadic `operator()` for flat/exact access and trailing `slice` (C++20-portable);
23
+ * `operator[]` multi-arg sugar provided when the compiler supports P2128 (C++23).
23
24
  * - Axis iteration (rows_views(), rows_spans(), axis_iterator)
24
25
  * - Conversion to vector_view/vector_span for rank-1 tensors
25
26
  */
@@ -37,6 +38,14 @@
37
38
 
38
39
  #include "vector.hpp" // `aligned_allocator`
39
40
 
41
+ // True when the compiler supports C++23 P2128 multi-arg `operator[]`. Under
42
+ // this gate we expose `t[a, b, c]` as sugar that delegates to `operator()`.
43
+ #if defined(__cpp_multidimensional_subscript) && __cpp_multidimensional_subscript >= 202110L
44
+ #define NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_ 1
45
+ #else
46
+ #define NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_ 0
47
+ #endif
48
+
40
49
  namespace ashvardanian::numkong {
41
50
 
42
51
  template <typename value_type_, std::size_t max_rank_>
@@ -300,26 +309,44 @@ struct tensor_view {
300
309
  return tensor_flat_lookup_(*this, idx);
301
310
  }
302
311
 
303
- /** @brief Exact multi-dimensional scalar lookup. */
312
+ /** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
304
313
  template <std::integral... index_types_>
305
314
  requires(sizeof...(index_types_) >= 2)
306
- decltype(auto) operator[](index_types_... idxs) const noexcept {
315
+ decltype(auto) operator()(index_types_... idxs) const noexcept {
307
316
  nk_assert_(shape_.rank == sizeof...(index_types_));
308
317
  auto coords = resolve_tensor_indices_<value_type_>(shape_, std::index_sequence_for<index_types_...> {},
309
318
  idxs...);
310
319
  return tensor_lookup_resolved_(*this, std::span<std::size_t const, sizeof...(index_types_)>(coords));
311
320
  }
312
321
 
322
+ #if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
323
+ /** @brief C++23 sugar: `t[i, j, k]` scalar lookup, delegates to `operator()`. */
324
+ template <std::integral... index_types_>
325
+ requires(sizeof...(index_types_) >= 2)
326
+ decltype(auto) operator[](index_types_... idxs) const noexcept {
327
+ return (*this)(idxs...);
328
+ }
329
+ #endif
330
+
313
331
  /** @brief Trailing `slice` returns the same view. */
314
332
  constexpr tensor_view operator[](tensor_slice_t) const noexcept { return *this; }
315
333
 
316
- /** @brief Prefix leading-axis slicing with a trailing `slice` marker. */
334
+ /** @brief Prefix leading-axis slicing with a trailing `slice` marker (call syntax, C++20-portable). */
317
335
  template <typename first_type_, typename second_type_, typename... rest_types_>
318
336
  requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
319
- tensor_view operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
337
+ tensor_view operator()(first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
320
338
  return tensor_slice_suffix_(*this, first, second, rest...);
321
339
  }
322
340
 
341
+ #if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
342
+ /** @brief C++23 sugar: `t[i, nk::slice]` slicing, delegates to `operator()`. */
343
+ template <typename first_type_, typename second_type_, typename... rest_types_>
344
+ requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
345
+ tensor_view operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
346
+ return (*this)(first, second, rest...);
347
+ }
348
+ #endif
349
+
323
350
  /** @brief Rank-0 scalar access. */
324
351
  decltype(auto) scalar() const noexcept {
325
352
  nk_assert_(shape_.rank == 0);
@@ -512,22 +539,36 @@ struct tensor_span {
512
539
  return tensor_flat_lookup_(static_cast<tensor_view<value_type_, max_rank_>>(*this), idx);
513
540
  }
514
541
 
515
- /** @brief Exact multi-dimensional scalar lookup. */
542
+ /** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
516
543
  template <std::integral... index_types_>
517
544
  requires(sizeof...(index_types_) >= 2)
518
- decltype(auto) operator[](index_types_... idxs) noexcept {
545
+ decltype(auto) operator()(index_types_... idxs) noexcept {
519
546
  nk_assert_(shape_.rank == sizeof...(index_types_));
520
547
  auto coords = resolve_tensor_indices_<value_type_>(shape_, std::index_sequence_for<index_types_...> {},
521
548
  idxs...);
522
549
  return tensor_lookup_resolved_(*this, std::span<std::size_t const, sizeof...(index_types_)>(coords));
523
550
  }
524
551
 
525
- /** @brief Const full-coordinate lookup. */
552
+ /** @brief Const full-coordinate lookup via call syntax. */
553
+ template <std::integral... index_types_>
554
+ requires(sizeof...(index_types_) >= 2)
555
+ decltype(auto) operator()(index_types_... idxs) const noexcept {
556
+ return static_cast<tensor_view<value_type_, max_rank_>>(*this)(idxs...);
557
+ }
558
+
559
+ #if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
560
+ /** @brief C++23 sugar: multi-arg `[]` scalar lookup, delegates to `operator()`. */
561
+ template <std::integral... index_types_>
562
+ requires(sizeof...(index_types_) >= 2)
563
+ decltype(auto) operator[](index_types_... idxs) noexcept {
564
+ return (*this)(idxs...);
565
+ }
526
566
  template <std::integral... index_types_>
527
567
  requires(sizeof...(index_types_) >= 2)
528
568
  decltype(auto) operator[](index_types_... idxs) const noexcept {
529
- return static_cast<tensor_view<value_type_, max_rank_>>(*this)[idxs...];
569
+ return (*this)(idxs...);
530
570
  }
571
+ #endif
531
572
 
532
573
  /** @brief Trailing `slice` returns the same span. */
533
574
  constexpr tensor_span operator[](tensor_slice_t) noexcept { return *this; }
@@ -535,21 +576,36 @@ struct tensor_span {
535
576
  return static_cast<tensor_view<value_type_, max_rank_>>(*this);
536
577
  }
537
578
 
538
- /** @brief Prefix leading-axis slicing with a trailing `slice` marker. */
579
+ /** @brief Prefix leading-axis slicing via call syntax (C++20-portable). */
539
580
  template <typename first_type_, typename second_type_, typename... rest_types_>
540
581
  requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
541
- tensor_span operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
582
+ tensor_span operator()(first_type_ first, second_type_ second, rest_types_... rest) noexcept {
542
583
  return tensor_slice_suffix_(*this, first, second, rest...);
543
584
  }
544
585
 
545
- /** @brief Const prefix leading-axis slicing with a trailing `slice` marker. */
586
+ /** @brief Const prefix leading-axis slicing via call syntax. */
546
587
  template <typename first_type_, typename second_type_, typename... rest_types_>
547
588
  requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
548
- tensor_view<value_type_, max_rank_> operator[](first_type_ first, second_type_ second,
589
+ tensor_view<value_type_, max_rank_> operator()(first_type_ first, second_type_ second,
549
590
  rest_types_... rest) const noexcept {
550
591
  return tensor_slice_suffix_(static_cast<tensor_view<value_type_, max_rank_>>(*this), first, second, rest...);
551
592
  }
552
593
 
594
+ #if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
595
+ /** @brief C++23 sugar: multi-arg `[]` slicing, delegates to `operator()`. */
596
+ template <typename first_type_, typename second_type_, typename... rest_types_>
597
+ requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
598
+ tensor_span operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
599
+ return (*this)(first, second, rest...);
600
+ }
601
+ template <typename first_type_, typename second_type_, typename... rest_types_>
602
+ requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
603
+ tensor_view<value_type_, max_rank_> operator[](first_type_ first, second_type_ second,
604
+ rest_types_... rest) const noexcept {
605
+ return (*this)(first, second, rest...);
606
+ }
607
+ #endif
608
+
553
609
  /** @brief Rank-0 mutable scalar access. */
554
610
  decltype(auto) scalar_ref() noexcept {
555
611
  nk_assert_(shape_.rank == 0);
@@ -1546,38 +1602,66 @@ struct tensor {
1546
1602
  return view()[idx];
1547
1603
  }
1548
1604
 
1549
- /** @brief Exact multi-dimensional scalar lookup. */
1605
+ /** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
1550
1606
  template <std::integral... index_types_>
1551
1607
  requires(sizeof...(index_types_) >= 2)
1552
- decltype(auto) operator[](index_types_... idxs) noexcept {
1553
- return span()[idxs...];
1608
+ decltype(auto) operator()(index_types_... idxs) noexcept {
1609
+ return span()(idxs...);
1610
+ }
1611
+
1612
+ /** @brief Const multidimensional lookup via call syntax. */
1613
+ template <std::integral... index_types_>
1614
+ requires(sizeof...(index_types_) >= 2)
1615
+ decltype(auto) operator()(index_types_... idxs) const noexcept {
1616
+ return view()(idxs...);
1554
1617
  }
1555
1618
 
1556
- /** @brief Const multidimensional lookup. */
1619
+ #if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
1620
+ /** @brief C++23 sugar: multi-arg `[]` scalar lookup, delegates to `operator()`. */
1621
+ template <std::integral... index_types_>
1622
+ requires(sizeof...(index_types_) >= 2)
1623
+ decltype(auto) operator[](index_types_... idxs) noexcept {
1624
+ return (*this)(idxs...);
1625
+ }
1557
1626
  template <std::integral... index_types_>
1558
1627
  requires(sizeof...(index_types_) >= 2)
1559
1628
  decltype(auto) operator[](index_types_... idxs) const noexcept {
1560
- return view()[idxs...];
1629
+ return (*this)(idxs...);
1561
1630
  }
1631
+ #endif
1562
1632
 
1563
1633
  /** @brief Trailing `slice` returns the same tensor view/span category. */
1564
1634
  span_type operator[](tensor_slice_t) noexcept { return span(); }
1565
1635
  view_type operator[](tensor_slice_t) const noexcept { return view(); }
1566
1636
 
1567
- /** @brief Prefix leading-axis slicing with a trailing `slice` marker. */
1637
+ /** @brief Prefix leading-axis slicing via call syntax (C++20-portable). */
1568
1638
  template <typename first_type_, typename second_type_, typename... rest_types_>
1569
1639
  requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
1570
- span_type operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
1640
+ span_type operator()(first_type_ first, second_type_ second, rest_types_... rest) noexcept {
1571
1641
  return tensor_slice_suffix_(span(), first, second, rest...);
1572
1642
  }
1573
1643
 
1574
- /** @brief Const prefix leading-axis slicing with a trailing `slice` marker. */
1644
+ /** @brief Const prefix leading-axis slicing via call syntax. */
1575
1645
  template <typename first_type_, typename second_type_, typename... rest_types_>
1576
1646
  requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
1577
- view_type operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
1647
+ view_type operator()(first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
1578
1648
  return tensor_slice_suffix_(view(), first, second, rest...);
1579
1649
  }
1580
1650
 
1651
+ #if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
1652
+ /** @brief C++23 sugar: multi-arg `[]` slicing, delegates to `operator()`. */
1653
+ template <typename first_type_, typename second_type_, typename... rest_types_>
1654
+ requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
1655
+ span_type operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
1656
+ return (*this)(first, second, rest...);
1657
+ }
1658
+ template <typename first_type_, typename second_type_, typename... rest_types_>
1659
+ requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
1660
+ view_type operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
1661
+ return (*this)(first, second, rest...);
1662
+ }
1663
+ #endif
1664
+
1581
1665
  /** @brief Rank-0 mutable scalar access. */
1582
1666
  decltype(auto) scalar_ref() noexcept { return span().scalar_ref(); }
1583
1667
 
@@ -69,6 +69,20 @@
69
69
  #define _GNU_SOURCE
70
70
  #endif
71
71
 
72
+ // MSan (MemorySanitizer) cannot track data flow through SVE horizontal reductions
73
+ // like `svaddv`, which move data from vector registers to scalar registers via
74
+ // architecture-specific paths invisible to the compiler. `nk_unpoison_` marks the
75
+ // resulting scalar as initialized so MSan does not report false positives.
76
+ #if defined(__has_feature)
77
+ #if __has_feature(memory_sanitizer)
78
+ #include <sanitizer/msan_interface.h>
79
+ #define nk_unpoison_(ptr, size) __msan_unpoison((ptr), (size))
80
+ #endif
81
+ #endif
82
+ #ifndef nk_unpoison_
83
+ #define nk_unpoison_(ptr, size) (void)(ptr), (void)(size)
84
+ #endif
85
+
72
86
  // Inferring target OS: Windows, macOS, Linux, or FreeBSD
73
87
  #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
74
88
  #define NK_DEFINED_WINDOWS_ 1
@@ -1627,6 +1641,51 @@ NK_INTERNAL nk_size_t nk_sme_cntd_(void) {
1627
1641
  __asm__ __volatile__("smstart sm\n\t" "cntd %0\n\t" "smstop sm" : "=r"(r));
1628
1642
  return (nk_size_t)r;
1629
1643
  }
1644
+
1645
+ /** @brief Enter streaming SVE mode (PSTATE.SM = 1). Caller is responsible for smstop. */
1646
+ NK_INTERNAL void nk_sme_start_streaming_(void) { __asm__ __volatile__("smstart sm" ::: "memory"); }
1647
+ /** @brief Exit streaming SVE mode (PSTATE.SM = 0). Must pair with nk_sme_start_streaming_. */
1648
+ NK_INTERNAL void nk_sme_stop_streaming_(void) { __asm__ __volatile__("smstop sm" ::: "memory"); }
1649
+
1650
+ /**
1651
+ * SME runtime stubs — weak definitions for symbols the compiler may reference
1652
+ * from __arm_streaming or __arm_new("za") functions. Every TU that includes
1653
+ * this header emits a weak copy; the linker deduplicates to one.
1654
+ *
1655
+ * - __arm_tpidr2_save / __arm_tpidr2_restore: lazy ZA save/restore protocol
1656
+ * used in __arm_new("za") prologues. Always no-ops in NumKong because no
1657
+ * NK_PUBLIC function carries ZA state (TPIDR2_EL0 is always null at entry).
1658
+ *
1659
+ * - __arm_sc_memset / __arm_sc_memcpy / __arm_sc_memmove: streaming-compatible
1660
+ * memory routines the compiler may emit inside __arm_streaming functions.
1661
+ * Apple Clang provides these in its runtime; upstream LLVM does not.
1662
+ */
1663
+ __attribute__((weak)) void __arm_tpidr2_save(void) {}
1664
+ __attribute__((weak)) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
1665
+ __attribute__((weak, target("+sme"))) void *__arm_sc_memset(void *d, int c, __SIZE_TYPE__ n) __arm_streaming_compatible {
1666
+ unsigned char *p = (unsigned char *)d;
1667
+ for (__SIZE_TYPE__ i = 0; i < n; i++) p[i] = (unsigned char)c;
1668
+ return d;
1669
+ }
1670
+ __attribute__((weak, target("+sme"))) void *__arm_sc_memcpy(void *d, void const *s,
1671
+ __SIZE_TYPE__ n) __arm_streaming_compatible {
1672
+ unsigned char *dp = (unsigned char *)d;
1673
+ unsigned char const *sp = (unsigned char const *)s;
1674
+ for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
1675
+ return d;
1676
+ }
1677
+ __attribute__((weak, target("+sme"))) void *__arm_sc_memmove(void *d, void const *s,
1678
+ __SIZE_TYPE__ n) __arm_streaming_compatible {
1679
+ unsigned char *dp = (unsigned char *)d;
1680
+ unsigned char const *sp = (unsigned char const *)s;
1681
+ if (dp < sp) {
1682
+ for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
1683
+ }
1684
+ else {
1685
+ for (__SIZE_TYPE__ i = n; i > 0; i--) dp[i - 1] = sp[i - 1];
1686
+ }
1687
+ return d;
1688
+ }
1630
1689
  #endif
1631
1690
 
1632
1691
  #ifdef __cplusplus
@@ -99,6 +99,19 @@ Object.defineProperty(exports, "PackedMatrix", { enumerable: true, get: function
99
99
  Object.defineProperty(exports, "DType", { enumerable: true, get: function () { return types_js_1.DType; } });
100
100
  Object.defineProperty(exports, "outputDtype", { enumerable: true, get: function () { return types_js_1.outputDtype; } });
101
101
  function loadNativeAddon() {
102
+ var _a;
103
+ // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
104
+ // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
105
+ // runtime (e.g. one loaded by another native addon) may already be
106
+ // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
107
+ // libiomp5 to coexist; it must be in `process.env` before the `require()`
108
+ // below triggers the addon's `dlopen`, since libomp's constructor reads
109
+ // the env during dependency resolution and is too late to influence
110
+ // afterwards. Left unguarded because the variable is harmless on
111
+ // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
112
+ // who set it to something else is respected by `??=`. See
113
+ // `python/numkong/__init__.py` for the Python analog.
114
+ (_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
102
115
  // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
103
116
  try {
104
117
  const req = (0, node_module_1.createRequire)(path.join(getDirName(), "noop.js"));
@@ -31,6 +31,19 @@ import { existsSync } from "node:fs";
31
31
  import { getFileName, getRoot } from "bindings";
32
32
  import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype } from "./types.js";
33
33
  function loadNativeAddon() {
34
+ var _a;
35
+ // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
36
+ // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
37
+ // runtime (e.g. one loaded by another native addon) may already be
38
+ // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
39
+ // libiomp5 to coexist; it must be in `process.env` before the `require()`
40
+ // below triggers the addon's `dlopen`, since libomp's constructor reads
41
+ // the env during dependency resolution and is too late to influence
42
+ // afterwards. Left unguarded because the variable is harmless on
43
+ // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
44
+ // who set it to something else is respected by `??=`. See
45
+ // `python/numkong/__init__.py` for the Python analog.
46
+ (_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
34
47
  // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
35
48
  try {
36
49
  const req = createRequire(path.join(getDirName(), "noop.js"));
@@ -9,10 +9,17 @@
9
9
 
10
10
  #include <string.h> // `strcmp` function
11
11
 
12
+ #if defined(NK_USE_OPENMP)
13
+ #include <omp.h>
14
+ #endif
15
+
12
16
  #include <node_api.h> // `napi_*` functions — N-API v6+ for BigInt (Node ≥ 10.20)
13
17
 
14
18
  #include <numkong/numkong.h> // `nk_*` functions — must be first to bring `_GNU_SOURCE`
15
19
 
20
+ #define NK_PARALLEL_PACKED_TILE 64
21
+ #define NK_PARALLEL_SYMMETRIC_TILE 32
22
+
16
23
  /** @brief Global variable that caches the CPU capabilities, and is computed just once, when the module is loaded. */
17
24
  nk_capability_t static_capabilities = nk_cap_serial_k;
18
25
 
@@ -152,9 +159,10 @@ static napi_value dense(napi_env env, napi_callback_info info, nk_kernel_kind_t
152
159
  // Auto-detect from N-API TypedArray type (backward-compatible 4-type whitelist)
153
160
  if (type_a != napi_float64_array && type_a != napi_float32_array && type_a != napi_int8_array &&
154
161
  type_a != napi_uint8_array) {
155
- napi_throw_error(
162
+ napi_throw_error( //
156
163
  env, NULL,
157
- "Only f64, f32, i8, u8 arrays are auto-detected; pass dtype string as 3rd argument " "for other " "types");
164
+ "Only f64, f32, i8, u8 arrays are auto-detected; " //
165
+ "pass dtype string as 3rd argument for other types");
158
166
  return NULL;
159
167
  }
160
168
  switch (type_a) {
@@ -482,11 +490,11 @@ static napi_value api_dots_pack(napi_env env, napi_callback_info info) {
482
490
  * dtype
483
491
  */
484
492
  static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
485
- size_t argc = 9;
486
- napi_value args[9];
493
+ size_t argc = 10;
494
+ napi_value args[10];
487
495
  napi_get_cb_info(env, info, &argc, args, NULL, NULL);
488
- if (argc != 9) {
489
- napi_throw_error(env, NULL, "Packed operation requires 9 arguments");
496
+ if (argc < 9 || argc > 10) {
497
+ napi_throw_error(env, NULL, "Packed operation requires 9-10 arguments (last is optional threads)");
490
498
  return NULL;
491
499
  }
492
500
 
@@ -533,8 +541,26 @@ static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_ke
533
541
  return NULL;
534
542
  }
535
543
 
536
- kernel(a_data, packed_data, result_data, (nk_size_t)height, (nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride,
537
- (nk_size_t)result_stride);
544
+ uint32_t threads = 1;
545
+ if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
546
+
547
+ #if defined(NK_USE_OPENMP)
548
+ if (threads == 0) threads = (uint32_t)omp_get_max_threads();
549
+ omp_set_num_threads((int)threads);
550
+ #endif
551
+
552
+ // `int` loop counter pre-declared: MSVC's OpenMP stays at 2.0 canonical
553
+ // form, which forbids in-init declarations and rejects 64-bit iterators
554
+ // — either would trip C3015.
555
+ int const tile_count = (int)nk_size_divide_round_up_(height, NK_PARALLEL_PACKED_TILE);
556
+ int tile_idx;
557
+ #pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
558
+ for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
559
+ nk_size_t row = (nk_size_t)tile_idx * NK_PARALLEL_PACKED_TILE;
560
+ nk_size_t chunk = (row + NK_PARALLEL_PACKED_TILE <= height) ? NK_PARALLEL_PACKED_TILE : (height - row);
561
+ kernel((char const *)a_data + row * a_stride, packed_data, (char *)result_data + row * result_stride, chunk,
562
+ (nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride, (nk_size_t)result_stride);
563
+ }
538
564
  return NULL;
539
565
  }
540
566
 
@@ -554,11 +580,11 @@ static napi_value api_euclideans_packed(napi_env env, napi_callback_info info) {
554
580
  * string dtype
555
581
  */
556
582
  static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
557
- size_t argc = 9;
558
- napi_value args[9];
583
+ size_t argc = 10;
584
+ napi_value args[10];
559
585
  napi_get_cb_info(env, info, &argc, args, NULL, NULL);
560
- if (argc != 9) {
561
- napi_throw_error(env, NULL, "Symmetric operation requires 9 arguments");
586
+ if (argc < 9 || argc > 10) {
587
+ napi_throw_error(env, NULL, "Symmetric operation requires 9-10 arguments (last is optional threads)");
562
588
  return NULL;
563
589
  }
564
590
 
@@ -601,8 +627,27 @@ static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk
601
627
  return NULL;
602
628
  }
603
629
 
604
- kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
605
- (nk_size_t)result_stride, (nk_size_t)row_start, (nk_size_t)row_count);
630
+ uint32_t threads = 1;
631
+ if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
632
+
633
+ #if defined(NK_USE_OPENMP)
634
+ if (threads == 0) threads = (uint32_t)omp_get_max_threads();
635
+ omp_set_num_threads((int)threads);
636
+ #endif
637
+
638
+ // `int` loop counter pre-declared: see note at `api_packed_common`.
639
+ int const tile_count = (int)nk_size_divide_round_up_(row_count, NK_PARALLEL_SYMMETRIC_TILE);
640
+ int tile_idx;
641
+ #pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
642
+ for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
643
+ nk_size_t tile_start = (nk_size_t)row_start + (nk_size_t)tile_idx * NK_PARALLEL_SYMMETRIC_TILE;
644
+ nk_size_t tile_rows = (tile_start + NK_PARALLEL_SYMMETRIC_TILE <= (nk_size_t)row_start + row_count)
645
+ ? NK_PARALLEL_SYMMETRIC_TILE
646
+ : ((nk_size_t)row_start + row_count - tile_start);
647
+ kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
648
+ (nk_size_t)result_stride, tile_start, tile_rows);
649
+ }
650
+
606
651
  return NULL;
607
652
  }
608
653
 
@@ -33,6 +33,19 @@ import { getFileName, getRoot } from "bindings";
33
33
  import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype, KernelFamily } from "./types.js";
34
34
 
35
35
  function loadNativeAddon(): any {
36
+ // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
37
+ // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
38
+ // runtime (e.g. one loaded by another native addon) may already be
39
+ // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
40
+ // libiomp5 to coexist; it must be in `process.env` before the `require()`
41
+ // below triggers the addon's `dlopen`, since libomp's constructor reads
42
+ // the env during dependency resolution and is too late to influence
43
+ // afterwards. Left unguarded because the variable is harmless on
44
+ // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
45
+ // who set it to something else is respected by `??=`. See
46
+ // `python/numkong/__init__.py` for the Python analog.
47
+ process.env.KMP_DUPLICATE_LIB_OK ??= "TRUE";
48
+
36
49
  // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
37
50
  try {
38
51
  const req = createRequire(path.join(getDirName(), "noop.js"));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "numkong",
3
- "version": "7.4.5",
3
+ "version": "7.6.0",
4
4
  "description": "Portable mixed-precision math, linear-algebra, & retrieval library with 2000+ SIMD kernels for x86, Arm, RISC-V, LoongArch, Power, & WebAssembly",
5
5
  "homepage": "https://github.com/ashvardanian/NumKong",
6
6
  "author": "Ash Vardanian",
@@ -98,11 +98,11 @@
98
98
  "printWidth": 120
99
99
  },
100
100
  "optionalDependencies": {
101
- "@numkong/darwin-arm64": "7.4.5",
102
- "@numkong/darwin-x64": "7.4.5",
103
- "@numkong/linux-arm64": "7.4.5",
104
- "@numkong/linux-x64": "7.4.5",
105
- "@numkong/win32-arm64": "7.4.5",
106
- "@numkong/win32-x64": "7.4.5"
101
+ "@numkong/darwin-arm64": "7.6.0",
102
+ "@numkong/darwin-x64": "7.6.0",
103
+ "@numkong/linux-arm64": "7.6.0",
104
+ "@numkong/linux-x64": "7.6.0",
105
+ "@numkong/win32-arm64": "7.6.0",
106
+ "@numkong/win32-x64": "7.6.0"
107
107
  }
108
108
  }
package/probes/probe.js CHANGED
@@ -76,8 +76,8 @@ const PROBES = [
76
76
  ["NK_TARGET_SME2P1", "probes/arm_sme2p1.c", ["-march=armv8-a+sme2p1"], []],
77
77
  ["NK_TARGET_SMEF64", "probes/arm_sme_f64.c", ["-march=armv8-a+sme+sme-f64f64"], []],
78
78
  ["NK_TARGET_SMEHALF", "probes/arm_sme_half.c", ["-march=armv8-a+sme+sme-f16f16"], []],
79
- ["NK_TARGET_SMEBF16", "probes/arm_sme_bf16.c", ["-march=armv8-a+sme2+b16b16"], []],
80
- ["NK_TARGET_SMEBI32", "probes/arm_sme_bi32.c", ["-march=armv8-a+sme2+sme-i16i32"], []],
79
+ ["NK_TARGET_SMEBF16", "probes/arm_sme_bf16.c", ["-march=armv8-a+sme2+sme-b16b16"], []],
80
+ ["NK_TARGET_SMEBI32", "probes/arm_sme_bi32.c", ["-march=armv8-a+sme2"], []],
81
81
  ["NK_TARGET_SMELUT2", "probes/arm_sme_lut2.c", ["-march=armv8-a+sme2+lut"], []],
82
82
  ["NK_TARGET_SMEFA64", "probes/arm_sme_fa64.c", ["-march=armv8-a+sme+sme-fa64"], []],
83
83
  // RISC-V
package/wasm/numkong.wasm CHANGED
Binary file