numkong 7.4.5 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/binding.gyp +99 -5
- package/c/dispatch_e5m2.c +23 -3
- package/c/dispatch_f16.c +23 -0
- package/c/numkong.c +0 -13
- package/include/numkong/attention/sme.h +34 -31
- package/include/numkong/capabilities.h +2 -15
- package/include/numkong/cast/README.md +3 -0
- package/include/numkong/cast/haswell.h +28 -64
- package/include/numkong/cast/neon.h +15 -0
- package/include/numkong/cast/serial.h +17 -0
- package/include/numkong/cast/skylake.h +67 -52
- package/include/numkong/cast.h +1 -0
- package/include/numkong/curved/smef64.h +82 -62
- package/include/numkong/dot/README.md +1 -0
- package/include/numkong/dot/haswell.h +92 -13
- package/include/numkong/dot/rvvbf16.h +1 -1
- package/include/numkong/dot/rvvhalf.h +1 -1
- package/include/numkong/dot/serial.h +15 -0
- package/include/numkong/dot/skylake.h +61 -14
- package/include/numkong/dot/sve.h +6 -5
- package/include/numkong/dot/svebfdot.h +2 -1
- package/include/numkong/dot/svehalf.h +6 -5
- package/include/numkong/dot/svesdot.h +3 -2
- package/include/numkong/dots/README.md +2 -0
- package/include/numkong/dots/graniteamx.h +1167 -0
- package/include/numkong/dots/haswell.h +28 -28
- package/include/numkong/dots/sapphireamx.h +1 -1
- package/include/numkong/dots/serial.h +33 -11
- package/include/numkong/dots/skylake.h +28 -23
- package/include/numkong/dots/sme.h +172 -140
- package/include/numkong/dots/smebi32.h +14 -11
- package/include/numkong/dots/smef64.h +31 -26
- package/include/numkong/dots.h +41 -3
- package/include/numkong/each/serial.h +39 -0
- package/include/numkong/geospatial/haswell.h +1 -1
- package/include/numkong/geospatial/neon.h +1 -1
- package/include/numkong/geospatial/serial.h +15 -4
- package/include/numkong/geospatial/skylake.h +1 -1
- package/include/numkong/maxsim/serial.h +15 -0
- package/include/numkong/maxsim/sme.h +34 -33
- package/include/numkong/mesh/README.md +50 -44
- package/include/numkong/mesh/genoa.h +462 -0
- package/include/numkong/mesh/haswell.h +806 -933
- package/include/numkong/mesh/neon.h +871 -943
- package/include/numkong/mesh/neonbfdot.h +382 -522
- package/include/numkong/mesh/neonfhm.h +676 -0
- package/include/numkong/mesh/rvv.h +404 -319
- package/include/numkong/mesh/serial.h +225 -161
- package/include/numkong/mesh/skylake.h +1029 -1585
- package/include/numkong/mesh/v128relaxed.h +403 -377
- package/include/numkong/mesh.h +38 -0
- package/include/numkong/reduce/neon.h +29 -0
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +4 -4
- package/include/numkong/reduce/serial.h +15 -1
- package/include/numkong/reduce/sve.h +52 -0
- package/include/numkong/reduce.h +4 -0
- package/include/numkong/set/sve.h +6 -5
- package/include/numkong/sets/smebi32.h +35 -30
- package/include/numkong/sparse/serial.h +17 -2
- package/include/numkong/sparse/sve2.h +3 -2
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +98 -56
- package/include/numkong/spatial/serial.h +15 -0
- package/include/numkong/spatial/skylake.h +114 -54
- package/include/numkong/spatial/sve.h +7 -6
- package/include/numkong/spatial/svebfdot.h +7 -4
- package/include/numkong/spatial/svehalf.h +5 -4
- package/include/numkong/spatial/svesdot.h +9 -8
- package/include/numkong/spatial.h +0 -12
- package/include/numkong/spatials/graniteamx.h +301 -0
- package/include/numkong/spatials/serial.h +39 -0
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +391 -350
- package/include/numkong/spatials/smef64.h +79 -70
- package/include/numkong/spatials.h +54 -4
- package/include/numkong/tensor.hpp +107 -23
- package/include/numkong/types.h +59 -0
- package/javascript/dist/cjs/numkong.js +13 -0
- package/javascript/dist/esm/numkong.js +13 -0
- package/javascript/numkong.c +59 -14
- package/javascript/numkong.ts +13 -0
- package/package.json +7 -7
- package/probes/probe.js +2 -2
- package/wasm/numkong.wasm +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* @brief NumKong Tensor types and tensor-level operations for C++
|
|
2
|
+
* @brief NumKong Tensor types and tensor-level operations for C++20 and newer.
|
|
3
3
|
* @file include/numkong/tensor.hpp
|
|
4
4
|
* @author Ash Vardanian
|
|
5
5
|
* @date March 2026
|
|
@@ -19,7 +19,8 @@
|
|
|
19
19
|
* Features:
|
|
20
20
|
* - Signed strides (ptrdiff_t) for reversed/transposed views
|
|
21
21
|
* - Signed indexing (negative = from end)
|
|
22
|
-
* -
|
|
22
|
+
* - Variadic `operator()` for flat/exact access and trailing `slice` (C++20-portable);
|
|
23
|
+
* `operator[]` multi-arg sugar provided when the compiler supports P2128 (C++23).
|
|
23
24
|
* - Axis iteration (rows_views(), rows_spans(), axis_iterator)
|
|
24
25
|
* - Conversion to vector_view/vector_span for rank-1 tensors
|
|
25
26
|
*/
|
|
@@ -37,6 +38,14 @@
|
|
|
37
38
|
|
|
38
39
|
#include "vector.hpp" // `aligned_allocator`
|
|
39
40
|
|
|
41
|
+
// True when the compiler supports C++23 P2128 multi-arg `operator[]`. Under
|
|
42
|
+
// this gate we expose `t[a, b, c]` as sugar that delegates to `operator()`.
|
|
43
|
+
#if defined(__cpp_multidimensional_subscript) && __cpp_multidimensional_subscript >= 202110L
|
|
44
|
+
#define NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_ 1
|
|
45
|
+
#else
|
|
46
|
+
#define NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_ 0
|
|
47
|
+
#endif
|
|
48
|
+
|
|
40
49
|
namespace ashvardanian::numkong {
|
|
41
50
|
|
|
42
51
|
template <typename value_type_, std::size_t max_rank_>
|
|
@@ -300,26 +309,44 @@ struct tensor_view {
|
|
|
300
309
|
return tensor_flat_lookup_(*this, idx);
|
|
301
310
|
}
|
|
302
311
|
|
|
303
|
-
/** @brief Exact multi-dimensional scalar lookup. */
|
|
312
|
+
/** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
|
|
304
313
|
template <std::integral... index_types_>
|
|
305
314
|
requires(sizeof...(index_types_) >= 2)
|
|
306
|
-
decltype(auto) operator
|
|
315
|
+
decltype(auto) operator()(index_types_... idxs) const noexcept {
|
|
307
316
|
nk_assert_(shape_.rank == sizeof...(index_types_));
|
|
308
317
|
auto coords = resolve_tensor_indices_<value_type_>(shape_, std::index_sequence_for<index_types_...> {},
|
|
309
318
|
idxs...);
|
|
310
319
|
return tensor_lookup_resolved_(*this, std::span<std::size_t const, sizeof...(index_types_)>(coords));
|
|
311
320
|
}
|
|
312
321
|
|
|
322
|
+
#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
|
|
323
|
+
/** @brief C++23 sugar: `t[i, j, k]` scalar lookup, delegates to `operator()`. */
|
|
324
|
+
template <std::integral... index_types_>
|
|
325
|
+
requires(sizeof...(index_types_) >= 2)
|
|
326
|
+
decltype(auto) operator[](index_types_... idxs) const noexcept {
|
|
327
|
+
return (*this)(idxs...);
|
|
328
|
+
}
|
|
329
|
+
#endif
|
|
330
|
+
|
|
313
331
|
/** @brief Trailing `slice` returns the same view. */
|
|
314
332
|
constexpr tensor_view operator[](tensor_slice_t) const noexcept { return *this; }
|
|
315
333
|
|
|
316
|
-
/** @brief Prefix leading-axis slicing with a trailing `slice` marker. */
|
|
334
|
+
/** @brief Prefix leading-axis slicing with a trailing `slice` marker (call syntax, C++20-portable). */
|
|
317
335
|
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
318
336
|
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
319
|
-
tensor_view operator
|
|
337
|
+
tensor_view operator()(first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
|
|
320
338
|
return tensor_slice_suffix_(*this, first, second, rest...);
|
|
321
339
|
}
|
|
322
340
|
|
|
341
|
+
#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
|
|
342
|
+
/** @brief C++23 sugar: `t[i, nk::slice]` slicing, delegates to `operator()`. */
|
|
343
|
+
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
344
|
+
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
345
|
+
tensor_view operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
|
|
346
|
+
return (*this)(first, second, rest...);
|
|
347
|
+
}
|
|
348
|
+
#endif
|
|
349
|
+
|
|
323
350
|
/** @brief Rank-0 scalar access. */
|
|
324
351
|
decltype(auto) scalar() const noexcept {
|
|
325
352
|
nk_assert_(shape_.rank == 0);
|
|
@@ -512,22 +539,36 @@ struct tensor_span {
|
|
|
512
539
|
return tensor_flat_lookup_(static_cast<tensor_view<value_type_, max_rank_>>(*this), idx);
|
|
513
540
|
}
|
|
514
541
|
|
|
515
|
-
/** @brief Exact multi-dimensional scalar lookup. */
|
|
542
|
+
/** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
|
|
516
543
|
template <std::integral... index_types_>
|
|
517
544
|
requires(sizeof...(index_types_) >= 2)
|
|
518
|
-
decltype(auto) operator
|
|
545
|
+
decltype(auto) operator()(index_types_... idxs) noexcept {
|
|
519
546
|
nk_assert_(shape_.rank == sizeof...(index_types_));
|
|
520
547
|
auto coords = resolve_tensor_indices_<value_type_>(shape_, std::index_sequence_for<index_types_...> {},
|
|
521
548
|
idxs...);
|
|
522
549
|
return tensor_lookup_resolved_(*this, std::span<std::size_t const, sizeof...(index_types_)>(coords));
|
|
523
550
|
}
|
|
524
551
|
|
|
525
|
-
/** @brief Const full-coordinate lookup. */
|
|
552
|
+
/** @brief Const full-coordinate lookup via call syntax. */
|
|
553
|
+
template <std::integral... index_types_>
|
|
554
|
+
requires(sizeof...(index_types_) >= 2)
|
|
555
|
+
decltype(auto) operator()(index_types_... idxs) const noexcept {
|
|
556
|
+
return static_cast<tensor_view<value_type_, max_rank_>>(*this)(idxs...);
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
|
|
560
|
+
/** @brief C++23 sugar: multi-arg `[]` scalar lookup, delegates to `operator()`. */
|
|
561
|
+
template <std::integral... index_types_>
|
|
562
|
+
requires(sizeof...(index_types_) >= 2)
|
|
563
|
+
decltype(auto) operator[](index_types_... idxs) noexcept {
|
|
564
|
+
return (*this)(idxs...);
|
|
565
|
+
}
|
|
526
566
|
template <std::integral... index_types_>
|
|
527
567
|
requires(sizeof...(index_types_) >= 2)
|
|
528
568
|
decltype(auto) operator[](index_types_... idxs) const noexcept {
|
|
529
|
-
return
|
|
569
|
+
return (*this)(idxs...);
|
|
530
570
|
}
|
|
571
|
+
#endif
|
|
531
572
|
|
|
532
573
|
/** @brief Trailing `slice` returns the same span. */
|
|
533
574
|
constexpr tensor_span operator[](tensor_slice_t) noexcept { return *this; }
|
|
@@ -535,21 +576,36 @@ struct tensor_span {
|
|
|
535
576
|
return static_cast<tensor_view<value_type_, max_rank_>>(*this);
|
|
536
577
|
}
|
|
537
578
|
|
|
538
|
-
/** @brief Prefix leading-axis slicing
|
|
579
|
+
/** @brief Prefix leading-axis slicing via call syntax (C++20-portable). */
|
|
539
580
|
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
540
581
|
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
541
|
-
tensor_span operator
|
|
582
|
+
tensor_span operator()(first_type_ first, second_type_ second, rest_types_... rest) noexcept {
|
|
542
583
|
return tensor_slice_suffix_(*this, first, second, rest...);
|
|
543
584
|
}
|
|
544
585
|
|
|
545
|
-
/** @brief Const prefix leading-axis slicing
|
|
586
|
+
/** @brief Const prefix leading-axis slicing via call syntax. */
|
|
546
587
|
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
547
588
|
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
548
|
-
tensor_view<value_type_, max_rank_> operator
|
|
589
|
+
tensor_view<value_type_, max_rank_> operator()(first_type_ first, second_type_ second,
|
|
549
590
|
rest_types_... rest) const noexcept {
|
|
550
591
|
return tensor_slice_suffix_(static_cast<tensor_view<value_type_, max_rank_>>(*this), first, second, rest...);
|
|
551
592
|
}
|
|
552
593
|
|
|
594
|
+
#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
|
|
595
|
+
/** @brief C++23 sugar: multi-arg `[]` slicing, delegates to `operator()`. */
|
|
596
|
+
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
597
|
+
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
598
|
+
tensor_span operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
|
|
599
|
+
return (*this)(first, second, rest...);
|
|
600
|
+
}
|
|
601
|
+
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
602
|
+
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
603
|
+
tensor_view<value_type_, max_rank_> operator[](first_type_ first, second_type_ second,
|
|
604
|
+
rest_types_... rest) const noexcept {
|
|
605
|
+
return (*this)(first, second, rest...);
|
|
606
|
+
}
|
|
607
|
+
#endif
|
|
608
|
+
|
|
553
609
|
/** @brief Rank-0 mutable scalar access. */
|
|
554
610
|
decltype(auto) scalar_ref() noexcept {
|
|
555
611
|
nk_assert_(shape_.rank == 0);
|
|
@@ -1546,38 +1602,66 @@ struct tensor {
|
|
|
1546
1602
|
return view()[idx];
|
|
1547
1603
|
}
|
|
1548
1604
|
|
|
1549
|
-
/** @brief Exact multi-dimensional scalar lookup. */
|
|
1605
|
+
/** @brief Exact multi-dimensional scalar lookup via call syntax (C++20-portable). */
|
|
1550
1606
|
template <std::integral... index_types_>
|
|
1551
1607
|
requires(sizeof...(index_types_) >= 2)
|
|
1552
|
-
decltype(auto) operator
|
|
1553
|
-
return span()
|
|
1608
|
+
decltype(auto) operator()(index_types_... idxs) noexcept {
|
|
1609
|
+
return span()(idxs...);
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
/** @brief Const multidimensional lookup via call syntax. */
|
|
1613
|
+
template <std::integral... index_types_>
|
|
1614
|
+
requires(sizeof...(index_types_) >= 2)
|
|
1615
|
+
decltype(auto) operator()(index_types_... idxs) const noexcept {
|
|
1616
|
+
return view()(idxs...);
|
|
1554
1617
|
}
|
|
1555
1618
|
|
|
1556
|
-
|
|
1619
|
+
#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
|
|
1620
|
+
/** @brief C++23 sugar: multi-arg `[]` scalar lookup, delegates to `operator()`. */
|
|
1621
|
+
template <std::integral... index_types_>
|
|
1622
|
+
requires(sizeof...(index_types_) >= 2)
|
|
1623
|
+
decltype(auto) operator[](index_types_... idxs) noexcept {
|
|
1624
|
+
return (*this)(idxs...);
|
|
1625
|
+
}
|
|
1557
1626
|
template <std::integral... index_types_>
|
|
1558
1627
|
requires(sizeof...(index_types_) >= 2)
|
|
1559
1628
|
decltype(auto) operator[](index_types_... idxs) const noexcept {
|
|
1560
|
-
return
|
|
1629
|
+
return (*this)(idxs...);
|
|
1561
1630
|
}
|
|
1631
|
+
#endif
|
|
1562
1632
|
|
|
1563
1633
|
/** @brief Trailing `slice` returns the same tensor view/span category. */
|
|
1564
1634
|
span_type operator[](tensor_slice_t) noexcept { return span(); }
|
|
1565
1635
|
view_type operator[](tensor_slice_t) const noexcept { return view(); }
|
|
1566
1636
|
|
|
1567
|
-
/** @brief Prefix leading-axis slicing
|
|
1637
|
+
/** @brief Prefix leading-axis slicing via call syntax (C++20-portable). */
|
|
1568
1638
|
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
1569
1639
|
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
1570
|
-
span_type operator
|
|
1640
|
+
span_type operator()(first_type_ first, second_type_ second, rest_types_... rest) noexcept {
|
|
1571
1641
|
return tensor_slice_suffix_(span(), first, second, rest...);
|
|
1572
1642
|
}
|
|
1573
1643
|
|
|
1574
|
-
/** @brief Const prefix leading-axis slicing
|
|
1644
|
+
/** @brief Const prefix leading-axis slicing via call syntax. */
|
|
1575
1645
|
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
1576
1646
|
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
1577
|
-
view_type operator
|
|
1647
|
+
view_type operator()(first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
|
|
1578
1648
|
return tensor_slice_suffix_(view(), first, second, rest...);
|
|
1579
1649
|
}
|
|
1580
1650
|
|
|
1651
|
+
#if NK_HAS_MULTIDIMENSIONAL_SUBSCRIPT_
|
|
1652
|
+
/** @brief C++23 sugar: multi-arg `[]` slicing, delegates to `operator()`. */
|
|
1653
|
+
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
1654
|
+
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
1655
|
+
span_type operator[](first_type_ first, second_type_ second, rest_types_... rest) noexcept {
|
|
1656
|
+
return (*this)(first, second, rest...);
|
|
1657
|
+
}
|
|
1658
|
+
template <typename first_type_, typename second_type_, typename... rest_types_>
|
|
1659
|
+
requires(trailing_tensor_slice_args_v<first_type_, second_type_, rest_types_...>)
|
|
1660
|
+
view_type operator[](first_type_ first, second_type_ second, rest_types_... rest) const noexcept {
|
|
1661
|
+
return (*this)(first, second, rest...);
|
|
1662
|
+
}
|
|
1663
|
+
#endif
|
|
1664
|
+
|
|
1581
1665
|
/** @brief Rank-0 mutable scalar access. */
|
|
1582
1666
|
decltype(auto) scalar_ref() noexcept { return span().scalar_ref(); }
|
|
1583
1667
|
|
package/include/numkong/types.h
CHANGED
|
@@ -69,6 +69,20 @@
|
|
|
69
69
|
#define _GNU_SOURCE
|
|
70
70
|
#endif
|
|
71
71
|
|
|
72
|
+
// MSan (MemorySanitizer) cannot track data flow through SVE horizontal reductions
|
|
73
|
+
// like `svaddv`, which move data from vector registers to scalar registers via
|
|
74
|
+
// architecture-specific paths invisible to the compiler. `nk_unpoison_` marks the
|
|
75
|
+
// resulting scalar as initialized so MSan does not report false positives.
|
|
76
|
+
#if defined(__has_feature)
|
|
77
|
+
#if __has_feature(memory_sanitizer)
|
|
78
|
+
#include <sanitizer/msan_interface.h>
|
|
79
|
+
#define nk_unpoison_(ptr, size) __msan_unpoison((ptr), (size))
|
|
80
|
+
#endif
|
|
81
|
+
#endif
|
|
82
|
+
#ifndef nk_unpoison_
|
|
83
|
+
#define nk_unpoison_(ptr, size) (void)(ptr), (void)(size)
|
|
84
|
+
#endif
|
|
85
|
+
|
|
72
86
|
// Inferring target OS: Windows, macOS, Linux, or FreeBSD
|
|
73
87
|
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
|
|
74
88
|
#define NK_DEFINED_WINDOWS_ 1
|
|
@@ -1627,6 +1641,51 @@ NK_INTERNAL nk_size_t nk_sme_cntd_(void) {
|
|
|
1627
1641
|
__asm__ __volatile__("smstart sm\n\t" "cntd %0\n\t" "smstop sm" : "=r"(r));
|
|
1628
1642
|
return (nk_size_t)r;
|
|
1629
1643
|
}
|
|
1644
|
+
|
|
1645
|
+
/** @brief Enter streaming SVE mode (PSTATE.SM = 1). Caller is responsible for smstop. */
|
|
1646
|
+
NK_INTERNAL void nk_sme_start_streaming_(void) { __asm__ __volatile__("smstart sm" ::: "memory"); }
|
|
1647
|
+
/** @brief Exit streaming SVE mode (PSTATE.SM = 0). Must pair with nk_sme_start_streaming_. */
|
|
1648
|
+
NK_INTERNAL void nk_sme_stop_streaming_(void) { __asm__ __volatile__("smstop sm" ::: "memory"); }
|
|
1649
|
+
|
|
1650
|
+
/**
|
|
1651
|
+
* SME runtime stubs — weak definitions for symbols the compiler may reference
|
|
1652
|
+
* from __arm_streaming or __arm_new("za") functions. Every TU that includes
|
|
1653
|
+
* this header emits a weak copy; the linker deduplicates to one.
|
|
1654
|
+
*
|
|
1655
|
+
* - __arm_tpidr2_save / __arm_tpidr2_restore: lazy ZA save/restore protocol
|
|
1656
|
+
* used in __arm_new("za") prologues. Always no-ops in NumKong because no
|
|
1657
|
+
* NK_PUBLIC function carries ZA state (TPIDR2_EL0 is always null at entry).
|
|
1658
|
+
*
|
|
1659
|
+
* - __arm_sc_memset / __arm_sc_memcpy / __arm_sc_memmove: streaming-compatible
|
|
1660
|
+
* memory routines the compiler may emit inside __arm_streaming functions.
|
|
1661
|
+
* Apple Clang provides these in its runtime; upstream LLVM does not.
|
|
1662
|
+
*/
|
|
1663
|
+
__attribute__((weak)) void __arm_tpidr2_save(void) {}
|
|
1664
|
+
__attribute__((weak)) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
|
|
1665
|
+
__attribute__((weak, target("+sme"))) void *__arm_sc_memset(void *d, int c, __SIZE_TYPE__ n) __arm_streaming_compatible {
|
|
1666
|
+
unsigned char *p = (unsigned char *)d;
|
|
1667
|
+
for (__SIZE_TYPE__ i = 0; i < n; i++) p[i] = (unsigned char)c;
|
|
1668
|
+
return d;
|
|
1669
|
+
}
|
|
1670
|
+
__attribute__((weak, target("+sme"))) void *__arm_sc_memcpy(void *d, void const *s,
|
|
1671
|
+
__SIZE_TYPE__ n) __arm_streaming_compatible {
|
|
1672
|
+
unsigned char *dp = (unsigned char *)d;
|
|
1673
|
+
unsigned char const *sp = (unsigned char const *)s;
|
|
1674
|
+
for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
|
|
1675
|
+
return d;
|
|
1676
|
+
}
|
|
1677
|
+
__attribute__((weak, target("+sme"))) void *__arm_sc_memmove(void *d, void const *s,
|
|
1678
|
+
__SIZE_TYPE__ n) __arm_streaming_compatible {
|
|
1679
|
+
unsigned char *dp = (unsigned char *)d;
|
|
1680
|
+
unsigned char const *sp = (unsigned char const *)s;
|
|
1681
|
+
if (dp < sp) {
|
|
1682
|
+
for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
|
|
1683
|
+
}
|
|
1684
|
+
else {
|
|
1685
|
+
for (__SIZE_TYPE__ i = n; i > 0; i--) dp[i - 1] = sp[i - 1];
|
|
1686
|
+
}
|
|
1687
|
+
return d;
|
|
1688
|
+
}
|
|
1630
1689
|
#endif
|
|
1631
1690
|
|
|
1632
1691
|
#ifdef __cplusplus
|
|
@@ -99,6 +99,19 @@ Object.defineProperty(exports, "PackedMatrix", { enumerable: true, get: function
|
|
|
99
99
|
Object.defineProperty(exports, "DType", { enumerable: true, get: function () { return types_js_1.DType; } });
|
|
100
100
|
Object.defineProperty(exports, "outputDtype", { enumerable: true, get: function () { return types_js_1.outputDtype; } });
|
|
101
101
|
function loadNativeAddon() {
|
|
102
|
+
var _a;
|
|
103
|
+
// Duplicate-libomp guard. We ship our own `libomp.dylib` next to
|
|
104
|
+
// `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
|
|
105
|
+
// runtime (e.g. one loaded by another native addon) may already be
|
|
106
|
+
// resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
|
|
107
|
+
// libiomp5 to coexist; it must be in `process.env` before the `require()`
|
|
108
|
+
// below triggers the addon's `dlopen`, since libomp's constructor reads
|
|
109
|
+
// the env during dependency resolution and is too late to influence
|
|
110
|
+
// afterwards. Left unguarded because the variable is harmless on
|
|
111
|
+
// platforms / runtimes (GCC libgomp) that don't recognize it, and a user
|
|
112
|
+
// who set it to something else is respected by `??=`. See
|
|
113
|
+
// `python/numkong/__init__.py` for the Python analog.
|
|
114
|
+
(_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
|
|
102
115
|
// Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
|
|
103
116
|
try {
|
|
104
117
|
const req = (0, node_module_1.createRequire)(path.join(getDirName(), "noop.js"));
|
|
@@ -31,6 +31,19 @@ import { existsSync } from "node:fs";
|
|
|
31
31
|
import { getFileName, getRoot } from "bindings";
|
|
32
32
|
import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype } from "./types.js";
|
|
33
33
|
function loadNativeAddon() {
|
|
34
|
+
var _a;
|
|
35
|
+
// Duplicate-libomp guard. We ship our own `libomp.dylib` next to
|
|
36
|
+
// `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
|
|
37
|
+
// runtime (e.g. one loaded by another native addon) may already be
|
|
38
|
+
// resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
|
|
39
|
+
// libiomp5 to coexist; it must be in `process.env` before the `require()`
|
|
40
|
+
// below triggers the addon's `dlopen`, since libomp's constructor reads
|
|
41
|
+
// the env during dependency resolution and is too late to influence
|
|
42
|
+
// afterwards. Left unguarded because the variable is harmless on
|
|
43
|
+
// platforms / runtimes (GCC libgomp) that don't recognize it, and a user
|
|
44
|
+
// who set it to something else is respected by `??=`. See
|
|
45
|
+
// `python/numkong/__init__.py` for the Python analog.
|
|
46
|
+
(_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
|
|
34
47
|
// Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
|
|
35
48
|
try {
|
|
36
49
|
const req = createRequire(path.join(getDirName(), "noop.js"));
|
package/javascript/numkong.c
CHANGED
|
@@ -9,10 +9,17 @@
|
|
|
9
9
|
|
|
10
10
|
#include <string.h> // `strcmp` function
|
|
11
11
|
|
|
12
|
+
#if defined(NK_USE_OPENMP)
|
|
13
|
+
#include <omp.h>
|
|
14
|
+
#endif
|
|
15
|
+
|
|
12
16
|
#include <node_api.h> // `napi_*` functions — N-API v6+ for BigInt (Node ≥ 10.20)
|
|
13
17
|
|
|
14
18
|
#include <numkong/numkong.h> // `nk_*` functions — must be first to bring `_GNU_SOURCE`
|
|
15
19
|
|
|
20
|
+
#define NK_PARALLEL_PACKED_TILE 64
|
|
21
|
+
#define NK_PARALLEL_SYMMETRIC_TILE 32
|
|
22
|
+
|
|
16
23
|
/** @brief Global variable that caches the CPU capabilities, and is computed just once, when the module is loaded. */
|
|
17
24
|
nk_capability_t static_capabilities = nk_cap_serial_k;
|
|
18
25
|
|
|
@@ -152,9 +159,10 @@ static napi_value dense(napi_env env, napi_callback_info info, nk_kernel_kind_t
|
|
|
152
159
|
// Auto-detect from N-API TypedArray type (backward-compatible 4-type whitelist)
|
|
153
160
|
if (type_a != napi_float64_array && type_a != napi_float32_array && type_a != napi_int8_array &&
|
|
154
161
|
type_a != napi_uint8_array) {
|
|
155
|
-
napi_throw_error(
|
|
162
|
+
napi_throw_error( //
|
|
156
163
|
env, NULL,
|
|
157
|
-
"Only f64, f32, i8, u8 arrays are auto-detected;
|
|
164
|
+
"Only f64, f32, i8, u8 arrays are auto-detected; " //
|
|
165
|
+
"pass dtype string as 3rd argument for other types");
|
|
158
166
|
return NULL;
|
|
159
167
|
}
|
|
160
168
|
switch (type_a) {
|
|
@@ -482,11 +490,11 @@ static napi_value api_dots_pack(napi_env env, napi_callback_info info) {
|
|
|
482
490
|
* dtype
|
|
483
491
|
*/
|
|
484
492
|
static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
|
|
485
|
-
size_t argc =
|
|
486
|
-
napi_value args[
|
|
493
|
+
size_t argc = 10;
|
|
494
|
+
napi_value args[10];
|
|
487
495
|
napi_get_cb_info(env, info, &argc, args, NULL, NULL);
|
|
488
|
-
if (argc
|
|
489
|
-
napi_throw_error(env, NULL, "Packed operation requires 9 arguments");
|
|
496
|
+
if (argc < 9 || argc > 10) {
|
|
497
|
+
napi_throw_error(env, NULL, "Packed operation requires 9-10 arguments (last is optional threads)");
|
|
490
498
|
return NULL;
|
|
491
499
|
}
|
|
492
500
|
|
|
@@ -533,8 +541,26 @@ static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_ke
|
|
|
533
541
|
return NULL;
|
|
534
542
|
}
|
|
535
543
|
|
|
536
|
-
|
|
537
|
-
|
|
544
|
+
uint32_t threads = 1;
|
|
545
|
+
if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
|
|
546
|
+
|
|
547
|
+
#if defined(NK_USE_OPENMP)
|
|
548
|
+
if (threads == 0) threads = (uint32_t)omp_get_max_threads();
|
|
549
|
+
omp_set_num_threads((int)threads);
|
|
550
|
+
#endif
|
|
551
|
+
|
|
552
|
+
// `int` loop counter pre-declared: MSVC's OpenMP stays at 2.0 canonical
|
|
553
|
+
// form, which forbids in-init declarations and rejects 64-bit iterators
|
|
554
|
+
// — either would trip C3015.
|
|
555
|
+
int const tile_count = (int)nk_size_divide_round_up_(height, NK_PARALLEL_PACKED_TILE);
|
|
556
|
+
int tile_idx;
|
|
557
|
+
#pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
|
|
558
|
+
for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
|
|
559
|
+
nk_size_t row = (nk_size_t)tile_idx * NK_PARALLEL_PACKED_TILE;
|
|
560
|
+
nk_size_t chunk = (row + NK_PARALLEL_PACKED_TILE <= height) ? NK_PARALLEL_PACKED_TILE : (height - row);
|
|
561
|
+
kernel((char const *)a_data + row * a_stride, packed_data, (char *)result_data + row * result_stride, chunk,
|
|
562
|
+
(nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride, (nk_size_t)result_stride);
|
|
563
|
+
}
|
|
538
564
|
return NULL;
|
|
539
565
|
}
|
|
540
566
|
|
|
@@ -554,11 +580,11 @@ static napi_value api_euclideans_packed(napi_env env, napi_callback_info info) {
|
|
|
554
580
|
* string dtype
|
|
555
581
|
*/
|
|
556
582
|
static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
|
|
557
|
-
size_t argc =
|
|
558
|
-
napi_value args[
|
|
583
|
+
size_t argc = 10;
|
|
584
|
+
napi_value args[10];
|
|
559
585
|
napi_get_cb_info(env, info, &argc, args, NULL, NULL);
|
|
560
|
-
if (argc
|
|
561
|
-
napi_throw_error(env, NULL, "Symmetric operation requires 9 arguments");
|
|
586
|
+
if (argc < 9 || argc > 10) {
|
|
587
|
+
napi_throw_error(env, NULL, "Symmetric operation requires 9-10 arguments (last is optional threads)");
|
|
562
588
|
return NULL;
|
|
563
589
|
}
|
|
564
590
|
|
|
@@ -601,8 +627,27 @@ static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk
|
|
|
601
627
|
return NULL;
|
|
602
628
|
}
|
|
603
629
|
|
|
604
|
-
|
|
605
|
-
|
|
630
|
+
uint32_t threads = 1;
|
|
631
|
+
if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
|
|
632
|
+
|
|
633
|
+
#if defined(NK_USE_OPENMP)
|
|
634
|
+
if (threads == 0) threads = (uint32_t)omp_get_max_threads();
|
|
635
|
+
omp_set_num_threads((int)threads);
|
|
636
|
+
#endif
|
|
637
|
+
|
|
638
|
+
// `int` loop counter pre-declared: see note at `api_packed_common`.
|
|
639
|
+
int const tile_count = (int)nk_size_divide_round_up_(row_count, NK_PARALLEL_SYMMETRIC_TILE);
|
|
640
|
+
int tile_idx;
|
|
641
|
+
#pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
|
|
642
|
+
for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
|
|
643
|
+
nk_size_t tile_start = (nk_size_t)row_start + (nk_size_t)tile_idx * NK_PARALLEL_SYMMETRIC_TILE;
|
|
644
|
+
nk_size_t tile_rows = (tile_start + NK_PARALLEL_SYMMETRIC_TILE <= (nk_size_t)row_start + row_count)
|
|
645
|
+
? NK_PARALLEL_SYMMETRIC_TILE
|
|
646
|
+
: ((nk_size_t)row_start + row_count - tile_start);
|
|
647
|
+
kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
|
|
648
|
+
(nk_size_t)result_stride, tile_start, tile_rows);
|
|
649
|
+
}
|
|
650
|
+
|
|
606
651
|
return NULL;
|
|
607
652
|
}
|
|
608
653
|
|
package/javascript/numkong.ts
CHANGED
|
@@ -33,6 +33,19 @@ import { getFileName, getRoot } from "bindings";
|
|
|
33
33
|
import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype, KernelFamily } from "./types.js";
|
|
34
34
|
|
|
35
35
|
function loadNativeAddon(): any {
|
|
36
|
+
// Duplicate-libomp guard. We ship our own `libomp.dylib` next to
|
|
37
|
+
// `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
|
|
38
|
+
// runtime (e.g. one loaded by another native addon) may already be
|
|
39
|
+
// resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
|
|
40
|
+
// libiomp5 to coexist; it must be in `process.env` before the `require()`
|
|
41
|
+
// below triggers the addon's `dlopen`, since libomp's constructor reads
|
|
42
|
+
// the env during dependency resolution and is too late to influence
|
|
43
|
+
// afterwards. Left unguarded because the variable is harmless on
|
|
44
|
+
// platforms / runtimes (GCC libgomp) that don't recognize it, and a user
|
|
45
|
+
// who set it to something else is respected by `??=`. See
|
|
46
|
+
// `python/numkong/__init__.py` for the Python analog.
|
|
47
|
+
process.env.KMP_DUPLICATE_LIB_OK ??= "TRUE";
|
|
48
|
+
|
|
36
49
|
// Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
|
|
37
50
|
try {
|
|
38
51
|
const req = createRequire(path.join(getDirName(), "noop.js"));
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "numkong",
|
|
3
|
-
"version": "7.
|
|
3
|
+
"version": "7.6.0",
|
|
4
4
|
"description": "Portable mixed-precision math, linear-algebra, & retrieval library with 2000+ SIMD kernels for x86, Arm, RISC-V, LoongArch, Power, & WebAssembly",
|
|
5
5
|
"homepage": "https://github.com/ashvardanian/NumKong",
|
|
6
6
|
"author": "Ash Vardanian",
|
|
@@ -98,11 +98,11 @@
|
|
|
98
98
|
"printWidth": 120
|
|
99
99
|
},
|
|
100
100
|
"optionalDependencies": {
|
|
101
|
-
"@numkong/darwin-arm64": "7.
|
|
102
|
-
"@numkong/darwin-x64": "7.
|
|
103
|
-
"@numkong/linux-arm64": "7.
|
|
104
|
-
"@numkong/linux-x64": "7.
|
|
105
|
-
"@numkong/win32-arm64": "7.
|
|
106
|
-
"@numkong/win32-x64": "7.
|
|
101
|
+
"@numkong/darwin-arm64": "7.6.0",
|
|
102
|
+
"@numkong/darwin-x64": "7.6.0",
|
|
103
|
+
"@numkong/linux-arm64": "7.6.0",
|
|
104
|
+
"@numkong/linux-x64": "7.6.0",
|
|
105
|
+
"@numkong/win32-arm64": "7.6.0",
|
|
106
|
+
"@numkong/win32-x64": "7.6.0"
|
|
107
107
|
}
|
|
108
108
|
}
|
package/probes/probe.js
CHANGED
|
@@ -76,8 +76,8 @@ const PROBES = [
|
|
|
76
76
|
["NK_TARGET_SME2P1", "probes/arm_sme2p1.c", ["-march=armv8-a+sme2p1"], []],
|
|
77
77
|
["NK_TARGET_SMEF64", "probes/arm_sme_f64.c", ["-march=armv8-a+sme+sme-f64f64"], []],
|
|
78
78
|
["NK_TARGET_SMEHALF", "probes/arm_sme_half.c", ["-march=armv8-a+sme+sme-f16f16"], []],
|
|
79
|
-
["NK_TARGET_SMEBF16", "probes/arm_sme_bf16.c", ["-march=armv8-a+sme2+b16b16"], []],
|
|
80
|
-
["NK_TARGET_SMEBI32", "probes/arm_sme_bi32.c", ["-march=armv8-a+sme2
|
|
79
|
+
["NK_TARGET_SMEBF16", "probes/arm_sme_bf16.c", ["-march=armv8-a+sme2+sme-b16b16"], []],
|
|
80
|
+
["NK_TARGET_SMEBI32", "probes/arm_sme_bi32.c", ["-march=armv8-a+sme2"], []],
|
|
81
81
|
["NK_TARGET_SMELUT2", "probes/arm_sme_lut2.c", ["-march=armv8-a+sme2+lut"], []],
|
|
82
82
|
["NK_TARGET_SMEFA64", "probes/arm_sme_fa64.c", ["-march=armv8-a+sme+sme-fa64"], []],
|
|
83
83
|
// RISC-V
|
package/wasm/numkong.wasm
CHANGED
|
Binary file
|