npm - numkong - Versions diffs - 7.0.0 - Mend

numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (294) hide show

package/LICENSE +201 -0
package/README.md +495 -0
package/binding.gyp +540 -0
package/c/dispatch.h +512 -0
package/c/dispatch_bf16.c +389 -0
package/c/dispatch_bf16c.c +52 -0
package/c/dispatch_e2m3.c +263 -0
package/c/dispatch_e3m2.c +243 -0
package/c/dispatch_e4m3.c +276 -0
package/c/dispatch_e5m2.c +272 -0
package/c/dispatch_f16.c +376 -0
package/c/dispatch_f16c.c +58 -0
package/c/dispatch_f32.c +378 -0
package/c/dispatch_f32c.c +99 -0
package/c/dispatch_f64.c +296 -0
package/c/dispatch_f64c.c +98 -0
package/c/dispatch_i16.c +96 -0
package/c/dispatch_i32.c +89 -0
package/c/dispatch_i4.c +150 -0
package/c/dispatch_i64.c +86 -0
package/c/dispatch_i8.c +289 -0
package/c/dispatch_other.c +330 -0
package/c/dispatch_u1.c +148 -0
package/c/dispatch_u16.c +124 -0
package/c/dispatch_u32.c +118 -0
package/c/dispatch_u4.c +150 -0
package/c/dispatch_u64.c +102 -0
package/c/dispatch_u8.c +303 -0
package/c/numkong.c +950 -0
package/include/README.md +573 -0
package/include/module.modulemap +129 -0
package/include/numkong/attention/sapphireamx.h +1361 -0
package/include/numkong/attention/sme.h +2066 -0
package/include/numkong/attention.h +49 -0
package/include/numkong/capabilities.h +748 -0
package/include/numkong/cast/README.md +262 -0
package/include/numkong/cast/haswell.h +975 -0
package/include/numkong/cast/icelake.h +470 -0
package/include/numkong/cast/neon.h +1192 -0
package/include/numkong/cast/rvv.h +1021 -0
package/include/numkong/cast/sapphire.h +262 -0
package/include/numkong/cast/serial.h +2262 -0
package/include/numkong/cast/skylake.h +856 -0
package/include/numkong/cast/v128relaxed.h +180 -0
package/include/numkong/cast.h +230 -0
package/include/numkong/curved/README.md +223 -0
package/include/numkong/curved/genoa.h +182 -0
package/include/numkong/curved/haswell.h +276 -0
package/include/numkong/curved/neon.h +205 -0
package/include/numkong/curved/neonbfdot.h +212 -0
package/include/numkong/curved/neonhalf.h +212 -0
package/include/numkong/curved/rvv.h +305 -0
package/include/numkong/curved/serial.h +207 -0
package/include/numkong/curved/skylake.h +457 -0
package/include/numkong/curved/smef64.h +506 -0
package/include/numkong/curved.h +517 -0
package/include/numkong/curved.hpp +144 -0
package/include/numkong/dot/README.md +425 -0
package/include/numkong/dot/alder.h +563 -0
package/include/numkong/dot/genoa.h +315 -0
package/include/numkong/dot/haswell.h +1688 -0
package/include/numkong/dot/icelake.h +883 -0
package/include/numkong/dot/neon.h +818 -0
package/include/numkong/dot/neonbfdot.h +244 -0
package/include/numkong/dot/neonfhm.h +360 -0
package/include/numkong/dot/neonhalf.h +198 -0
package/include/numkong/dot/neonsdot.h +508 -0
package/include/numkong/dot/rvv.h +714 -0
package/include/numkong/dot/rvvbb.h +72 -0
package/include/numkong/dot/rvvbf16.h +123 -0
package/include/numkong/dot/rvvhalf.h +129 -0
package/include/numkong/dot/sapphire.h +141 -0
package/include/numkong/dot/serial.h +838 -0
package/include/numkong/dot/sierra.h +405 -0
package/include/numkong/dot/skylake.h +1084 -0
package/include/numkong/dot/sve.h +379 -0
package/include/numkong/dot/svebfdot.h +74 -0
package/include/numkong/dot/svehalf.h +123 -0
package/include/numkong/dot/v128relaxed.h +1258 -0
package/include/numkong/dot.h +1070 -0
package/include/numkong/dot.hpp +94 -0
package/include/numkong/dots/README.md +496 -0
package/include/numkong/dots/alder.h +114 -0
package/include/numkong/dots/genoa.h +94 -0
package/include/numkong/dots/haswell.h +295 -0
package/include/numkong/dots/icelake.h +171 -0
package/include/numkong/dots/neon.h +120 -0
package/include/numkong/dots/neonbfdot.h +58 -0
package/include/numkong/dots/neonfhm.h +94 -0
package/include/numkong/dots/neonhalf.h +57 -0
package/include/numkong/dots/neonsdot.h +108 -0
package/include/numkong/dots/rvv.h +2486 -0
package/include/numkong/dots/sapphireamx.h +3973 -0
package/include/numkong/dots/serial.h +2844 -0
package/include/numkong/dots/sierra.h +97 -0
package/include/numkong/dots/skylake.h +196 -0
package/include/numkong/dots/sme.h +5372 -0
package/include/numkong/dots/smebi32.h +461 -0
package/include/numkong/dots/smef64.h +1318 -0
package/include/numkong/dots/smehalf.h +47 -0
package/include/numkong/dots/v128relaxed.h +294 -0
package/include/numkong/dots.h +2804 -0
package/include/numkong/dots.hpp +639 -0
package/include/numkong/each/README.md +469 -0
package/include/numkong/each/haswell.h +1658 -0
package/include/numkong/each/icelake.h +272 -0
package/include/numkong/each/neon.h +1104 -0
package/include/numkong/each/neonbfdot.h +212 -0
package/include/numkong/each/neonhalf.h +410 -0
package/include/numkong/each/rvv.h +1121 -0
package/include/numkong/each/sapphire.h +477 -0
package/include/numkong/each/serial.h +260 -0
package/include/numkong/each/skylake.h +1562 -0
package/include/numkong/each.h +2146 -0
package/include/numkong/each.hpp +434 -0
package/include/numkong/geospatial/README.md +147 -0
package/include/numkong/geospatial/haswell.h +593 -0
package/include/numkong/geospatial/neon.h +571 -0
package/include/numkong/geospatial/rvv.h +701 -0
package/include/numkong/geospatial/serial.h +309 -0
package/include/numkong/geospatial/skylake.h +577 -0
package/include/numkong/geospatial/v128relaxed.h +613 -0
package/include/numkong/geospatial.h +453 -0
package/include/numkong/geospatial.hpp +235 -0
package/include/numkong/matrix.hpp +336 -0
package/include/numkong/maxsim/README.md +187 -0
package/include/numkong/maxsim/alder.h +511 -0
package/include/numkong/maxsim/genoa.h +115 -0
package/include/numkong/maxsim/haswell.h +553 -0
package/include/numkong/maxsim/icelake.h +480 -0
package/include/numkong/maxsim/neonsdot.h +394 -0
package/include/numkong/maxsim/sapphireamx.h +877 -0
package/include/numkong/maxsim/serial.h +490 -0
package/include/numkong/maxsim/sme.h +929 -0
package/include/numkong/maxsim/v128relaxed.h +280 -0
package/include/numkong/maxsim.h +571 -0
package/include/numkong/maxsim.hpp +133 -0
package/include/numkong/mesh/README.md +227 -0
package/include/numkong/mesh/haswell.h +2235 -0
package/include/numkong/mesh/neon.h +1329 -0
package/include/numkong/mesh/neonbfdot.h +842 -0
package/include/numkong/mesh/neonhalf.h +616 -0
package/include/numkong/mesh/rvv.h +916 -0
package/include/numkong/mesh/serial.h +742 -0
package/include/numkong/mesh/skylake.h +1135 -0
package/include/numkong/mesh/v128relaxed.h +1052 -0
package/include/numkong/mesh.h +652 -0
package/include/numkong/mesh.hpp +762 -0
package/include/numkong/numkong.h +78 -0
package/include/numkong/numkong.hpp +57 -0
package/include/numkong/probability/README.md +173 -0
package/include/numkong/probability/haswell.h +267 -0
package/include/numkong/probability/neon.h +225 -0
package/include/numkong/probability/rvv.h +409 -0
package/include/numkong/probability/serial.h +169 -0
package/include/numkong/probability/skylake.h +324 -0
package/include/numkong/probability.h +383 -0
package/include/numkong/probability.hpp +120 -0
package/include/numkong/random.h +50 -0
package/include/numkong/random.hpp +285 -0
package/include/numkong/reduce/README.md +547 -0
package/include/numkong/reduce/alder.h +632 -0
package/include/numkong/reduce/genoa.h +201 -0
package/include/numkong/reduce/haswell.h +3783 -0
package/include/numkong/reduce/icelake.h +549 -0
package/include/numkong/reduce/neon.h +3841 -0
package/include/numkong/reduce/neonbfdot.h +353 -0
package/include/numkong/reduce/neonfhm.h +665 -0
package/include/numkong/reduce/neonhalf.h +157 -0
package/include/numkong/reduce/neonsdot.h +357 -0
package/include/numkong/reduce/rvv.h +3407 -0
package/include/numkong/reduce/serial.h +757 -0
package/include/numkong/reduce/sierra.h +338 -0
package/include/numkong/reduce/skylake.h +3792 -0
package/include/numkong/reduce/v128relaxed.h +2302 -0
package/include/numkong/reduce.h +1597 -0
package/include/numkong/reduce.hpp +633 -0
package/include/numkong/scalar/README.md +89 -0
package/include/numkong/scalar/haswell.h +113 -0
package/include/numkong/scalar/neon.h +122 -0
package/include/numkong/scalar/neonhalf.h +70 -0
package/include/numkong/scalar/rvv.h +211 -0
package/include/numkong/scalar/sapphire.h +63 -0
package/include/numkong/scalar/serial.h +332 -0
package/include/numkong/scalar/v128relaxed.h +56 -0
package/include/numkong/scalar.h +683 -0
package/include/numkong/set/README.md +179 -0
package/include/numkong/set/haswell.h +334 -0
package/include/numkong/set/icelake.h +485 -0
package/include/numkong/set/neon.h +364 -0
package/include/numkong/set/rvv.h +226 -0
package/include/numkong/set/rvvbb.h +117 -0
package/include/numkong/set/serial.h +174 -0
package/include/numkong/set/sve.h +185 -0
package/include/numkong/set/v128relaxed.h +240 -0
package/include/numkong/set.h +457 -0
package/include/numkong/set.hpp +114 -0
package/include/numkong/sets/README.md +149 -0
package/include/numkong/sets/haswell.h +63 -0
package/include/numkong/sets/icelake.h +66 -0
package/include/numkong/sets/neon.h +61 -0
package/include/numkong/sets/serial.h +43 -0
package/include/numkong/sets/smebi32.h +1099 -0
package/include/numkong/sets/v128relaxed.h +58 -0
package/include/numkong/sets.h +339 -0
package/include/numkong/sparse/README.md +156 -0
package/include/numkong/sparse/icelake.h +463 -0
package/include/numkong/sparse/neon.h +288 -0
package/include/numkong/sparse/serial.h +117 -0
package/include/numkong/sparse/sve2.h +507 -0
package/include/numkong/sparse/turin.h +322 -0
package/include/numkong/sparse.h +363 -0
package/include/numkong/sparse.hpp +113 -0
package/include/numkong/spatial/README.md +435 -0
package/include/numkong/spatial/alder.h +607 -0
package/include/numkong/spatial/genoa.h +290 -0
package/include/numkong/spatial/haswell.h +960 -0
package/include/numkong/spatial/icelake.h +586 -0
package/include/numkong/spatial/neon.h +773 -0
package/include/numkong/spatial/neonbfdot.h +165 -0
package/include/numkong/spatial/neonhalf.h +118 -0
package/include/numkong/spatial/neonsdot.h +261 -0
package/include/numkong/spatial/rvv.h +984 -0
package/include/numkong/spatial/rvvbf16.h +123 -0
package/include/numkong/spatial/rvvhalf.h +117 -0
package/include/numkong/spatial/sapphire.h +343 -0
package/include/numkong/spatial/serial.h +346 -0
package/include/numkong/spatial/sierra.h +323 -0
package/include/numkong/spatial/skylake.h +606 -0
package/include/numkong/spatial/sve.h +224 -0
package/include/numkong/spatial/svebfdot.h +122 -0
package/include/numkong/spatial/svehalf.h +109 -0
package/include/numkong/spatial/v128relaxed.h +717 -0
package/include/numkong/spatial.h +1425 -0
package/include/numkong/spatial.hpp +183 -0
package/include/numkong/spatials/README.md +580 -0
package/include/numkong/spatials/alder.h +94 -0
package/include/numkong/spatials/genoa.h +94 -0
package/include/numkong/spatials/haswell.h +219 -0
package/include/numkong/spatials/icelake.h +113 -0
package/include/numkong/spatials/neon.h +109 -0
package/include/numkong/spatials/neonbfdot.h +60 -0
package/include/numkong/spatials/neonfhm.h +92 -0
package/include/numkong/spatials/neonhalf.h +58 -0
package/include/numkong/spatials/neonsdot.h +109 -0
package/include/numkong/spatials/rvv.h +1960 -0
package/include/numkong/spatials/sapphireamx.h +1149 -0
package/include/numkong/spatials/serial.h +226 -0
package/include/numkong/spatials/sierra.h +96 -0
package/include/numkong/spatials/skylake.h +184 -0
package/include/numkong/spatials/sme.h +1901 -0
package/include/numkong/spatials/smef64.h +465 -0
package/include/numkong/spatials/v128relaxed.h +240 -0
package/include/numkong/spatials.h +3021 -0
package/include/numkong/spatials.hpp +508 -0
package/include/numkong/tensor.hpp +1592 -0
package/include/numkong/trigonometry/README.md +184 -0
package/include/numkong/trigonometry/haswell.h +652 -0
package/include/numkong/trigonometry/neon.h +639 -0
package/include/numkong/trigonometry/rvv.h +699 -0
package/include/numkong/trigonometry/serial.h +703 -0
package/include/numkong/trigonometry/skylake.h +721 -0
package/include/numkong/trigonometry/v128relaxed.h +666 -0
package/include/numkong/trigonometry.h +467 -0
package/include/numkong/trigonometry.hpp +166 -0
package/include/numkong/types.h +1384 -0
package/include/numkong/types.hpp +5603 -0
package/include/numkong/vector.hpp +698 -0
package/javascript/README.md +246 -0
package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
package/javascript/dist/cjs/numkong-wasm.js +617 -0
package/javascript/dist/cjs/numkong.d.ts +343 -0
package/javascript/dist/cjs/numkong.js +523 -0
package/javascript/dist/cjs/package.json +3 -0
package/javascript/dist/cjs/types.d.ts +284 -0
package/javascript/dist/cjs/types.js +653 -0
package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
package/javascript/dist/esm/numkong-wasm.js +595 -0
package/javascript/dist/esm/numkong.d.ts +343 -0
package/javascript/dist/esm/numkong.js +452 -0
package/javascript/dist/esm/package.json +3 -0
package/javascript/dist/esm/types.d.ts +284 -0
package/javascript/dist/esm/types.js +630 -0
package/javascript/dist-package-cjs.json +3 -0
package/javascript/dist-package-esm.json +3 -0
package/javascript/node-gyp-build.d.ts +1 -0
package/javascript/numkong-wasm.ts +756 -0
package/javascript/numkong.c +689 -0
package/javascript/numkong.ts +575 -0
package/javascript/tsconfig-base.json +39 -0
package/javascript/tsconfig-cjs.json +8 -0
package/javascript/tsconfig-esm.json +8 -0
package/javascript/types.ts +674 -0
package/package.json +87 -0

package/include/numkong/numkong.h ADDED Viewed

@@ -0,0 +1,78 @@
+/**
+ *  @brief SIMD-accelerated Similarity Measures and Distance Functions.
+ *  @file include/numkong.h
+ *  @author Ash Vardanian
+ *  @date March 14, 2023
+ *
+ *  Umbrella header that includes all domain-specific kernel headers
+ *  and the runtime capability detection infrastructure.
+ */
+#ifndef NK_NUMKONG_H
+#define NK_NUMKONG_H
+#include "numkong/capabilities.h" // Runtime detection, like `nk_capabilities_x86_`
+#include "numkong/scalar.h"       // Scalar math: sqrt, rsqrt, fma, saturating, order, like `nk_f32_sqrt`
+#include "numkong/cast.h"         // Type conversions, like `nk_cast`
+#include "numkong/set.h"          // Hamming, Jaccard, like `nk_hamming_u1`
+#include "numkong/curved.h"       // Mahalanobis, Bilinear Forms, like `nk_bilinear_f64`
+#include "numkong/dot.h"          // Inner (dot) product and its conjugate, like `nk_dot_f32`
+#include "numkong/dots.h"         // GEMM-style MxN batched dot-products, like `nk_dots_packed_size_bf16`
+#include "numkong/each.h"         // Weighted Sum, Fused-Multiply-Add, like `nk_each_scale_f64`
+#include "numkong/geospatial.h"   // Haversine and Vincenty, like `nk_haversine_f64`
+#include "numkong/mesh.h"         // RMSD, Kabsch, Umeyama, like `nk_rmsd_f64`
+#include "numkong/probability.h"  // Kullback-Leibler, Jensen-Shannon, like `nk_kld_f16`
+#include "numkong/reduce.h"       // Horizontal MinMax & Moments reductions, like `nk_reduce_moments_f64`
+#include "numkong/sets.h"         // Hamming & Jaccard for binary sets, like `nk_hammings_packed_u1`
+#include "numkong/sparse.h"       // Set Intersections and Sparse Dot Products, like `nk_sparse_intersect_u16`
+#include "numkong/spatial.h"      // Euclidean, Angular, like `nk_euclidean_f64`
+#include "numkong/spatials.h"     // Batched Angular & Euclidean distances, like `nk_angulars_packed_f32`
+#include "numkong/maxsim.h"       // MaxSim: Multi-Vector Maximum Similarity, like `nk_maxsim_packed_f32`
+#include "numkong/trigonometry.h" // Sin, Cos, Atan, like `nk_each_sin_f64`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/**
+ *  @brief  Returns the output dtype for a given metric kind and input dtype.
+ */
+NK_PUBLIC nk_dtype_t nk_kernel_output_dtype(nk_kernel_kind_t kind, nk_dtype_t input) {
+    switch (kind) {
+    case nk_kernel_dot_k:
+    case nk_kernel_vdot_k:
+    case nk_kernel_dots_packed_k:
+    case nk_kernel_dots_symmetric_k: return nk_dot_output_dtype(input);
+    case nk_kernel_angular_k:
+    case nk_kernel_angulars_packed_k:
+    case nk_kernel_angulars_symmetric_k: return nk_angular_output_dtype(input);
+    case nk_kernel_euclidean_k:
+    case nk_kernel_euclideans_packed_k:
+    case nk_kernel_euclideans_symmetric_k: return nk_euclidean_output_dtype(input);
+    case nk_kernel_sqeuclidean_k: return nk_sqeuclidean_output_dtype(input);
+    case nk_kernel_bilinear_k: return nk_bilinear_output_dtype(input);
+    case nk_kernel_mahalanobis_k: return nk_mahalanobis_output_dtype(input);
+    case nk_kernel_hamming_k:
+    case nk_kernel_hammings_packed_k:
+    case nk_kernel_hammings_symmetric_k: return nk_hamming_output_dtype(input);
+    case nk_kernel_jaccard_k:
+    case nk_kernel_jaccards_packed_k:
+    case nk_kernel_jaccards_symmetric_k: return nk_jaccard_output_dtype(input);
+    case nk_kernel_haversine_k: return nk_haversine_output_dtype(input);
+    case nk_kernel_vincenty_k: return nk_vincenty_output_dtype(input);
+    case nk_kernel_kld_k:
+    case nk_kernel_jsd_k: return nk_probability_output_dtype(input);
+    case nk_kernel_rmsd_k: return nk_rmsd_output_dtype(input);
+    case nk_kernel_kabsch_k: return nk_kabsch_output_dtype(input);
+    case nk_kernel_umeyama_k: return nk_umeyama_output_dtype(input);
+    case nk_kernel_sparse_dot_k: return nk_sparse_dot_output_dtype(input);
+    case nk_kernel_maxsim_packed_k: return nk_maxsim_output_dtype(input);
+    default: return nk_dtype_unknown_k;
+    }
+}
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_NUMKONG_H

package/include/numkong/numkong.hpp ADDED Viewed

@@ -0,0 +1,57 @@
+/**
+ *  @brief NumKong SDK for C++23 and newer.
+ *  @file include/numkong.hpp
+ *  @author Ash Vardanian
+ *  @date January 7, 2026
+ *
+ *  C doesn't have a strong type system or composable infrastructure for complex kernels
+ *  and datastructures like the C++ templates and Rust traits. Unlike C++, C also lacks
+ *  function overloading, namespaces and templates, thus requiring verbose signatures and
+ *  naming conventions, like:
+ *
+ *  @code{c}
+ *  void nk_dot_f64(nk_f64_t const*, nk_f64_t const*, nk_size_t, nk_f64_t *);
+ *  void nk_dot_f32(nk_f32_t const*, nk_f32_t const*, nk_size_t, nk_f64_t *);
+ *  void nk_dot_f16(nk_f16_t const*, nk_f16_t const*, nk_size_t, nk_f32_t *);
+ *  void nk_dot_bf16(nk_bf16_t const*, nk_bf16_t const*, nk_size_t, nk_f32_t *);
+ *  void nk_dot_e4m3(nk_e4m3_t const*, nk_e4m3_t const*, nk_size_t, nk_f32_t *);
+ *  void nk_dot_e5m2(nk_e5m2_t const*, nk_e5m2_t const*, nk_size_t, nk_f32_t *);
+ *  @endcode
+ *
+ *  As opposed to C++:
+ *
+ *  @code{cpp}
+ *  namespace ashvardanian::numkong {
+ *      template <typename input_type_, typename result_type_>
+ *      void dot(input_type_ const*, input_type_ const*, size_t, result_type_ *);
+ *  }
+ *
+ *  In HPC implementations, where pretty much every kernel and every datatype uses different
+ *  Assembly instructions on different CPU generations/models, those higher-level abstractions
+ *  aren't always productive for the primary implementation, but they can still be handy as
+ *  a higher-level API for NumKong. They are also used for algorithm verification in no-SIMD
+ *  mode, upcasting to much larger number types like `f118_t`.
+ */
+#ifndef NK_NUMKONG_HPP
+#define NK_NUMKONG_HPP
+#include "numkong/random.hpp"
+#include "numkong/dot.hpp"
+#include "numkong/spatial.hpp"
+#include "numkong/spatials.hpp"
+#include "numkong/probability.hpp"
+#include "numkong/each.hpp"
+#include "numkong/reduce.hpp"
+#include "numkong/curved.hpp"
+#include "numkong/geospatial.hpp"
+#include "numkong/sparse.hpp"
+#include "numkong/set.hpp"
+#include "numkong/mesh.hpp"
+#include "numkong/trigonometry.hpp"
+#include "numkong/dots.hpp"
+#include "numkong/matrix.hpp"
+#include "numkong/maxsim.hpp"
+#include "numkong/tensor.hpp"
+#endif // NK_NUMKONG_HPP

package/include/numkong/probability/README.md ADDED Viewed

@@ -0,0 +1,173 @@
+# Divergence Measures for Probability Distributions in NumKong
+NumKong implements divergence functions between discrete probability distributions: Kullback-Leibler divergence measures the information lost when one distribution approximates another, while Jensen-Shannon distance provides a symmetric and bounded alternative.
+These are used in variational inference, topic modeling, and distribution comparison tasks.
+Kullback-Leibler divergence from $P$ to $Q$:
+```math
+\text{KLD}(P \| Q) = \sum_{i=0}^{n-1} P(i) \log_2 \frac{P(i)}{Q(i)}
+```
+Jensen-Shannon distance is the square root of the symmetrized KLD through a mixture:
+$$\text{JSD}(P, Q) = \frac{1}{2} \text{KLD}(P \| M) + \frac{1}{2} \text{KLD}(Q \| M)$$
+where $M = \frac{P + Q}{2}$, yielding the distance:
+$$d_{JS}(P, Q) = \sqrt{\text{JSD}(P, Q)}$$
+Unlike the raw divergence, $d_{JS}$ is a true metric satisfying the triangle inequality.
+Reformulating as Python pseudocode:
+```python
+import numpy as np
+def kld(p: np.ndarray, q: np.ndarray) -> float:
+    mask = p > 0
+    return np.sum(p[mask] * np.log2(p[mask] / q[mask]))
+def jsd(p: np.ndarray, q: np.ndarray) -> float:
+    m = (p + q) / 2
+    return np.sqrt((kld(p, m) + kld(q, m)) / 2)
+```
+## Use Cases
+__Kullback-Leibler divergence__ is the workhorse of variational inference (ELBO objective), knowledge distillation between neural networks, information gain in decision trees, and measuring fit between a model and observed data.
+__Jensen-Shannon distance__ sees primary use in microbiome community comparison (enterotyping), where its metric property enables clustering with standard algorithms. It also appears in distribution drift detection, topic model evaluation, and as the theoretical foundation of the original GAN objective — though in practice GAN training uses proxy losses rather than computing JSD directly.
+## Input & Output Types
+| Input Type | Output Type | Description                                    |
+| ---------- | ----------- | ---------------------------------------------- |
+| `f64`      | `f64`       | 64-bit IEEE 754 double precision               |
+| `f32`      | `f32`       | 32-bit IEEE 754 single precision               |
+| `f16`      | `f32`       | 16-bit IEEE 754 half precision, widened output |
+| `bf16`     | `f32`       | 16-bit brain float, widened output             |
+## Optimizations
+### SIMD Log2 Approximation
+`nk_kld_f32_skylake`, `nk_jsd_f32_skylake` use `VGETEXP` and `VGETMANT` to decompose floating-point values into exponent and mantissa components, then apply a polynomial approximation to the mantissa to compute $\log_2$.
+The pipeline on Skylake is:
+```
+exponent = VGETEXPPS(x)
+mantissa = VGETMANTPS(x, normalize_to_[1,2)) - 1
+log2(x) ≈ exponent + polynomial(mantissa)
+```
+`VGETEXP` extracts the unbiased exponent as a float, while `VGETMANT` normalizes the mantissa to $[1, 2)$.
+A degree-4 minimax polynomial over the normalized mantissa completes the approximation.
+These instructions handle subnormals correctly without extra integer bit manipulation.
+`nk_kld_f32_neon`, `nk_jsd_f32_neon`, `nk_kld_f16_haswell`, `nk_jsd_f16_haswell` use integer bit extraction instead:
+```
+exponent = (reinterpret_as_int(x) >> 23) - 127
+mantissa = reinterpret_as_float((reinterpret_as_int(x) & 0x7FFFFF) | 0x3F800000) - 1
+log2(x) ≈ exponent + c₁·m + c₂·m² + c₃·m³ + c₄·m⁴ + c₅·m⁵
+```
+This approach reinterprets the float as an integer, shifts out the mantissa bits to obtain the exponent, then masks and recombines to produce a normalized mantissa in $[1, 2)$.
+It works on any ISA with integer-float reinterpretation and avoids the need for specialized exponent/mantissa instructions.
+### Kahan Compensated Summation for Float64
+`nk_kld_f64_haswell`, `nk_jsd_f64_haswell` use Kahan compensated summation to maintain a running correction term alongside the accumulator.
+The Kahan update for each divergence term is:
+```
+compensated_term = divergence_term - correction
+tentative_sum    = accumulator + compensated_term
+correction       = (tentative_sum - accumulator) - compensated_term
+accumulator      = tentative_sum
+```
+After each $P(i) \log_2(P(i) / Q(i))$ term is computed, `correction` captures the low-order bits lost in the addition, and the next iteration subtracts this correction from the new term before adding it to the accumulator.
+This keeps the accumulated error bounded by $O(1)$ ULP regardless of vector length, rather than the $O(n)$ ULP growth of naive summation.
+## Performance
+The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
+The input size is controlled by the `NK_DENSE_DIMENSIONS` environment variable and set to 256, 1024, and 4096 elements.
+The throughput is measured in GB/s as the number of input bytes per second.
+The published tables below summarize mean ULP (units in last place) across all test pairs — the average number of representable floating-point values between the computed result and the exact answer. The current `nk_test` family also reports max/mean absolute and relative divergence error for detailed inspection.
+Each kernel runs for at least 20 seconds per configuration.
+Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
+Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
+### Intel Sapphire Rapids
+#### Native
+| Kernel               |                      256 |                     1024 |                     4096 |
+| :------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f64__              | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f64_serial`  |    0.693 gb/s, 5.65K ulp |    0.699 gb/s, 24.5K ulp |    0.753 gb/s, 98.9K ulp |
+| `nk_jsd_f64_serial`  |      0.324 gb/s, 0.5 ulp |      0.349 gb/s, 0.3 ulp |      0.391 gb/s, 0.6 ulp |
+| `nk_kld_f64_haswell` |     5.34 gb/s, 5.64K ulp |     5.59 gb/s, 24.6K ulp |     5.76 gb/s, 99.1K ulp |
+| `nk_jsd_f64_haswell` |       3.03 gb/s, 1.7 ulp |       3.05 gb/s, 1.4 ulp |       3.25 gb/s, 1.2 ulp |
+| `nk_kld_f64_skylake` |     7.01 gb/s, 5.64K ulp |     6.85 gb/s, 24.4K ulp |     6.86 gb/s, 98.9K ulp |
+| `nk_jsd_f64_skylake` |       3.66 gb/s, 1.6 ulp |       3.85 gb/s, 1.4 ulp |       4.30 gb/s, 1.2 ulp |
+| __f32__              | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f32_serial`  |    0.528 gb/s, 1.04K ulp |    0.516 gb/s, 4.54K ulp |    0.527 gb/s, 18.2K ulp |
+| `nk_jsd_f32_serial`  |      0.273 gb/s, 0.4 ulp |      0.272 gb/s, 0.4 ulp |      0.268 gb/s, 4.5 ulp |
+| `nk_kld_f32_skylake` |     11.8 gb/s, 1.04K ulp |     10.4 gb/s, 4.55K ulp |     8.73 gb/s, 18.3K ulp |
+| `nk_jsd_f32_skylake` |       6.25 gb/s, 6.6 ulp |       5.96 gb/s, 7.0 ulp |      6.05 gb/s, 11.1 ulp |
+| __bf16__             | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_bf16_serial` |    0.138 gb/s, 1.04K ulp |    0.142 gb/s, 4.53K ulp |    0.136 gb/s, 18.3K ulp |
+| `nk_jsd_bf16_serial` |     0.0857 gb/s, 1.5 ulp |     0.0842 gb/s, 3.4 ulp |    0.0841 gb/s, 10.7 ulp |
+| __f16__              | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f16_serial`  |    0.166 gb/s, 1.05K ulp |    0.163 gb/s, 4.53K ulp |    0.163 gb/s, 18.2K ulp |
+| `nk_jsd_f16_serial`  |      0.151 gb/s, 1.5 ulp |      0.148 gb/s, 2.3 ulp |      0.152 gb/s, 9.4 ulp |
+| `nk_kld_f16_haswell` |     6.99 gb/s, 1.05K ulp |     6.09 gb/s, 4.54K ulp |     6.97 gb/s, 18.2K ulp |
+| `nk_jsd_f16_haswell` |       2.81 gb/s, 6.4 ulp |       2.79 gb/s, 6.8 ulp |      2.72 gb/s, 11.5 ulp |
+| `nk_kld_f16_skylake` |     6.16 gb/s, 1.05K ulp |     5.65 gb/s, 4.54K ulp |     5.78 gb/s, 18.3K ulp |
+| `nk_jsd_f16_skylake` |       3.51 gb/s, 6.5 ulp |       3.22 gb/s, 6.9 ulp |      3.35 gb/s, 11.4 ulp |
+#### WASM
+Measured with Wasmtime v42 (Cranelift backend).
+| Kernel               |                      256 |                     1024 |                     4096 |
+| :------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f64__              | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f64_serial`  |    0.239 gb/s, 5.64K ulp |    0.223 gb/s, 24.6K ulp |     0.13 gb/s, 99.6K ulp |
+| `nk_jsd_f64_serial`  |      0.315 gb/s, 0.5 ulp |      0.402 gb/s, 0.3 ulp |       0.29 gb/s, 0.5 ulp |
+| __f32__              | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f32_serial`  |    0.302 gb/s, 1.04K ulp |    0.342 gb/s, 4.52K ulp |    0.277 gb/s, 18.3K ulp |
+| `nk_jsd_f32_serial`  |      0.152 gb/s, 0.4 ulp |      0.164 gb/s, 0.4 ulp |      0.160 gb/s, 4.7 ulp |
+| __bf16__             | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_bf16_serial` |    0.139 gb/s, 1.05K ulp |    0.143 gb/s, 4.53K ulp |    0.150 gb/s, 18.3K ulp |
+| `nk_jsd_bf16_serial` |     0.0867 gb/s, 1.5 ulp |     0.0775 gb/s, 3.1 ulp |     0.0679 gb/s, 9.8 ulp |
+| __f16__              | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f16_serial`  |    0.118 gb/s, 1.04K ulp |    0.127 gb/s, 4.53K ulp |    0.111 gb/s, 18.3K ulp |
+| `nk_jsd_f16_serial`  |     0.0748 gb/s, 1.4 ulp |     0.0681 gb/s, 2.6 ulp |     0.0857 gb/s, 9.7 ulp |
+### Apple M4
+#### Native
+| Kernel                |                      256 |                     1024 |                     4096 |
+| :-------------------- | -----------------------: | -----------------------: | -----------------------: |
+| __f64__               | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f64_serial`   |      2.21 gb/s, 5.6K ulp |       2.22 gb/s, 25K ulp |       2.18 gb/s, 99K ulp |
+| `nk_jsd_f64_serial`   |       1.40 gb/s, 0.4 ulp |       1.45 gb/s, 0.4 ulp |       1.45 gb/s, 0.5 ulp |
+| __f32__               | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f32_serial`   |      6.29 gb/s, 1.0K ulp |      6.35 gb/s, 4.5K ulp |       6.22 gb/s, 18K ulp |
+| `nk_jsd_f32_serial`   |       1.21 gb/s, 0.4 ulp |       1.20 gb/s, 0.4 ulp |       1.20 gb/s, 4.6 ulp |
+| `nk_kld_f32_neon`     |      14.5 gb/s, 1.0K ulp |      14.4 gb/s, 4.5K ulp |       12.8 gb/s, 18K ulp |
+| `nk_jsd_f32_neon`     |        6.81 gb/s, 15 ulp |        7.04 gb/s, 14 ulp |       6.78 gb/s, 9.9 ulp |
+| __bf16__              | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_bf16_serial`  |      3.16 gb/s, 1.0K ulp |      2.96 gb/s, 4.5K ulp |       3.16 gb/s, 18K ulp |
+| `nk_jsd_bf16_serial`  |      0.611 gb/s, 1.4 ulp |      0.595 gb/s, 2.9 ulp |      0.613 gb/s, 9.7 ulp |
+| __f16__               | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
+| `nk_kld_f16_serial`   |      3.15 gb/s, 1.0K ulp |      3.14 gb/s, 4.5K ulp |       2.81 gb/s, 18K ulp |
+| `nk_jsd_f16_serial`   |      0.610 gb/s, 1.4 ulp |      0.611 gb/s, 2.7 ulp |      0.602 gb/s, 8.7 ulp |
+| `nk_kld_f16_neonhalf` |      6.78 gb/s, 1.0K ulp |      6.72 gb/s, 4.5K ulp |       6.09 gb/s, 18K ulp |
+| `nk_jsd_f16_neonhalf` |        3.42 gb/s, 15 ulp |        3.40 gb/s, 14 ulp |       3.14 gb/s, 9.9 ulp |

package/include/numkong/probability/haswell.h ADDED Viewed

@@ -0,0 +1,267 @@
+/**
+ *  @brief Haswell-accelerated Probability Distribution Similarity Measures.
+ *  @file include/numkong/probability/haswell.h
+ *  @author Ash Vardanian
+ *  @date February 6, 2026
+ *
+ *  @sa include/numkong/probability.h
+ */
+#ifndef NK_PROBABILITY_HASWELL_H
+#define NK_PROBABILITY_HASWELL_H
+#if NK_TARGET_X86_
+#if NK_TARGET_HASWELL
+#include "numkong/types.h"
+#include "numkong/reduce/haswell.h"  // `nk_reduce_add_f32x8_haswell_`, `nk_reduce_add_f64x4_haswell_`
+#include "numkong/spatial/haswell.h" // `nk_f32_sqrt_haswell`, `nk_f64_sqrt_haswell`
+#include "numkong/cast/haswell.h"    // `nk_partial_load_f16x8_to_f32x8_haswell_`, `nk_partial_load_b64x4_haswell_`
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2,f16c,fma,bmi,bmi2"))), apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx2", "f16c", "fma", "bmi", "bmi2")
+#endif
+NK_INTERNAL __m256 nk_log2_f32x8_haswell_(__m256 x) {
+    // Extracting the exponent
+    __m256i bits_i32x8 = _mm256_castps_si256(x);
+    __m256i exponent_i32x8 = _mm256_srli_epi32(_mm256_and_si256(bits_i32x8, _mm256_set1_epi32(0x7F800000)), 23);
+    exponent_i32x8 = _mm256_sub_epi32(exponent_i32x8, _mm256_set1_epi32(127)); // removing the bias
+    __m256 exponent_f32x8 = _mm256_cvtepi32_ps(exponent_i32x8);
+    // Extracting the mantissa ∈ [1, 2)
+    __m256 mantissa_f32x8 = _mm256_castsi256_ps(
+        _mm256_or_si256(_mm256_and_si256(bits_i32x8, _mm256_set1_epi32(0x007FFFFF)), _mm256_set1_epi32(0x3F800000)));
+    // Compute log2(m) using the s-series: s = (m-1)/(m+1), s ∈ [0, 1/3] for m ∈ [1, 2)
+    // log2(m) = (2/ln2) × s × (1 + s²/3 + s⁴/5 + s⁶/7 + s⁸/9)
+    __m256 one_f32x8 = _mm256_set1_ps(1.0f);
+    __m256 s_f32x8 = _mm256_div_ps(_mm256_sub_ps(mantissa_f32x8, one_f32x8), _mm256_add_ps(mantissa_f32x8, one_f32x8));
+    __m256 s2_f32x8 = _mm256_mul_ps(s_f32x8, s_f32x8);
+    __m256 series_f32x8 = _mm256_set1_ps(0.111111111f);                                   // 1/9
+    series_f32x8 = _mm256_fmadd_ps(series_f32x8, s2_f32x8, _mm256_set1_ps(0.142857143f)); // 1/7
+    series_f32x8 = _mm256_fmadd_ps(series_f32x8, s2_f32x8, _mm256_set1_ps(0.2f));         // 1/5
+    series_f32x8 = _mm256_fmadd_ps(series_f32x8, s2_f32x8, _mm256_set1_ps(0.333333333f)); // 1/3
+    series_f32x8 = _mm256_fmadd_ps(series_f32x8, s2_f32x8, one_f32x8);                    // 1
+    __m256 log2m_f32x8 = _mm256_mul_ps(_mm256_set1_ps(2.885390081777927f), _mm256_mul_ps(s_f32x8, series_f32x8));
+    return _mm256_add_ps(log2m_f32x8, exponent_f32x8);
+}
+NK_INTERNAL __m256d nk_log2_f64x4_haswell_(__m256d x) {
+    // Extract exponent via integer shift: (bits >> 52) - 1023
+    __m256i bits_i64x4 = _mm256_castpd_si256(x);
+    __m256i exponent_i64x4 = _mm256_srli_epi64(bits_i64x4, 52);
+    // AVX2 has no _mm256_cvtepi64_pd, so extract lanes and convert
+    nk_f64_t exp0 = (nk_f64_t)((nk_i64_t)_mm256_extract_epi64(exponent_i64x4, 0) - 1023);
+    nk_f64_t exp1 = (nk_f64_t)((nk_i64_t)_mm256_extract_epi64(exponent_i64x4, 1) - 1023);
+    nk_f64_t exp2 = (nk_f64_t)((nk_i64_t)_mm256_extract_epi64(exponent_i64x4, 2) - 1023);
+    nk_f64_t exp3 = (nk_f64_t)((nk_i64_t)_mm256_extract_epi64(exponent_i64x4, 3) - 1023);
+    __m256d exponent_f64x4 = _mm256_set_pd(exp3, exp2, exp1, exp0);
+    // Extract mantissa: clear exponent bits, set exponent to 1023 (= 1.0 bias)
+    __m256i mantissa_mask = _mm256_set1_epi64x(0x000FFFFFFFFFFFFFLL);
+    __m256i bias = _mm256_set1_epi64x(0x3FF0000000000000LL);
+    __m256d mantissa_f64x4 = _mm256_castsi256_pd(_mm256_or_si256(_mm256_and_si256(bits_i64x4, mantissa_mask), bias));
+    // s-series: s = (m-1)/(m+1), log2(m) = 2*s*P(s²) * log2(e)
+    __m256d one_f64x4 = _mm256_set1_pd(1.0);
+    __m256d s_f64x4 = _mm256_div_pd(_mm256_sub_pd(mantissa_f64x4, one_f64x4), _mm256_add_pd(mantissa_f64x4, one_f64x4));
+    __m256d s2_f64x4 = _mm256_mul_pd(s_f64x4, s_f64x4);
+    // 14-term Horner: P(s²) = 1 + s²/3 + s⁴/5 + ... + s²⁶/27
+    __m256d poly_f64x4 = _mm256_set1_pd(1.0 / 27.0);
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 25.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 23.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 21.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 19.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 17.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 15.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 13.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 11.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 9.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 7.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 5.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0 / 3.0));
+    poly_f64x4 = _mm256_fmadd_pd(s2_f64x4, poly_f64x4, _mm256_set1_pd(1.0));
+    __m256d two_f64x4 = _mm256_set1_pd(2.0);
+    __m256d ln_m_f64x4 = _mm256_mul_pd(_mm256_mul_pd(two_f64x4, s_f64x4), poly_f64x4);
+    __m256d log2e_f64x4 = _mm256_set1_pd(1.4426950408889634);
+    __m256d log2_m_f64x4 = _mm256_mul_pd(ln_m_f64x4, log2e_f64x4);
+    return _mm256_add_pd(exponent_f64x4, log2_m_f64x4);
+}
+NK_PUBLIC void nk_kld_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
+    __m256 sum_f32x8 = _mm256_setzero_ps();
+    nk_f32_t epsilon = NK_F32_DIVISION_EPSILON;
+    __m256 epsilon_f32x8 = _mm256_set1_ps(epsilon);
+    __m256 a_f32x8, b_f32x8;
+nk_kld_f16_haswell_cycle:
+    if (n < 8) {
+        nk_b256_vec_t a_vec, b_vec;
+        nk_partial_load_f16x8_to_f32x8_haswell_(a, &a_vec, n);
+        nk_partial_load_f16x8_to_f32x8_haswell_(b, &b_vec, n);
+        a_f32x8 = a_vec.ymm_ps;
+        b_f32x8 = b_vec.ymm_ps;
+        n = 0;
+    }
+    else {
+        a_f32x8 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)a));
+        b_f32x8 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)b));
+        n -= 8, a += 8, b += 8;
+    }
+    __m256 ratio_f32x8 = _mm256_div_ps(_mm256_add_ps(a_f32x8, epsilon_f32x8), _mm256_add_ps(b_f32x8, epsilon_f32x8));
+    __m256 log_ratio_f32x8 = nk_log2_f32x8_haswell_(ratio_f32x8);
+    __m256 contribution_f32x8 = _mm256_mul_ps(a_f32x8, log_ratio_f32x8);
+    sum_f32x8 = _mm256_add_ps(sum_f32x8, contribution_f32x8);
+    if (n) goto nk_kld_f16_haswell_cycle;
+    nk_f32_t log2_normalizer = 0.6931471805599453f;
+    nk_f32_t sum = nk_reduce_add_f32x8_haswell_(sum_f32x8);
+    sum *= log2_normalizer;
+    *result = sum;
+}
+NK_PUBLIC void nk_jsd_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
+    nk_f32_t epsilon = NK_F32_DIVISION_EPSILON;
+    __m256 epsilon_f32x8 = _mm256_set1_ps(epsilon);
+    __m256 sum_f32x8 = _mm256_setzero_ps();
+    __m256 a_f32x8, b_f32x8;
+nk_jsd_f16_haswell_cycle:
+    if (n < 8) {
+        nk_b256_vec_t a_vec, b_vec;
+        nk_partial_load_f16x8_to_f32x8_haswell_(a, &a_vec, n);
+        nk_partial_load_f16x8_to_f32x8_haswell_(b, &b_vec, n);
+        a_f32x8 = a_vec.ymm_ps;
+        b_f32x8 = b_vec.ymm_ps;
+        n = 0;
+    }
+    else {
+        a_f32x8 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)a));
+        b_f32x8 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)b));
+        n -= 8, a += 8, b += 8;
+    }
+    __m256 mean_f32x8 = _mm256_mul_ps(_mm256_add_ps(a_f32x8, b_f32x8), _mm256_set1_ps(0.5f)); // M = (P + Q) / 2
+    __m256 ratio_a_f32x8 = _mm256_div_ps(_mm256_add_ps(a_f32x8, epsilon_f32x8),
+                                         _mm256_add_ps(mean_f32x8, epsilon_f32x8));
+    __m256 ratio_b_f32x8 = _mm256_div_ps(_mm256_add_ps(b_f32x8, epsilon_f32x8),
+                                         _mm256_add_ps(mean_f32x8, epsilon_f32x8));
+    __m256 log_ratio_a_f32x8 = nk_log2_f32x8_haswell_(ratio_a_f32x8);
+    __m256 log_ratio_b_f32x8 = nk_log2_f32x8_haswell_(ratio_b_f32x8);
+    __m256 contribution_a_f32x8 = _mm256_mul_ps(a_f32x8, log_ratio_a_f32x8);
+    __m256 contribution_b_f32x8 = _mm256_mul_ps(b_f32x8, log_ratio_b_f32x8);
+    sum_f32x8 = _mm256_add_ps(sum_f32x8, contribution_a_f32x8);
+    sum_f32x8 = _mm256_add_ps(sum_f32x8, contribution_b_f32x8);
+    if (n) goto nk_jsd_f16_haswell_cycle;
+    nk_f32_t log2_normalizer = 0.6931471805599453f;
+    nk_f32_t sum = nk_reduce_add_f32x8_haswell_(sum_f32x8);
+    sum *= log2_normalizer / 2;
+    *result = sum > 0 ? nk_f32_sqrt_haswell(sum) : 0;
+}
+NK_PUBLIC void nk_kld_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    nk_f64_t epsilon = NK_F64_DIVISION_EPSILON;
+    __m256d epsilon_f64x4 = _mm256_set1_pd(epsilon);
+    __m256d sum_f64x4 = _mm256_setzero_pd();
+    __m256d compensation_f64x4 = _mm256_setzero_pd();
+    __m256d a_f64x4, b_f64x4;
+nk_kld_f64_haswell_cycle:
+    if (n < 4) {
+        nk_b256_vec_t a_vec, b_vec;
+        nk_partial_load_b64x4_haswell_(a, &a_vec, n);
+        nk_partial_load_b64x4_haswell_(b, &b_vec, n);
+        a_f64x4 = a_vec.ymm_pd;
+        b_f64x4 = b_vec.ymm_pd;
+        n = 0;
+    }
+    else {
+        a_f64x4 = _mm256_loadu_pd(a);
+        b_f64x4 = _mm256_loadu_pd(b);
+        n -= 4, a += 4, b += 4;
+    }
+    __m256d ratio_f64x4 = _mm256_div_pd(_mm256_add_pd(a_f64x4, epsilon_f64x4), _mm256_add_pd(b_f64x4, epsilon_f64x4));
+    __m256d log_ratio_f64x4 = nk_log2_f64x4_haswell_(ratio_f64x4);
+    __m256d contribution_f64x4 = _mm256_mul_pd(a_f64x4, log_ratio_f64x4);
+    // Kahan compensated summation
+    __m256d compensated_f64x4 = _mm256_sub_pd(contribution_f64x4, compensation_f64x4);
+    __m256d tentative_f64x4 = _mm256_add_pd(sum_f64x4, compensated_f64x4);
+    compensation_f64x4 = _mm256_sub_pd(_mm256_sub_pd(tentative_f64x4, sum_f64x4), compensated_f64x4);
+    sum_f64x4 = tentative_f64x4;
+    if (n) goto nk_kld_f64_haswell_cycle;
+    nk_f64_t log2_normalizer = 0.6931471805599453;
+    *result = nk_reduce_add_f64x4_haswell_(sum_f64x4) * log2_normalizer;
+}
+NK_PUBLIC void nk_jsd_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
+    nk_f64_t epsilon = NK_F64_DIVISION_EPSILON;
+    __m256d epsilon_f64x4 = _mm256_set1_pd(epsilon);
+    __m256d sum_f64x4 = _mm256_setzero_pd();
+    __m256d compensation_f64x4 = _mm256_setzero_pd();
+    __m256d a_f64x4, b_f64x4;
+nk_jsd_f64_haswell_cycle:
+    if (n < 4) {
+        nk_b256_vec_t a_vec, b_vec;
+        nk_partial_load_b64x4_haswell_(a, &a_vec, n);
+        nk_partial_load_b64x4_haswell_(b, &b_vec, n);
+        a_f64x4 = a_vec.ymm_pd;
+        b_f64x4 = b_vec.ymm_pd;
+        n = 0;
+    }
+    else {
+        a_f64x4 = _mm256_loadu_pd(a);
+        b_f64x4 = _mm256_loadu_pd(b);
+        n -= 4, a += 4, b += 4;
+    }
+    __m256d mean_f64x4 = _mm256_mul_pd(_mm256_add_pd(a_f64x4, b_f64x4), _mm256_set1_pd(0.5));
+    __m256d ratio_a_f64x4 = _mm256_div_pd(_mm256_add_pd(a_f64x4, epsilon_f64x4),
+                                          _mm256_add_pd(mean_f64x4, epsilon_f64x4));
+    __m256d ratio_b_f64x4 = _mm256_div_pd(_mm256_add_pd(b_f64x4, epsilon_f64x4),
+                                          _mm256_add_pd(mean_f64x4, epsilon_f64x4));
+    __m256d log_ratio_a_f64x4 = nk_log2_f64x4_haswell_(ratio_a_f64x4);
+    __m256d log_ratio_b_f64x4 = nk_log2_f64x4_haswell_(ratio_b_f64x4);
+    __m256d contribution_a_f64x4 = _mm256_mul_pd(a_f64x4, log_ratio_a_f64x4);
+    __m256d contribution_b_f64x4 = _mm256_mul_pd(b_f64x4, log_ratio_b_f64x4);
+    // Kahan compensated summation for contribution a
+    __m256d compensated_a_f64x4 = _mm256_sub_pd(contribution_a_f64x4, compensation_f64x4);
+    __m256d tentative_a_f64x4 = _mm256_add_pd(sum_f64x4, compensated_a_f64x4);
+    compensation_f64x4 = _mm256_sub_pd(_mm256_sub_pd(tentative_a_f64x4, sum_f64x4), compensated_a_f64x4);
+    sum_f64x4 = tentative_a_f64x4;
+    // Kahan compensated summation for contribution b
+    __m256d compensated_b_f64x4 = _mm256_sub_pd(contribution_b_f64x4, compensation_f64x4);
+    __m256d tentative_b_f64x4 = _mm256_add_pd(sum_f64x4, compensated_b_f64x4);
+    compensation_f64x4 = _mm256_sub_pd(_mm256_sub_pd(tentative_b_f64x4, sum_f64x4), compensated_b_f64x4);
+    sum_f64x4 = tentative_b_f64x4;
+    if (n) goto nk_jsd_f64_haswell_cycle;
+    nk_f64_t log2_normalizer = 0.6931471805599453;
+    nk_f64_t sum = nk_reduce_add_f64x4_haswell_(sum_f64x4);
+    sum *= log2_normalizer / 2;
+    *result = sum > 0 ? nk_f64_sqrt_haswell(sum) : 0;
+}
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // NK_TARGET_HASWELL
+#endif // NK_TARGET_X86_
+#endif // NK_PROBABILITY_HASWELL_H